From 8a38c1f1a1c41aa221d1f4de11bb3e5b104eeb75 Mon Sep 17 00:00:00 2001 From: Robert Date: Fri, 22 Nov 2024 15:39:10 -0500 Subject: [PATCH] mibios/query: Add ChainedQuerySet container/wrapper The ChainedQuerySet hooks into the QuerySet.iterate() machinery via the new QuerySet.split_by_fk() method to be used as alternative to a filter(relation__in=subquery) when the subquery prevents an index scan (the first application will be data export of ReadAbundance for a selection of samples.) This is part 1 of 3 of series of commits to further address slow data export of very large tables. --- mibios/query.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/mibios/query.py b/mibios/query.py index 8715abb..43d58bc 100644 --- a/mibios/query.py +++ b/mibios/query.py @@ -1,6 +1,7 @@ from collections import OrderedDict from decimal import Decimal from inspect import getgeneratorstate, getgeneratorlocals +from itertools import chain import json from logging import getLogger from operator import attrgetter, itemgetter @@ -1043,3 +1044,110 @@ def iterate(self, chunk_size=None, cache=None): return ModelIterable(self, chunk_size, cache) else: return ValuesListIterable(self, chunk_size, cache) + + def split_by_fk(self, fk_field, subquery): + return ChainedQuerySet(self, fk_field, subquery) + + +class ChainedQuerySet: + """ + Chain querysets, split along relation to avoid WHERE w/subquery. + + This is a wrapper/container for querysets. It replaces a Queryset that had + filter(relation__in=subquery) applied with a chain of querysets with + filter(relation__pk=a_pk) for each object in the subquery. The resulting + objects can with limitations be used like the original queryset. + + It is primarily meant to have iterate() applied further splitting the + queries in order to take advantange of indexes and avoiding sequence scan + on very large tables.. + """ + def __init__(self, queryset, fk_field, subqueryset): + model = queryset.model + if isinstance(fk_field, str): + fk_field = model._meta.get_field(fk_field) + else: + try: + _fk_field = model._meta.get_field(fk_field.name) + except FieldDoesNotExist: + raise ValueError('field does not belong to model of queryset') + else: + if _fk_field is not fk_field: + raise ValueError('not a field of model (but name match?)') + if not fk_field.many_to_one: + raise ValueError('only ForeignKey fields are admitted here') + + if fk_field.related_model is not subqueryset.model: + raise ValueError('related model does not match subquery\'s model') + + self.model = model + self.base_qs = queryset + self.fk_field = fk_field + self.iterate_sortkey = None + self.subqueryset = subqueryset + + def _clone(self): + """ + Make a copy of us ready for other operations on our querysets + """ + obj = self.__class__( + self.base_qs._chain(), + self.fk_field, + self.subqueryset._chain(), + ) + obj.iterate_sortkey = self.iterate_sortkey + return obj + + def _apply_queryset_method(self, meth, *args, **kwargs): + """ + Apply a queryset method to all members of the chain. + + meth: str, name of queryset method + + The args and kwargs are passed to the method. Only methods that also + return a QuerySet should be used, else the result is unlikely to do any + good. Returns a new instance. + """ + obj = self._clone() + qs_meth = getattr(obj.base_qs, meth) + obj.base_qs = qs_meth(*args, **kwargs) + return obj + + def get_split_querysets(self): + """ + Generate the queryset iterables. + """ + for pk in self.subqueryset.values_list('pk', flat=True): + yield self.base_qs.filter(**{self.fk_field.attname: pk}) + + # Below we implement a few of the usual queryset methods + + def iterate(self, **kwargs): + """ + Chains QuerySet.interate() calls along split querysets + + kwargs are passed to each Queryset.iterate(). + """ + kwargs.setdefault('sortkey', self.iterate_sortkey) + it = ((qs.iterate(**kwargs) for qs in self.get_split_querysets())) + return chain.from_iterable(it) + + def values_list(self, *args, **kwargs): + return self._apply_queryset_method('values_list', *args, **kwargs) + + def prefetch_related(self, *args, **kwargs): + return self._apply_queryset_method('prefetch_related', *args, **kwargs) + + def count(self): + """ + Not implemented but declared here to make class pass django_table2's + TableQuerySetData.validate(). + """ + raise NotImplementedError + + def order_by(self, *field_names): + """ + Not implemented but declared here to make class pass django_table2's + TableQuerySetData.validate(). + """ + raise NotImplementedError