From bf869d9c7ef257e6cca99095b74e6693a03e0c89 Mon Sep 17 00:00:00 2001 From: Robert Date: Tue, 26 Nov 2024 15:36:48 -0500 Subject: [PATCH] glamr: Improve speed of certain data downloads Adds some infrastructure to pass parameters from view to QuerySet.iterate() calls in glamr.tab les() * new glamr.tables.Table.export_sortkey attribute * new ChainedQuerySet.iterate_kw attribute Faster queries for: * ToManyListView with a sample's read/abundance lsiting export * tax abundance tables + reduce number of queries Closes #56. --- mibios/glamr/tables.py | 25 +++++++++++++++++++++---- mibios/glamr/views.py | 20 ++++++++++++-------- mibios/query.py | 18 +++++++++++------- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/mibios/glamr/tables.py b/mibios/glamr/tables.py index 1c3a4ec..a03f991 100644 --- a/mibios/glamr/tables.py +++ b/mibios/glamr/tables.py @@ -47,6 +47,9 @@ def __init__(self, data=None, view=None, exclude=None, **kwargs): exclude += ((i for i in self._meta.model.get_internal_fields() if i not in exclude)) + self.export_sortkey = None + """ c.f. order_by, set via customize_queryset() """ + if data is None: data = self._meta.model.objects.all() elif isinstance(data, ChainedQuerySet): @@ -165,7 +168,7 @@ def as_values(self, exclude_columns=None): self.data.data = (self.data.data .values_list(*select_names) - .iterate(cache=True)) + .iterate(cache=True, sortkey=self.export_sortkey)) yield headers yield from self.as_values_bottom(columns) @@ -344,7 +347,11 @@ def customize_queryset(self, qs): if qs.fk_field is self._meta.model._meta.get_field('sample'): # use sample+ref uniq constraint index, so existing order # by ref_id - qs.iterate_sortkey = 'ref_id' + self.export_sortkey = 'ref_id' + elif hasattr(self.view, 'obj_model'): + if self.view.obj_model._meta.model_name == 'sample': + # assume ToManyListview with sample as object + self.export_sortkey = 'ref_id' else: # For normal HTML table get the function names, don't need or want # those for export @@ -416,8 +423,18 @@ class Meta: order_by = ['-tpm'] def customize_queryset(self, qs): - qs = qs.select_related('taxon') - qs = qs.only('sample_id', 'taxon__taxid', 'taxon__rank', 'tpm') + qs = qs.select_related('taxon', 'sample') + qs = qs.only('sample_id', 'sample__sample_id', 'sample__sample_name', + 'taxon__taxid', 'taxon__rank', 'taxon__name', 'tpm') + + if self.is_for_export(): + if isinstance(qs, ChainedQuerySet): + if qs.fk_field is self._meta.model._meta.get_field('sample'): + # use sample+tax uniq constraint index, so existing order + # by taxon_id + # NOTE: splitting queries by sample this seems slower than + # the alternative (regular qs with iterate()) + self.export_sortkey = 'taxon_id' return qs def render_taxon(self, value): diff --git a/mibios/glamr/views.py b/mibios/glamr/views.py index 7830ec8..3e10bac 100644 --- a/mibios/glamr/views.py +++ b/mibios/glamr/views.py @@ -189,19 +189,23 @@ def get_export_queryset(self, export_option): """ if export_option is self.EXPORT_TABLE: return self.get_queryset() - else: - # Try for related data export (or get 404 if this fails), other - # more intricate options would need to be implemented by inheriting - # views. - remote_field = self.get_export_remote_field(export_option) - if self.model is Sample: + + # Try for related data export (or get 404 if this fails), other + # more intricate options would need to be implemented by inheriting + # views. + remote_field = self.get_export_remote_field(export_option) + + match self.model._meta.model_name, export_option: + case 'sample', 'functional_abundance': return remote_field.model.objects.all().split_by_fk( remote_field, self.get_queryset(), + iterate_kw=dict(chunk_size=200000), ) - f = {remote_field.name + '__in': self.get_queryset()} - return remote_field.model.objects.filter(**f) + case _: + f = {remote_field.name + '__in': self.get_queryset()} + return remote_field.model.objects.filter(**f) def get_export_remote_field(self, export_option): """ diff --git a/mibios/query.py b/mibios/query.py index 4b2fcfb..59f5a17 100644 --- a/mibios/query.py +++ b/mibios/query.py @@ -1056,8 +1056,8 @@ def iterate(self, cache=None, chunk_size=None, sortkey=None): else: return ValuesListIterable(self, cache, chunk_size, sortkey) - def split_by_fk(self, fk_field, subquery): - return ChainedQuerySet(self, fk_field, subquery) + def split_by_fk(self, fk_field, subquery, iterate_kw=None): + return ChainedQuerySet(self, fk_field, subquery, iterate_kw=iterate_kw) class ChainedQuerySet: @@ -1073,7 +1073,7 @@ class ChainedQuerySet: queries in order to take advantange of indexes and avoiding sequence scan on very large tables.. """ - def __init__(self, queryset, fk_field, subqueryset): + def __init__(self, queryset, fk_field, subqueryset, iterate_kw=None): model = queryset.model if isinstance(fk_field, str): fk_field = model._meta.get_field(fk_field) @@ -1091,10 +1091,13 @@ def __init__(self, queryset, fk_field, subqueryset): if fk_field.related_model is not subqueryset.model: raise ValueError('related model does not match subquery\'s model') + if iterate_kw is None: + iterate_kw = {} + self.model = model self.base_qs = queryset self.fk_field = fk_field - self.iterate_sortkey = None + self.iterate_kw = iterate_kw self.subqueryset = subqueryset def _clone(self): @@ -1105,8 +1108,8 @@ def _clone(self): self.base_qs._chain(), self.fk_field, self.subqueryset._chain(), + iterate_kw=self.iterate_kw, ) - obj.iterate_sortkey = self.iterate_sortkey return obj def _apply_queryset_method(self, meth, *args, **kwargs): @@ -1139,8 +1142,9 @@ def iterate(self, **kwargs): kwargs are passed to each Queryset.iterate(). """ - kwargs.setdefault('sortkey', self.iterate_sortkey) - it = ((qs.iterate(**kwargs) for qs in self.get_split_querysets())) + it_kw = dict(self.iterate_kw) + it_kw.update(kwargs) + it = ((qs.iterate(**it_kw) for qs in self.get_split_querysets())) return chain.from_iterable(it) def values_list(self, *args, **kwargs):