Skip to content

Commit

Permalink
glamr: Improve speed of certain data downloads
Browse files Browse the repository at this point in the history
Adds some infrastructure to pass parameters from view to
QuerySet.iterate() calls in glamr.tab les()

   * new glamr.tables.Table.export_sortkey attribute
   * new ChainedQuerySet.iterate_kw attribute

Faster queries for:
   * ToManyListView with a sample's read/abundance lsiting export
   * tax abundance tables + reduce number of queries

Closes #56.
  • Loading branch information
robert102 committed Nov 26, 2024
1 parent 26743ed commit bf869d9
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 19 deletions.
25 changes: 21 additions & 4 deletions mibios/glamr/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ def __init__(self, data=None, view=None, exclude=None, **kwargs):
exclude += ((i for i in self._meta.model.get_internal_fields()
if i not in exclude))

self.export_sortkey = None
""" c.f. order_by, set via customize_queryset() """

if data is None:
data = self._meta.model.objects.all()
elif isinstance(data, ChainedQuerySet):
Expand Down Expand Up @@ -165,7 +168,7 @@ def as_values(self, exclude_columns=None):

self.data.data = (self.data.data
.values_list(*select_names)
.iterate(cache=True))
.iterate(cache=True, sortkey=self.export_sortkey))

yield headers
yield from self.as_values_bottom(columns)
Expand Down Expand Up @@ -344,7 +347,11 @@ def customize_queryset(self, qs):
if qs.fk_field is self._meta.model._meta.get_field('sample'):
# use sample+ref uniq constraint index, so existing order
# by ref_id
qs.iterate_sortkey = 'ref_id'
self.export_sortkey = 'ref_id'
elif hasattr(self.view, 'obj_model'):
if self.view.obj_model._meta.model_name == 'sample':
# assume ToManyListview with sample as object
self.export_sortkey = 'ref_id'
else:
# For normal HTML table get the function names, don't need or want
# those for export
Expand Down Expand Up @@ -416,8 +423,18 @@ class Meta:
order_by = ['-tpm']

def customize_queryset(self, qs):
qs = qs.select_related('taxon')
qs = qs.only('sample_id', 'taxon__taxid', 'taxon__rank', 'tpm')
qs = qs.select_related('taxon', 'sample')
qs = qs.only('sample_id', 'sample__sample_id', 'sample__sample_name',
'taxon__taxid', 'taxon__rank', 'taxon__name', 'tpm')

if self.is_for_export():
if isinstance(qs, ChainedQuerySet):
if qs.fk_field is self._meta.model._meta.get_field('sample'):
# use sample+tax uniq constraint index, so existing order
# by taxon_id
# NOTE: splitting queries by sample this seems slower than
# the alternative (regular qs with iterate())
self.export_sortkey = 'taxon_id'
return qs

def render_taxon(self, value):
Expand Down
20 changes: 12 additions & 8 deletions mibios/glamr/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,19 +189,23 @@ def get_export_queryset(self, export_option):
"""
if export_option is self.EXPORT_TABLE:
return self.get_queryset()
else:
# Try for related data export (or get 404 if this fails), other
# more intricate options would need to be implemented by inheriting
# views.
remote_field = self.get_export_remote_field(export_option)
if self.model is Sample:

# Try for related data export (or get 404 if this fails), other
# more intricate options would need to be implemented by inheriting
# views.
remote_field = self.get_export_remote_field(export_option)

match self.model._meta.model_name, export_option:
case 'sample', 'functional_abundance':
return remote_field.model.objects.all().split_by_fk(
remote_field,
self.get_queryset(),
iterate_kw=dict(chunk_size=200000),
)

f = {remote_field.name + '__in': self.get_queryset()}
return remote_field.model.objects.filter(**f)
case _:
f = {remote_field.name + '__in': self.get_queryset()}
return remote_field.model.objects.filter(**f)

def get_export_remote_field(self, export_option):
"""
Expand Down
18 changes: 11 additions & 7 deletions mibios/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,8 +1056,8 @@ def iterate(self, cache=None, chunk_size=None, sortkey=None):
else:
return ValuesListIterable(self, cache, chunk_size, sortkey)

def split_by_fk(self, fk_field, subquery):
return ChainedQuerySet(self, fk_field, subquery)
def split_by_fk(self, fk_field, subquery, iterate_kw=None):
return ChainedQuerySet(self, fk_field, subquery, iterate_kw=iterate_kw)


class ChainedQuerySet:
Expand All @@ -1073,7 +1073,7 @@ class ChainedQuerySet:
queries in order to take advantange of indexes and avoiding sequence scan
on very large tables..
"""
def __init__(self, queryset, fk_field, subqueryset):
def __init__(self, queryset, fk_field, subqueryset, iterate_kw=None):
model = queryset.model
if isinstance(fk_field, str):
fk_field = model._meta.get_field(fk_field)
Expand All @@ -1091,10 +1091,13 @@ def __init__(self, queryset, fk_field, subqueryset):
if fk_field.related_model is not subqueryset.model:
raise ValueError('related model does not match subquery\'s model')

if iterate_kw is None:
iterate_kw = {}

self.model = model
self.base_qs = queryset
self.fk_field = fk_field
self.iterate_sortkey = None
self.iterate_kw = iterate_kw
self.subqueryset = subqueryset

def _clone(self):
Expand All @@ -1105,8 +1108,8 @@ def _clone(self):
self.base_qs._chain(),
self.fk_field,
self.subqueryset._chain(),
iterate_kw=self.iterate_kw,
)
obj.iterate_sortkey = self.iterate_sortkey
return obj

def _apply_queryset_method(self, meth, *args, **kwargs):
Expand Down Expand Up @@ -1139,8 +1142,9 @@ def iterate(self, **kwargs):
kwargs are passed to each Queryset.iterate().
"""
kwargs.setdefault('sortkey', self.iterate_sortkey)
it = ((qs.iterate(**kwargs) for qs in self.get_split_querysets()))
it_kw = dict(self.iterate_kw)
it_kw.update(kwargs)
it = ((qs.iterate(**it_kw) for qs in self.get_split_querysets()))
return chain.from_iterable(it)

def values_list(self, *args, **kwargs):
Expand Down

0 comments on commit bf869d9

Please sign in to comment.