Skip to content

Commit

Permalink
Merge pull request #649 from City-of-Helsinki/link-1408-optimize-dist…
Browse files Browse the repository at this point in the history
…inct

Remove use of distinct in event listing
  • Loading branch information
voneiden authored Aug 8, 2023
2 parents 6c8e8fd + 3324685 commit c1d128a
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 86 deletions.
209 changes: 123 additions & 86 deletions events/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from django.core.cache import cache
from django.core.exceptions import PermissionDenied
from django.db import transaction
from django.db.models import Count, F, Prefetch, Q
from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, QuerySet
from django.db.models.functions import Greatest
from django.db.utils import IntegrityError
from django.http import Http404, HttpResponsePermanentRedirect
Expand Down Expand Up @@ -2511,15 +2511,47 @@ def _get_queryset_from_cache_many(params, param, cache_name, operator, queryset)
return queryset


def _find_keyword_replacements(keyword_ids: list[str]) -> tuple[list[Keyword], bool]:
"""
Convert a list of keyword ids into keyword objects with
replacements applied.
:return: a pair containing a list of keywords and a boolean indicating
whether all keywords were found
"""
keywords = Keyword.objects.filter(id__in=keyword_ids).select_related("replaced_by")
found_all_keywords = len(keyword_ids) == len(keywords)
replaced_keywords = [keyword.get_replacement() or keyword for keyword in keywords]
return list(set(replaced_keywords)), found_all_keywords


def _filter_events_keyword_or(queryset: QuerySet, keyword_ids: list[str]) -> QuerySet:
"""
Given an Event queryset, apply OR filter on keyword ids
"""
keywords, __ = _find_keyword_replacements(keyword_ids)
keyword_ids = [keyword.pk for keyword in keywords]

kw_qs = Event.keywords.through.objects.filter(
event=OuterRef("pk"), keyword__in=keyword_ids
)
audience_qs = Event.audience.through.objects.filter(
event=OuterRef("pk"), keyword__in=keyword_ids
)
return queryset.filter(Exists(kw_qs) | Exists(audience_qs))


def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
"""
Filter events queryset by params
(e.g. self.request.query_params ingit EventViewSet)
"""
# Filter by string (case insensitive). This searches from all fields
# which are marked translatable in translation.py
# Please keep in mind that .distinct() will absolutely kill
# performance of event queries. To avoid duplicate rows in filters
# consider using Exists instead.
# Filtering against the through table can also provide
# benefits (see _filter_events_keyword_or)

# filter to get events with or without a registration.
val = params.get("registration", None)
if val and parse_bool(val, "registration"):
queryset = queryset.exclude(registration=None)
Expand Down Expand Up @@ -2617,21 +2649,29 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
vals = params.get("keyword_set_AND", None)
if vals:
vals = vals.split(",")
keyword_sets = KeywordSet.objects.filter(id__in=vals)
for keyword_set in keyword_sets:
keywords = keyword_set.keywords.all()
qset = Q(keywords__in=keywords)
queryset = queryset.filter(qset)
kw_sets = KeywordSet.objects.filter(id__in=vals).prefetch_related(
Prefetch("keywords", Keyword.objects.all().only("id"))
)
for kw_set in kw_sets:
kw_ids = [kw.id for kw in kw_set.keywords.all()]
subqs = Event.objects.filter(id=OuterRef("pk"), keywords__in=kw_ids)
queryset = queryset.filter(Exists(subqs))

vals = params.get("keyword_set_OR", None)
if vals:
vals = vals.split(",")
keyword_sets = KeywordSet.objects.filter(id__in=vals)
keyword_sets = KeywordSet.objects.filter(id__in=vals).prefetch_related(
Prefetch("keywords", Keyword.objects.all().only("id"))
)
all_keywords = set()
for keyword_set in keyword_sets:
keywords = keyword_set.keywords.all()
all_keywords.update(keywords)
queryset = queryset.filter(keywords__in=all_keywords)

kw_qs = Event.keywords.through.objects.filter(
event=OuterRef("pk"), keyword__in=all_keywords
)
queryset = queryset.filter(Exists(kw_qs))

if "local_ongoing_OR_set" in "".join(params):
count = 1
Expand Down Expand Up @@ -2694,8 +2734,11 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
for i in all_sets:
val = params.get(i, None)
if val:
val = val.split(",")
queryset = queryset.filter(keywords__pk__in=val)
vals = val.split(",")
kw_qs = Event.keywords.through.objects.filter(
event=OuterRef("pk"), keyword__in=vals
)
queryset = queryset.filter(Exists(kw_qs))

val = params.get("internet_based", None)
if val and parse_bool(val, "internet_based"):
Expand All @@ -2707,39 +2750,44 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
val = params.get("combined_text", None)
if val:
val = val.lower()
qset = Q()
vals = val.split(",")
qsets = []

combined_q = Q()
for val in vals:
val_q = Q()
# Free string search from all translated event fields
event_fields = EventTranslationOptions.fields
for field in event_fields:
# check all languages for each field
qset |= _text_qset_by_translated_field(field, val)
val_q |= _text_qset_by_translated_field(field, val)

# Free string search from all translated place fields
place_fields = PlaceTranslationOptions.fields
for field in place_fields:
location_field = "location__" + field
# check all languages for each field
qset |= _text_qset_by_translated_field(location_field, val)
val_q |= _text_qset_by_translated_field(location_field, val)

langs = (
["fi", "sv"]
if re.search("[\u00C0-\u00FF]", val)
else ["fi", "sv", "en"]
)
tri = [TrigramSimilarity(f"name_{i}", val) for i in langs]
keywords = (
keywords = list(
Keyword.objects.annotate(simile=Greatest(*tri))
.filter(simile__gt=0.2)
.order_by("-simile")[:3]
)
if keywords:
qset |= Q(keywords__in=keywords)
qsets.append(qset)
qset = Q()
queryset = queryset.filter(*qsets)

val_q |= Q(keywords__in=keywords)

combined_q &= val_q

# FYI: Biggest slowdown in this filter is the ordering of keywords simile
queryset = queryset.filter(
Exists(Event.objects.filter(combined_q, id=OuterRef("pk")).only("id"))
)

val = params.get("text", None)
if val:
Expand Down Expand Up @@ -2866,75 +2914,46 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
val = val.split(",")
queryset = queryset.filter(location_id__in=val)

# Filter by keyword id, multiple ids separated by comma
val = params.get("keyword", None)
if val:
val = val.split(",")
try:
# replaced keywords are looked up for backwards compatibility
val = [
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
for kid in val
]
except Keyword.DoesNotExist:
# the user asked for an unknown keyword
queryset = queryset.none()
queryset = queryset.filter(
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
).distinct()
if val := params.get("keyword", None):
queryset = _filter_events_keyword_or(queryset, val.split(","))

# 'keyword_OR' behaves the same way as 'keyword'
val = params.get("keyword_OR", None)
if val:
val = val.split(",")
try:
# replaced keywords are looked up for backwards compatibility
val = [
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
for kid in val
]
except Keyword.DoesNotExist:
# the user asked for an unknown keyword
queryset = queryset.none()
queryset = queryset.filter(
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
).distinct()
if val := params.get("keyword_OR", None):
queryset = _filter_events_keyword_or(queryset, val.split(","))

# Filter by keyword ids requiring all keywords to be present in event
val = params.get("keyword_AND", None)
if val:
val = val.split(",")
for keyword_id in val:
try:
# replaced keywords are looked up for backwards compatibility
val = (
getattr(Keyword.objects.get(id=keyword_id).replaced_by, "id", None)
or keyword_id
)
except Keyword.DoesNotExist:
# the user asked for an unknown keyword
queryset = queryset.none()
queryset = queryset.filter(
Q(keywords__pk=keyword_id) | Q(audience__pk=keyword_id)
keywords, found_all_keywords = _find_keyword_replacements(val.split(","))

# If some keywords were not found, AND can not match
if not found_all_keywords:
return queryset.none()

q = Q()
for keyword in keywords:
q &= Q(keywords__pk=keyword.pk) | Q(audience__pk=keyword.pk)

for keyword in keywords:
kw_qs = Event.keywords.through.objects.filter(
event=OuterRef("pk"), keyword=keyword
)
queryset = queryset.distinct()
audience_qs = Event.audience.through.objects.filter(
event=OuterRef("pk"), keyword=keyword
)
queryset = queryset.filter(Exists(kw_qs) | Exists(audience_qs))

# Negative filter for keyword ids
val = params.get("keyword!", None)
if val:
val = val.split(",")
try:
# replaced keywords are looked up for backwards compatibility
val = [
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
for kid in val
]
except Keyword.DoesNotExist:
# the user asked for an unknown keyword
pass
keywords, __ = _find_keyword_replacements(val.split(","))
keyword_ids = [keyword.pk for keyword in keywords]

# This yields an AND NOT ((EXISTS.. keywords )) clause in SQL
# No distinct needed!
queryset = queryset.exclude(
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
).distinct()
Q(keywords__pk__in=keyword_ids) | Q(audience__pk__in=keyword_ids)
)

# filter only super or non-super events. to be deprecated?
val = params.get("recurring", None)
Expand Down Expand Up @@ -3018,22 +3037,36 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
short_desc_arg = {"short_description_" + lang + "__isnull": False}
q = (
q
| Q(in_language__id=lang)
| Exists(
Event.in_language.through.objects.filter(
event=OuterRef("pk"), language=lang
)
)
| Q(**name_arg)
| Q(**desc_arg)
| Q(**short_desc_arg)
)
else:
q = q | Q(in_language__id=lang)
queryset = queryset.filter(q).distinct()
q = q | Exists(
Event.in_language.through.objects.filter(
event=OuterRef("pk"), language=lang
)
)

queryset = queryset.filter(q)

# Filter by in_language field only
val = params.get("in_language", None)
if val:
val = val.split(",")
q = Q()
for lang in val:
q = q | Q(in_language__id=lang)
q = q | Exists(
Event.in_language.through.objects.filter(
event=OuterRef("pk"), language=lang
)
)

queryset = queryset.filter(q)

val = params.get("starts_after", None)
Expand Down Expand Up @@ -3119,8 +3152,12 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
# Filter by free offer
val = params.get("is_free", None)
if val and val.lower() in ["true", "false"]:
# Include events that have at least one free offer
if val.lower() == "true":
queryset = queryset.filter(offers__is_free=True)
queryset = queryset.filter(
Exists(Offer.objects.filter(event=OuterRef("pk"), is_free=True))
)
# Include events that have no free offers
elif val.lower() == "false":
queryset = queryset.exclude(offers__is_free=True)

Expand All @@ -3146,7 +3183,7 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
| Q(audience_max_age__lt=upper_boundary)
| Q(Q(audience_min_age=None) & Q(audience_max_age=None))
)
return queryset.distinct()
return queryset


class EventExtensionFilterBackend(BaseFilterBackend):
Expand Down
7 changes: 7 additions & 0 deletions events/tests/test_event_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,21 +496,28 @@ def test_language_filter(api_client, event, event2, event3):

# Finnish should be the default language
get_list_and_assert_events("language=fi", [event, event2, event3])
# in_language is explicit filter on the in_language field, so no results for fi
get_list_and_assert_events("in_language=fi", [])

# Swedish should have two events (matches in_language and name_sv)
get_list_and_assert_events("language=sv", [event, event2])
get_list_and_assert_events("in_language=sv", [event2])

# English should have one event (matches in_language)
get_list_and_assert_events("language=en", [event2])
get_list_and_assert_events("in_language=en", [event2])

# Russian should have one event (matches name_ru)
get_list_and_assert_events("language=ru", [event3])
get_list_and_assert_events("in_language=ru", [])

# Chinese should have no events
get_list_and_assert_events("language=zh_hans", [])
get_list_and_assert_events("in_language=zh_hans", [])

# Estonian should have one event (matches in_language), even without translations available
get_list_and_assert_events("language=et", [event3])
get_list_and_assert_events("in_language=et", [event3])


@pytest.mark.django_db
Expand Down
20 changes: 20 additions & 0 deletions events/tests/test_keyword.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from rest_framework.exceptions import ValidationError

from events.api import _find_keyword_replacements
from events.tests.factories import KeywordFactory


Expand Down Expand Up @@ -86,3 +87,22 @@ def test_keyword_get_replacement_multi_level():
old_keyword_2 = KeywordFactory(deprecated=True, replaced_by=old_keyword_1)

assert old_keyword_2.get_replacement().pk == new_keyword.pk


@pytest.mark.django_db
def test_find_keyword_replacements():
new_keyword = KeywordFactory()
replaced_keyword = KeywordFactory(deprecated=True, replaced_by=new_keyword)
other_keyword = KeywordFactory()
unknown_keyword_id = "keyword:doesnotexist"

assert _find_keyword_replacements([replaced_keyword.pk]) == ([new_keyword], True)
assert _find_keyword_replacements([new_keyword.pk]) == ([new_keyword], True)
assert _find_keyword_replacements([new_keyword.pk, replaced_keyword.pk]) == (
[new_keyword],
True,
)
assert _find_keyword_replacements([other_keyword.pk, unknown_keyword_id]) == (
[other_keyword],
False,
)

0 comments on commit c1d128a

Please sign in to comment.