Skip to content

Commit

Permalink
fix: respect small page[size] when streaming search results
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Jan 27, 2025
1 parent 4e52c47 commit b512a77
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 49 deletions.
2 changes: 1 addition & 1 deletion trove/render/_simple_trovesearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _get_card_content(
_card_content = (
next(self.response_gathering.ask(TROVE.resourceMetadata, focus=card))
if graph is None
else next(graph.q(card, TROVE.resourceMetadata))
else next(graph.q(card, TROVE.resourceMetadata), None)
)
elif isinstance(card, frozenset):
_card_content = next(
Expand Down
15 changes: 14 additions & 1 deletion trove/trovesearch/page_cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def bounded_page_size(self) -> int:
else int(self.page_size)
)

@property
def is_complete_page(self) -> bool:
return self.bounded_page_size == self.page_size

def as_queryparam_value(self) -> str:
_cls_key = _PageCursorTypes(type(self)).name
_as_json = json.dumps([_cls_key, *dataclasses.astuple(self)])
Expand Down Expand Up @@ -82,10 +86,19 @@ class OffsetCursor(PageCursor):
# total_count: int | float (from PageCursor)
start_offset: int = 0

@property
def bounded_page_size(self) -> int:
# overrides PageCursor
_bounded_page_size = super().bounded_page_size
if (_bounded_page_size < self.page_size < MAX_OFFSET):
_remaining = self.page_size - self.start_offset
_bounded_page_size = int(min(_bounded_page_size, _remaining))
return _bounded_page_size

def is_valid(self) -> bool:
_end_offset = (
self.total_count
if self.bounded_page_size == self.page_size
if self.is_complete_page
else min(self.total_count, self.page_size)
)
return (
Expand Down
2 changes: 2 additions & 0 deletions trove/trovesearch/search_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def __post_init__(self):
return _page

def get_next_streaming_handle(self) -> typing.Self | None:
if self.cursor.is_complete_page:
return None
_next_cursor = self.cursor.next_cursor()
if (_next_cursor is not None) and (self.handler is not None):
assert isinstance(self.search_params, CardsearchParams)
Expand Down
100 changes: 53 additions & 47 deletions trove/trovesearch/trovesearch_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class IndexcardFocus(_TypedFocus):

# additional dataclass fields
indexcard: trove_db.Indexcard = dataclasses.field(compare=False)
resourceMetadata: Any = dataclasses.field(compare=False, default=None)
resourceMetadata: Any = dataclasses.field(compare=False, default=None, repr=False)


# TODO: per-field text search in rdf
Expand Down Expand Up @@ -157,7 +157,7 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs):
while _current_handle is not None:
_result_page = []
_card_foci = _load_cards_and_contents(
(_result.card_iri for _result in _current_handle.search_result_page),
card_iris=(_result.card_iri for _result in _current_handle.search_result_page),
deriver_iri=deriver_iri,
)
for _result in _current_handle.search_result_page or ():
Expand All @@ -175,10 +175,11 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs):
(TROVE.indexCard, _result.card_iri),
*_text_evidence_twoples,
)))
# hack around (current) limitations of primitive_metadata.gather:
# yield a redundant triple to make this IndexcardFocus gatherable
# hack around (current) limitations of primitive_metadata.gather
# (what with all these intermediate blank nodes and sequences):
# yield trove:resourceMetadata here (instead of another gatherer)
_card_focus = _card_foci[_result.card_iri]
yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI)
yield (_result.card_iri, TROVE.resourceMetadata, _card_focus.resourceMetadata)
yield (TROVE.searchResultPage, sequence(_result_page))
_current_handle = _current_handle.get_next_streaming_handle()

Expand Down Expand Up @@ -226,39 +227,26 @@ def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs):
if _result.value_iri
}
if _value_iris:
_value_indexcards = (
trove_db.Indexcard.objects
.filter(
focus_identifier_set__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iris(_value_iris)
),
derived_indexcard_set__deriver_identifier__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iri(deriver_iri)
),
)
.prefetch_related('focus_identifier_set')
)
_card_foci = _load_cards_and_contents(value_iris=_value_iris, deriver_iri=deriver_iri)
else:
_value_indexcards = []
_cards_by_suffuniq_iri = {
_identifier.sufficiently_unique_iri: _indexcard
for _indexcard in _value_indexcards
for _identifier in _indexcard.focus_identifier_set.all()
_card_foci = {}
_card_foci_by_suffuniq_iri: dict[str, IndexcardFocus] = {
_identifier.sufficiently_unique_iri: _focus
for _focus in _card_foci.values()
for _identifier in _focus.indexcard.focus_identifier_set.all()
}
for _result in focus.search_handle.search_result_page or ():
_indexcard_obj = None
if _result.value_iri is not None:
_indexcard = _cards_by_suffuniq_iri.get(
_card_focus = _card_foci_by_suffuniq_iri.get(
get_sufficiently_unique_iri(_result.value_iri),
)
if _indexcard is not None:
_indexcard_obj = _indexcard.get_iri()
# hack around (current) limitations of primitive_metadata.gather:
# yield a redundant triple to make this IndexcardFocus gatherable
_card_focus = IndexcardFocus.new(_indexcard_obj, indexcard=_indexcard)
yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI)
if _card_focus is not None:
_indexcard_obj = _card_focus.indexcard.get_iri()
# hack around (current) limitations of primitive_metadata.gather
# (what with all these intermediate blank nodes and sequences):
# yield trove:resourceMetadata here (instead of another gatherer)
yield (_indexcard_obj, TROVE.resourceMetadata, _card_focus.resourceMetadata)
if _indexcard_obj is None:
# no actual indexcard; put what we know in a blanknode-indexcard
_indexcard_obj = _valuesearch_result_as_indexcard_blanknode(_result)
Expand Down Expand Up @@ -325,31 +313,39 @@ def gather_card_contents(focus: IndexcardFocus, *, deriver_iri, **kwargs):
yield (TROVE.resourceMetadata, focus.resourceMetadata)
else:
_iri = focus.single_iri()
_loaded_foci = _load_cards_and_contents([_iri], deriver_iri)
_loaded_foci = _load_cards_and_contents(card_iris=[_iri], deriver_iri=deriver_iri)
_loaded_metadata = _loaded_foci[_iri].resourceMetadata
yield (TROVE.resourceMetadata, _loaded_metadata)


def _load_cards_and_contents(card_iris, deriver_iri) -> dict[str, IndexcardFocus]:
def _load_cards_and_contents(*, card_iris=None, value_iris=None, deriver_iri) -> dict[str, IndexcardFocus]:
return (
_load_cards_and_extracted_rdf_contents(card_iris)
_load_cards_and_extracted_rdf_contents(card_iris, value_iris)
if deriver_iri is None
else _load_cards_and_derived_contents(card_iris, deriver_iri)
else _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri)
)


def _load_cards_and_extracted_rdf_contents(card_iris) -> dict[str, IndexcardFocus]:
def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> dict[str, IndexcardFocus]:
_card_namespace = trove_indexcard_namespace()
_indexcard_uuids = {
iri_minus_namespace(_card_iri, namespace=_card_namespace)
for _card_iri in card_iris
}
_indexcard_rdf_qs = (
trove_db.LatestIndexcardRdf.objects
.filter(indexcard__uuid__in=_indexcard_uuids)
.select_related('indexcard')
.prefetch_related('indexcard__focus_identifier_set')
)
if card_iris is not None:
_indexcard_uuids = {
iri_minus_namespace(_card_iri, namespace=_card_namespace)
for _card_iri in card_iris
}
_indexcard_rdf_qs = _indexcard_rdf_qs.filter(indexcard__uuid__in=_indexcard_uuids)
if value_iris is not None:
_indexcard_rdf_qs = _indexcard_rdf_qs.filter(
indexcard__focus_identifier_set__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iris(value_iris)
),
)
_card_foci: dict[str, IndexcardFocus] = {}
for _indexcard_rdf in _indexcard_rdf_qs:
_card = _indexcard_rdf.indexcard
Expand All @@ -366,17 +362,12 @@ def _load_cards_and_extracted_rdf_contents(card_iris) -> dict[str, IndexcardFocu
return _card_foci


def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> dict[str, IndexcardFocus]:
def _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri: str) -> dict[str, IndexcardFocus]:
_card_namespace = trove_indexcard_namespace()
_indexcard_uuids = {
iri_minus_namespace(_card_iri, namespace=_card_namespace)
for _card_iri in card_iris
}
# include pre-formatted data from a DerivedIndexcard
_derived_indexcard_qs = (
trove_db.DerivedIndexcard.objects
.filter(
upriver_indexcard__uuid__in=_indexcard_uuids,
deriver_identifier__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iri(deriver_iri)
Expand All @@ -385,6 +376,21 @@ def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> dict[str, I
.select_related('upriver_indexcard')
.prefetch_related('upriver_indexcard__focus_identifier_set')
)
if card_iris is not None:
_indexcard_uuids = {
iri_minus_namespace(_card_iri, namespace=_card_namespace)
for _card_iri in card_iris
}
_derived_indexcard_qs = _derived_indexcard_qs.filter(
upriver_indexcard__uuid__in=_indexcard_uuids,
)
if value_iris is not None:
_derived_indexcard_qs = _derived_indexcard_qs.filter(
upriver_indexcard__focus_identifier_set__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iris(value_iris)
),
)
_card_foci: dict[str, IndexcardFocus] = {}
for _derived in _derived_indexcard_qs:
_card_iri = _derived.upriver_indexcard.get_iri()
Expand Down

0 comments on commit b512a77

Please sign in to comment.