From a16afeb666764b9e6fb422a905d71d6745c72fc8 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 6 Jan 2025 12:57:51 -0500 Subject: [PATCH 01/35] wip --- share/search/index_strategy/_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index bafec1fa4..dec5aa46f 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -65,12 +65,13 @@ def indexname_wildcard(self): return f'{self.indexname_prefix}*' @functools.cached_property - def current_indexname(self): + def all_current_indexnames(self) -> tuple[str, ...]: self.assert_strategy_is_current() - return ''.join(( + _single_indexname = ''.join(( self.indexname_prefix, self.CURRENT_STRATEGY_CHECKSUM.hexdigest, )) + return (_single_indexname,) def assert_message_type(self, message_type: messages.MessageType): if message_type not in self.supported_message_types: From 0a7dc4beed5d7dc4dba041a3b93edc5d9efb40fe Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 7 Jan 2025 09:44:57 -0500 Subject: [PATCH 02/35] s/each_specific_index/each_existing_index --- share/admin/search.py | 2 +- share/search/index_strategy/_base.py | 5 +++-- share/search/index_strategy/elastic8.py | 2 +- share/search/index_strategy/sharev2_elastic5.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/share/admin/search.py b/share/admin/search.py index fbf2446b0..4f22304c6 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -73,7 +73,7 @@ def _index_status_by_strategy(): }, 'prior': sorted(( specific_index.pls_get_status() - for specific_index in _index_strategy.each_specific_index() + for specific_index in _index_strategy.each_existing_index() if not specific_index.is_current ), reverse=True), 'queues': [ diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index dec5aa46f..74db2f03a 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -1,3 +1,4 @@ +from __future__ import annotations import abc import functools import logging @@ -128,7 +129,7 @@ def compute_strategy_checksum(self) -> ChecksumIri: raise NotImplementedError @abc.abstractmethod - def each_specific_index(self) -> 'typing.Iterable[SpecificIndex]': + def each_existing_index(self) -> typing.Iterator[SpecificIndex]: raise NotImplementedError @abc.abstractmethod @@ -177,7 +178,7 @@ def pls_setup(self, *, skip_backfill=False): assert self.is_current, 'cannot setup a non-current index' _preexisting_index_count = sum( _index.pls_check_exists() - for _index in self.index_strategy.each_specific_index() + for _index in self.index_strategy.each_existing_index() ) self.pls_create() self.pls_start_keeping_live() diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 546889e9f..8361e1222 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -119,7 +119,7 @@ def compute_strategy_checksum(self): ) # abstract method from IndexStrategy - def each_specific_index(self): + def each_existing_index(self): indexname_set = set( self.es8_client.indices .get(index=self.indexname_wildcard, features=',') diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 13edb4881..29eff4b58 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -92,7 +92,7 @@ def pls_get_default_for_searching(self): return self.for_specific_index(self.STATIC_INDEXNAME) # abstract method from IndexStrategy - def each_specific_index(self): + def each_existing_index(self): yield self.for_specific_index(self.STATIC_INDEXNAME) # abstract method from IndexStrategy From e373888643d61cbe6fe738c679c3bc272b08866e Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 7 Jan 2025 09:49:18 -0500 Subject: [PATCH 03/35] s/for_specific_index/get_index_by_name --- share/search/index_strategy/__init__.py | 2 +- share/search/index_strategy/_base.py | 4 ++-- share/search/index_strategy/elastic8.py | 4 ++-- share/search/index_strategy/sharev2_elastic5.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 297702475..d8093e7a1 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -59,7 +59,7 @@ def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> I except IndexStrategyError: for _index_strategy in all_index_strategies().values(): try: - return _index_strategy.for_specific_index(indexname_or_strategyname) + return _index_strategy.get_index_by_name(indexname_or_strategyname) except IndexStrategyError: pass raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 74db2f03a..77b252553 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -93,11 +93,11 @@ def assert_strategy_is_current(self): ) ```''') - def for_specific_index(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': + def get_index_by_name(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': return self.SpecificIndex(self, specific_indexname) # type: ignore[abstract] def for_current_index(self) -> 'IndexStrategy.SpecificIndex': - return self.for_specific_index(self.current_indexname) + return self.get_index_by_name(self.current_indexname) def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create(index_strategy_name=self.name) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 8361e1222..9eef40287 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -127,7 +127,7 @@ def each_existing_index(self): ) indexname_set.add(self.current_indexname) for indexname in indexname_set: - yield self.for_specific_index(indexname) + yield self.get_index_by_name(indexname) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): @@ -191,7 +191,7 @@ def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificI def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks - return self.for_specific_index(self._alias_for_searching) + return self.get_index_by_name(self._alias_for_searching) # override from IndexStrategy def pls_mark_backfill_complete(self): diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 29eff4b58..981b87e41 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -89,11 +89,11 @@ def pls_make_default_for_searching(self, specific_index): # abstract method from IndexStrategy def pls_get_default_for_searching(self): - return self.for_specific_index(self.STATIC_INDEXNAME) + return self.get_index_by_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def each_existing_index(self): - yield self.for_specific_index(self.STATIC_INDEXNAME) + yield self.get_index_by_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): From 99eae9f6ad5d7281b343e7193977ea7b5c4a4fc4 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 7 Jan 2025 13:09:53 -0500 Subject: [PATCH 04/35] plan (add _TODO_multindex.txt) --- _TODO_multindex.txt | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 _TODO_multindex.txt diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt new file mode 100644 index 000000000..97ed79d49 --- /dev/null +++ b/_TODO_multindex.txt @@ -0,0 +1,49 @@ + +IndexStrategy revamp plan/log: + +- update existing classes to dataclasses + - IndexStrategy (each instance represents a current or past version of the strategy, identified by subname) + - name (existing attr) + - subname (new; default from CURRENT_STRATEGY_CHECKSUM) + - IndexStrategy.SpecificIndex (each strategy has a set of indexes with unique short-names) + - index_strategy (existing) + - short_indexname (new; unique within a strategy) + - (base SpecificIndex now mainly inward, focused on constructing index names and checking index status) + +- move search methods from IndexStrategy.SpecificIndex to IndexStrategy + +- consolidate parsing names from `indexStrategy` queryparam and from elastic state + +- remove uniindex methods from IndexStrategy (and friends) + - each_specific_index + - for_specific_index + - for_current_index + - SpecificIndex.pls_setup + - SpecificIndex.pls_handle_cardsearch + - SpecificIndex.pls_handle_valuesearch + - SpecificIndex.pls_refresh + - SpecificIndex.pls_delete + - SpecificIndex.pls_start_keeping_live + - Elastic8IndexStrategy.index_settings + - Elastic8IndexStrategy.index_mappings + +- add replacement multiindex methods to IndexStrategy (and friends) + - (classmethod) each_existing_index (based on index names from elastic; may be any hex) + - each_named_index (includes non-existent; ) + - index_shortname_set + - get_index_by_shortname + - is_current + - pls_setup + - pls_start_keeping_live + - pls_teardown + - pls_handle_cardsearch + - pls_handle_valuesearch + - pls_ensure_fresh + - Elastic8IndexStrategy.index_definitions (abstractmethod) + - Elastic8IndexStrategy.each_named_index (based on index_definitions) + +- update existing base methods + - add subname to indexname_prefix + - pls_get_default_for_searching (classmethod, return IndexStrategy) + - pls_make_default_for_searching (by self.subname, not SpecificIndex) + From 86ef59f9eed43d03884ad72dbe2577fa3d8fd449 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 7 Jan 2025 13:10:42 -0500 Subject: [PATCH 05/35] wip... --- share/search/index_strategy/__init__.py | 31 +++++-- share/search/index_strategy/_base.py | 81 ++++++++++--------- share/search/index_strategy/elastic8.py | 25 +++++- .../search/index_strategy/sharev2_elastic5.py | 4 +- .../index_strategy/trovesearch_denorm.py | 28 +++++++ 5 files changed, 117 insertions(+), 52 deletions(-) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index d8093e7a1..c55220a5c 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -26,10 +26,12 @@ @functools.cache def all_index_strategies() -> MappingProxyType[str, IndexStrategy]: - return MappingProxyType({ - _strategy.name: _strategy - for _strategy in _iter_all_index_strategies() - }) + _all_strategies = {} + for _strategy in _iter_all_index_strategies(): + if _strategy.name in _all_strategies: + raise IndexStrategyError(f'strategy names must be unique! (duplicate "{_strategy.name}")') + _all_strategies[_strategy.name] = _strategy + return MappingProxyType(_all_strategies) # a single cached readonly proxy -- set of strategy names immutable def _iter_all_index_strategies(): @@ -41,11 +43,24 @@ def _iter_all_index_strategies(): yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') -def get_index_strategy(strategyname: str) -> IndexStrategy: +def parse_strategy_request(self, requested_strategy_name: str) -> IndexStrategy: + (_strategyname, *_etc) = requested_strategy_name.split(_INDEXNAME_DELIM) try: - return all_index_strategies()[strategyname] + _strategy = get_index_strategy( + _strategyname, + subname=(_etc[0] if _etc else ''), + ) + except IndexStrategyError: + raise IndexStrategyError(f'unrecognized strategy name "{requested_strategy_name}"') + else: + return _strategy + + +def get_index_strategy(strategy_name: str, subname: str = '') -> IndexStrategy: + try: + return all_index_strategies()[strategy_name] except KeyError: - raise IndexStrategyError(f'unknown index strategy "{strategyname}"') + raise IndexStrategyError(f'unknown index strategy "{strategy_name}"') def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> IndexStrategy.SpecificIndex: @@ -59,7 +74,7 @@ def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> I except IndexStrategyError: for _index_strategy in all_index_strategies().values(): try: - return _index_strategy.get_index_by_name(indexname_or_strategyname) + return _index_strategy.get_index_by_shortname(indexname_or_strategyname) except IndexStrategyError: pass raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 77b252553..6eb1e7ae9 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -1,5 +1,6 @@ from __future__ import annotations import abc +import dataclasses import functools import logging import typing @@ -22,6 +23,10 @@ logger = logging.getLogger(__name__) +_INDEXNAME_DELIM = '__' # used to separate indexnames into a list of meaningful values + + +@dataclasses.dataclass class IndexStrategy(abc.ABC): '''an abstraction for indexes in different places and ways. @@ -38,33 +43,38 @@ class IndexStrategy(abc.ABC): * may know of version- or cluster-specific features (should include identifiers like version numbers in subclass name) ''' - CURRENT_STRATEGY_CHECKSUM: ChecksumIri # set on subclasses to protect against accidents + CURRENT_STRATEGY_CHECKSUM: typing.ClassVar[ChecksumIri] # set on subclasses to protect against accidents - def __init__(self, name): - self.name = name + name: str + subname: str = '' # if unspecified, uses current - def __repr__(self): - return ''.join(( - self.__class__.__qualname__, - f'(name="{self.name}")' - )) + def __post_init__(self): + if _INDEXNAME_DELIM in self.name: + raise IndexStrategyError(f'strategy name may not contain "{_INDEXNAME_DELIM}" (got "{self.name}")') + if not self.subname: + self.subname = self.CURRENT_STRATEGY_CHECKSUM.hexdigest @property - def nonurgent_messagequeue_name(self): + def nonurgent_messagequeue_name(self) -> str: return f'{self.name}.nonurgent' @property - def urgent_messagequeue_name(self): + def urgent_messagequeue_name(self) -> str: return f'{self.name}.urgent' @property - def indexname_prefix(self): - return f'{self.name}__' + def indexname_prefix(self) -> str: + # note: ends with _INDEXNAME_DELIM + return _INDEXNAME_DELIM.join((self.name, self.subname, '')) @property - def indexname_wildcard(self): + def indexname_wildcard(self) -> str: return f'{self.indexname_prefix}*' + @property + def is_current(self) -> bool: + return self.subname == self.CURRENT_STRATEGY_CHECKSUM.hexdigest + @functools.cached_property def all_current_indexnames(self) -> tuple[str, ...]: self.assert_strategy_is_current() @@ -93,11 +103,14 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_name(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': - return self.SpecificIndex(self, specific_indexname) # type: ignore[abstract] + def with_hex(self, subname: str): + return dataclasses.replace(self, subname=subname) - def for_current_index(self) -> 'IndexStrategy.SpecificIndex': - return self.get_index_by_name(self.current_indexname) + def get_index_by_shortname(self, shortname: str) -> typing.Self.SpecificIndex: + return self.SpecificIndex(self, shortname) # type: ignore[abstract] + + def for_current_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index_by_shortname(self.current_indexname) def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create(index_strategy_name=self.name) @@ -146,33 +159,25 @@ def pls_get_default_for_searching(self) -> 'SpecificIndex': # IndexStrategy.SpecificIndex must be implemented by subclasses # in their own `class SpecificIndex(IndexStrategy.SpecificIndex)` + @dataclasses.dataclass class SpecificIndex(abc.ABC): - def __init__(self, index_strategy, indexname): - if not indexname.startswith(index_strategy.indexname_prefix): + index_strategy: IndexStrategy + short_indexname: str # unique per index_strategy + + def __post_init__(self): + if self.short_indexname not in self.index_strategy.short_indexname_set: raise IndexStrategyError( - f'invalid indexname "{indexname}"!' - f' (expected to start with "{index_strategy.indexname_prefix}")' + f'invalid short_indexname "{self.short_indexname}"!' + f' (expected to start with "{self.index_strategy.short_indexname_set}")' ) - self.index_strategy = index_strategy - self.indexname = indexname - - def __eq__(self, other): - return ( - other.__class__ is self.__class__ - and other.index_strategy is self.index_strategy - and other.indexname == self.indexname - ) - def __repr__(self): - return ''.join(( - self.__class__.__qualname__, - f'(index_strategy={self.index_strategy}, ' - f'indexname={self.indexname})' - )) + @property + def is_current(self) -> bool: + return self.index_strategy.is_current @property - def is_current(self): - return self.indexname == self.index_strategy.current_indexname + def indexname(self) -> str: + return f'{self.index_strategy.indexname_prefix}{self.short_indexname}' def pls_setup(self, *, skip_backfill=False): assert self.is_current, 'cannot setup a non-current index' diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 9eef40287..40a7101dd 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -20,12 +20,14 @@ logger = logging.getLogger(__name__) +@dataclasses.dataclass class Elastic8IndexStrategy(IndexStrategy): '''abstract base class for index strategies using elasticsearch 8 ''' + es8_client: elasticsearch8.Elasticsearch = dataclasses.field(init=False) - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __post_init__(self): + super().__post_init__() should_sniff = settings.ELASTICSEARCH['SNIFF'] timeout = settings.ELASTICSEARCH['TIMEOUT'] self.es8_client = elasticsearch8.Elasticsearch( @@ -48,8 +50,23 @@ def __init__(self, *args, **kwargs): min_delay_between_sniffing=timeout, ) + ### + # for use when defining indexes + @dataclasses.dataclass + class IndexDefinition: + settings: dict + mappings: dict + ### # abstract methods for subclasses to implement + @abc.abstractmethod + @classmethod + def index_definitions(cls) -> typing.Iterator[IndexDefinition]: + ... + + def each_named_index(self): + for _index_def in self.each_index_definition(): + yield self.get_index_by_shortname('iris') @abc.abstractmethod def index_settings(self): @@ -127,7 +144,7 @@ def each_existing_index(self): ) indexname_set.add(self.current_indexname) for indexname in indexname_set: - yield self.get_index_by_name(indexname) + yield self.get_index_by_shortname(indexname) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): @@ -191,7 +208,7 @@ def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificI def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks - return self.get_index_by_name(self._alias_for_searching) + return self.get_index_by_shortname(self._alias_for_searching) # override from IndexStrategy def pls_mark_backfill_complete(self): diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 981b87e41..20de93537 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -89,11 +89,11 @@ def pls_make_default_for_searching(self, specific_index): # abstract method from IndexStrategy def pls_get_default_for_searching(self): - return self.get_index_by_name(self.STATIC_INDEXNAME) + return self.parse_index_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def each_existing_index(self): - yield self.get_index_by_name(self.STATIC_INDEXNAME) + yield self.parse_index_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2dbeb4614..2bbf6b4d0 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -59,6 +59,29 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', ) + @classmethod + def each_index_definition(cls) -> typing.Iterator[Elastic8IndexStrategy.IndexDefiniton]: + yield Elastic8IndexStrategy.IndexDefinition( + settings=cls._index_settings(), + mappings=cls._card_index_mappings(), + ) + yield strategy.SpecificIndex( + index_strategy=strategy, + shortname='card', + elastic8_index_settings=cls._index_settings(), + elastic8_index_mappings=cls._card_index_mappings(), + ) + yield self.SpecificIndex( + index_strategy=strategy, + shortname='iris', + elastic8_index_settings=cls._index_settings(), + elastic8_index_mappings=cls._iris_index_mappings(), + ) + + yield Elastic8IndexDefinition( + settings=cls._index_settings(), + mappings=cls._card_index_mappings(), + ) # abstract method from IndexStrategy @property def supported_message_types(self): @@ -72,6 +95,11 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD + def each_index(self) -> typing.Iterator[TrovesearchDenormIndexStrategy.SpecificIndex]: + yield self.SpecificIndex( + index_strategy=self, + ) + # abstract method from Elastic8IndexStrategy def index_settings(self): return { From 5e65e5627239ecc14c9fe20dc7d22f68e5c1660d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 7 Jan 2025 13:59:54 -0500 Subject: [PATCH 06/35] wip..... --- _TODO_multindex.txt | 16 +++++----- share/admin/search.py | 23 +++++++++------ share/models/index_backfill.py | 8 ++--- share/search/daemon.py | 7 +++-- share/search/index_strategy/__init__.py | 4 +-- share/search/index_strategy/_base.py | 29 ++++++++++--------- share/search/index_strategy/_indexnames.py | 21 ++++++++++++++ share/search/index_strategy/elastic8.py | 20 ++++++------- .../index_strategy/trovesearch_denorm.py | 24 +++++---------- 9 files changed, 86 insertions(+), 66 deletions(-) create mode 100644 share/search/index_strategy/_indexnames.py diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt index 97ed79d49..6c160ac63 100644 --- a/_TODO_multindex.txt +++ b/_TODO_multindex.txt @@ -3,11 +3,11 @@ IndexStrategy revamp plan/log: - update existing classes to dataclasses - IndexStrategy (each instance represents a current or past version of the strategy, identified by subname) - - name (existing attr) - - subname (new; default from CURRENT_STRATEGY_CHECKSUM) - - IndexStrategy.SpecificIndex (each strategy has a set of indexes with unique short-names) + - strategy_name (rename existing `name` attr for disambig) + - strategy_check (new; default CURRENT_STRATEGY_CHECKSUM but may be parsed from or index name or `indexStrategy` query param) + - IndexStrategy.SpecificIndex - index_strategy (existing) - - short_indexname (new; unique within a strategy) + - subname (new; unique within a strategy) - (base SpecificIndex now mainly inward, focused on constructing index names and checking index status) - move search methods from IndexStrategy.SpecificIndex to IndexStrategy @@ -19,6 +19,7 @@ IndexStrategy revamp plan/log: - for_specific_index - for_current_index - SpecificIndex.pls_setup + - SpecificIndex.pls_check_exists - SpecificIndex.pls_handle_cardsearch - SpecificIndex.pls_handle_valuesearch - SpecificIndex.pls_refresh @@ -30,16 +31,17 @@ IndexStrategy revamp plan/log: - add replacement multiindex methods to IndexStrategy (and friends) - (classmethod) each_existing_index (based on index names from elastic; may be any hex) - each_named_index (includes non-existent; ) - - index_shortname_set - - get_index_by_shortname + - get_index_by_subname + - subnames - is_current - pls_setup + - pls_check_exists - pls_start_keeping_live - pls_teardown - pls_handle_cardsearch - pls_handle_valuesearch - pls_ensure_fresh - - Elastic8IndexStrategy.index_definitions (abstractmethod) + - Elastic8IndexStrategy.each_index_definition (abstractmethod) - Elastic8IndexStrategy.each_named_index (based on index_definitions) - update existing base methods diff --git a/share/admin/search.py b/share/admin/search.py index 4f22304c6..b635fb525 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -52,9 +52,9 @@ def _mappings_url_prefix(): def _index_status_by_strategy(): - backfill_by_indexname: dict[str, IndexBackfill] = { - backfill.specific_indexname: backfill - for backfill in ( + _backfill_by_checksum: dict[str, IndexBackfill] = { + _backfill.strategy_checksum: _backfill + for _backfill in ( IndexBackfill.objects .filter(index_strategy_name__in=index_strategy.all_index_strategies().keys()) ) @@ -62,13 +62,18 @@ def _index_status_by_strategy(): status_by_strategy = {} _messenger = IndexMessenger() for _index_strategy in index_strategy.all_index_strategies().values(): - current_index = _index_strategy.for_current_index() + _current_backfill = _backfill_by_checksum.get( + str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), + ) status_by_strategy[_index_strategy.name] = { 'current': { - 'status': current_index.pls_get_status(), + 'status': [ + _index.pls_get_status() + for _index in _index_strategy.each_current_index() + ], 'backfill': _serialize_backfill( current_index, - backfill_by_indexname.get(current_index.indexname), + _backfill_by_checksum.get(current_index.indexname), ), }, 'prior': sorted(( @@ -91,14 +96,14 @@ def _index_status_by_strategy(): def _serialize_backfill( - specific_index: index_strategy.IndexStrategy.SpecificIndex, + strategy: index_strategy.IndexStrategy, backfill: IndexBackfill | None, ): - if not specific_index.is_current: + if not strategy.is_current: return {} if not backfill: return { - 'can_start_backfill': specific_index.pls_check_exists(), + 'can_start_backfill': strategy.pls_check_exists(), } return { 'backfill_status': backfill.backfill_status, diff --git a/share/models/index_backfill.py b/share/models/index_backfill.py index c8e92ffed..6b3f6fdba 100644 --- a/share/models/index_backfill.py +++ b/share/models/index_backfill.py @@ -47,7 +47,7 @@ class IndexBackfill(models.Model): ) backfill_status = models.TextField(choices=BACKFILL_STATUS_CHOICES, default=INITIAL) index_strategy_name = models.TextField(unique=True) - specific_indexname = models.TextField() + strategy_checksum = models.TextField() error_type = models.TextField(blank=True) error_message = models.TextField(blank=True) error_context = models.TextField(blank=True) @@ -77,13 +77,13 @@ def mutex(self): def pls_start(self, index_strategy): with self.mutex() as locked_self: assert locked_self.index_strategy_name == index_strategy.name - current_index = index_strategy.for_current_index() - if locked_self.specific_indexname == current_index.indexname: + _current_checksum = str(index_strategy.CURRENT_STRATEGY_CHECKSUM) + if locked_self.strategy_checksum == _current_checksum: # what is "current" has not changed -- should be INITIAL assert locked_self.backfill_status == IndexBackfill.INITIAL else: # what is "current" has changed! disregard backfill_status - locked_self.specific_indexname = current_index.indexname + locked_self.strategy_checksum = _current_checksum locked_self.backfill_status = IndexBackfill.INITIAL locked_self.__update_error(None) try: diff --git a/share/search/daemon.py b/share/search/daemon.py index 1fa7cce23..35f4b83d8 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -232,11 +232,12 @@ def _the_loop_itself(self): def _raise_if_backfill_noncurrent(self): if self.message_type.is_backfill: index_backfill = self.index_strategy.get_or_create_backfill() - if index_backfill.specific_indexname != self.index_strategy.current_indexname: + _current_checksum = str(self.index_strategy.CURRENT_STRATEGY_CHECKSUM) + if index_backfill.strategy_checksum != _current_checksum: raise exceptions.DaemonSetupError( 'IndexerDaemon observes conflicting currence:' - f'\n\tIndexBackfill (from database) says current is "{index_backfill.specific_indexname}"' - f'\n\tIndexStrategy (from static code) says current is "{self.index_strategy.current_indexname}"' + f'\n\tIndexBackfill (from database) says current is "{index_backfill.strategy_checksum}"' + f'\n\tIndexStrategy (from static code) says current is "{_current_checksum}"' '\n\t(may be the daemon is running old code -- will die and retry,' ' but if this keeps happening you may need to reset backfill_status' ' to INITIAL and restart the backfill)' diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index c55220a5c..12299877d 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -43,7 +43,7 @@ def _iter_all_index_strategies(): yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') -def parse_strategy_request(self, requested_strategy_name: str) -> IndexStrategy: +def parse_strategy_request(requested_strategy_name: str) -> IndexStrategy: (_strategyname, *_etc) = requested_strategy_name.split(_INDEXNAME_DELIM) try: _strategy = get_index_strategy( @@ -74,7 +74,7 @@ def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> I except IndexStrategyError: for _index_strategy in all_index_strategies().values(): try: - return _index_strategy.get_index_by_shortname(indexname_or_strategyname) + return _index_strategy.get_index_by_subname(indexname_or_strategyname) except IndexStrategyError: pass raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 6eb1e7ae9..117f0bff7 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -18,14 +18,12 @@ CardsearchHandle, ValuesearchHandle, ) +from . import _indexnames as indexnames logger = logging.getLogger(__name__) -_INDEXNAME_DELIM = '__' # used to separate indexnames into a list of meaningful values - - @dataclasses.dataclass class IndexStrategy(abc.ABC): '''an abstraction for indexes in different places and ways. @@ -46,13 +44,13 @@ class IndexStrategy(abc.ABC): CURRENT_STRATEGY_CHECKSUM: typing.ClassVar[ChecksumIri] # set on subclasses to protect against accidents name: str - subname: str = '' # if unspecified, uses current + subname: str = '' # if unspecified, uses current checksum def __post_init__(self): - if _INDEXNAME_DELIM in self.name: - raise IndexStrategyError(f'strategy name may not contain "{_INDEXNAME_DELIM}" (got "{self.name}")') + indexnames.raise_if_invalid_indexname_part(self.name) if not self.subname: self.subname = self.CURRENT_STRATEGY_CHECKSUM.hexdigest + indexnames.raise_if_invalid_indexname_part(self.subname) @property def nonurgent_messagequeue_name(self) -> str: @@ -64,8 +62,7 @@ def urgent_messagequeue_name(self) -> str: @property def indexname_prefix(self) -> str: - # note: ends with _INDEXNAME_DELIM - return _INDEXNAME_DELIM.join((self.name, self.subname, '')) + return indexnames.combine_indexname_parts(self.name, self.subname) @property def indexname_wildcard(self) -> str: @@ -78,10 +75,10 @@ def is_current(self) -> bool: @functools.cached_property def all_current_indexnames(self) -> tuple[str, ...]: self.assert_strategy_is_current() - _single_indexname = ''.join(( + _single_indexname = indexnames.combine_indexname_parts( self.indexname_prefix, self.CURRENT_STRATEGY_CHECKSUM.hexdigest, - )) + ) return (_single_indexname,) def assert_message_type(self, message_type: messages.MessageType): @@ -106,11 +103,12 @@ def assert_strategy_is_current(self): def with_hex(self, subname: str): return dataclasses.replace(self, subname=subname) - def get_index_by_shortname(self, shortname: str) -> typing.Self.SpecificIndex: + def get_index_by_subname(self, shortname: str) -> IndexStrategy.SpecificIndex: return self.SpecificIndex(self, shortname) # type: ignore[abstract] - def for_current_index(self) -> IndexStrategy.SpecificIndex: - return self.get_index_by_shortname(self.current_indexname) + def each_current_index(self) -> typing.Iterator[IndexStrategy.SpecificIndex]: + for _subname in self.: + yield self.get_index_by_subname(_subname) def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create(index_strategy_name=self.name) @@ -177,7 +175,10 @@ def is_current(self) -> bool: @property def indexname(self) -> str: - return f'{self.index_strategy.indexname_prefix}{self.short_indexname}' + return indexnames.combine_indexname_parts( + self.index_strategy.indexname_prefix, + self.short_indexname, + ) def pls_setup(self, *, skip_backfill=False): assert self.is_current, 'cannot setup a non-current index' diff --git a/share/search/index_strategy/_indexnames.py b/share/search/index_strategy/_indexnames.py new file mode 100644 index 000000000..3517a6d01 --- /dev/null +++ b/share/search/index_strategy/_indexnames.py @@ -0,0 +1,21 @@ +from share.search.exceptions import IndexStrategyError + + +INDEXNAME_DELIM = '__' # used to separate indexnames into a list of meaningful values + + +def is_valid_indexname_part(indexname_part: str) -> bool: + return bool(INDEXNAME_DELIM not in indexname_part) + + +def raise_if_invalid_indexname_part(indexname_part: str) -> None: + if INDEXNAME_DELIM in indexname_part: + raise IndexStrategyError(f'name may not contain "{INDEXNAME_DELIM}" (got "{indexname_part}")') + + +def combine_indexname_parts(*indexname_parts: str) -> str: + return INDEXNAME_DELIM.join(indexname_parts) + + +def parse_indexname_parts(name: str) -> list[str]: + return name.split(INDEXNAME_DELIM) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 40a7101dd..20443b8fd 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -61,12 +61,8 @@ class IndexDefinition: # abstract methods for subclasses to implement @abc.abstractmethod @classmethod - def index_definitions(cls) -> typing.Iterator[IndexDefinition]: - ... - - def each_named_index(self): - for _index_def in self.each_index_definition(): - yield self.get_index_by_shortname('iris') + def index_definitions(cls) -> dict[str, IndexDefinition]: + raise NotImplementedError @abc.abstractmethod def index_settings(self): @@ -135,6 +131,10 @@ def compute_strategy_checksum(self): }, ) + def each_named_index(self): + for _subname, _index_def in self.index_definitions().items(): + yield self.get_index_by_subname('iris') + # abstract method from IndexStrategy def each_existing_index(self): indexname_set = set( @@ -144,7 +144,7 @@ def each_existing_index(self): ) indexname_set.add(self.current_indexname) for indexname in indexname_set: - yield self.get_index_by_shortname(indexname) + yield self.get_index_by_subname(indexname) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): @@ -208,7 +208,7 @@ def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificI def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks - return self.get_index_by_shortname(self._alias_for_searching) + return self.get_index_by_subname(self._alias_for_searching) # override from IndexStrategy def pls_mark_backfill_complete(self): @@ -218,11 +218,11 @@ def pls_mark_backfill_complete(self): @property def _alias_for_searching(self): - return f'{self.indexname_prefix}search' + return f'{self.indexname_prefix}__search' @property def _alias_for_keeping_live(self): - return f'{self.indexname_prefix}live' + return f'{self.indexname_prefix}__live' def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker): if not indexnames: diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2bbf6b4d0..6e2dcff07 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -60,28 +60,18 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): ) @classmethod - def each_index_definition(cls) -> typing.Iterator[Elastic8IndexStrategy.IndexDefiniton]: + def each_index_definition(cls) -> dict[str, Elastic8IndexStrategy.IndexDefiniton]: yield Elastic8IndexStrategy.IndexDefinition( + subname='card', settings=cls._index_settings(), mappings=cls._card_index_mappings(), ) - yield strategy.SpecificIndex( - index_strategy=strategy, - shortname='card', - elastic8_index_settings=cls._index_settings(), - elastic8_index_mappings=cls._card_index_mappings(), - ) - yield self.SpecificIndex( - index_strategy=strategy, - shortname='iris', - elastic8_index_settings=cls._index_settings(), - elastic8_index_mappings=cls._iris_index_mappings(), - ) - - yield Elastic8IndexDefinition( + yield Elastic8IndexStrategy.IndexDefinition( + subname='value', settings=cls._index_settings(), - mappings=cls._card_index_mappings(), + mappings=cls._value_index_mappings(), ) + # abstract method from IndexStrategy @property def supported_message_types(self): @@ -95,7 +85,7 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD - def each_index(self) -> typing.Iterator[TrovesearchDenormIndexStrategy.SpecificIndex]: + def each_index(self) -> Iterator[TrovesearchDenormIndexStrategy.SpecificIndex]: yield self.SpecificIndex( index_strategy=self, ) From ac8f49b6287d2c9b0a251ec1ca369c5f2c0b6a67 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 9 Jan 2025 09:30:06 -0500 Subject: [PATCH 07/35] wip.... --- _TODO_multindex.txt | 6 +-- share/admin/search.py | 57 +++++++++++-------------- share/search/index_strategy/__init__.py | 12 +++++- share/search/index_strategy/_base.py | 47 +++++++++----------- share/search/index_strategy/elastic8.py | 15 ++----- 5 files changed, 65 insertions(+), 72 deletions(-) diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt index 6c160ac63..66f9124db 100644 --- a/_TODO_multindex.txt +++ b/_TODO_multindex.txt @@ -42,10 +42,10 @@ IndexStrategy revamp plan/log: - pls_handle_valuesearch - pls_ensure_fresh - Elastic8IndexStrategy.each_index_definition (abstractmethod) - - Elastic8IndexStrategy.each_named_index (based on index_definitions) + - Elastic8IndexStrategy.each_named_index (based on current_index_definitions) - update existing base methods - - add subname to indexname_prefix + - add strategy_check to indexname_prefix - pls_get_default_for_searching (classmethod, return IndexStrategy) - - pls_make_default_for_searching (by self.subname, not SpecificIndex) + - pls_make_default_for_searching (by strategy instance (or strategy_check), not SpecificIndex) diff --git a/share/admin/search.py b/share/admin/search.py index b635fb525..4cd0486b5 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -25,19 +25,17 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _specific_index = index_strategy.get_specific_index(request.POST['specific_indexname']) - _pls_doer = PLS_DOERS[request.POST['pls_do']] - _pls_doer(_specific_index) - _redirect_id = ( - _specific_index.index_strategy.name - if _pls_doer is _pls_delete - else _specific_index.indexname + _index_strategy = index_strategy.parse_strategy_request( + request.POST['specific_indexname'], # TODO: rename in form ) + _pls_doer = PLS_DOERS[request.POST['pls_do']] + _pls_doer(_index_strategy) + _redirect_id = _index_strategy.strategy_name return HttpResponseRedirect('#'.join((request.path, _redirect_id))) def search_index_mappings_view(request, index_name): - _specific_index = index_strategy.get_specific_index(index_name) + _specific_index = index_strategy.parse_index_name(index_name) _mappings = _specific_index.pls_get_mappings() return JsonResponse(_mappings) @@ -65,16 +63,13 @@ def _index_status_by_strategy(): _current_backfill = _backfill_by_checksum.get( str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), ) - status_by_strategy[_index_strategy.name] = { + status_by_strategy[_index_strategy.strategy_name] = { 'current': { 'status': [ _index.pls_get_status() - for _index in _index_strategy.each_current_index() + for _index in _index_strategy.each_named_index() ], - 'backfill': _serialize_backfill( - current_index, - _backfill_by_checksum.get(current_index.indexname), - ), + 'backfill': _serialize_backfill(_index_strategy, _current_backfill), }, 'prior': sorted(( specific_index.pls_get_status() @@ -114,35 +109,35 @@ def _serialize_backfill( } -def _pls_setup(specific_index): - assert specific_index.is_current - specific_index.pls_setup() +def _pls_setup(index_strategy): + assert index_strategy.is_current + index_strategy.pls_setup() -def _pls_start_keeping_live(specific_index): - specific_index.pls_start_keeping_live() +def _pls_start_keeping_live(index_strategy): + index_strategy.pls_start_keeping_live() -def _pls_stop_keeping_live(specific_index): - specific_index.pls_stop_keeping_live() +def _pls_stop_keeping_live(index_strategy): + index_strategy.pls_stop_keeping_live() -def _pls_start_backfill(specific_index): - assert specific_index.is_current - specific_index.index_strategy.pls_start_backfill() +def _pls_start_backfill(index_strategy): + assert index_strategy.is_current + index_strategy.pls_start_backfill() -def _pls_mark_backfill_complete(specific_index): - specific_index.index_strategy.pls_mark_backfill_complete() +def _pls_mark_backfill_complete(index_strategy): + index_strategy.pls_mark_backfill_complete() -def _pls_make_default_for_searching(specific_index): - specific_index.index_strategy.pls_make_default_for_searching(specific_index) +def _pls_make_default_for_searching(index_strategy): + index_strategy.pls_make_default_for_searching() -def _pls_delete(specific_index): - assert not specific_index.is_current - specific_index.pls_delete() +def _pls_delete(index_strategy): + assert not index_strategy.is_current + index_strategy.pls_delete() PLS_DOERS = { diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 12299877d..425e476a7 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -12,6 +12,7 @@ from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy +from ._indexnames import parse_indexname_parts __all__ = ( @@ -44,7 +45,7 @@ def _iter_all_index_strategies(): def parse_strategy_request(requested_strategy_name: str) -> IndexStrategy: - (_strategyname, *_etc) = requested_strategy_name.split(_INDEXNAME_DELIM) + (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) try: _strategy = get_index_strategy( _strategyname, @@ -56,6 +57,15 @@ def parse_strategy_request(requested_strategy_name: str) -> IndexStrategy: return _strategy +def parse_index_name(index_name: str) -> IndexStrategy.SpecificIndex: + try: + (_strategy_name, _strategy_check, *_etc) = parse_indexname_parts(index_name) + _strategy = get_index_strategy(_strategy_name, _strategy_check) + return _strategy.get_index_by_subname(*_etc) + except IndexStrategyError: + raise IndexStrategyError(f'invalid index_name "{index_name}"') + + def get_index_strategy(strategy_name: str, subname: str = '') -> IndexStrategy: try: return all_index_strategies()[strategy_name] diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 117f0bff7..227dad675 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -43,26 +43,30 @@ class IndexStrategy(abc.ABC): ''' CURRENT_STRATEGY_CHECKSUM: typing.ClassVar[ChecksumIri] # set on subclasses to protect against accidents - name: str - subname: str = '' # if unspecified, uses current checksum + strategy_name: str + strategy_check: str = '' # if unspecified, uses current checksum def __post_init__(self): - indexnames.raise_if_invalid_indexname_part(self.name) - if not self.subname: - self.subname = self.CURRENT_STRATEGY_CHECKSUM.hexdigest - indexnames.raise_if_invalid_indexname_part(self.subname) + indexnames.raise_if_invalid_indexname_part(self.strategy_name) + if not self.strategy_check: + self.strategy_check = self.CURRENT_STRATEGY_CHECKSUM.hexdigest + indexnames.raise_if_invalid_indexname_part(self.strategy_check) @property def nonurgent_messagequeue_name(self) -> str: - return f'{self.name}.nonurgent' + return f'{self.strategy_name}.nonurgent' @property def urgent_messagequeue_name(self) -> str: - return f'{self.name}.urgent' + return f'{self.strategy_name}.urgent' + + @property + def indexname_prefix_parts(self) -> list[str]: + return [self.strategy_name, self.strategy_check] @property def indexname_prefix(self) -> str: - return indexnames.combine_indexname_parts(self.name, self.subname) + return indexnames.combine_indexname_parts(self.indexname_prefix_parts) @property def indexname_wildcard(self) -> str: @@ -70,16 +74,12 @@ def indexname_wildcard(self) -> str: @property def is_current(self) -> bool: - return self.subname == self.CURRENT_STRATEGY_CHECKSUM.hexdigest + return self.strategy_check == self.CURRENT_STRATEGY_CHECKSUM.hexdigest @functools.cached_property - def all_current_indexnames(self) -> tuple[str, ...]: + def all_current_indexnames(self) -> frozenset[str]: self.assert_strategy_is_current() - _single_indexname = indexnames.combine_indexname_parts( - self.indexname_prefix, - self.CURRENT_STRATEGY_CHECKSUM.hexdigest, - ) - return (_single_indexname,) + return frozenset((...)) # TODO def assert_message_type(self, message_type: messages.MessageType): if message_type not in self.supported_message_types: @@ -100,18 +100,13 @@ def assert_strategy_is_current(self): ) ```''') - def with_hex(self, subname: str): - return dataclasses.replace(self, subname=subname) - - def get_index_by_subname(self, shortname: str) -> IndexStrategy.SpecificIndex: - return self.SpecificIndex(self, shortname) # type: ignore[abstract] - - def each_current_index(self) -> typing.Iterator[IndexStrategy.SpecificIndex]: - for _subname in self.: - yield self.get_index_by_subname(_subname) + def get_index_by_subname(self, subname: str) -> IndexStrategy.SpecificIndex: + return self.SpecificIndex(self, subname) # type: ignore[abstract] def get_or_create_backfill(self): - (index_backfill, _) = IndexBackfill.objects.get_or_create(index_strategy_name=self.name) + (index_backfill, _) = IndexBackfill.objects.get_or_create( + index_strategy_name=self.strategy_name, + ) return index_backfill def pls_start_backfill(self): diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 20443b8fd..859c0954e 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -61,15 +61,7 @@ class IndexDefinition: # abstract methods for subclasses to implement @abc.abstractmethod @classmethod - def index_definitions(cls) -> dict[str, IndexDefinition]: - raise NotImplementedError - - @abc.abstractmethod - def index_settings(self): - raise NotImplementedError - - @abc.abstractmethod - def index_mappings(self): + def current_index_definitions(cls) -> dict[str, IndexDefinition]: raise NotImplementedError @abc.abstractmethod @@ -131,9 +123,10 @@ def compute_strategy_checksum(self): }, ) + # abstract method from IndexStrategy def each_named_index(self): - for _subname, _index_def in self.index_definitions().items(): - yield self.get_index_by_subname('iris') + for _subname, _index_def in self.current_index_definitions().items(): + yield self.get_index_by_subname(_subname) # abstract method from IndexStrategy def each_existing_index(self): From 0cce300a8519ae0baf07749c89108deb9c3d643b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 9 Jan 2025 09:30:19 -0500 Subject: [PATCH 08/35] wip....... --- share/search/index_strategy/_base.py | 87 +++++++----- share/search/index_strategy/elastic8.py | 131 +++++++++++------- .../search/index_strategy/sharev2_elastic8.py | 8 +- .../index_strategy/trove_indexcard_flats.py | 4 +- .../index_strategy/trovesearch_denorm.py | 41 +++--- 5 files changed, 159 insertions(+), 112 deletions(-) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 227dad675..415bb79fc 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -52,6 +52,11 @@ def __post_init__(self): self.strategy_check = self.CURRENT_STRATEGY_CHECKSUM.hexdigest indexnames.raise_if_invalid_indexname_part(self.strategy_check) + @functools.cache + @classmethod + def index_subname_set(cls) -> frozenset[str]: + return frozenset(cls.each_index_subname()) + @property def nonurgent_messagequeue_name(self) -> str: return f'{self.strategy_name}.nonurgent' @@ -66,7 +71,7 @@ def indexname_prefix_parts(self) -> list[str]: @property def indexname_prefix(self) -> str: - return indexnames.combine_indexname_parts(self.indexname_prefix_parts) + return indexnames.combine_indexname_parts(*self.indexname_prefix_parts) @property def indexname_wildcard(self) -> str: @@ -76,11 +81,6 @@ def indexname_wildcard(self) -> str: def is_current(self) -> bool: return self.strategy_check == self.CURRENT_STRATEGY_CHECKSUM.hexdigest - @functools.cached_property - def all_current_indexnames(self) -> frozenset[str]: - self.assert_strategy_is_current() - return frozenset((...)) # TODO - def assert_message_type(self, message_type: messages.MessageType): if message_type not in self.supported_message_types: raise IndexStrategyError(f'Invalid message_type "{message_type}" (expected {self.supported_message_types})') @@ -100,9 +100,24 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_subname(self, subname: str) -> IndexStrategy.SpecificIndex: + def get_index_by_subname(self, *subnames: str) -> IndexStrategy.SpecificIndex: return self.SpecificIndex(self, subname) # type: ignore[abstract] + def pls_setup(self, *, skip_backfill=False): + assert self.is_current, 'cannot setup a non-current strategy' + _preexisting_index_count = sum( + _index.pls_check_exists() + for _index in self.each_existing_index() + ) + self.pls_create() + self.pls_start_keeping_live() + if skip_backfill: + _backfill = self.index_strategy.get_or_create_backfill() + _backfill.backfill_status = _backfill.COMPLETE + _backfill.save() + if not _preexisting_index_count: # first index for a strategy is automatic default + self.index_strategy.pls_make_default_for_searching(self) + def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create( index_strategy_name=self.strategy_name, @@ -115,23 +130,36 @@ def pls_start_backfill(self): def pls_mark_backfill_complete(self): self.get_or_create_backfill().pls_mark_complete() - @property + ### + # abstract methods (required for concrete subclasses) + + @classmethod @abc.abstractmethod - def supported_message_types(self) -> typing.Iterable[messages.MessageType]: + def compute_strategy_checksum(self) -> ChecksumIri: + '''get a dict (json-serializable and thereby checksummable) of all + configuration held still by this IndexStrategy subclass -- changes + in the checksum may result in new indices being created and filled + ''' + raise NotImplementedError + + @classmethod + @abc.abstractmethod + def each_index_subname(self) -> typing.Iterable[str]: + raise NotImplementedError + + @classmethod + @abc.abstractmethod + def each_setup_strategy(cls) -> typing.Iterator[typing.Self]: raise NotImplementedError @property @abc.abstractmethod - def backfill_message_type(self) -> messages.MessageType: + def supported_message_types(self) -> typing.Iterable[messages.MessageType]: raise NotImplementedError + @property @abc.abstractmethod - def compute_strategy_checksum(self) -> ChecksumIri: - '''get a dict (json-serializable and thereby checksummable) of all - configuration held still by this IndexStrategy instance -- changes - in this value's checksum may invoke changes in index lifecycle, as - may be defined by IndexStrategy subclasses - ''' + def backfill_message_type(self) -> messages.MessageType: raise NotImplementedError @abc.abstractmethod @@ -155,13 +183,13 @@ def pls_get_default_for_searching(self) -> 'SpecificIndex': @dataclasses.dataclass class SpecificIndex(abc.ABC): index_strategy: IndexStrategy - short_indexname: str # unique per index_strategy + subname: str # unique per index_strategy def __post_init__(self): - if self.short_indexname not in self.index_strategy.short_indexname_set: + if self.subname not in self.index_strategy.index_subname_set(): raise IndexStrategyError( - f'invalid short_indexname "{self.short_indexname}"!' - f' (expected to start with "{self.index_strategy.short_indexname_set}")' + f'invalid subname "{self.subname}"!' + f' (expected one of {self.index_strategy.index_subname_set}")' ) @property @@ -169,26 +197,11 @@ def is_current(self) -> bool: return self.index_strategy.is_current @property - def indexname(self) -> str: + def full_index_name(self) -> str: return indexnames.combine_indexname_parts( self.index_strategy.indexname_prefix, - self.short_indexname, - ) - - def pls_setup(self, *, skip_backfill=False): - assert self.is_current, 'cannot setup a non-current index' - _preexisting_index_count = sum( - _index.pls_check_exists() - for _index in self.index_strategy.each_existing_index() + self.subname, ) - self.pls_create() - self.pls_start_keeping_live() - if skip_backfill: - _backfill = self.index_strategy.get_or_create_backfill() - _backfill.backfill_status = _backfill.COMPLETE - _backfill.save() - if not _preexisting_index_count: # first index for a strategy is automatic default - self.index_strategy.pls_make_default_for_searching(self) @abc.abstractmethod def pls_get_status(self) -> IndexStatus: diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 859c0954e..acbda9101 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -2,8 +2,10 @@ import abc import collections import dataclasses +import functools from http import HTTPStatus import logging +import types import typing from django.conf import settings @@ -15,6 +17,7 @@ from share.search import messages from share.search.index_strategy._util import timestamp_to_readable_datetime from share.util.checksum_iri import ChecksumIri +from ._indexnames import parse_index_name logger = logging.getLogger(__name__) @@ -51,25 +54,32 @@ def __post_init__(self): ) ### - # for use when defining indexes + # for use when defining abstract methods in subclasses + @dataclasses.dataclass class IndexDefinition: + subname: str settings: dict mappings: dict + @dataclasses.dataclass + class MessageActionSet: + message_target_id: int + actions_by_subname: dict[str, typing.Iterable[dict]] + ### # abstract methods for subclasses to implement + @abc.abstractmethod @classmethod - def current_index_definitions(cls) -> dict[str, IndexDefinition]: + def each_index_definition(cls) -> typing.Iterable[IndexDefinition]: raise NotImplementedError @abc.abstractmethod def build_elastic_actions( self, messages_chunk: messages.MessagesChunk, - ) -> typing.Iterable[tuple[int, dict | typing.Iterable[dict]]]: - # yield (message_target_id, [elastic_action, ...]) pairs + ) -> typing.Iterable[MessageActionSet]: raise NotImplementedError def before_chunk( @@ -113,16 +123,25 @@ def build_update_action(self, doc_id, doc_source): # implementation for subclasses to ignore # abstract method from IndexStrategy - def compute_strategy_checksum(self): + @classmethod + def compute_strategy_checksum(cls): return ChecksumIri.digest_json( checksumalgorithm_name='sha-256', - salt=self.__class__.__name__, + salt=cls.__name__, raw_json={ - 'settings': self.index_settings(), - 'mappings': self.index_mappings(), - }, + _subname: dataclasses.asdict(_def) + for _subname, _def in cls.current_index_definitions().items() + } ) + @classmethod + @functools.cache + def current_index_definitions(cls): + return types.MappingProxyType({ + _def.subname: _def + for _def in cls.each_index_definition() + }) + # abstract method from IndexStrategy def each_named_index(self): for _subname, _index_def in self.current_index_definitions().items(): @@ -135,22 +154,19 @@ def each_existing_index(self): .get(index=self.indexname_wildcard, features=',') .keys() ) - indexname_set.add(self.current_indexname) for indexname in indexname_set: - yield self.get_index_by_subname(indexname) + _index = parse_index_name(indexname) + assert _index.index_strategy == self + yield _index # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) - if messages_chunk.message_type.is_backfill: - _indexnames = {self.current_indexname} - else: - _indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) self.before_chunk(messages_chunk, _indexnames) _action_tracker = _ActionTracker() _bulk_stream = streaming_bulk( self.es8_client, - self._elastic_actions_with_index(messages_chunk, _indexnames, _action_tracker), + self._elastic_actions_with_index(messages_chunk, _action_tracker), raise_on_error=False, max_retries=settings.ELASTICSEARCH['MAX_RETRIES'], ) @@ -194,7 +210,7 @@ def pls_handle_messages_chunk(self, messages_chunk): def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): self._set_indexnames_for_alias( self._alias_for_searching, - {specific_index.indexname}, + {specific_index.full_index_name}, ) # abstract method from IndexStrategy @@ -217,21 +233,36 @@ def _alias_for_searching(self): def _alias_for_keeping_live(self): return f'{self.indexname_prefix}__live' - def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker): - if not indexnames: - raise ValueError('cannot index to no indexes') - for _message_target_id, _elastic_actions in self.build_elastic_actions(messages_chunk): - if isinstance(_elastic_actions, dict): # allow a single action - _elastic_actions = [_elastic_actions] - for _elastic_action in _elastic_actions: - _docid = _elastic_action['_id'] - for _indexname in indexnames: - action_tracker.add_action(_message_target_id, _indexname, _docid) - yield { - **_elastic_action, - '_index': _indexname, - } - action_tracker.done_scheduling(_message_target_id) + def _elastic_actions_with_index( + self, + messages_chunk: messages.MessagesChunk, + action_tracker: _ActionTracker, + ): + for _actionset in self.build_elastic_actions(messages_chunk): + for _index_subname, _elastic_actions in _actionset.actions_by_subname.items(): + _indexnames = self._get_indexnames_for_action( + index_subname=_index_subname, + is_backfill_action=messages_chunk.message_type.is_backfill, + ) + for _elastic_action in _elastic_actions: + _docid = _elastic_action['_id'] + for _indexname in _indexnames: + action_tracker.add_action(_actionset.message_target_id, _indexname, _docid) + yield { + **_elastic_action, + '_index': _indexname, + } + action_tracker.done_scheduling(_actionset.message_target_id) + + def _get_indexnames_for_action( + self, + index_subname: str, + *, + is_backfill_action: bool = False, + ) -> set[str]: + if is_backfill_action: + return {self.get_index_by_subname(index_subname).full_index_name} + _indexes_kept_live = self._get_indexnames_for_alias(self._alias_for_keeping_live) def _get_indexnames_for_alias(self, alias_name) -> set[str]: try: @@ -270,14 +301,16 @@ def _set_indexnames_for_alias(self, alias_name, indexnames): ), ]) + @dataclasses.dataclass class SpecificIndex(IndexStrategy.SpecificIndex): + index_strategy: Elastic8IndexStrategy # abstract method from IndexStrategy.SpecificIndex def pls_get_status(self) -> IndexStatus: if not self.pls_check_exists(): return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_strategy_name=self.index_strategy.strategy_name, + specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, doc_count=0, @@ -285,8 +318,8 @@ def pls_get_status(self) -> IndexStatus: ) index_info = ( self.index_strategy.es8_client.indices - .get(index=self.indexname, features='aliases,settings') - [self.indexname] + .get(index=self.full_index_name, features='aliases,settings') + [self.full_index_name] ) index_aliases = set(index_info['aliases'].keys()) creation_date = timestamp_to_readable_datetime( @@ -294,12 +327,12 @@ def pls_get_status(self) -> IndexStatus: ) doc_count = ( self.index_strategy.es8_client.indices - .stats(index=self.indexname, metric='docs') - ['indices'][self.indexname]['primaries']['docs']['count'] + .stats(index=self.full_index_name, metric='docs') + ['indices'][self.full_index_name]['primaries']['docs']['count'] ) return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_strategy_name=self.index_strategy.strategy_name, + specific_indexname=self.full_index_name, is_kept_live=( self.index_strategy._alias_for_keeping_live in index_aliases @@ -314,11 +347,11 @@ def pls_get_status(self) -> IndexStatus: # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): - indexname = self.indexname - logger.info(f'{self.__class__.__name__}: checking for index {indexname}') + full_index_name = self.full_index_name + logger.info(f'{self.__class__.__name__}: checking for index {full_index_name}') return bool( self.index_strategy.es8_client.indices - .exists(index=indexname) + .exists(index=full_index_name) ) # abstract method from IndexStrategy.SpecificIndex @@ -327,7 +360,7 @@ def pls_create(self): 'cannot create a non-current version of an index!' ' maybe try `index_strategy.for_current_index()`?' ) - index_to_create = self.indexname + index_to_create = self.full_index_name logger.debug('Ensuring index %s', index_to_create) index_exists = ( self.index_strategy.es8_client.indices @@ -349,7 +382,7 @@ def pls_create(self): def pls_refresh(self): ( self.index_strategy.es8_client.indices - .refresh(index=self.indexname) + .refresh(index=self.full_index_name) ) logger.debug('%r: Waiting for yellow status', self) ( @@ -362,14 +395,14 @@ def pls_refresh(self): def pls_delete(self): ( self.index_strategy.es8_client.indices - .delete(index=self.indexname, ignore=[400, 404]) + .delete(index=self.full_index_name, ignore=[400, 404]) ) logger.warning('%r: deleted', self) # abstract method from IndexStrategy.SpecificIndex def pls_start_keeping_live(self): self.index_strategy._add_indexname_to_alias( - indexname=self.indexname, + indexname=self.full_index_name, alias_name=self.index_strategy._alias_for_keeping_live, ) logger.info('%r: now kept live', self) @@ -377,13 +410,13 @@ def pls_start_keeping_live(self): # abstract method from IndexStrategy.SpecificIndex def pls_stop_keeping_live(self): self.index_strategy._remove_indexname_from_alias( - indexname=self.indexname, + indexname=self.full_index_name, alias_name=self.index_strategy._alias_for_keeping_live, ) logger.warning('%r: no longer kept live', self) def pls_get_mappings(self): - return self.index_strategy.es8_client.indices.get_mapping(index=self.indexname).body + return self.index_strategy.es8_client.indices.get_mapping(index=self.full_index_name).body @dataclasses.dataclass diff --git a/share/search/index_strategy/sharev2_elastic8.py b/share/search/index_strategy/sharev2_elastic8.py index 016503d96..76e221118 100644 --- a/share/search/index_strategy/sharev2_elastic8.py +++ b/share/search/index_strategy/sharev2_elastic8.py @@ -135,9 +135,9 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _doc_id = _source_doc['id'] _suid_ids.discard(_suid_id) if _source_doc.pop('is_deleted', False): - yield _suid_id, self.build_delete_action(_doc_id) + yield _suid_id, '', self.build_delete_action(_doc_id) else: - yield _suid_id, self.build_index_action(_doc_id, _source_doc) + yield _suid_id, '', self.build_index_action(_doc_id, _source_doc) # delete any leftovers for _leftover_suid in SourceUniqueIdentifier.objects.filter(id__in=_suid_ids): _suid_ids.discard(_leftover_suid.id) @@ -145,10 +145,10 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _leftover_suid_id = _leftover_suid.get_backcompat_sharev2_suid().id except SourceUniqueIdentifier.DoesNotExist: _leftover_suid_id = _leftover_suid.id - yield _leftover_suid.id, self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _leftover_suid.id, '', self.build_delete_action(self._get_doc_id(_leftover_suid_id)) # these ones don't even exist! for _leftover_suid_id in _suid_ids: - yield _leftover_suid_id, self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _leftover_suid_id, '', self.build_delete_action(self._get_doc_id(_leftover_suid_id)) def _get_doc_id(self, suid_id: int): return IDObfuscator.encode_id(suid_id, SourceUniqueIdentifier) diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index dceb272df..6d258a1a6 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -267,11 +267,11 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): doc_source=_sourcedoc, ) _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _indexcard_rdf.indexcard_id, _index_action + yield _indexcard_rdf.indexcard_id, '', _index_action # delete any that don't have "latest" rdf and derived osfmap_json _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) for _indexcard in _leftovers: - yield _indexcard.id, self.build_delete_action(_indexcard.get_iri()) + yield _indexcard.id, '', self.build_delete_action(_indexcard.get_iri()) class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 6e2dcff07..e5921db9e 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -59,8 +59,9 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', ) + # abstract method from Elastic8IndexStrategy @classmethod - def each_index_definition(cls) -> dict[str, Elastic8IndexStrategy.IndexDefiniton]: + def each_index_definition(cls) -> Iterator[Elastic8IndexStrategy.IndexDefiniton]: yield Elastic8IndexStrategy.IndexDefinition( subname='card', settings=cls._index_settings(), @@ -85,31 +86,27 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD - def each_index(self) -> Iterator[TrovesearchDenormIndexStrategy.SpecificIndex]: - yield self.SpecificIndex( - index_strategy=self, - ) - - # abstract method from Elastic8IndexStrategy - def index_settings(self): + @classmethod + def _index_settings(cls): return { 'number_of_shards': 5, 'number_of_replicas': 2, } - # abstract method from Elastic8IndexStrategy - def index_mappings(self): + @classmethod + def _card_index_mappings(cls): return { 'dynamic': 'false', - 'dynamic_templates': self._dynamic_templates(), + 'dynamic_templates': cls._dynamic_templates(), 'properties': { - 'card': {'properties': self._card_mappings()}, - 'iri_value': {'properties': self._iri_value_mappings()}, + 'card': {'properties': cls._card_mappings()}, + 'iri_value': {'properties': cls._iri_value_mappings()}, 'chunk_timestamp': {'type': 'unsigned_long'}, }, } - def _dynamic_templates(self): + @classmethod + def _dynamic_templates(cls): return [ {'dynamic_text_by_propertypath': { 'path_match': '*.text_by_propertypath.*', @@ -132,7 +129,8 @@ def _dynamic_templates(self): }}, ] - def _card_mappings(self): + @classmethod + def _card_mappings(cls): return { # simple keyword properties 'card_iri': ts.KEYWORD_MAPPING, @@ -144,19 +142,21 @@ def _card_mappings(self): 'source_record_identifier': ts.KEYWORD_MAPPING, }, }, - **self._paths_and_values_mappings(), + **cls._paths_and_values_mappings(), } - def _iri_value_mappings(self): + @classmethod + def _iri_value_mappings(cls): return { 'value_name': ts.KEYWORD_MAPPING, 'value_title': ts.KEYWORD_MAPPING, 'value_label': ts.KEYWORD_MAPPING, 'at_card_propertypaths': ts.KEYWORD_MAPPING, - **self._paths_and_values_mappings(), + **cls._paths_and_values_mappings(), } - def _paths_and_values_mappings(self): + @classmethod + def _paths_and_values_mappings(cls): return { 'single_focus_iri': ts.KEYWORD_MAPPING, 'focus_iri_synonyms': ts.KEYWORD_MAPPING, @@ -175,7 +175,7 @@ def _paths_and_values_mappings(self): def after_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]): task__delete_iri_value_scraps.apply_async( kwargs={ - 'index_strategy_name': self.name, + 'index_strategy_name': self.strategy_name, 'indexnames': list(indexnames), 'card_pks': messages_chunk.target_ids_chunk, 'timestamp': messages_chunk.timestamp, @@ -207,6 +207,7 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): # implement abstract IndexStrategy.SpecificIndex class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): + index_strategy: TrovesearchDenormIndexStrategy # abstract method from IndexStrategy.SpecificIndex def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: From 8f4fa1cf942b9f4fd244f3350e328623c90f3f71 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 10 Jan 2025 11:28:47 -0500 Subject: [PATCH 09/35] wip (remove unused before_chunk; make after_chunk multiindex-friendly) --- share/search/index_strategy/elastic8.py | 15 +++++---------- share/search/index_strategy/trovesearch_denorm.py | 4 ++-- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index acbda9101..795b10c7b 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -82,17 +82,10 @@ def build_elastic_actions( ) -> typing.Iterable[MessageActionSet]: raise NotImplementedError - def before_chunk( - self, - messages_chunk: messages.MessagesChunk, - indexnames: typing.Iterable[str], - ) -> None: - ... # implement when needed - def after_chunk( self, messages_chunk: messages.MessagesChunk, - indexnames: typing.Iterable[str], + affected_indexnames: typing.Iterable[str], ) -> None: ... # implement when needed @@ -137,6 +130,7 @@ def compute_strategy_checksum(cls): @classmethod @functools.cache def current_index_definitions(cls): + # readonly and cached per class return types.MappingProxyType({ _def.subname: _def for _def in cls.each_index_definition() @@ -162,7 +156,6 @@ def each_existing_index(self): # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) - self.before_chunk(messages_chunk, _indexnames) _action_tracker = _ActionTracker() _bulk_stream = streaming_bulk( self.es8_client, @@ -170,11 +163,13 @@ def pls_handle_messages_chunk(self, messages_chunk): raise_on_error=False, max_retries=settings.ELASTICSEARCH['MAX_RETRIES'], ) + _affected_indexnames: set[str] = set() for (_ok, _response) in _bulk_stream: (_op_type, _response_body) = next(iter(_response.items())) _status = _response_body.get('status') _docid = _response_body['_id'] _indexname = _response_body['_index'] + _affected_indexnames.add(_indexname) _is_done = _ok or (_op_type == 'delete' and _status == 404) if _is_done: _finished_message_id = _action_tracker.action_done(_indexname, _docid) @@ -204,7 +199,7 @@ def pls_handle_messages_chunk(self, messages_chunk): status_code=HTTPStatus.OK.value, error_text=None, ) - self.after_chunk(messages_chunk, _indexnames) + self.after_chunk(messages_chunk, _affected_indexnames) # abstract method from IndexStrategy def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index e5921db9e..9c1257ee4 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -172,11 +172,11 @@ def _paths_and_values_mappings(cls): } # override method from Elastic8IndexStrategy - def after_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]): + def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexnames: Iterable[str]): task__delete_iri_value_scraps.apply_async( kwargs={ 'index_strategy_name': self.strategy_name, - 'indexnames': list(indexnames), + 'indexnames': list(affected_indexnames), 'card_pks': messages_chunk.target_ids_chunk, 'timestamp': messages_chunk.timestamp, }, From b125f19468b80ead4668740ec0d90b993b0959bf Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 10 Jan 2025 15:35:09 -0500 Subject: [PATCH 10/35] wip (simplify share.search.index_strategy) --- share/admin/search.py | 6 +-- share/bin/search.py | 21 +++------- share/checks.py | 2 +- share/search/daemon.py | 2 +- share/search/index_messenger.py | 2 +- share/search/index_strategy/__init__.py | 28 ++++++------- share/search/index_strategy/_base.py | 24 ++++++----- tests/share/bin/test_sharectl.py | 8 ++-- tests/share/search/__init__.py | 9 ++-- .../index_strategy/test_strategy_selection.py | 42 +++++++------------ tests/share/search/test_admin_workflow.py | 4 +- 11 files changed, 64 insertions(+), 84 deletions(-) diff --git a/share/admin/search.py b/share/admin/search.py index 4cd0486b5..78c63790e 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -25,7 +25,7 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _index_strategy = index_strategy.parse_strategy_request( + _index_strategy = index_strategy.get_strategy( request.POST['specific_indexname'], # TODO: rename in form ) _pls_doer = PLS_DOERS[request.POST['pls_do']] @@ -54,12 +54,12 @@ def _index_status_by_strategy(): _backfill.strategy_checksum: _backfill for _backfill in ( IndexBackfill.objects - .filter(index_strategy_name__in=index_strategy.all_index_strategies().keys()) + .filter(index_strategy_name__in=index_strategy.all_strategy_names()) ) } status_by_strategy = {} _messenger = IndexMessenger() - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in index_strategy.each_strategy(): _current_backfill = _backfill_by_checksum.get( str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), ) diff --git a/share/bin/search.py b/share/bin/search.py index 69f5c0eff..a677d0445 100644 --- a/share/bin/search.py +++ b/share/bin/search.py @@ -41,25 +41,16 @@ def setup(args, argv): """ _is_initial = args.get('--initial') if _is_initial: - _specific_indexes = [ - _index_strategy.for_current_index() - for _index_strategy in index_strategy.all_index_strategies().values() - ] + for _index_strategy in index_strategy.each_strategy(): + _index_strategy.pls_setup() else: _index_or_strategy_name = args[''] try: - _specific_indexes = [index_strategy.get_specific_index(_index_or_strategy_name)] + _strategy = index_strategy.get_strategy(_index_or_strategy_name) except IndexStrategyError: - try: - _specific_indexes = [ - index_strategy.get_specific_index(_index_or_strategy_name), - ] - except IndexStrategyError: - raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') - for _specific_index in _specific_indexes: - _specific_index.pls_setup( - skip_backfill=_is_initial, # for initial setup, there's nothing back to fill - ) + raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') + else: + _strategy.pls_setup() @search.subcommand('Start the search indexing daemon') diff --git a/share/checks.py b/share/checks.py index a53d2a228..1dda809d6 100644 --- a/share/checks.py +++ b/share/checks.py @@ -5,7 +5,7 @@ def check_all_index_strategies_current(app_configs, **kwargs): from share.search import index_strategy from share.search.exceptions import IndexStrategyError errors = [] - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in index_strategy.each_strategy(): try: _index_strategy.assert_strategy_is_current() except IndexStrategyError as exception: diff --git a/share/search/daemon.py b/share/search/daemon.py index 35f4b83d8..9ce2dbf34 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -68,7 +68,7 @@ def start_daemonthreads_for_strategy(self, index_strategy): return _daemon def start_all_daemonthreads(self): - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in index_strategy.each_strategy(): self.start_daemonthreads_for_strategy(_index_strategy) def stop_daemonthreads(self, *, wait=False): diff --git a/share/search/index_messenger.py b/share/search/index_messenger.py index 0cd51293b..67a7b154b 100644 --- a/share/search/index_messenger.py +++ b/share/search/index_messenger.py @@ -32,7 +32,7 @@ def __init__(self, *, celery_app=None, index_strategys=None): if celery_app is None else celery_app ) - self.index_strategys = index_strategys or tuple(index_strategy.all_index_strategies().values()) + self.index_strategys = index_strategys or tuple(index_strategy.each_strategy()) def notify_indexcard_update(self, indexcards, *, urgent=False): self.send_messages_chunk( diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 425e476a7..c5077ffc0 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations import functools from types import MappingProxyType +from typing import Iterator from django.conf import settings @@ -17,25 +18,24 @@ __all__ = ( 'IndexStrategy', - 'all_index_strategies', - 'get_index_for_sharev2_search', - 'get_index_for_trovesearch', - 'get_index_strategy', - 'get_specific_index', + 'each_strategy', + 'all_strategy_names', + 'get_strategy', + # TODO: cleanup + # 'all_index_strategies', + # 'get_index_for_sharev2_search', + # 'get_index_for_trovesearch', + # 'get_index_strategy', + # 'get_specific_index', ) @functools.cache -def all_index_strategies() -> MappingProxyType[str, IndexStrategy]: - _all_strategies = {} - for _strategy in _iter_all_index_strategies(): - if _strategy.name in _all_strategies: - raise IndexStrategyError(f'strategy names must be unique! (duplicate "{_strategy.name}")') - _all_strategies[_strategy.name] = _strategy - return MappingProxyType(_all_strategies) # a single cached readonly proxy -- set of strategy names immutable +def all_strategy_names() -> frozenset[str]: + return frozenset(_strategy.name for _strategy in each_strategy()) -def _iter_all_index_strategies(): +def each_strategy() -> Iterator[IndexStrategy]: if settings.ELASTICSEARCH5_URL: yield Sharev2Elastic5IndexStrategy(name='sharev2_elastic5') if settings.ELASTICSEARCH8_URL: @@ -44,7 +44,7 @@ def _iter_all_index_strategies(): yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') -def parse_strategy_request(requested_strategy_name: str) -> IndexStrategy: +def get_strategy(requested_strategy_name: str) -> IndexStrategy: (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) try: _strategy = get_index_strategy( diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 415bb79fc..56515f9e0 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -100,7 +100,7 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_subname(self, *subnames: str) -> IndexStrategy.SpecificIndex: + def get_index_by_subname(self, subname: str = '') -> IndexStrategy.SpecificIndex: return self.SpecificIndex(self, subname) # type: ignore[abstract] def pls_setup(self, *, skip_backfill=False): @@ -178,6 +178,18 @@ def pls_make_default_for_searching(self, specific_index: 'SpecificIndex'): def pls_get_default_for_searching(self) -> 'SpecificIndex': raise NotImplementedError + ### + # optional implementations + + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + raise NotImplementedError + + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + raise NotImplementedError + + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)') + # IndexStrategy.SpecificIndex must be implemented by subclasses # in their own `class SpecificIndex(IndexStrategy.SpecificIndex)` @dataclasses.dataclass @@ -231,16 +243,6 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise NotImplementedError - # optional for subclasses - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)') - - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - raise NotImplementedError - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - raise NotImplementedError - def pls_get_mappings(self) -> dict: raise NotImplementedError diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index e39c6140c..a1c47d0a2 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -52,13 +52,13 @@ def _get_specific_index(indexname): def test_setup_initial(self, settings): _expected_indexes = ['baz', 'bar', 'foo'] - _mock_index_strategys = { - _name: mock.Mock() + _mock_index_strategys = [ + mock.Mock(name=_name) for _name in _expected_indexes - } + ] with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') - for mock_index_strategy in _mock_index_strategys.values(): + for mock_index_strategy in _mock_index_strategys: mock_specific_index = mock_index_strategy.for_current_index.return_value assert mock_specific_index.pls_setup.mock_calls == [mock.call(skip_backfill=True)] diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index a7a49aaf9..fb12f9081 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -1,16 +1,17 @@ import contextlib +from typing import Iterable from unittest import mock from share.search import index_strategy @contextlib.contextmanager -def patch_index_strategies(strategies: dict[str, index_strategy.IndexStrategy]): - index_strategy.all_index_strategies.cache_clear() +def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): + index_strategy.all_strategy_names.cache_clear() with mock.patch.object( index_strategy, - 'all_index_strategies', + 'each_strategy', return_value=strategies, ): yield - index_strategy.all_index_strategies.cache_clear() + index_strategy.all_strategy_names.cache_clear() diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index e24fb0a1a..b21204e75 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -3,11 +3,10 @@ from share.search.exceptions import IndexStrategyError from share.search.index_strategy import ( - all_index_strategies, - get_index_strategy, - get_specific_index, - get_index_for_sharev2_search, IndexStrategy, + each_strategy, + all_strategy_names, + get_strategy, sharev2_elastic5, sharev2_elastic8, trove_indexcard_flats, @@ -28,43 +27,30 @@ def expected_strategy_classes(): class TestBaseIndexStrategy: def test_get_index_strategy(self, mock_elastic_clients, expected_strategy_classes): for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - index_strategy = get_index_strategy(strategy_name) + index_strategy = get_strategy(strategy_name) assert isinstance(index_strategy, expected_strategy_class) def test_all_index_strategies(self, mock_elastic_clients, expected_strategy_classes): - all_strategys = tuple(all_index_strategies().values()) + all_strategys = tuple(each_strategy()) assert len(all_strategys) == len(expected_strategy_classes) - strategy_names = {index_strategy.name for index_strategy in all_strategys} + strategy_names = {index_strategy.strategy_name for index_strategy in all_strategys} assert strategy_names == set(expected_strategy_classes.keys()) for index_strategy in all_strategys: - strategy_class = expected_strategy_classes[index_strategy.name] + strategy_class = expected_strategy_classes[index_strategy.strategy_name] assert isinstance(index_strategy, strategy_class) assert issubclass(index_strategy.SpecificIndex, IndexStrategy.SpecificIndex) assert index_strategy.SpecificIndex is not IndexStrategy.SpecificIndex - def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes): - for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - indexname_prefix = get_index_strategy(strategy_name).indexname_prefix - specific_indexname = ''.join((indexname_prefix, 'foo')) - specific_index = get_specific_index(specific_indexname) - assert isinstance(specific_index.index_strategy, expected_strategy_class) - assert isinstance(specific_index, expected_strategy_class.SpecificIndex) - assert specific_index.indexname == specific_indexname - bad_indexname = 'foo_foo' # assumed to not start with index prefix - with pytest.raises(IndexStrategyError): - get_specific_index(bad_indexname) - @pytest.mark.django_db def test_get_by_request(self, mock_elastic_clients): - for strategy_name, index_strategy in all_index_strategies().items(): + for _strategy in each_strategy(): good_requests = [ - strategy_name, - index_strategy.current_indexname, - ''.join((index_strategy.indexname_prefix, 'foo')), + _strategy.strategy_name, + ''.join((_strategy.indexname_prefix, 'foo')), ] for good_request in good_requests: - specific_index = get_index_for_sharev2_search(good_request) - assert isinstance(specific_index, index_strategy.SpecificIndex) - assert specific_index.index_strategy is index_strategy + _got_strategy = get_strategy(good_request) + assert isinstance(_got_strategy, IndexStrategy) + assert _got_strategy == _strategy with pytest.raises(IndexStrategyError): - get_index_for_sharev2_search('bad-request') + get_strategy('bad-request') diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index 6a1ee9a03..640a9e617 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -15,7 +15,7 @@ def test_admin_search_indexes_view(mock_elastic_clients): client.login(**credentials) with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): resp = client.get('/admin/search-indexes') - for strategy_name in index_strategy.all_index_strategies(): - _index_strategy = index_strategy.get_index_strategy(strategy_name) + for strategy_name in index_strategy.all_strategy_names(): + _index_strategy = index_strategy.get_strategy(strategy_name) expected_header = f'

current index: {_index_strategy.current_indexname}

' assert expected_header.encode() in resp.content From 042a9448ebf22cb20132e79c071d2469c193a4ee Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Jan 2025 09:24:44 -0500 Subject: [PATCH 11/35] wip... --- _TODO_multindex.txt | 4 +- share/admin/search.py | 6 +- share/search/index_strategy/__init__.py | 65 +++++++++++-------- share/search/index_strategy/_base.py | 6 +- share/search/index_strategy/elastic8.py | 32 +++++---- .../search/index_strategy/sharev2_elastic5.py | 4 +- .../index_strategy/trovesearch_denorm.py | 36 ++++++---- share/tasks/__init__.py | 2 +- tests/share/bin/test_sharectl.py | 2 +- 9 files changed, 89 insertions(+), 68 deletions(-) diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt index 66f9124db..b9d5f0ab7 100644 --- a/_TODO_multindex.txt +++ b/_TODO_multindex.txt @@ -31,7 +31,7 @@ IndexStrategy revamp plan/log: - add replacement multiindex methods to IndexStrategy (and friends) - (classmethod) each_existing_index (based on index names from elastic; may be any hex) - each_named_index (includes non-existent; ) - - get_index_by_subname + - get_index_by_subnames - subnames - is_current - pls_setup @@ -41,7 +41,7 @@ IndexStrategy revamp plan/log: - pls_handle_cardsearch - pls_handle_valuesearch - pls_ensure_fresh - - Elastic8IndexStrategy.each_index_definition (abstractmethod) + - Elastic8IndexStrategy.define_current_indexes (abstractmethod) - Elastic8IndexStrategy.each_named_index (based on current_index_definitions) - update existing base methods diff --git a/share/admin/search.py b/share/admin/search.py index 78c63790e..dd7e188a4 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -25,7 +25,7 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _index_strategy = index_strategy.get_strategy( + _index_strategy = index_strategy.parse_strategy_name( request.POST['specific_indexname'], # TODO: rename in form ) _pls_doer = PLS_DOERS[request.POST['pls_do']] @@ -35,7 +35,7 @@ def search_indexes_view(request): def search_index_mappings_view(request, index_name): - _specific_index = index_strategy.parse_index_name(index_name) + _specific_index = index_strategy.get_specific_index(index_name) _mappings = _specific_index.pls_get_mappings() return JsonResponse(_mappings) @@ -59,7 +59,7 @@ def _index_status_by_strategy(): } status_by_strategy = {} _messenger = IndexMessenger() - for _index_strategy in index_strategy.each_strategy(): + for _index_strategy in index_strategy.all_index_strategies().values(): _current_backfill = _backfill_by_checksum.get( str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), ) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index c5077ffc0..0f1dec3eb 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,4 +1,5 @@ from __future__ import annotations +import enum import functools from types import MappingProxyType from typing import Iterator @@ -21,51 +22,63 @@ 'each_strategy', 'all_strategy_names', 'get_strategy', + 'parse_strategy_name', + 'get_specific_index', # TODO: cleanup # 'all_index_strategies', # 'get_index_for_sharev2_search', # 'get_index_for_trovesearch', # 'get_index_strategy', - # 'get_specific_index', ) -@functools.cache -def all_strategy_names() -> frozenset[str]: - return frozenset(_strategy.name for _strategy in each_strategy()) +class StrategyTypes(enum.Enum): + if settings.ELASTICSEARCH5_URL: + sharev2_elastic5 = Sharev2Elastic5IndexStrategy + if settings.ELASTICSEARCH8_URL: + sharev2_elastic8 = Sharev2Elastic8IndexStrategy + trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy + trovesearch_denorm = TrovesearchDenormIndexStrategy + + def instantiate_strategy(self, strategy_check: str = ''): + _strategy_type = self.value + return _strategy_type(strategy_name=self.name, strategy_check=strategy_check) def each_strategy() -> Iterator[IndexStrategy]: - if settings.ELASTICSEARCH5_URL: - yield Sharev2Elastic5IndexStrategy(name='sharev2_elastic5') - if settings.ELASTICSEARCH8_URL: - yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') - yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') - yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') + for _strat_enum in StrategyTypes: + yield _strat_enum.instantiate_strategy() + +@functools.cache +def all_strategy_names() -> frozenset[str]: + return frozenset(StrategyTypes.__members__.keys()) -def get_strategy(requested_strategy_name: str) -> IndexStrategy: + +def parse_strategy_name(requested_strategy_name: str) -> IndexStrategy: (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) - try: - _strategy = get_index_strategy( - _strategyname, - subname=(_etc[0] if _etc else ''), - ) - except IndexStrategyError: - raise IndexStrategyError(f'unrecognized strategy name "{requested_strategy_name}"') - else: - return _strategy + return get_strategy( + strategy_name=_strategyname, + strategy_check=(_etc[0] if _etc else ''), + ) -def parse_index_name(index_name: str) -> IndexStrategy.SpecificIndex: +def parse_specific_index_name(index_name: str) -> IndexStrategy.SpecificIndex: try: - (_strategy_name, _strategy_check, *_etc) = parse_indexname_parts(index_name) - _strategy = get_index_strategy(_strategy_name, _strategy_check) - return _strategy.get_index_by_subname(*_etc) + _strategy = parse_strategy_name(index_name) + return _strategy.parse_full_index_name(index_name) except IndexStrategyError: raise IndexStrategyError(f'invalid index_name "{index_name}"') +def get_strategy(strategy_name: str, strategy_check: str = '') -> IndexStrategy: + try: + _strat_enum = StrategyTypes[strategy_name] + except KeyError: + raise IndexStrategyError(f'unrecognized strategy name "{strategy_name}"') + return _strat_enum.instantiate_strategy(strategy_check=strategy_check) + + def get_index_strategy(strategy_name: str, subname: str = '') -> IndexStrategy: try: return all_index_strategies()[strategy_name] @@ -73,7 +86,7 @@ def get_index_strategy(strategy_name: str, subname: str = '') -> IndexStrategy: raise IndexStrategyError(f'unknown index strategy "{strategy_name}"') -def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> IndexStrategy.SpecificIndex: +def get_strategy_for_search(strategy_name_request: str = '') -> IndexStrategy.SpecificIndex: try: _strategy = get_index_strategy(indexname_or_strategyname) return ( @@ -84,7 +97,7 @@ def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> I except IndexStrategyError: for _index_strategy in all_index_strategies().values(): try: - return _index_strategy.get_index_by_subname(indexname_or_strategyname) + return _index_strategy.get_index_by_subnames(indexname_or_strategyname) except IndexStrategyError: pass raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 56515f9e0..2a6ec9ef7 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -100,8 +100,10 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_subname(self, subname: str = '') -> IndexStrategy.SpecificIndex: - return self.SpecificIndex(self, subname) # type: ignore[abstract] + def get_index_by_subnames(self, *subnames: str) -> IndexStrategy.SpecificIndex: + if len(subnames == 1): + return self.SpecificIndex(self, subnames[0]) # type: ignore[abstract] + raise NotImplementedError(f'how subnames {subnames}') def pls_setup(self, *, skip_backfill=False): assert self.is_current, 'cannot setup a non-current strategy' diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 795b10c7b..dbb26df7b 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -17,7 +17,7 @@ from share.search import messages from share.search.index_strategy._util import timestamp_to_readable_datetime from share.util.checksum_iri import ChecksumIri -from ._indexnames import parse_index_name +from ._indexnames import parse_indexname_parts logger = logging.getLogger(__name__) @@ -29,6 +29,8 @@ class Elastic8IndexStrategy(IndexStrategy): ''' es8_client: elasticsearch8.Elasticsearch = dataclasses.field(init=False) + index_definitions: typing.ClassVar[IndexDefinitionDict] + def __post_init__(self): super().__post_init__() should_sniff = settings.ELASTICSEARCH['SNIFF'] @@ -56,11 +58,10 @@ def __post_init__(self): ### # for use when defining abstract methods in subclasses - @dataclasses.dataclass + @dataclasses.dataclass(frozen=True) class IndexDefinition: - subname: str - settings: dict mappings: dict + settings: dict @dataclasses.dataclass class MessageActionSet: @@ -70,9 +71,9 @@ class MessageActionSet: ### # abstract methods for subclasses to implement - @abc.abstractmethod @classmethod - def each_index_definition(cls) -> typing.Iterable[IndexDefinition]: + @abc.abstractmethod + def define_current_indexes(cls) -> dict[str, IndexDefinition]: raise NotImplementedError @abc.abstractmethod @@ -123,23 +124,20 @@ def compute_strategy_checksum(cls): salt=cls.__name__, raw_json={ _subname: dataclasses.asdict(_def) - for _subname, _def in cls.current_index_definitions().items() + for _subname, _def in cls.current_index_defs().items() } ) @classmethod @functools.cache - def current_index_definitions(cls): + def current_index_defs(cls): # readonly and cached per class - return types.MappingProxyType({ - _def.subname: _def - for _def in cls.each_index_definition() - }) + return types.MappingProxyType(cls.define_current_indexes()) # abstract method from IndexStrategy def each_named_index(self): - for _subname, _index_def in self.current_index_definitions().items(): - yield self.get_index_by_subname(_subname) + for _subname in self.current_index_defs().keys(): + yield self.get_index_by_subnames(_subname) # abstract method from IndexStrategy def each_existing_index(self): @@ -149,7 +147,7 @@ def each_existing_index(self): .keys() ) for indexname in indexname_set: - _index = parse_index_name(indexname) + _index = self.parse_full_index_name(indexname) assert _index.index_strategy == self yield _index @@ -212,7 +210,7 @@ def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificI def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks - return self.get_index_by_subname(self._alias_for_searching) + return self.get_index_by_subnames(self._alias_for_searching) # override from IndexStrategy def pls_mark_backfill_complete(self): @@ -256,7 +254,7 @@ def _get_indexnames_for_action( is_backfill_action: bool = False, ) -> set[str]: if is_backfill_action: - return {self.get_index_by_subname(index_subname).full_index_name} + return {self.get_index_by_subnames(index_subname).full_index_name} _indexes_kept_live = self._get_indexnames_for_alias(self._alias_for_keeping_live) def _get_indexnames_for_alias(self, alias_name) -> set[str]: diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 20de93537..fdf8d0f95 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -89,11 +89,11 @@ def pls_make_default_for_searching(self, specific_index): # abstract method from IndexStrategy def pls_get_default_for_searching(self): - return self.parse_index_name(self.STATIC_INDEXNAME) + return self.parse_full_index_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def each_existing_index(self): - yield self.parse_index_name(self.STATIC_INDEXNAME) + yield self.parse_full_index_name(self.STATIC_INDEXNAME) # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 9c1257ee4..275d4036f 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -61,17 +61,17 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): # abstract method from Elastic8IndexStrategy @classmethod - def each_index_definition(cls) -> Iterator[Elastic8IndexStrategy.IndexDefiniton]: - yield Elastic8IndexStrategy.IndexDefinition( - subname='card', - settings=cls._index_settings(), - mappings=cls._card_index_mappings(), - ) - yield Elastic8IndexStrategy.IndexDefinition( - subname='value', - settings=cls._index_settings(), - mappings=cls._value_index_mappings(), - ) + def define_current_indexes(cls) -> dict[str, Elastic8IndexStrategy.IndexDefiniton]: + return { + 'cardsearch': cls.IndexDefinition( + settings=cls._index_settings(), + mappings=cls._cardsearch_index_mappings(), + ), + 'valuesearch': cls.IndexDefinition( + settings=cls._index_settings(), + mappings=cls._valuesearch_index_mappings(), + ), + } # abstract method from IndexStrategy @property @@ -94,7 +94,7 @@ def _index_settings(cls): } @classmethod - def _card_index_mappings(cls): + def _cardsearch_index_mappings(cls): return { 'dynamic': 'false', 'dynamic_templates': cls._dynamic_templates(), @@ -105,6 +105,14 @@ def _card_index_mappings(cls): }, } + @classmethod + def _valuesearch_index_mappings(cls): + _card_mappings = cls._cardsearch_index_mappings() + _card_mappings['properties']['iri_value'] = { + 'properties': cls._iri_value_mappings(), + } + return _card_mappings + @classmethod def _dynamic_templates(cls): return [ @@ -948,8 +956,8 @@ def task__delete_iri_value_scraps( this task deletes those untouched value-docs after the index has refreshed at its own pace (allowing a slightly longer delay for items to _stop_ matching queries for removed values) ''' - from share.search.index_strategy import get_index_strategy - _index_strategy = get_index_strategy(index_strategy_name) + from share.search.index_strategy import get_strategy + _index_strategy = get_strategy(index_strategy_name) assert isinstance(_index_strategy, Elastic8IndexStrategy) # delete any docs that belong to cards in this chunk but weren't touched by indexing _delete_resp = _index_strategy.es8_client.delete_by_query( diff --git a/share/tasks/__init__.py b/share/tasks/__init__.py index fa93ccf76..37da10801 100644 --- a/share/tasks/__init__.py +++ b/share/tasks/__init__.py @@ -61,7 +61,7 @@ def schedule_index_backfill(self, index_backfill_pk): _index_backfill = db.IndexBackfill.objects.get(pk=index_backfill_pk) _index_backfill.pls_note_scheduling_has_begun() try: - _index_strategy = index_strategy.get_index_strategy(_index_backfill.index_strategy_name) + _index_strategy = index_strategy.get_strategy(_index_backfill.index_strategy_name) _messenger = IndexMessenger(celery_app=self.app, index_strategys=[_index_strategy]) _messagetype = _index_strategy.backfill_message_type assert _messagetype in _index_strategy.supported_message_types diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index a1c47d0a2..ca31edb48 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -64,7 +64,7 @@ def test_setup_initial(self, settings): def test_setup_index(self): mock_index_strategy = mock.Mock() - with mock.patch('share.bin.search.index_strategy.get_index_strategy', return_value=mock_index_strategy): + with mock.patch('share.bin.search.index_strategy.get_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') mock_current_index = mock_index_strategy.for_current_index.return_value assert mock_current_index.pls_setup.mock_calls == [mock.call(skip_backfill=False)] From ff96a19f1dfa36a2b573157c5948d2f2bc94fcbc Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Jan 2025 12:38:25 -0500 Subject: [PATCH 12/35] wip.. --- _TODO_multindex.txt | 6 +- share/admin/search.py | 29 +++++---- share/search/index_strategy/__init__.py | 84 +++++++++++-------------- share/search/index_strategy/_base.py | 15 ++--- share/search/index_strategy/elastic8.py | 66 ++++++++++--------- trove/views/search.py | 1 + 6 files changed, 97 insertions(+), 104 deletions(-) diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt index b9d5f0ab7..fd013930a 100644 --- a/_TODO_multindex.txt +++ b/_TODO_multindex.txt @@ -31,7 +31,7 @@ IndexStrategy revamp plan/log: - add replacement multiindex methods to IndexStrategy (and friends) - (classmethod) each_existing_index (based on index names from elastic; may be any hex) - each_named_index (includes non-existent; ) - - get_index_by_subnames + - get_index_by_subname - subnames - is_current - pls_setup @@ -44,8 +44,10 @@ IndexStrategy revamp plan/log: - Elastic8IndexStrategy.define_current_indexes (abstractmethod) - Elastic8IndexStrategy.each_named_index (based on current_index_definitions) +- update `share.search.index_strategy` public interface (see __all__) + - update existing base methods - add strategy_check to indexname_prefix - - pls_get_default_for_searching (classmethod, return IndexStrategy) + - pls_get_default_for_searching (get strategy_check from es alias (or current)) - pls_make_default_for_searching (by strategy instance (or strategy_check), not SpecificIndex) diff --git a/share/admin/search.py b/share/admin/search.py index dd7e188a4..11a5a1509 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -7,7 +7,10 @@ from share.admin.util import admin_url from share.models.index_backfill import IndexBackfill from share.search.index_messenger import IndexMessenger -from share.search import index_strategy +from share.search.index_strategy import ( + IndexStrategy, + parse_strategy_name, +) logger = logging.getLogger(__name__) @@ -25,7 +28,7 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _index_strategy = index_strategy.parse_strategy_name( + _index_strategy = parse_strategy_name( request.POST['specific_indexname'], # TODO: rename in form ) _pls_doer = PLS_DOERS[request.POST['pls_do']] @@ -35,7 +38,7 @@ def search_indexes_view(request): def search_index_mappings_view(request, index_name): - _specific_index = index_strategy.get_specific_index(index_name) + _specific_index = get_specific_index(index_name) _mappings = _specific_index.pls_get_mappings() return JsonResponse(_mappings) @@ -54,12 +57,12 @@ def _index_status_by_strategy(): _backfill.strategy_checksum: _backfill for _backfill in ( IndexBackfill.objects - .filter(index_strategy_name__in=index_strategy.all_strategy_names()) + .filter(index_strategy_name__in=all_strategy_names()) ) } status_by_strategy = {} _messenger = IndexMessenger() - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in all_index_strategies().values(): _current_backfill = _backfill_by_checksum.get( str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), ) @@ -91,7 +94,7 @@ def _index_status_by_strategy(): def _serialize_backfill( - strategy: index_strategy.IndexStrategy, + strategy: IndexStrategy, backfill: IndexBackfill | None, ): if not strategy.is_current: @@ -109,33 +112,33 @@ def _serialize_backfill( } -def _pls_setup(index_strategy): +def _pls_setup(index_strategy: IndexStrategy): assert index_strategy.is_current index_strategy.pls_setup() -def _pls_start_keeping_live(index_strategy): +def _pls_start_keeping_live(index_strategy: IndexStrategy): index_strategy.pls_start_keeping_live() -def _pls_stop_keeping_live(index_strategy): +def _pls_stop_keeping_live(index_strategy: IndexStrategy): index_strategy.pls_stop_keeping_live() -def _pls_start_backfill(index_strategy): +def _pls_start_backfill(index_strategy: IndexStrategy): assert index_strategy.is_current index_strategy.pls_start_backfill() -def _pls_mark_backfill_complete(index_strategy): +def _pls_mark_backfill_complete(index_strategy: IndexStrategy): index_strategy.pls_mark_backfill_complete() -def _pls_make_default_for_searching(index_strategy): +def _pls_make_default_for_searching(index_strategy: IndexStrategy): index_strategy.pls_make_default_for_searching() -def _pls_delete(index_strategy): +def _pls_delete(index_strategy: IndexStrategy): assert not index_strategy.is_current index_strategy.pls_delete() diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 0f1dec3eb..f017b10c5 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -19,20 +19,17 @@ __all__ = ( 'IndexStrategy', - 'each_strategy', 'all_strategy_names', + 'each_strategy', 'get_strategy', + 'get_strategy_for_sharev2_search', + 'get_strategy_for_trovesearch', + 'parse_specific_index_name', 'parse_strategy_name', - 'get_specific_index', - # TODO: cleanup - # 'all_index_strategies', - # 'get_index_for_sharev2_search', - # 'get_index_for_trovesearch', - # 'get_index_strategy', ) -class StrategyTypes(enum.Enum): +class _StrategyTypes(enum.Enum): if settings.ELASTICSEARCH5_URL: sharev2_elastic5 = Sharev2Elastic5IndexStrategy if settings.ELASTICSEARCH8_URL: @@ -40,26 +37,31 @@ class StrategyTypes(enum.Enum): trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy trovesearch_denorm = TrovesearchDenormIndexStrategy - def instantiate_strategy(self, strategy_check: str = ''): + def new_strategy_instance(self, strategy_check: str = '', *, for_search=False) -> IndexStrategy: _strategy_type = self.value - return _strategy_type(strategy_name=self.name, strategy_check=strategy_check) + _strategy = _strategy_type(strategy_name=self.name, strategy_check=strategy_check) + return ( + _strategy.get_default_search_strategy() + if (for_search and not strategy_check) + else _strategy + ) def each_strategy() -> Iterator[IndexStrategy]: - for _strat_enum in StrategyTypes: - yield _strat_enum.instantiate_strategy() + for _strat_enum in _StrategyTypes: + yield _strat_enum.new_strategy_instance() -@functools.cache def all_strategy_names() -> frozenset[str]: - return frozenset(StrategyTypes.__members__.keys()) + return frozenset(_StrategyTypes.__members__.keys()) -def parse_strategy_name(requested_strategy_name: str) -> IndexStrategy: +def parse_strategy_name(requested_strategy_name: str, *, for_search=False) -> IndexStrategy: (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) return get_strategy( strategy_name=_strategyname, strategy_check=(_etc[0] if _etc else ''), + for_search=for_search ) @@ -71,39 +73,21 @@ def parse_specific_index_name(index_name: str) -> IndexStrategy.SpecificIndex: raise IndexStrategyError(f'invalid index_name "{index_name}"') -def get_strategy(strategy_name: str, strategy_check: str = '') -> IndexStrategy: +def get_strategy( + strategy_name: str, + strategy_check: str = '', + *, + for_search: bool = False, +) -> IndexStrategy: try: - _strat_enum = StrategyTypes[strategy_name] + _strat_enum = _StrategyTypes[strategy_name] except KeyError: raise IndexStrategyError(f'unrecognized strategy name "{strategy_name}"') - return _strat_enum.instantiate_strategy(strategy_check=strategy_check) - - -def get_index_strategy(strategy_name: str, subname: str = '') -> IndexStrategy: - try: - return all_index_strategies()[strategy_name] - except KeyError: - raise IndexStrategyError(f'unknown index strategy "{strategy_name}"') + return _strat_enum.new_strategy_instance(strategy_check=strategy_check) -def get_strategy_for_search(strategy_name_request: str = '') -> IndexStrategy.SpecificIndex: - try: - _strategy = get_index_strategy(indexname_or_strategyname) - return ( - _strategy.pls_get_default_for_searching() - if for_search - else _strategy.for_current_index() - ) - except IndexStrategyError: - for _index_strategy in all_index_strategies().values(): - try: - return _index_strategy.get_index_by_subnames(indexname_or_strategyname) - except IndexStrategyError: - pass - raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') - -def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: +def get_strategy_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: if requested_name: _name = requested_name elif ( @@ -115,14 +99,16 @@ def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificI _name = 'sharev2_elastic8' else: raise IndexStrategyError('no available index for sharev2 search') - return get_specific_index(_name, for_search=True) + return parse_strategy_name(_name) -def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: +def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: if params.index_strategy_name: # specific strategy requested - _name = params.index_strategy_name - elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): - _name = 'trovesearch_denorm' + _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) else: - _name = 'trove_indexcard_flats' - return get_specific_index(_name, for_search=True) + _default_strategy_enum = ( + _StrategyTypes.trovesearch_denorm + if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) + else _StrategyTypes.trove_indexcard_flats + ) + _strategy = _default_strategy_enum.new_strategy_instance() diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 2a6ec9ef7..1ea7edd9a 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -100,10 +100,8 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_subnames(self, *subnames: str) -> IndexStrategy.SpecificIndex: - if len(subnames == 1): - return self.SpecificIndex(self, subnames[0]) # type: ignore[abstract] - raise NotImplementedError(f'how subnames {subnames}') + def get_index_by_subname(self, subname: str) -> IndexStrategy.SpecificIndex: + return self.SpecificIndex(self, subname) # type: ignore[abstract] def pls_setup(self, *, skip_backfill=False): assert self.is_current, 'cannot setup a non-current strategy' @@ -149,11 +147,6 @@ def compute_strategy_checksum(self) -> ChecksumIri: def each_index_subname(self) -> typing.Iterable[str]: raise NotImplementedError - @classmethod - @abc.abstractmethod - def each_setup_strategy(cls) -> typing.Iterator[typing.Self]: - raise NotImplementedError - @property @abc.abstractmethod def supported_message_types(self) -> typing.Iterable[messages.MessageType]: @@ -173,11 +166,11 @@ def pls_handle_messages_chunk(self, messages_chunk: messages.MessagesChunk) -> t raise NotImplementedError @abc.abstractmethod - def pls_make_default_for_searching(self, specific_index: 'SpecificIndex'): + def pls_make_default_for_searching(self) -> None: raise NotImplementedError @abc.abstractmethod - def pls_get_default_for_searching(self) -> 'SpecificIndex': + def pls_get_default_for_searching(self) -> IndexStrategy: raise NotImplementedError ### diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index dbb26df7b..5dd64f5b2 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -27,34 +27,8 @@ class Elastic8IndexStrategy(IndexStrategy): '''abstract base class for index strategies using elasticsearch 8 ''' - es8_client: elasticsearch8.Elasticsearch = dataclasses.field(init=False) - index_definitions: typing.ClassVar[IndexDefinitionDict] - def __post_init__(self): - super().__post_init__() - should_sniff = settings.ELASTICSEARCH['SNIFF'] - timeout = settings.ELASTICSEARCH['TIMEOUT'] - self.es8_client = elasticsearch8.Elasticsearch( - settings.ELASTICSEARCH8_URL, - # security: - ca_certs=settings.ELASTICSEARCH8_CERT_PATH, - basic_auth=( - (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) - if settings.ELASTICSEARCH8_SECRET is not None - else None - ), - # retry: - retry_on_timeout=True, - request_timeout=timeout, - # sniffing: - sniff_on_start=should_sniff, - sniff_before_requests=should_sniff, - sniff_on_node_failure=should_sniff, - sniff_timeout=timeout, - min_delay_between_sniffing=timeout, - ) - ### # for use when defining abstract methods in subclasses @@ -134,6 +108,35 @@ def current_index_defs(cls): # readonly and cached per class return types.MappingProxyType(cls.define_current_indexes()) + @classmethod + @functools.cache + def _make_elastic8_client(cls) -> elasticsearch8.Elasticsearch: + should_sniff = settings.ELASTICSEARCH['SNIFF'] + timeout = settings.ELASTICSEARCH['TIMEOUT'] + return elasticsearch8.Elasticsearch( + settings.ELASTICSEARCH8_URL, + # security: + ca_certs=settings.ELASTICSEARCH8_CERT_PATH, + basic_auth=( + (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) + if settings.ELASTICSEARCH8_SECRET is not None + else None + ), + # retry: + retry_on_timeout=True, + request_timeout=timeout, + # sniffing: + sniff_on_start=should_sniff, + sniff_before_requests=should_sniff, + sniff_on_node_failure=should_sniff, + sniff_timeout=timeout, + min_delay_between_sniffing=timeout, + ) + + @property + def es8_client(self): + return self._make_elastic8_client() # cached shared client + # abstract method from IndexStrategy def each_named_index(self): for _subname in self.current_index_defs().keys(): @@ -200,14 +203,19 @@ def pls_handle_messages_chunk(self, messages_chunk): self.after_chunk(messages_chunk, _affected_indexnames) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): + def pls_make_default_for_searching(self): self._set_indexnames_for_alias( self._alias_for_searching, - {specific_index.full_index_name}, + {self.indexname_wildcard}, ) # abstract method from IndexStrategy - def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: + def pls_get_default_for_searching(self) -> IndexStrategy: + _indexnames = self._get_indexnames_for_alias(self._alias_for_searching) + try: + _indexname = _indexnames.pop() + except KeyError: + return # a SpecificIndex for an alias will work fine for searching, but # will error if you try to invoke lifecycle hooks return self.get_index_by_subnames(self._alias_for_searching) diff --git a/trove/views/search.py b/trove/views/search.py index fd4043259..858707d08 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -52,6 +52,7 @@ def get(self, request): _url = request.build_absolute_uri() _search_gathering = self._start_gathering(renderer_type=_renderer_type) _search_params = self._parse_search_params(request) + _strategy = index_strategy.get_strategy_for_trovesearch(_search_params) _specific_index = index_strategy.get_index_for_trovesearch(_search_params) _focus = self.focus_type.new( iris=_url, From 1860f35fb0d58e9a6eebb51326b3484df4fa4b14 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Jan 2025 16:05:14 -0500 Subject: [PATCH 13/35] wip.... --- _TODO_multindex.txt | 3 +- api/search/views.py | 2 +- api/views/feeds.py | 6 +- share/admin/search.py | 9 +- share/models/feature_flag.py | 1 + share/search/index_strategy/__init__.py | 98 ++++++++++--------- share/search/index_strategy/_base.py | 40 +++++--- share/search/index_strategy/_indexnames.py | 2 +- share/search/index_strategy/elastic8.py | 54 ++++++---- .../search/index_strategy/sharev2_elastic5.py | 89 ++++++++++------- .../search/index_strategy/sharev2_elastic8.py | 16 ++- .../index_strategy/trove_indexcard_flats.py | 21 +++- tests/api/test_elasticsearch.py | 2 +- tests/api/test_feeds.py | 2 +- trove/views/search.py | 21 ++-- 15 files changed, 222 insertions(+), 144 deletions(-) diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt index fd013930a..7df6fadae 100644 --- a/_TODO_multindex.txt +++ b/_TODO_multindex.txt @@ -31,7 +31,7 @@ IndexStrategy revamp plan/log: - add replacement multiindex methods to IndexStrategy (and friends) - (classmethod) each_existing_index (based on index names from elastic; may be any hex) - each_named_index (includes non-existent; ) - - get_index_by_subname + - get_index - subnames - is_current - pls_setup @@ -41,6 +41,7 @@ IndexStrategy revamp plan/log: - pls_handle_cardsearch - pls_handle_valuesearch - pls_ensure_fresh + - with_strategy_check (return copy of the strategy with another check) - Elastic8IndexStrategy.define_current_indexes (abstractmethod) - Elastic8IndexStrategy.each_named_index (based on current_index_definitions) diff --git a/api/search/views.py b/api/search/views.py index 12075a82d..ddb606018 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -32,7 +32,7 @@ def _handle_request(self, request): if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: - specific_index = index_strategy.get_index_for_sharev2_search(requested_index_strategy) + specific_index = index_strategy.get_strategy_for_sharev2_search(requested_index_strategy) except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: diff --git a/api/views/feeds.py b/api/views/feeds.py index 417d479fa..02c0eb955 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -34,7 +34,7 @@ class MetadataRecordsRSS(Feed): description = 'Updates to the SHARE open dataset' author_name = 'SHARE' - _search_index: index_strategy.IndexStrategy.SpecificIndex + _search_strategy: index_strategy.IndexStrategy def title(self, obj): query = json.dumps(obj.get('query', 'All')) @@ -43,7 +43,7 @@ def title(self, obj): def get_object(self, request): self._order = request.GET.get('order') elastic_query = request.GET.get('elasticQuery') - self._search_index = index_strategy.get_index_for_sharev2_search(request.GET.get('indexStrategy')) + self._search_strategy = index_strategy.get_strategy_for_sharev2_search(request.GET.get('indexStrategy')) if self._order not in {'date_modified', 'date_updated', 'date_created', 'date_published'}: self._order = 'date_modified' @@ -64,7 +64,7 @@ def get_object(self, request): def items(self, obj): try: - json_response = self._search_index.pls_handle_search__sharev2_backcompat( + json_response = self._search_strategy.pls_handle_search__sharev2_backcompat( request_body=obj, ) except IndexStrategyError: diff --git a/share/admin/search.py b/share/admin/search.py index 11a5a1509..4ccd17a94 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -9,7 +9,10 @@ from share.search.index_messenger import IndexMessenger from share.search.index_strategy import ( IndexStrategy, + all_strategy_names, + each_strategy, parse_strategy_name, + parse_specific_index_name, ) @@ -38,7 +41,7 @@ def search_indexes_view(request): def search_index_mappings_view(request, index_name): - _specific_index = get_specific_index(index_name) + _specific_index = parse_specific_index_name(index_name) _mappings = _specific_index.pls_get_mappings() return JsonResponse(_mappings) @@ -62,7 +65,7 @@ def _index_status_by_strategy(): } status_by_strategy = {} _messenger = IndexMessenger() - for _index_strategy in all_index_strategies().values(): + for _index_strategy in each_strategy(): _current_backfill = _backfill_by_checksum.get( str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), ) @@ -70,7 +73,7 @@ def _index_status_by_strategy(): 'current': { 'status': [ _index.pls_get_status() - for _index in _index_strategy.each_named_index() + for _index in _index_strategy.each_subnamed_index() ], 'backfill': _serialize_backfill(_index_strategy, _current_backfill), }, diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index efd9edc76..6c6fd8afa 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -33,6 +33,7 @@ class FeatureFlag(models.Model): FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' TROVESEARCH_DENORMILY = 'trovesearch_denormily' PREPRINT_AFFILIATIONS = 'preprint_affiliations' + TROVESEARCH_SPLINTDEX = 'trovesearch_splintdex' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index f017b10c5..abf57e8d0 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,7 +1,5 @@ from __future__ import annotations import enum -import functools -from types import MappingProxyType from typing import Iterator from django.conf import settings @@ -29,48 +27,31 @@ ) -class _StrategyTypes(enum.Enum): - if settings.ELASTICSEARCH5_URL: - sharev2_elastic5 = Sharev2Elastic5IndexStrategy - if settings.ELASTICSEARCH8_URL: - sharev2_elastic8 = Sharev2Elastic8IndexStrategy - trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy - trovesearch_denorm = TrovesearchDenormIndexStrategy - - def new_strategy_instance(self, strategy_check: str = '', *, for_search=False) -> IndexStrategy: - _strategy_type = self.value - _strategy = _strategy_type(strategy_name=self.name, strategy_check=strategy_check) - return ( - _strategy.get_default_search_strategy() - if (for_search and not strategy_check) - else _strategy - ) +class _AvailableStrategies(enum.Enum): + '''static source of truth for available index strategies + (don't import this enum directly -- access via the other functions in this module) + ''' -def each_strategy() -> Iterator[IndexStrategy]: - for _strat_enum in _StrategyTypes: - yield _strat_enum.new_strategy_instance() + if settings.ELASTICSEARCH5_URL: + sharev2_elastic5 = Sharev2Elastic5IndexStrategy('sharev2_elastic5') + if settings.ELASTICSEARCH8_URL: + sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') + trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') + trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') -def all_strategy_names() -> frozenset[str]: - return frozenset(_StrategyTypes.__members__.keys()) +### +# module public interface -def parse_strategy_name(requested_strategy_name: str, *, for_search=False) -> IndexStrategy: - (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) - return get_strategy( - strategy_name=_strategyname, - strategy_check=(_etc[0] if _etc else ''), - for_search=for_search - ) +def all_strategy_names() -> frozenset[str]: + return frozenset(_AvailableStrategies.__members__.keys()) -def parse_specific_index_name(index_name: str) -> IndexStrategy.SpecificIndex: - try: - _strategy = parse_strategy_name(index_name) - return _strategy.parse_full_index_name(index_name) - except IndexStrategyError: - raise IndexStrategyError(f'invalid index_name "{index_name}"') +def each_strategy() -> Iterator[IndexStrategy]: + for _strat_enum in _AvailableStrategies: + yield _strat_enum.value def get_strategy( @@ -80,14 +61,19 @@ def get_strategy( for_search: bool = False, ) -> IndexStrategy: try: - _strat_enum = _StrategyTypes[strategy_name] + _strategy: IndexStrategy = _AvailableStrategies[strategy_name].value except KeyError: raise IndexStrategyError(f'unrecognized strategy name "{strategy_name}"') - return _strat_enum.new_strategy_instance(strategy_check=strategy_check) - + if strategy_check: + _strategy = _strategy.with_strategy_check(strategy_check) + return ( + _strategy.pls_get_default_for_searching() + if (for_search and not strategy_check) + else _strategy + ) -def get_strategy_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: +def get_strategy_for_sharev2_search(requested_name: str | None = None) -> IndexStrategy: if requested_name: _name = requested_name elif ( @@ -102,13 +88,33 @@ def get_strategy_for_sharev2_search(requested_name=None) -> IndexStrategy.Specif return parse_strategy_name(_name) -def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: +def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy: if params.index_strategy_name: # specific strategy requested _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) else: - _default_strategy_enum = ( - _StrategyTypes.trovesearch_denorm - if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) - else _StrategyTypes.trove_indexcard_flats + _strategy = get_strategy( + strategy_name=( + _AvailableStrategies.trovesearch_denorm.name + if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) + else _AvailableStrategies.trove_indexcard_flats.name + ), + for_search=True, ) - _strategy = _default_strategy_enum.new_strategy_instance() + return _strategy + + +def parse_specific_index_name(index_name: str) -> IndexStrategy.SpecificIndex: + try: + _strategy = parse_strategy_name(index_name) + return _strategy.parse_full_index_name(index_name) + except IndexStrategyError: + raise IndexStrategyError(f'invalid index_name "{index_name}"') + + +def parse_strategy_name(requested_strategy_name: str, *, for_search=False) -> IndexStrategy: + (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) + return get_strategy( + strategy_name=_strategyname, + strategy_check=(_etc[0] if _etc else ''), + for_search=for_search + ) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 1ea7edd9a..951313b45 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -52,11 +52,15 @@ def __post_init__(self): self.strategy_check = self.CURRENT_STRATEGY_CHECKSUM.hexdigest indexnames.raise_if_invalid_indexname_part(self.strategy_check) - @functools.cache @classmethod + @functools.cache def index_subname_set(cls) -> frozenset[str]: return frozenset(cls.each_index_subname()) + def each_subnamed_index(self) -> typing.Iterator[SpecificIndex]: + for _subname in self.index_subname_set(): + yield self.get_index(_subname) + @property def nonurgent_messagequeue_name(self) -> str: return f'{self.strategy_name}.nonurgent' @@ -100,23 +104,29 @@ def assert_strategy_is_current(self): ) ```''') - def get_index_by_subname(self, subname: str) -> IndexStrategy.SpecificIndex: + def get_index(self, subname: str) -> SpecificIndex: return self.SpecificIndex(self, subname) # type: ignore[abstract] - def pls_setup(self, *, skip_backfill=False): - assert self.is_current, 'cannot setup a non-current strategy' - _preexisting_index_count = sum( - _index.pls_check_exists() - for _index in self.each_existing_index() - ) - self.pls_create() - self.pls_start_keeping_live() + def parse_full_index_name(self, index_name: str) -> SpecificIndex: + (_strategy_name, _strategy_check, *_etc) = indexnames.parse_indexname_parts(index_name) + if _strategy_name != self.strategy_name: + raise IndexStrategyError(f'this index belongs to another strategy (expected strategy name "{self.strategy_name}"; got "{_strategy_name}" from index name {index_name})') + _strategy = self.with_strategy_check(_strategy_check) + return _strategy.get_index(_etc[0] if _etc else '') + + def with_strategy_check(self, strategy_check: str) -> IndexStrategy: + return dataclasses.replace(self, strategy_check=strategy_check) + + def pls_setup(self, *, skip_backfill=False) -> None: + if not self.is_current: + raise IndexStrategyError('cannot setup a non-current strategy') + for _index in self.each_subnamed_index(): + _index.pls_create() + _index.pls_start_keeping_live() if skip_backfill: - _backfill = self.index_strategy.get_or_create_backfill() + _backfill = self.get_or_create_backfill() _backfill.backfill_status = _backfill.COMPLETE _backfill.save() - if not _preexisting_index_count: # first index for a strategy is automatic default - self.index_strategy.pls_make_default_for_searching(self) def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create( @@ -196,7 +206,7 @@ def __post_init__(self): if self.subname not in self.index_strategy.index_subname_set(): raise IndexStrategyError( f'invalid subname "{self.subname}"!' - f' (expected one of {self.index_strategy.index_subname_set}")' + f' (expected one of {self.index_strategy.index_subname_set()}")' ) @property @@ -206,7 +216,7 @@ def is_current(self) -> bool: @property def full_index_name(self) -> str: return indexnames.combine_indexname_parts( - self.index_strategy.indexname_prefix, + *self.index_strategy.indexname_prefix_parts, self.subname, ) diff --git a/share/search/index_strategy/_indexnames.py b/share/search/index_strategy/_indexnames.py index 3517a6d01..bc9f1e149 100644 --- a/share/search/index_strategy/_indexnames.py +++ b/share/search/index_strategy/_indexnames.py @@ -14,7 +14,7 @@ def raise_if_invalid_indexname_part(indexname_part: str) -> None: def combine_indexname_parts(*indexname_parts: str) -> str: - return INDEXNAME_DELIM.join(indexname_parts) + return INDEXNAME_DELIM.join(filter(bool, indexname_parts)) def parse_indexname_parts(name: str) -> list[str]: diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 5dd64f5b2..f1bbe5453 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -1,6 +1,7 @@ from __future__ import annotations import abc import collections +from collections.abc import Mapping import dataclasses import functools from http import HTTPStatus @@ -27,7 +28,7 @@ class Elastic8IndexStrategy(IndexStrategy): '''abstract base class for index strategies using elasticsearch 8 ''' - index_definitions: typing.ClassVar[IndexDefinitionDict] + index_definitions: typing.ClassVar[dict[str, IndexDefinition]] ### # for use when defining abstract methods in subclasses @@ -102,15 +103,20 @@ def compute_strategy_checksum(cls): } ) + # abstract method from IndexStrategy + @classmethod + def each_index_subname(self) -> typing.Iterable[str]: + yield from self.current_index_defs().keys() + @classmethod @functools.cache - def current_index_defs(cls): + def current_index_defs(cls) -> Mapping[str, IndexDefinition]: # readonly and cached per class return types.MappingProxyType(cls.define_current_indexes()) @classmethod @functools.cache - def _make_elastic8_client(cls) -> elasticsearch8.Elasticsearch: + def _get_elastic8_client(cls) -> elasticsearch8.Elasticsearch: should_sniff = settings.ELASTICSEARCH['SNIFF'] timeout = settings.ELASTICSEARCH['TIMEOUT'] return elasticsearch8.Elasticsearch( @@ -135,12 +141,12 @@ def _make_elastic8_client(cls) -> elasticsearch8.Elasticsearch: @property def es8_client(self): - return self._make_elastic8_client() # cached shared client + return self._get_elastic8_client() # cached classmethod for shared client # abstract method from IndexStrategy - def each_named_index(self): + def each_subnamed_index(self): for _subname in self.current_index_defs().keys(): - yield self.get_index_by_subnames(_subname) + yield self.get_index(_subname) # abstract method from IndexStrategy def each_existing_index(self): @@ -211,20 +217,21 @@ def pls_make_default_for_searching(self): # abstract method from IndexStrategy def pls_get_default_for_searching(self) -> IndexStrategy: - _indexnames = self._get_indexnames_for_alias(self._alias_for_searching) + _searchnames = self._get_indexnames_for_alias(self._alias_for_searching) try: - _indexname = _indexnames.pop() - except KeyError: - return - # a SpecificIndex for an alias will work fine for searching, but - # will error if you try to invoke lifecycle hooks - return self.get_index_by_subnames(self._alias_for_searching) + (_indexname, *_) = _searchnames + except ValueError: + return self # no default set, this one's fine + (_strategyname, _strategycheck, *_) = parse_indexname_parts(_indexname) + assert _strategyname == self.strategy_name + return self.with_strategy_check(_strategycheck) # override from IndexStrategy def pls_mark_backfill_complete(self): super().pls_mark_backfill_complete() # explicit refresh after bulk operation - self.for_current_index().pls_refresh() + for _index in self.each_subnamed_index(): + _index.pls_refresh() @property def _alias_for_searching(self): @@ -262,8 +269,10 @@ def _get_indexnames_for_action( is_backfill_action: bool = False, ) -> set[str]: if is_backfill_action: - return {self.get_index_by_subnames(index_subname).full_index_name} - _indexes_kept_live = self._get_indexnames_for_alias(self._alias_for_keeping_live) + return {self.get_index(index_subname).full_index_name} + # note: using alias directly to reduce bulk-action clutter + # -- shortcut around `self._get_indexnames_for_alias(self._alias_for_keeping_live)` + return {self._alias_for_keeping_live} def _get_indexnames_for_alias(self, alias_name) -> set[str]: try: @@ -304,7 +313,11 @@ def _set_indexnames_for_alias(self, alias_name, indexnames): @dataclasses.dataclass class SpecificIndex(IndexStrategy.SpecificIndex): - index_strategy: Elastic8IndexStrategy + index_strategy: Elastic8IndexStrategy # note: narrower type + + @property + def index_def(self) -> Elastic8IndexStrategy.IndexDefinition: + return self.index_strategy.current_index_defs()[self.subname] # abstract method from IndexStrategy.SpecificIndex def pls_get_status(self) -> IndexStatus: @@ -368,13 +381,14 @@ def pls_create(self): .exists(index=index_to_create) ) if not index_exists: - logger.warning('Creating index %s', index_to_create) + logger.info('Creating index %s', index_to_create) + _index_def = self.index_def ( self.index_strategy.es8_client.indices .create( index=index_to_create, - settings=self.index_strategy.index_settings(), - mappings=self.index_strategy.index_mappings(), + settings=_index_def.settings, + mappings=_index_def.mappings, ) ) self.pls_refresh() diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index fdf8d0f95..34ae4d80a 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -1,3 +1,5 @@ +from __future__ import annotations +import functools import json import logging @@ -35,10 +37,11 @@ class Sharev2Elastic5IndexStrategy(IndexStrategy): # perpetuated optimizations from times long past MAX_CHUNK_BYTES = 10 * 1024 ** 2 # 10 megs - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + @classmethod + @functools.cache + def _get_elastic5_client(cls) -> elasticsearch5.Elasticsearch: should_sniff = settings.ELASTICSEARCH['SNIFF'] - self.es5_client = elasticsearch5.Elasticsearch( + return elasticsearch5.Elasticsearch( settings.ELASTICSEARCH5_URL, retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'], @@ -50,6 +53,19 @@ def __init__(self, *args, **kwargs): sniffer_timeout=60 if should_sniff else None, ) + @property + def es5_client(self): + return self._get_elastic5_client() # cached classmethod for shared client + + @property + def single_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index(self.STATIC_INDEXNAME) + + # abstract method from IndexStrategy + @classmethod + def each_index_subname(self): + yield self.STATIC_INDEXNAME + # override IndexStrategy @property def nonurgent_messagequeue_name(self): @@ -65,11 +81,6 @@ def urgent_messagequeue_name(self): def indexname_prefix(self): return self.STATIC_INDEXNAME - # override IndexStrategy - @property - def current_indexname(self): - return self.STATIC_INDEXNAME - # abstract method from IndexStrategy def compute_strategy_checksum(self): return ChecksumIri.digest_json( @@ -83,17 +94,22 @@ def compute_strategy_checksum(self): ) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index): - assert specific_index.index_strategy is self - assert specific_index.indexname == self.STATIC_INDEXNAME + def pls_make_default_for_searching(self): + pass # the one index is the only one # abstract method from IndexStrategy def pls_get_default_for_searching(self): - return self.parse_full_index_name(self.STATIC_INDEXNAME) + return self # abstract method from IndexStrategy def each_existing_index(self): - yield self.parse_full_index_name(self.STATIC_INDEXNAME) + _index = self.single_index + if _index.pls_check_exists(): + yield _index + + # abstract method from IndexStrategy + def each_subnamed_index(self): + yield self.single_index # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): @@ -315,14 +331,21 @@ def _build_elastic_actions(self, messages_chunk): yield action class SpecificIndex(IndexStrategy.SpecificIndex): + index_strategy: Sharev2Elastic5IndexStrategy # narrow type + + # override IndexStrategy.SpecificIndex + @property + def full_index_name(self): + return self.index_strategy.STATIC_INDEXNAME + # abstract method from IndexStrategy.SpecificIndex def pls_create(self): # check index exists (if not, create) - logger.debug('Ensuring index %s', self.indexname) + logger.debug('Ensuring index %s', self.full_index_name) indices_api = self.index_strategy.es5_client.indices - if not indices_api.exists(index=self.indexname): + if not indices_api.exists(index=self.full_index_name): indices_api.create( - self.indexname, + self.full_index_name, body={ 'settings': self.index_strategy._index_settings(), 'mappings': self.index_strategy._index_mappings(), @@ -334,7 +357,7 @@ def pls_create(self): self.index_strategy.es5_client.cluster .health(wait_for_status='yellow') ) - logger.info('Finished setting up Elasticsearch index %s', self.indexname) + logger.info('Finished setting up Elasticsearch index %s', self.full_index_name) # abstract method from IndexStrategy.SpecificIndex def pls_start_keeping_live(self): @@ -344,7 +367,7 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise exceptions.IndexStrategyError( f'{self.__class__.__qualname__} is implemented for only one index, ' - f'"{self.indexname}", which is always kept live (until elasticsearch5 ' + f'"{self.full_index_name}", which is always kept live (until elasticsearch5 ' 'support is dropped)' ) @@ -352,23 +375,23 @@ def pls_stop_keeping_live(self): def pls_refresh(self): ( self.index_strategy.es5_client.indices - .refresh(index=self.indexname) + .refresh(index=self.full_index_name) ) - logger.info('Refreshed index %s', self.indexname) + logger.info('Refreshed index %s', self.full_index_name) # abstract method from IndexStrategy.SpecificIndex def pls_delete(self): - logger.warning(f'{self.__class__.__name__}: deleting index {self.indexname}') + logger.warning(f'{self.__class__.__name__}: deleting index {self.full_index_name}') ( self.index_strategy.es5_client.indices - .delete(index=self.indexname, ignore=[400, 404]) + .delete(index=self.full_index_name, ignore=[400, 404]) ) # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): return bool( self.index_strategy.es5_client.indices - .exists(index=self.indexname) + .exists(index=self.full_index_name) ) # abstract method from IndexStrategy.SpecificIndex @@ -376,27 +399,27 @@ def pls_get_status(self) -> IndexStatus: try: stats = ( self.index_strategy.es5_client.indices - .stats(index=self.indexname, metric='docs') + .stats(index=self.full_index_name, metric='docs') ) existing_indexes = ( self.index_strategy.es5_client.indices - .get_settings(index=self.indexname, name='index.creation_date') + .get_settings(index=self.full_index_name, name='index.creation_date') ) - index_settings = existing_indexes[self.indexname] - index_stats = stats['indices'][self.indexname] + index_settings = existing_indexes[self.full_index_name] + index_stats = stats['indices'][self.full_index_name] except (KeyError, elasticsearch5.exceptions.NotFoundError): # not yet created return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_strategy_name=self.index_strategy.strategy_name, + specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, - creation_date=None, + creation_date='', doc_count=0, ) return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_strategy_name=self.index_strategy.strategy_name, + specific_indexname=self.full_index_name, is_kept_live=True, is_default_for_searching=True, creation_date=timestamp_to_readable_datetime( @@ -411,7 +434,7 @@ def pls_handle_search__sharev2_backcompat(self, request_body=None, request_query ''' try: return self.index_strategy.es5_client.search( - index=self.indexname, + index=self.full_index_name, body=request_body or {}, params=request_queryparams or {}, ) diff --git a/share/search/index_strategy/sharev2_elastic8.py b/share/search/index_strategy/sharev2_elastic8.py index 76e221118..51bfa72b9 100644 --- a/share/search/index_strategy/sharev2_elastic8.py +++ b/share/search/index_strategy/sharev2_elastic8.py @@ -43,7 +43,17 @@ def backfill_message_type(self): return messages.MessageType.BACKFILL_SUID # abstract method from Elastic8IndexStrategy - def index_settings(self): + @classmethod + def define_current_indexes(cls): + return { # empty index subname, for backcompat + '': cls.IndexDefinition( + mappings=cls.index_mappings(), + settings=cls.index_settings(), + ), + } + + @classmethod + def index_settings(cls): return { 'analysis': { 'analyzer': { @@ -78,8 +88,8 @@ def index_settings(self): } } - # abstract method from Elastic8IndexStrategy - def index_mappings(self): + @classmethod + def index_mappings(cls): exact_field = { 'exact': { 'type': 'keyword', diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 6d258a1a6..4b5d163c0 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -72,10 +72,21 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD - def index_settings(self): + @classmethod + def define_current_indexes(cls): + return { # empty index subname, for backcompat + '': cls.IndexDefinition( + mappings=cls.index_mappings(), + settings=cls.index_settings(), + ), + } + + @classmethod + def index_settings(cls): return {} - def index_mappings(self): + @classmethod + def index_mappings(cls): _capped_keyword = { 'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX, @@ -276,7 +287,7 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: return self.index_strategy.es8_client.search( - index=self.indexname, + index=self.full_index_name, body={ **(request_body or {}), 'track_total_hits': True, @@ -309,7 +320,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear logger.info(json.dumps(_search_kwargs, indent=2)) try: _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, + index=self.full_index_name, **_search_kwargs, ) except elasticsearch8.TransportError as error: @@ -338,7 +349,7 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value logger.info(json.dumps(_search_kwargs, indent=2)) try: _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, + index=self.full_index_name, **_search_kwargs, ) except elasticsearch8.TransportError as error: diff --git a/tests/api/test_elasticsearch.py b/tests/api/test_elasticsearch.py index 13e6688f5..9a0bacff5 100644 --- a/tests/api/test_elasticsearch.py +++ b/tests/api/test_elasticsearch.py @@ -55,7 +55,7 @@ def test_search(self): with mock.patch('api.search.views.index_strategy') as _mock_index_strategy_module: mock_handle_search = ( _mock_index_strategy_module - .get_index_for_sharev2_search + .get_strategy_for_sharev2_search .return_value .pls_handle_search__sharev2_backcompat ) diff --git a/tests/api/test_feeds.py b/tests/api/test_feeds.py index 49a016664..0e56eac2c 100644 --- a/tests/api/test_feeds.py +++ b/tests/api/test_feeds.py @@ -52,7 +52,7 @@ def fake_items(self, Graph): json.loads(formatted_item) for formatted_item in formatted_items ] - with mock.patch('api.views.feeds.index_strategy.get_index_for_sharev2_search') as mock_get_for_searching: + with mock.patch('api.views.feeds.index_strategy.get_strategy_for_sharev2_search') as mock_get_for_searching: mock_strategy = mock_get_for_searching.return_value mock_strategy.pls_handle_search__sharev2_backcompat.return_value = { 'hits': { diff --git a/trove/views/search.py b/trove/views/search.py index 858707d08..d620b668a 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -53,11 +53,10 @@ def get(self, request): _search_gathering = self._start_gathering(renderer_type=_renderer_type) _search_params = self._parse_search_params(request) _strategy = index_strategy.get_strategy_for_trovesearch(_search_params) - _specific_index = index_strategy.get_index_for_trovesearch(_search_params) _focus = self.focus_type.new( iris=_url, search_params=_search_params, - search_handle=self.get_search_handle(_specific_index, _search_params), + search_handle=self.get_search_handle(_strategy, _search_params), ) if _renderer_type.PASSIVE_RENDER: self._fill_gathering(_search_gathering, _search_params, _focus) @@ -97,17 +96,17 @@ def _fill_gathering(self, search_gathering, search_params, start_focus): else: search_gathering.ask(_attrpaths, focus=_focus) - def get_search_handle(self, specific_index, search_params) -> BasicSearchHandle: - return self._get_wrapped_handler(specific_index)(search_params) + def get_search_handle(self, strategy, search_params) -> BasicSearchHandle: + return self._get_wrapped_handler(strategy)(search_params) def get_search_handler( self, - specific_index: index_strategy.IndexStrategy.SpecificIndex, + strategy: index_strategy.IndexStrategy, ) -> _TrovesearchHandler: raise NotImplementedError - def _get_wrapped_handler(self, specific_index): - _raw_handler = self.get_search_handler(specific_index) + def _get_wrapped_handler(self, strategy: index_strategy.IndexStrategy): + _raw_handler = self.get_search_handler(strategy) def _wrapped_handler(search_params): _handle = _raw_handler(search_params) @@ -120,13 +119,13 @@ class CardsearchView(_BaseTrovesearchView): focus_type = CardsearchFocus params_dataclass = CardsearchParams - def get_search_handler(self, specific_index): - return specific_index.pls_handle_cardsearch + def get_search_handler(self, strategy): + return strategy.pls_handle_cardsearch class ValuesearchView(_BaseTrovesearchView): focus_type = ValuesearchFocus params_dataclass = ValuesearchParams - def get_search_handler(self, specific_index): - return specific_index.pls_handle_valuesearch + def get_search_handler(self, strategy): + return strategy.pls_handle_valuesearch From 0a7a5c482df4f18b7ebb52efe23075653a626974 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Jan 2025 17:03:40 -0500 Subject: [PATCH 14/35] wip..... --- share/admin/__init__.py | 6 +- ...dexname_indexbackfill_strategy_checksum.py | 18 ++ share/search/index_status.py | 1 + share/search/index_strategy/__init__.py | 16 +- share/search/index_strategy/_base.py | 14 ++ share/search/index_strategy/elastic8.py | 24 ++- .../search/index_strategy/sharev2_elastic5.py | 2 + .../index_strategy/trovesearch_denorm.py | 179 ++++++++++-------- templates/admin/search-indexes.html | 32 ++-- .../index_strategy/_with_real_services.py | 72 +++---- .../index_strategy/test_sharev2_elastic5.py | 19 +- 11 files changed, 229 insertions(+), 154 deletions(-) create mode 100644 share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 9e68fe2e9..7174cc418 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -318,15 +318,15 @@ class FormattedMetadataRecordAdmin(admin.ModelAdmin): class IndexBackfillAdmin(admin.ModelAdmin): readonly_fields = ( 'index_strategy_name', - 'specific_indexname', + 'strategy_checksum', 'error_type', 'error_message', 'error_context', ) paginator = TimeLimitedPaginator - list_display = ('index_strategy_name', 'backfill_status', 'created', 'modified', 'specific_indexname') + list_display = ('index_strategy_name', 'backfill_status', 'created', 'modified', 'strategy_checksum') show_full_result_count = False - search_fields = ('index_strategy_name', 'specific_indexname',) + search_fields = ('index_strategy_name', 'strategy_checksum',) actions = ('reset_to_initial',) def reset_to_initial(self, request, queryset): diff --git a/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py b/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py new file mode 100644 index 000000000..37867fe2b --- /dev/null +++ b/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.25 on 2025-01-16 20:32 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('share', '0076_rawdatum_share_rawdatum_expiration_idx'), + ] + + operations = [ + migrations.RenameField( + model_name='indexbackfill', + old_name='specific_indexname', + new_name='strategy_checksum', + ), + ] diff --git a/share/search/index_status.py b/share/search/index_status.py index c413503a3..2c379c8a1 100644 --- a/share/search/index_status.py +++ b/share/search/index_status.py @@ -5,6 +5,7 @@ class IndexStatus: creation_date: str index_strategy_name: str + index_subname: str specific_indexname: str is_kept_live: bool = False is_default_for_searching: bool = False diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index abf57e8d0..b7d77c56c 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -40,6 +40,7 @@ class _AvailableStrategies(enum.Enum): sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') + trovesearch_splint = TrovesearchDenormIndexStrategy('trovesearch_splint') ### @@ -92,14 +93,13 @@ def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> Inde if params.index_strategy_name: # specific strategy requested _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) else: - _strategy = get_strategy( - strategy_name=( - _AvailableStrategies.trovesearch_denorm.name - if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) - else _AvailableStrategies.trove_indexcard_flats.name - ), - for_search=True, - ) + if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_SPLINTDEX): + _strategy_name = _AvailableStrategies.trovesearch_splint.name + elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): + _strategy_name = _AvailableStrategies.trovesearch_denorm.name + else: # default (until it's removed) + _strategy_name = _AvailableStrategies.trove_indexcard_flats.name + _strategy = get_strategy(_strategy_name, for_search=True) return _strategy diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 951313b45..ccdc09f06 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -128,6 +128,10 @@ def pls_setup(self, *, skip_backfill=False) -> None: _backfill.backfill_status = _backfill.COMPLETE _backfill.save() + def pls_teardown(self) -> None: + for _index in self.each_subnamed_index(): + _index.pls_delete() + def get_or_create_backfill(self): (index_backfill, _) = IndexBackfill.objects.get_or_create( index_strategy_name=self.strategy_name, @@ -140,6 +144,16 @@ def pls_start_backfill(self): def pls_mark_backfill_complete(self): self.get_or_create_backfill().pls_mark_complete() + def pls_check_exists(self) -> bool: + return all( + _index.pls_check_exists() + for _index in self.each_subnamed_index() + ) + + def pls_refresh(self) -> None: + for _index in self.each_subnamed_index(): + _index.pls_refresh() + ### # abstract methods (required for concrete subclasses) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index f1bbe5453..bfa4f21f0 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -94,13 +94,16 @@ def build_update_action(self, doc_id, doc_source): # abstract method from IndexStrategy @classmethod def compute_strategy_checksum(cls): + _current_json = { + _subname: dataclasses.asdict(_def) + for _subname, _def in cls.current_index_defs().items() + } + if set(_current_json.keys()) == {''}: + _current_json = _current_json[''] return ChecksumIri.digest_json( checksumalgorithm_name='sha-256', salt=cls.__name__, - raw_json={ - _subname: dataclasses.asdict(_def) - for _subname, _def in cls.current_index_defs().items() - } + raw_json=_current_json, ) # abstract method from IndexStrategy @@ -226,6 +229,17 @@ def pls_get_default_for_searching(self) -> IndexStrategy: assert _strategyname == self.strategy_name return self.with_strategy_check(_strategycheck) + # abstract method from IndexStrategy + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + return self.es8_client.search( + index=self.indexname_wildcard, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) + # override from IndexStrategy def pls_mark_backfill_complete(self): super().pls_mark_backfill_complete() @@ -324,6 +338,7 @@ def pls_get_status(self) -> IndexStatus: if not self.pls_check_exists(): return IndexStatus( index_strategy_name=self.index_strategy.strategy_name, + index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, @@ -346,6 +361,7 @@ def pls_get_status(self) -> IndexStatus: ) return IndexStatus( index_strategy_name=self.index_strategy.strategy_name, + index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=( self.index_strategy._alias_for_keeping_live diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 34ae4d80a..b857b3676 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -411,6 +411,7 @@ def pls_get_status(self) -> IndexStatus: # not yet created return IndexStatus( index_strategy_name=self.index_strategy.strategy_name, + index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, @@ -419,6 +420,7 @@ def pls_get_status(self) -> IndexStatus: ) return IndexStatus( index_strategy_name=self.index_strategy.strategy_name, + index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=True, is_default_for_searching=True, diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 275d4036f..5cd2f6fd0 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -2,6 +2,7 @@ from collections import abc, defaultdict import dataclasses import functools +import itertools import json import logging import re @@ -18,6 +19,7 @@ from share.search import exceptions from share.search import messages +from share.search.index_strategy._base import IndexStrategy from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db @@ -52,16 +54,27 @@ logger = logging.getLogger(__name__) +_PRIOR_UNSPLIT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchDenormIndexStrategy', + hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', +) + + +def _is_unsplit_strat(strategy: TrovesearchDenormIndexStrategy) -> bool: + return (strategy.strategy_check == _PRIOR_UNSPLIT_STRATEGY_CHECKSUM.hexdigest) + + class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): CURRENT_STRATEGY_CHECKSUM = ChecksumIri( checksumalgorithm_name='sha-256', salt='TrovesearchDenormIndexStrategy', - hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', + hexdigest='e8fce41147e8436bbfacebf669567a3e941a152261130e7331b36845d5a20952', ) # abstract method from Elastic8IndexStrategy @classmethod - def define_current_indexes(cls) -> dict[str, Elastic8IndexStrategy.IndexDefiniton]: + def define_current_indexes(cls) -> dict[str, Elastic8IndexStrategy.IndexDefinition]: return { 'cardsearch': cls.IndexDefinition( settings=cls._index_settings(), @@ -179,6 +192,9 @@ def _paths_and_values_mappings(cls): 'int_by_propertypath': {'type': 'object', 'dynamic': True}, } + ### + # receiving and acting on chunks of messages + # override method from Elastic8IndexStrategy def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexnames: Iterable[str]): task__delete_iri_value_scraps.apply_async( @@ -199,92 +215,96 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _docbuilder = self._SourcedocBuilder(_indexcard_rdf, messages_chunk.timestamp) if not _docbuilder.should_skip(): # if skipped, will be deleted _indexcard_pk = _indexcard_rdf.indexcard_id - yield _indexcard_pk, ( - self.build_index_action( - doc_id=_doc_id, - doc_source=_doc, - ) - for _doc_id, _doc in _docbuilder.build_docs() + _cardsearch_actions = ( + self.build_index_action(_doc_id, _doc) + for _doc_id, _doc in _docbuilder.build_cardsearch_docs() + ) + _valuesearch_actions = ( + self.build_index_action(_doc_id, _doc) + for _doc_id, _doc in _docbuilder.build_valuesearch_docs() ) + if _is_unsplit_strat(self): + _actions_by_index: dict[str, Iterable[dict]] = { + # single combined index + '': itertools.chain(_cardsearch_actions, _valuesearch_actions), + } + else: + _actions_by_index = { + 'cardsearch': _cardsearch_actions, + 'valuesearch': _valuesearch_actions, + } + yield self.MessageActionSet(_indexcard_pk, _actions_by_index) _remaining_indexcard_pks.discard(_indexcard_pk) # delete any that were skipped for any reason for _indexcard_pk in _remaining_indexcard_pks: yield _indexcard_pk, self.build_delete_action(_indexcard_pk) ### - # implement abstract IndexStrategy.SpecificIndex - - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - index_strategy: TrovesearchDenormIndexStrategy - - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - return self.index_strategy.es8_client.search( - index=self.indexname, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) + # handling searches - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - # cases to handle: - # - sort by field value (streamable) - # - sort by relevance to text (non-streamable) - # - random sort (...non-streamable?) - # - first page (full random) - # - subsequent page (reproducibly randomm) - # (for streaming pages, skip aggs and such on subsequents) - # maybe start with a "header" request (no hits, minimal aggs) - _querybuilder = _CardsearchQueryBuilder(cardsearch_params) - _search_kwargs = _querybuilder.build() - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - source=False, # no need to get _source, identifiers are enough - docvalue_fields=['card.card_iri'], - highlight={ - 'require_field_match': False, - 'fields': {'card.text_by_propertypath.*': {}}, - }, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._cardsearch_handle( - cardsearch_params, - _es8_response, - _querybuilder.response_cursor, - ) + def cardsearch_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index('' if _is_unsplit_strat(self) else 'cardsearch') - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _path = valuesearch_params.valuesearch_propertypath - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(_path[-1]) - _query = ( - _build_date_valuesearch(valuesearch_params) - if _is_date_search - else _build_iri_valuesearch(valuesearch_params, _cursor) + def valuesearch_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index('' if _is_unsplit_strat(self) else 'valuesearch') + + # abstract method from IndexStrategy + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + # cases to handle: + # - sort by field value (streamable) + # - sort by relevance to text (non-streamable) + # - random sort (...non-streamable?) + # - first page (full random) + # - subsequent page (reproducibly randomm) + # (for streaming pages, skip aggs and such on subsequents) + # maybe start with a "header" request (no hits, minimal aggs) + _querybuilder = _CardsearchQueryBuilder(cardsearch_params) + _search_kwargs = _querybuilder.build() + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.cardsearch_index().full_index_name, + source=False, # no need to get _source, identifiers are enough + docvalue_fields=['card.card_iri'], + highlight={ + 'require_field_match': False, + 'fields': {'card.text_by_propertypath.*': {}}, + }, + **_search_kwargs, ) - if settings.DEBUG: - logger.info(json.dumps(_query, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - **_query, - index=self.indexname, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return ( - self.index_strategy._valuesearch_dates_response(valuesearch_params, _es8_response) - if _is_date_search - else self.index_strategy._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._cardsearch_handle( + cardsearch_params, + _es8_response, + _querybuilder.response_cursor, + ) + + # abstract method from IndexStrategy + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + _path = valuesearch_params.valuesearch_propertypath + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = osfmap.is_date_property(_path[-1]) + _query = ( + _build_date_valuesearch(valuesearch_params) + if _is_date_search + else _build_iri_valuesearch(valuesearch_params, _cursor) + ) + if settings.DEBUG: + logger.info(json.dumps(_query, indent=2)) + try: + _es8_response = self.es8_client.search( + **_query, + index=self.valuesearch_index().full_index_name, ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return ( + self._valuesearch_dates_response(valuesearch_params, _es8_response) + if _is_date_search + else self._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + ) ### # building sourcedocs @@ -313,12 +333,13 @@ def should_skip(self) -> bool: or not any(self.rdfdoc.q(self.focus_iri, osfmap.NAMELIKE_PROPERTIES)) ) - def build_docs(self) -> Iterator[tuple[str, dict]]: - # index once without `iri_value` + def build_cardsearch_docs(self) -> Iterator[tuple[str, dict]]: yield self._doc_id(), { 'card': self._card_subdoc, 'chunk_timestamp': self.chunk_timestamp, } + + def build_valuesearch_docs(self) -> Iterator[tuple[str, dict]]: for _iri in self._fullwalk.paths_by_iri: yield self._doc_id(_iri), { 'card': self._card_subdoc, diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 30d5e11a1..418eba911 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -32,7 +32,8 @@

queues

-

current index: {{indexes.current.status.specific_indexname}}

+ {% for current_index_status in indexes.current.status %} +

current index: {{current_index_status.specific_indexname}}

@@ -43,44 +44,44 @@

current index: {{index

- - - - + + + +
{% trans "created" %}{% trans "links" %}
{{ indexes.current.status.creation_date|default:"--" }}{% if indexes.current.status.is_kept_live %}✓{% endif %}{% if indexes.current.status.is_default_for_searching %}✓{% endif %}{{ indexes.current.status.doc_count }}{{ current_index_status.creation_date|default:"--" }}{% if current_index_status.is_kept_live %}✓{% endif %}{% if current_index_status.is_default_for_searching %}✓{% endif %}{{ current_index_status.doc_count }} - {% if not indexes.current.status.creation_date %} + {% if not current_index_status.creation_date %}
{% csrf_token %} - +
- {% elif not indexes.current.status.is_kept_live %} + {% elif not current_index_status.is_kept_live %}
{% csrf_token %} - +
{% elif indexes.current.backfill.can_start_backfill %}
{% csrf_token %} - +
{% elif indexes.current.backfill.can_mark_backfill_complete %}
{% csrf_token %} - +
{% endif %} - {% if indexes.current.status.creation_date and not indexes.current.status.is_default_for_searching %} + {% if current_index_status.creation_date and not current_index_status.is_default_for_searching %}
{% csrf_token %} - +
@@ -92,13 +93,14 @@

current index: {{index {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }}

{% endif %} - {% if indexes.current.status.creation_date %} -

{% trans "mappings" %}

+ {% if current_index_status.creation_date %} +

{% trans "mappings" %}

{% endif %}

+ {% endfor %} {% if indexes.prior %}

prior indexes

diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 11c24594d..d85df8ab8 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -26,7 +26,7 @@ def setUp(self): self.index_strategy = self.get_index_strategy() def _fake_get_index_strategy(name): - if self.index_strategy.name == name: + if self.index_strategy.strategy_name == name: return self.index_strategy raise ValueError(f'unknown index strategy in test: {name}') @@ -38,13 +38,12 @@ def _fake_get_index_strategy(name): celery_app=celery_app, index_strategys=[self.index_strategy], ) - self.current_index = self.index_strategy.for_current_index() - self.current_index.pls_delete() # in case it already exists + self.index_strategy.pls_teardown() # in case it already exists self._assert_setup_happypath() def tearDown(self): super().tearDown() - self.current_index.pls_delete() + self.index_strategy.pls_teardown() # HACK: copied from TransactionTestCase._fixture_setup; restores db # to the state from before TransactionTestCase clobbered it (relies # on how django 3.2 implements `serialized_rollback = True`, above) @@ -74,8 +73,8 @@ def _assert_happypath_without_daemon(self, messages_chunk, expected_doc_count): assert all(_response.is_done for _response in _responses) _ids = {_response.index_message.target_id for _response in _responses} assert _ids == set(messages_chunk.target_ids_chunk) - self.current_index.pls_refresh() - _search_response = self.current_index.pls_handle_search__sharev2_backcompat() + self.index_strategy.pls_refresh() + _search_response = self.index_strategy.pls_handle_search__sharev2_backcompat() _hits = _search_response['hits']['hits'] assert len(_hits) == expected_doc_count @@ -85,8 +84,8 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): self.index_messenger.send_messages_chunk(messages_chunk) for _ in range(23): _daemon_control.stop_event.wait(timeout=0.2) - self.current_index.pls_refresh() - _search_response = self.current_index.pls_handle_search__sharev2_backcompat() + self.index_strategy.pls_refresh() + _search_response = self.index_strategy.pls_handle_search__sharev2_backcompat() _hits = _search_response['hits']['hits'] if len(_hits) == expected_doc_count: break # all good @@ -95,31 +94,32 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): def _assert_setup_happypath(self): # initial - assert not self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() - assert not index_status.creation_date - assert not index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # create index - self.current_index.pls_create() - assert self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert not index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # keep index live (with ingested updates) - self.current_index.pls_start_keeping_live() - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # default index for searching - self.index_strategy.pls_make_default_for_searching(self.current_index) - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert index_status.is_kept_live - assert index_status.is_default_for_searching - assert not index_status.doc_count + for _index in self.index_strategy.each_subnamed_index(): + assert not _index.pls_check_exists() + index_status = _index.pls_get_status() + assert not index_status.creation_date + assert not index_status.is_kept_live + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # create index + _index.pls_create() + assert _index.pls_check_exists() + index_status = _index.pls_get_status() + assert index_status.creation_date + assert not index_status.is_kept_live + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # keep index live (with ingested updates) + _index.pls_start_keeping_live() + index_status = _index.pls_get_status() + assert index_status.creation_date + assert index_status.is_kept_live + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # default index for searching + self.index_strategy.pls_make_default_for_searching() + index_status = _index.pls_get_status() + assert index_status.creation_date + assert index_status.is_kept_live + assert index_status.is_default_for_searching + assert not index_status.doc_count diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 6b1618301..0a953a542 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -56,30 +56,31 @@ def _get_formatted_record(self): # (single index that will not be updated again before being deleted) def _assert_happypath_until_ingest(self): # initial - assert not self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() + _index = next(self.index_strategy.each_subnamed_index()) + assert not _index.pls_check_exists() + index_status = _index.pls_get_status() assert not index_status.creation_date assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count # create index - self.current_index.pls_create() - assert self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() + _index.pls_create() + assert _index.pls_check_exists() + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live # change from base class assert index_status.is_default_for_searching # change from base class assert not index_status.doc_count # keep index live (with ingested updates) - self.current_index.pls_start_keeping_live() # now a no-op - index_status = self.current_index.pls_get_status() + _index.pls_start_keeping_live() # now a no-op + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live assert index_status.is_default_for_searching # change from base class assert not index_status.doc_count # default index for searching - self.index_strategy.pls_make_default_for_searching(self.current_index) # now a no-op - index_status = self.current_index.pls_get_status() + self.index_strategy.pls_make_default_for_searching() # now a no-op + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live assert index_status.is_default_for_searching From e59629f776f8f694f0f4bb35b8d2cf7c500d7a0e Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Jan 2025 17:03:56 -0500 Subject: [PATCH 15/35] wip...... --- share/models/feature_flag.py | 1 - share/search/daemon.py | 4 +- share/search/index_strategy/__init__.py | 11 ++-- share/search/index_strategy/elastic8.py | 14 ++--- .../search/index_strategy/sharev2_elastic8.py | 52 +++++++++++-------- .../index_strategy/trove_indexcard_flats.py | 6 ++- .../index_strategy/trovesearch_denorm.py | 11 +++- templates/admin/search-indexes.html | 26 ++++++---- .../index_strategy/_with_real_services.py | 3 +- .../search/index_strategy/test_elastic8.py | 17 +++--- .../index_strategy/test_strategy_selection.py | 8 +-- tests/share/search/test_admin_workflow.py | 5 +- tests/share/search/test_index_backfill.py | 8 +-- 13 files changed, 93 insertions(+), 73 deletions(-) diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index 6c6fd8afa..efd9edc76 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -33,7 +33,6 @@ class FeatureFlag(models.Model): FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' TROVESEARCH_DENORMILY = 'trovesearch_denormily' PREPRINT_AFFILIATIONS = 'preprint_affiliations' - TROVESEARCH_SPLINTDEX = 'trovesearch_splintdex' # name _should_ be one of the constants above, but that is not enforced by `choices` name = models.TextField(unique=True) diff --git a/share/search/daemon.py b/share/search/daemon.py index 9ce2dbf34..4d33a5b50 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -119,7 +119,7 @@ def get_consumers(self, Consumer, channel): ] def __repr__(self): - return '<{}({})>'.format(self.__class__.__name__, self.__index_strategy.name) + return '<{}({})>'.format(self.__class__.__name__, self.__index_strategy.strategy_name) def consume(self, *args, **kwargs): # wrap `consume` in `kombu.Connection.ensure`, following guidance from @@ -191,7 +191,7 @@ def on_message(self, body, message): continue def __repr__(self): - return '<{}({})>'.format(self.__class__.__name__, self.index_strategy.name) + return '<{}({})>'.format(self.__class__.__name__, self.index_strategy.strategy_name) @dataclasses.dataclass diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index b7d77c56c..6c7b28b81 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -93,12 +93,11 @@ def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> Inde if params.index_strategy_name: # specific strategy requested _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) else: - if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_SPLINTDEX): - _strategy_name = _AvailableStrategies.trovesearch_splint.name - elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): - _strategy_name = _AvailableStrategies.trovesearch_denorm.name - else: # default (until it's removed) - _strategy_name = _AvailableStrategies.trove_indexcard_flats.name + _strategy_name = ( + _AvailableStrategies.trovesearch_denorm.name + if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) + else _AvailableStrategies.trove_indexcard_flats.name + ) _strategy = get_strategy(_strategy_name, for_search=True) return _strategy diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index bfa4f21f0..8f2bfba31 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -146,11 +146,6 @@ def _get_elastic8_client(cls) -> elasticsearch8.Elasticsearch: def es8_client(self): return self._get_elastic8_client() # cached classmethod for shared client - # abstract method from IndexStrategy - def each_subnamed_index(self): - for _subname in self.current_index_defs().keys(): - yield self.get_index(_subname) - # abstract method from IndexStrategy def each_existing_index(self): indexname_set = set( @@ -284,9 +279,7 @@ def _get_indexnames_for_action( ) -> set[str]: if is_backfill_action: return {self.get_index(index_subname).full_index_name} - # note: using alias directly to reduce bulk-action clutter - # -- shortcut around `self._get_indexnames_for_alias(self._alias_for_keeping_live)` - return {self._alias_for_keeping_live} + return self._get_indexnames_for_alias(self._alias_for_keeping_live) def _get_indexnames_for_alias(self, alias_name) -> set[str]: try: @@ -377,11 +370,10 @@ def pls_get_status(self) -> IndexStatus: # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): - full_index_name = self.full_index_name - logger.info(f'{self.__class__.__name__}: checking for index {full_index_name}') + logger.info(f'{self.__class__.__name__}: checking for index {self}') return bool( self.index_strategy.es8_client.indices - .exists(index=full_index_name) + .exists(index=self.full_index_name) ) # abstract method from IndexStrategy.SpecificIndex diff --git a/share/search/index_strategy/sharev2_elastic8.py b/share/search/index_strategy/sharev2_elastic8.py index 51bfa72b9..17992719a 100644 --- a/share/search/index_strategy/sharev2_elastic8.py +++ b/share/search/index_strategy/sharev2_elastic8.py @@ -139,15 +139,18 @@ def index_mappings(cls): # abstract method from Elastic8IndexStrategy def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + def _single_action(suid_id, *actions): + return self.MessageActionSet(suid_id, {'': actions}) + _suid_ids = set(messages_chunk.target_ids_chunk) for _suid_id, _serialized_doc in self._load_docs(_suid_ids): _source_doc = json.loads(_serialized_doc) _doc_id = _source_doc['id'] _suid_ids.discard(_suid_id) if _source_doc.pop('is_deleted', False): - yield _suid_id, '', self.build_delete_action(_doc_id) + yield _single_action(_suid_id, self.build_delete_action(_doc_id)) else: - yield _suid_id, '', self.build_index_action(_doc_id, _source_doc) + yield _single_action(_suid_id, self.build_index_action(_doc_id, _source_doc)) # delete any leftovers for _leftover_suid in SourceUniqueIdentifier.objects.filter(id__in=_suid_ids): _suid_ids.discard(_leftover_suid.id) @@ -155,10 +158,14 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _leftover_suid_id = _leftover_suid.get_backcompat_sharev2_suid().id except SourceUniqueIdentifier.DoesNotExist: _leftover_suid_id = _leftover_suid.id - yield _leftover_suid.id, '', self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _single_action(_leftover_suid_id, self.build_delete_action( + self._get_doc_id(_leftover_suid_id), + )) # these ones don't even exist! for _leftover_suid_id in _suid_ids: - yield _leftover_suid_id, '', self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _single_action(_leftover_suid_id, self.build_delete_action( + self._get_doc_id(_leftover_suid_id), + )) def _get_doc_id(self, suid_id: int): return IDObfuscator.encode_id(suid_id, SourceUniqueIdentifier) @@ -181,22 +188,21 @@ def _load_docs(self, suid_ids) -> typing.Iterable[tuple[int, str]]: for _record in _record_qs: yield (_record.suid_id, _record.formatted_metadata) - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - # optional method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - try: - json_response = self.index_strategy.es8_client.search( - index=self.indexname, - body=(request_body or {}), - params=(request_queryparams or {}), - track_total_hits=True, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - try: # mangle response for some limited backcompat with elasticsearch5 - es8_total = json_response['hits']['total'] - json_response['hits']['total'] = es8_total['value'] - json_response['hits']['_total'] = es8_total - except KeyError: - pass - return json_response + # optional method from IndexStrategy + def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + try: + json_response = self.es8_client.search( + index=self.get_index('').full_index_name, + body=(request_body or {}), + params=(request_queryparams or {}), + track_total_hits=True, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + try: # mangle response for some limited backcompat with elasticsearch5 + es8_total = json_response['hits']['total'] + json_response['hits']['total'] = es8_total['value'] + json_response['hits']['_total'] = es8_total + except KeyError: + pass + return json_response diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 4b5d163c0..4c93ca146 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -265,6 +265,8 @@ def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]) } def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + def _single_action(indexcard_id, *actions): + return self.MessageActionSet(indexcard_id, {'': actions}) _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) for _indexcard_rdf in _indexcard_rdf_qs: @@ -278,11 +280,11 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): doc_source=_sourcedoc, ) _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _indexcard_rdf.indexcard_id, '', _index_action + yield _single_action(_indexcard_rdf.indexcard_id, _index_action) # delete any that don't have "latest" rdf and derived osfmap_json _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) for _indexcard in _leftovers: - yield _indexcard.id, '', self.build_delete_action(_indexcard.get_iri()) + yield _single_action(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 5cd2f6fd0..15e2d17b2 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -200,7 +200,11 @@ def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexname task__delete_iri_value_scraps.apply_async( kwargs={ 'index_strategy_name': self.strategy_name, - 'indexnames': list(affected_indexnames), + 'indexnames': [ + _indexname + for _indexname in affected_indexnames + if self.parse_full_index_name(_indexname).subname == 'valuesearch' + ], 'card_pks': messages_chunk.target_ids_chunk, 'timestamp': messages_chunk.timestamp, }, @@ -237,7 +241,10 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _remaining_indexcard_pks.discard(_indexcard_pk) # delete any that were skipped for any reason for _indexcard_pk in _remaining_indexcard_pks: - yield _indexcard_pk, self.build_delete_action(_indexcard_pk) + _subname = ('' if _is_unsplit_strat(self) else 'cardsearch') + yield self.MessageActionSet(_indexcard_pk, { + _subname: [self.build_delete_action(_indexcard_pk)], + }) ### # handling searches diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 418eba911..1573f80d7 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -32,18 +32,26 @@

queues

- {% for current_index_status in indexes.current.status %} -

current index: {{current_index_status.specific_indexname}}

+

current indexes

+ {% if indexes.current.backfill.backfill_admin_url %} +

+ {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }} +

+ {% endif %} + + - + {% for current_index_status in indexes.current.status %} + + @@ -88,34 +96,32 @@

current index: {{current {% endif %}

+ + {% endfor %}
{% trans "subname" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} {% trans "links" %}{% trans "full name" %}
{{ current_index_status.index_subname }} {{ current_index_status.creation_date|default:"--" }} {% if current_index_status.is_kept_live %}✓{% endif %} {% if current_index_status.is_default_for_searching %}✓{% endif %} - {% if indexes.current.backfill.backfill_admin_url %} -

- {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }} -

- {% endif %} {% if current_index_status.creation_date %}

{% trans "mappings" %}

{% endif %}
{{ index_status.specific_indexname }}
- {% endfor %} {% if indexes.prior %}

prior indexes

+ - + {% for index_status in indexes.prior %} + diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index d85df8ab8..e8509787d 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -31,7 +31,7 @@ def _fake_get_index_strategy(name): raise ValueError(f'unknown index strategy in test: {name}') self.enterContext(mock.patch( - 'share.search.index_strategy.get_index_strategy', + 'share.search.index_strategy.get_strategy', new=_fake_get_index_strategy, )) self.index_messenger = IndexMessenger( @@ -39,6 +39,7 @@ def _fake_get_index_strategy(name): index_strategys=[self.index_strategy], ) self.index_strategy.pls_teardown() # in case it already exists + self.index_strategy.pls_refresh() self._assert_setup_happypath() def tearDown(self): diff --git a/tests/share/search/index_strategy/test_elastic8.py b/tests/share/search/index_strategy/test_elastic8.py index 5de732690..6edcb30f9 100644 --- a/tests/share/search/index_strategy/test_elastic8.py +++ b/tests/share/search/index_strategy/test_elastic8.py @@ -17,6 +17,15 @@ class FakeElastic8IndexStrategy(Elastic8IndexStrategy): hexdigest='5371df2d0e3daaa9f1c344d14384cdbe65000f2b449b1c2f30ae322b0321eb12', ) + @classmethod + def define_current_indexes(cls): + return { + '': cls.IndexDefinition( + mappings={'my-mappings': 'lol'}, + settings={'my-settings': 'lol'}, + ), + } + @property def supported_message_types(self): return { @@ -28,12 +37,6 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_SUID - def index_settings(self): - return {'my-settings': 'lol'} - - def index_mappings(self): - return {'my-mappings': 'lol'} - def build_elastic_actions(self, messages_chunk): return FAKE_ACTION_ITERATOR @@ -48,7 +51,7 @@ def mock_es_client(self): @pytest.fixture def fake_strategy(self, mock_es_client, settings): settings.ELASTICSEARCH8_URL = 'http://nowhere.example:12345/' - strat = FakeElastic8IndexStrategy(name='fake_es8') + strat = FakeElastic8IndexStrategy('fake_es8') strat.assert_strategy_is_current() return strat diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index b21204e75..afc458814 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -5,13 +5,14 @@ from share.search.index_strategy import ( IndexStrategy, each_strategy, - all_strategy_names, get_strategy, sharev2_elastic5, sharev2_elastic8, trove_indexcard_flats, trovesearch_denorm, + parse_strategy_name, ) +from share.search.index_strategy._indexnames import combine_indexname_parts @pytest.fixture @@ -46,10 +47,11 @@ def test_get_by_request(self, mock_elastic_clients): for _strategy in each_strategy(): good_requests = [ _strategy.strategy_name, - ''.join((_strategy.indexname_prefix, 'foo')), + combine_indexname_parts(_strategy.strategy_name, _strategy.strategy_check), + combine_indexname_parts(_strategy.strategy_name, _strategy.strategy_check, 'foo'), ] for good_request in good_requests: - _got_strategy = get_strategy(good_request) + _got_strategy = parse_strategy_name(good_request) assert isinstance(_got_strategy, IndexStrategy) assert _got_strategy == _strategy with pytest.raises(IndexStrategyError): diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index 640a9e617..fc0ede074 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -17,5 +17,8 @@ def test_admin_search_indexes_view(mock_elastic_clients): resp = client.get('/admin/search-indexes') for strategy_name in index_strategy.all_strategy_names(): _index_strategy = index_strategy.get_strategy(strategy_name) - expected_header = f'

current index: {_index_strategy.current_indexname}

' + expected_header = f'

' assert expected_header.encode() in resp.content + for _index in _index_strategy.each_subnamed_index(): + expected_row = f'

' + assert expected_row.encode() in resp.content diff --git a/tests/share/search/test_index_backfill.py b/tests/share/search/test_index_backfill.py index 2ee24dd41..b18e93a61 100644 --- a/tests/share/search/test_index_backfill.py +++ b/tests/share/search/test_index_backfill.py @@ -11,7 +11,7 @@ class TestIndexBackfillMethods: def fake_strategy(self): fake_strategy = mock.Mock() fake_strategy.name = 'foo' - fake_strategy.for_current_index.return_value.indexname = 'foo_bar' + fake_strategy.CURRENT_STRATEGY_CHECKSUM = 'foo_bar' return fake_strategy @pytest.fixture @@ -20,14 +20,14 @@ def index_backfill(self, fake_strategy): index_strategy_name=fake_strategy.name, ) - def test_happypath(self, index_backfill, fake_strategy): + def test_happypath(self, index_backfill: IndexBackfill, fake_strategy): assert index_backfill.backfill_status == IndexBackfill.INITIAL - assert index_backfill.specific_indexname == '' + assert index_backfill.strategy_checksum == '' with mock.patch('share.tasks.schedule_index_backfill') as mock_task: index_backfill.pls_start(fake_strategy) mock_task.apply_async.assert_called_once_with((index_backfill.pk,)) assert index_backfill.backfill_status == IndexBackfill.WAITING - assert index_backfill.specific_indexname == 'foo_bar' + assert index_backfill.strategy_checksum == 'foo_bar' index_backfill.pls_note_scheduling_has_begun() assert index_backfill.backfill_status == IndexBackfill.SCHEDULING index_backfill.pls_note_scheduling_has_finished() From 8c3350becd0500a06704c2934fc38cccf7070c98 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Jan 2025 17:15:15 -0500 Subject: [PATCH 16/35] wip....... --- share/search/index_strategy/elastic8.py | 33 ++++++++++++------- .../index_strategy/_with_real_services.py | 1 - 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 8f2bfba31..c87441a8c 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -242,6 +242,12 @@ def pls_mark_backfill_complete(self): for _index in self.each_subnamed_index(): _index.pls_refresh() + # override from IndexStrategy + def pls_refresh(self): + super().pls_refresh() # refreshes each index + logger.debug('%s: Waiting for yellow status', self.strategy_name) + self.es8_client.cluster.health(wait_for_status='yellow') + @property def _alias_for_searching(self): return f'{self.indexname_prefix}__search' @@ -370,11 +376,17 @@ def pls_get_status(self) -> IndexStatus: # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): - logger.info(f'{self.__class__.__name__}: checking for index {self}') - return bool( + _indexname = self.full_index_name + _result = bool( self.index_strategy.es8_client.indices - .exists(index=self.full_index_name) + .exists(index=_indexname) + ) + logger.info( + f'{_indexname}: exists' + if _result + else f'{_indexname}: does not exist' ) + return _result # abstract method from IndexStrategy.SpecificIndex def pls_create(self): @@ -403,24 +415,21 @@ def pls_create(self): # abstract method from IndexStrategy.SpecificIndex def pls_refresh(self): + _indexname = self.full_index_name ( self.index_strategy.es8_client.indices - .refresh(index=self.full_index_name) - ) - logger.debug('%r: Waiting for yellow status', self) - ( - self.index_strategy.es8_client.cluster - .health(wait_for_status='yellow') + .refresh(index=_indexname) ) - logger.info('%r: Refreshed', self) + logger.info('%s: Refreshed', _indexname) # abstract method from IndexStrategy.SpecificIndex def pls_delete(self): + _indexname = self.full_index_name ( self.index_strategy.es8_client.indices - .delete(index=self.full_index_name, ignore=[400, 404]) + .delete(index=_indexname, ignore=[400, 404]) ) - logger.warning('%r: deleted', self) + logger.warning('%s: deleted', _indexname) # abstract method from IndexStrategy.SpecificIndex def pls_start_keeping_live(self): diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index e8509787d..be24eca9e 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -39,7 +39,6 @@ def _fake_get_index_strategy(name): index_strategys=[self.index_strategy], ) self.index_strategy.pls_teardown() # in case it already exists - self.index_strategy.pls_refresh() self._assert_setup_happypath() def tearDown(self): From 0d84aef01a4f3e08145b15a1abdcb86d619a15cf Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 10:58:44 -0500 Subject: [PATCH 17/35] wip..... --- api/search/views.py | 4 +- api/views/feeds.py | 2 +- share/admin/search.py | 2 +- share/models/index_backfill.py | 2 +- share/search/index_strategy/__init__.py | 5 +- share/search/index_strategy/_base.py | 23 +- share/search/index_strategy/elastic8.py | 31 +- .../search/index_strategy/sharev2_elastic5.py | 4 +- .../search/index_strategy/sharev2_elastic8.py | 17 +- .../index_strategy/trove_indexcard_flats.py | 1033 +++++++++-------- .../index_strategy/trovesearch_denorm.py | 15 +- tests/api/test_elasticsearch.py | 2 +- tests/api/test_feeds.py | 2 +- .../_common_trovesearch_tests.py | 10 +- .../index_strategy/_with_real_services.py | 4 +- 15 files changed, 587 insertions(+), 569 deletions(-) diff --git a/api/search/views.py b/api/search/views.py index ddb606018..7cf781947 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -28,7 +28,7 @@ def post(self, request): def _handle_request(self, request): queryparams = request.query_params.dict() - requested_index_strategy = queryparams.pop('indexStrategy', None) + requested_index_strategy = queryparams.get('indexStrategy', None) if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: @@ -36,7 +36,7 @@ def _handle_request(self, request): except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: - response_json = specific_index.pls_handle_search__sharev2_backcompat( + response_json = specific_index.pls_handle_search__passthru( request_body=request.data, request_queryparams=queryparams, ) diff --git a/api/views/feeds.py b/api/views/feeds.py index 02c0eb955..f2a74ecd6 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -64,7 +64,7 @@ def get_object(self, request): def items(self, obj): try: - json_response = self._search_strategy.pls_handle_search__sharev2_backcompat( + json_response = self._search_strategy.pls_handle_search__passthru( request_body=obj, ) except IndexStrategyError: diff --git a/share/admin/search.py b/share/admin/search.py index 4ccd17a94..9e1cdd4b7 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -79,7 +79,7 @@ def _index_status_by_strategy(): }, 'prior': sorted(( specific_index.pls_get_status() - for specific_index in _index_strategy.each_existing_index() + for specific_index in _index_strategy.each_existing_index(any_strategy_check=True) if not specific_index.is_current ), reverse=True), 'queues': [ diff --git a/share/models/index_backfill.py b/share/models/index_backfill.py index 6b3f6fdba..bcbc63b08 100644 --- a/share/models/index_backfill.py +++ b/share/models/index_backfill.py @@ -76,7 +76,7 @@ def mutex(self): def pls_start(self, index_strategy): with self.mutex() as locked_self: - assert locked_self.index_strategy_name == index_strategy.name + assert locked_self.index_strategy_name == index_strategy.strategy_name _current_checksum = str(index_strategy.CURRENT_STRATEGY_CHECKSUM) if locked_self.strategy_checksum == _current_checksum: # what is "current" has not changed -- should be INITIAL diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 6c7b28b81..4c020b654 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -40,7 +40,6 @@ class _AvailableStrategies(enum.Enum): sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') - trovesearch_splint = TrovesearchDenormIndexStrategy('trovesearch_splint') ### @@ -81,9 +80,9 @@ def get_strategy_for_sharev2_search(requested_name: str | None = None) -> IndexS settings.ELASTICSEARCH5_URL and not FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) ): - _name = 'sharev2_elastic5' + _name = _AvailableStrategies.sharev2_elastic5.name elif settings.ELASTICSEARCH8_URL: - _name = 'sharev2_elastic8' + _name = _AvailableStrategies.sharev2_elastic8.name else: raise IndexStrategyError('no available index for sharev2 search') return parse_strategy_name(_name) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index ccdc09f06..b1ea73ddc 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -108,7 +108,11 @@ def get_index(self, subname: str) -> SpecificIndex: return self.SpecificIndex(self, subname) # type: ignore[abstract] def parse_full_index_name(self, index_name: str) -> SpecificIndex: - (_strategy_name, _strategy_check, *_etc) = indexnames.parse_indexname_parts(index_name) + _parts = indexnames.parse_indexname_parts(index_name) + try: + (_strategy_name, _strategy_check, *_etc) = _parts + except ValueError: + raise IndexStrategyError(f'expected "strategyname__strategycheck", at least (got "{index_name}")') if _strategy_name != self.strategy_name: raise IndexStrategyError(f'this index belongs to another strategy (expected strategy name "{self.strategy_name}"; got "{_strategy_name}" from index name {index_name})') _strategy = self.with_strategy_check(_strategy_check) @@ -182,7 +186,7 @@ def backfill_message_type(self) -> messages.MessageType: raise NotImplementedError @abc.abstractmethod - def each_existing_index(self) -> typing.Iterator[SpecificIndex]: + def each_existing_index(self, *, any_strategy_check: bool = False) -> typing.Iterator[SpecificIndex]: raise NotImplementedError @abc.abstractmethod @@ -206,8 +210,8 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: raise NotImplementedError - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)') + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__passthru (either implement it or don\'t use this strategy for that)') # IndexStrategy.SpecificIndex must be implemented by subclasses # in their own `class SpecificIndex(IndexStrategy.SpecificIndex)` @@ -216,17 +220,14 @@ class SpecificIndex(abc.ABC): index_strategy: IndexStrategy subname: str # unique per index_strategy - def __post_init__(self): - if self.subname not in self.index_strategy.index_subname_set(): - raise IndexStrategyError( - f'invalid subname "{self.subname}"!' - f' (expected one of {self.index_strategy.index_subname_set()}")' - ) - @property def is_current(self) -> bool: return self.index_strategy.is_current + @property + def has_valid_subname(self) -> bool: + return self.subname in self.index_strategy.index_subname_set() + @property def full_index_name(self) -> str: return indexnames.combine_indexname_parts( diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index c87441a8c..18a401b93 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -18,7 +18,10 @@ from share.search import messages from share.search.index_strategy._util import timestamp_to_readable_datetime from share.util.checksum_iri import ChecksumIri -from ._indexnames import parse_indexname_parts +from ._indexnames import ( + parse_indexname_parts, + combine_indexname_parts, +) logger = logging.getLogger(__name__) @@ -147,15 +150,20 @@ def es8_client(self): return self._get_elastic8_client() # cached classmethod for shared client # abstract method from IndexStrategy - def each_existing_index(self): + def each_existing_index(self, *, any_strategy_check: bool = False): + _index_wildcard = ( + combine_indexname_parts(self.strategy_name, '*') + if any_strategy_check + else self.indexname_wildcard + ) indexname_set = set( self.es8_client.indices - .get(index=self.indexname_wildcard, features=',') + .get(index=_index_wildcard, features=',') .keys() ) for indexname in indexname_set: _index = self.parse_full_index_name(indexname) - assert _index.index_strategy == self + assert _index.index_strategy.strategy_name == self.strategy_name yield _index # abstract method from IndexStrategy @@ -225,9 +233,16 @@ def pls_get_default_for_searching(self) -> IndexStrategy: return self.with_strategy_check(_strategycheck) # abstract method from IndexStrategy - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + _queryparams = request_queryparams or {} + _requested_strategy = _queryparams.pop('indexStrategy', '') + _indexname = self.indexname_wildcard + if _requested_strategy and _requested_strategy.startswith(self.indexname_prefix): + _index = self.parse_full_index_name(_requested_strategy) + if _index.has_valid_subname: + _indexname = _index.full_index_name return self.es8_client.search( - index=self.indexname_wildcard, + index=_indexname, body={ **(request_body or {}), 'track_total_hits': True, @@ -250,11 +265,11 @@ def pls_refresh(self): @property def _alias_for_searching(self): - return f'{self.indexname_prefix}__search' + return combine_indexname_parts(self.strategy_name, 'search') @property def _alias_for_keeping_live(self): - return f'{self.indexname_prefix}__live' + return combine_indexname_parts(self.strategy_name, 'live') def _elastic_actions_with_index( self, diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index b857b3676..27c4d6936 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -102,7 +102,7 @@ def pls_get_default_for_searching(self): return self # abstract method from IndexStrategy - def each_existing_index(self): + def each_existing_index(self, *args, **kwargs): _index = self.single_index if _index.pls_check_exists(): yield _index @@ -431,7 +431,7 @@ def pls_get_status(self) -> IndexStatus: ) # optional method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: '''the definitive sharev2-search api: passthru to elasticsearch version 5 ''' try: diff --git a/share/search/index_strategy/sharev2_elastic8.py b/share/search/index_strategy/sharev2_elastic8.py index 17992719a..6de96a668 100644 --- a/share/search/index_strategy/sharev2_elastic8.py +++ b/share/search/index_strategy/sharev2_elastic8.py @@ -139,7 +139,7 @@ def index_mappings(cls): # abstract method from Elastic8IndexStrategy def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - def _single_action(suid_id, *actions): + def _make_actionset(suid_id, *actions): return self.MessageActionSet(suid_id, {'': actions}) _suid_ids = set(messages_chunk.target_ids_chunk) @@ -147,10 +147,11 @@ def _single_action(suid_id, *actions): _source_doc = json.loads(_serialized_doc) _doc_id = _source_doc['id'] _suid_ids.discard(_suid_id) - if _source_doc.pop('is_deleted', False): - yield _single_action(_suid_id, self.build_delete_action(_doc_id)) - else: - yield _single_action(_suid_id, self.build_index_action(_doc_id, _source_doc)) + yield _make_actionset(_suid_id, ( + self.build_delete_action(_doc_id) + if _source_doc.pop('is_deleted', False) + else self.build_index_action(_doc_id, _source_doc) + )) # delete any leftovers for _leftover_suid in SourceUniqueIdentifier.objects.filter(id__in=_suid_ids): _suid_ids.discard(_leftover_suid.id) @@ -158,12 +159,12 @@ def _single_action(suid_id, *actions): _leftover_suid_id = _leftover_suid.get_backcompat_sharev2_suid().id except SourceUniqueIdentifier.DoesNotExist: _leftover_suid_id = _leftover_suid.id - yield _single_action(_leftover_suid_id, self.build_delete_action( + yield _make_actionset(_leftover_suid_id, self.build_delete_action( self._get_doc_id(_leftover_suid_id), )) # these ones don't even exist! for _leftover_suid_id in _suid_ids: - yield _single_action(_leftover_suid_id, self.build_delete_action( + yield _make_actionset(_leftover_suid_id, self.build_delete_action( self._get_doc_id(_leftover_suid_id), )) @@ -189,7 +190,7 @@ def _load_docs(self, suid_ids) -> typing.Iterable[tuple[int, str]]: yield (_record.suid_id, _record.formatted_metadata) # optional method from IndexStrategy - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: try: json_response = self.es8_client.search( index=self.get_index('').full_index_name, diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 4c93ca146..e5a6488d1 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -14,6 +14,7 @@ from share.search import exceptions from share.search import messages +from share.search.index_strategy._base import IndexStrategy from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db @@ -166,6 +167,11 @@ def index_mappings(cls): }, } + @property + def __index(self) -> IndexStrategy.SpecificIndex: + # this is a single-index strategy -- for back-compat, that index has empty subname + return self.get_index('') + def _build_sourcedoc(self, indexcard_rdf): _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() if _should_skip_card(indexcard_rdf, _rdfdoc): @@ -265,7 +271,7 @@ def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]) } def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - def _single_action(indexcard_id, *actions): + def _make_actionset(indexcard_id, *actions): return self.MessageActionSet(indexcard_id, {'': actions}) _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) @@ -280,563 +286,562 @@ def _single_action(indexcard_id, *actions): doc_source=_sourcedoc, ) _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _single_action(_indexcard_rdf.indexcard_id, _index_action) + yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) # delete any that don't have "latest" rdf and derived osfmap_json _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) for _indexcard in _leftovers: - yield _single_action(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) - - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - return self.index_strategy.es8_client.search( - index=self.full_index_name, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) + yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) + + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + return self.es8_client.search( + index=self.get_index('').full_index_name, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - _cursor = self._cardsearch_cursor(cardsearch_params) - _sort = self._cardsearch_sort(cardsearch_params.sort_list) - _query = self._cardsearch_query( - cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_textsegment_set, - cardsearch_cursor=_cursor, - ) - _from_offset = ( - _cursor.start_offset - if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) - else _cursor.start_offset - len(_cursor.first_page_ids) - ) - _search_kwargs = dict( - query=_query, - aggs=self._cardsearch_aggs(cardsearch_params), - sort=_sort, - from_=_from_offset, - size=_cursor.bounded_page_size, - source=False, # no need to get _source; _id is enough - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) - _search_kwargs = dict( - query=self._cardsearch_query( - valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - )}}], - ), - size=0, # ignore cardsearch hits; just want the aggs - aggs=( - self._valuesearch_date_aggs(valuesearch_params) - if _is_date_search - else self._valuesearch_iri_aggs(valuesearch_params, _cursor) - ), + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + _cursor = self._cardsearch_cursor(cardsearch_params) + _sort = self._cardsearch_sort(cardsearch_params.sort_list) + _query = self._cardsearch_query( + cardsearch_params.cardsearch_filter_set, + cardsearch_params.cardsearch_textsegment_set, + cardsearch_cursor=_cursor, + ) + _from_offset = ( + _cursor.start_offset + if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) + else _cursor.start_offset - len(_cursor.first_page_ids) + ) + _search_kwargs = dict( + query=_query, + aggs=self._cardsearch_aggs(cardsearch_params), + sort=_sort, + from_=_from_offset, + size=_cursor.bounded_page_size, + source=False, # no need to get _source; _id is enough + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.__index.full_index_name, + **_search_kwargs, ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) - - ### - # query implementation - - def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: - _request_cursor = cardsearch_params.page_cursor - if ( - _request_cursor.is_basic() - and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_textsegment_set - ): - return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) - return OffsetCursor.from_cursor(_request_cursor) - - def _cardsearch_query( - self, - filter_set, textsegment_set, *, - additional_filters=None, - cardsearch_cursor: PageCursor | None = None, - ) -> dict: - _bool_query = { - 'filter': additional_filters or [], - 'must': [], - 'must_not': [], - 'should': [], - } - for _searchfilter in filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) + + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) + _search_kwargs = dict( + query=self._cardsearch_query( + valuesearch_params.cardsearch_filter_set, + valuesearch_params.cardsearch_textsegment_set, + additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + )}}], + ), + size=0, # ignore cardsearch hits; just want the aggs + aggs=( + self._valuesearch_date_aggs(valuesearch_params) + if _is_date_search + else self._valuesearch_iri_aggs(valuesearch_params, _cursor) + ), + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.get_index('').full_index_name, + **_search_kwargs, ) - for _textsegment in textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _bool_query[_boolkey].extend(_textqueries) - if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): - # no need for randomness - return {'bool': _bool_query} - if not cardsearch_cursor.first_page_ids: - # independent random sample - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} - if cardsearch_cursor.is_first_page(): - # returning to a first page previously visited - _bool_query['filter'].append(_firstpage_uuid_query) - return {'bool': _bool_query} - # get a subsequent page using reproducible randomness - _bool_query['must_not'].append(_firstpage_uuid_query) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) + + ### + # query implementation + + def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: + _request_cursor = cardsearch_params.page_cursor + if ( + _request_cursor.is_basic() + and not cardsearch_params.sort_list + and not cardsearch_params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + + def _cardsearch_query( + self, + filter_set, textsegment_set, *, + additional_filters=None, + cardsearch_cursor: PageCursor | None = None, + ) -> dict: + _bool_query = { + 'filter': additional_filters or [], + 'must': [], + 'must_not': [], + 'should': [], + } + for _searchfilter in filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator.is_date_operator(): + _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + _textq_builder = self._NestedTextQueryBuilder( + relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), + ) + for _textsegment in textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _bool_query[_boolkey].extend(_textqueries) + if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): + # no need for randomness + return {'bool': _bool_query} + if not cardsearch_cursor.first_page_ids: + # independent random sample return { 'function_score': { 'query': {'bool': _bool_query}, 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_ids), - 'field': 'indexcard_uuid', - }, + 'random_score': {}, # default random_score is fast and unpredictable }, } + _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} + if cardsearch_cursor.is_first_page(): + # returning to a first page previously visited + _bool_query['filter'].append(_firstpage_uuid_query) + return {'bool': _bool_query} + # get a subsequent page using reproducible randomness + _bool_query['must_not'].append(_firstpage_uuid_query) + return { + 'function_score': { + 'query': {'bool': _bool_query}, + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(cardsearch_cursor.first_page_ids), + 'field': 'indexcard_uuid', + }, + }, + } - def _cardsearch_aggs(self, cardsearch_params): - _aggs = {} - if cardsearch_params.related_property_paths: - _aggs['related_propertypath_usage'] = {'terms': { - 'field': 'iri_paths_present', - 'include': [ - iri_path_as_keyword(_path) - for _path in cardsearch_params.related_property_paths - ], - 'size': len(cardsearch_params.related_property_paths), - }} - return _aggs - - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool: dict[str, Any] = { - 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - )}}], - 'must': [], - 'must_not': [], - 'should': [], - } - _nested_terms_agg = { - 'field': 'nested_iri.iri_value', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_offset + cursor.bounded_page_size + 1, - } - _iris = list(valuesearch_params.valuesearch_iris()) - if _iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.iri_value': _iris, - }}) - _nested_terms_agg['size'] = len(_iris) - _nested_terms_agg['include'] = _iris - _type_iris = list(valuesearch_params.valuesearch_type_iris()) - if _type_iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.value_type_iri': _type_iris, - }}) - _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _nested_iri_bool[_boolkey].extend(_textqueries) - return { - 'in_nested_iri': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'bool': _nested_iri_bool}, - 'aggs': { - 'iri_values': { - 'terms': _nested_terms_agg, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, + def _cardsearch_aggs(self, cardsearch_params): + _aggs = {} + if cardsearch_params.related_property_paths: + _aggs['related_propertypath_usage'] = {'terms': { + 'field': 'iri_paths_present', + 'include': [ + iri_path_as_keyword(_path) + for _path in cardsearch_params.related_property_paths + ], + 'size': len(cardsearch_params.related_property_paths), + }} + return _aggs + + def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): + _nested_iri_bool: dict[str, Any] = { + 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + )}}], + 'must': [], + 'must_not': [], + 'should': [], + } + _nested_terms_agg = { + 'field': 'nested_iri.iri_value', + # WARNING: terribly inefficient pagination (part one) + 'size': cursor.start_offset + cursor.bounded_page_size + 1, + } + _iris = list(valuesearch_params.valuesearch_iris()) + if _iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.iri_value': _iris, + }}) + _nested_terms_agg['size'] = len(_iris) + _nested_terms_agg['include'] = _iris + _type_iris = list(valuesearch_params.valuesearch_type_iris()) + if _type_iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.value_type_iri': _type_iris, + }}) + _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') + for _textsegment in valuesearch_params.valuesearch_textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _nested_iri_bool[_boolkey].extend(_textqueries) + return { + 'in_nested_iri': { + 'nested': {'path': 'nested_iri'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'bool': _nested_iri_bool}, + 'aggs': { + 'iri_values': { + 'terms': _nested_terms_agg, + 'aggs': { + 'type_iri': {'terms': { + 'field': 'nested_iri.value_type_iri', + }}, + 'name_text': {'terms': { + 'field': 'nested_iri.value_name_text.raw', + }}, + 'title_text': {'terms': { + 'field': 'nested_iri.value_title_text.raw', + }}, + 'label_text': {'terms': { + 'field': 'nested_iri.value_label_text.raw', + }}, }, }, }, }, }, - } + }, + } - def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): - _aggs = { - 'in_nested_date': { - 'nested': {'path': 'nested_date'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - ), - }}, - 'aggs': { - 'count_by_year': { - 'date_histogram': { - 'field': 'nested_date.date_value', - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, + def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): + _aggs = { + 'in_nested_date': { + 'nested': {'path': 'nested_date'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + ), + }}, + 'aggs': { + 'count_by_year': { + 'date_histogram': { + 'field': 'nested_date.date_value', + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, }, }, }, }, }, - } - return _aggs - - def _valuesearch_handle( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: OffsetCursor, - ): - _iri_aggs = es8_response['aggregations'].get('in_nested_iri') - if _iri_aggs: - _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_offset + cursor.bounded_page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.total_count = ( - MANY_MORE - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchHandle( - cursor=cursor, - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - search_params=valuesearch_params, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations']['in_nested_date'] - ['value_at_propertypath']['count_by_year']['buckets'] - ) - return ValuesearchHandle( - cursor=PageCursor(len(_year_buckets)), - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - search_params=valuesearch_params, - ) - - def _valuesearch_iri_result(self, iri_bucket): - return ValuesearchResult( - value_iri=iri_bucket['key'], - value_type=_bucketlist(iri_bucket['type_iri']), - name_text=_bucketlist(iri_bucket['name_text']), - title_text=_bucketlist(iri_bucket['title_text']), - label_text=_bucketlist(iri_bucket['label_text']), - match_count=iri_bucket['doc_count'], + }, + } + return _aggs + + def _valuesearch_handle( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: OffsetCursor, + ): + _iri_aggs = es8_response['aggregations'].get('in_nested_iri') + if _iri_aggs: + _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly inefficient pagination (part two) + _page_end_index = cursor.start_offset + cursor.bounded_page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count ) - - def _valuesearch_date_result(self, date_bucket): - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], + return ValuesearchHandle( + cursor=cursor, + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + search_params=valuesearch_params, + ) + else: # assume date + _year_buckets = ( + es8_response['aggregations']['in_nested_date'] + ['value_at_propertypath']['count_by_year']['buckets'] + ) + return ValuesearchHandle( + cursor=PageCursor(len(_year_buckets)), + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + search_params=valuesearch_params, ) - def _cardsearch_presence_query(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} + def _valuesearch_iri_result(self, iri_bucket): + return ValuesearchResult( + value_iri=iri_bucket['key'], + value_type=_bucketlist(iri_bucket['type_iri']), + name_text=_bucketlist(iri_bucket['name_text']), + title_text=_bucketlist(iri_bucket['title_text']), + label_text=_bucketlist(iri_bucket['label_text']), + match_count=iri_bucket['doc_count'], + ) - def _cardsearch_path_presence_query(self, path: tuple[str, ...]): - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, - }} - return {'term': { - 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), - }} + def _valuesearch_date_result(self, date_bucket): + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) - def _cardsearch_iri_filter(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_iri_query(_path, search_filter.value_set) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, + def _cardsearch_presence_query(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_presence_query(_path) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} + + def _cardsearch_path_presence_query(self, path: tuple[str, ...]): + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): + return {'nested': { + 'path': 'nested_iri', + 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, }} + return {'term': { + 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), + }} - def _cardsearch_path_iri_query(self, path, value_set): - _suffuniq_values = [ - get_sufficiently_unique_iri(_iri) - for _iri in value_set - ] - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'bool': { - 'must': [ # both - {'term': {'nested_iri.distance_from_focus': len(path)}}, - {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, - ], - }}, - }} - # without a glob-path, can use the flattened keyword field - return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} + def _cardsearch_iri_filter(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_iri_query(_path, search_filter.value_set) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} - def _cardsearch_date_filter(self, search_filter): + def _cardsearch_path_iri_query(self, path, value_set): + _suffuniq_values = [ + get_sufficiently_unique_iri(_iri) + for _iri in value_set + ] + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): return {'nested': { - 'path': 'nested_date', - 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, + 'path': 'nested_iri', + 'query': {'bool': { + 'must': [ # both + {'term': {'nested_iri.distance_from_focus': len(path)}}, + {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, + ], + }}, }} + # without a glob-path, can use the flattened keyword field + return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} - def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: - # filter by requested paths - yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') - # filter by requested value/operator - if search_filter.operator == SearchFilter.FilterOperator.BEFORE: - _value = min(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'lt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AFTER: - _value = max(search_filter.value_set) # rely on string-comparable isoformat + def _cardsearch_date_filter(self, search_filter): + return {'nested': { + 'path': 'nested_date', + 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, + }} + + def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: + # filter by requested paths + yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') + # filter by requested value/operator + if search_filter.operator == SearchFilter.FilterOperator.BEFORE: + _value = min(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'lt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AFTER: + _value = max(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'gt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: + for _value in search_filter.value_set: + _filtervalue = _daterange_value_and_format(_value) yield {'range': {'nested_date.date_value': { - 'gt': _daterange_value_and_format(_value) + 'gte': _filtervalue, + 'lte': _filtervalue, }}} - elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: - for _value in search_filter.value_set: - _filtervalue = _daterange_value_and_format(_value) - yield {'range': {'nested_date.date_value': { - 'gte': _filtervalue, - 'lte': _filtervalue, - }}} - else: - raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - - def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): - if not sort_list: - return None - return [ - {'nested_date.date_value': { - 'order': ('desc' if _sortparam.descending else 'asc'), - 'nested': { - 'path': 'nested_date', - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - _sortparam.propertypath, - suffuniq=True, - ), - }}, - }, - }} - for _sortparam in sort_list - ] + else: + raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - def _cardsearch_handle( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: OffsetCursor, - ) -> CardsearchHandle: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.total_count = MANY_MORE - elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) - else: # exact (and small) count - cursor.total_count = _es8_total['value'] - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['_id'] - _results.append(CardsearchResult( - card_iri=_card_iri, - text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), - )) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchHandle( - cursor=cursor, - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - search_params=cardsearch_params, + def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): + if not sort_list: + return None + return [ + {'nested_date.date_value': { + 'order': ('desc' if _sortparam.descending else 'asc'), + 'nested': { + 'path': 'nested_date', + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + _sortparam.propertypath, + suffuniq=True, + ), + }}, + }, + }} + for _sortparam in sort_list + ] + + def _cardsearch_handle( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: OffsetCursor, + ) -> CardsearchHandle: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) + else: # exact (and small) count + cursor.total_count = _es8_total['value'] + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['_id'] + _results.append(CardsearchResult( + card_iri=_card_iri, + text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), + )) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchHandle( + cursor=cursor, + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + search_params=cardsearch_params, + ) - def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: - for _innerhit_group in es8_hit.get('inner_hits', {}).values(): - for _innerhit in _innerhit_group['hits']['hits']: - _property_path = tuple( - json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), + def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: + for _innerhit_group in es8_hit.get('inner_hits', {}).values(): + for _innerhit in _innerhit_group['hits']['hits']: + _property_path = tuple( + json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), + ) + try: + _language_iris = _innerhit['fields']['nested_text.language_iri'] + except KeyError: + _language_iris = () + for _highlight in _innerhit['highlight']['nested_text.text_value']: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), + card_iri=_innerhit['_id'], ) - try: - _language_iris = _innerhit['fields']['nested_text.language_iri'] - except KeyError: - _language_iris = () - for _highlight in _innerhit['highlight']['nested_text.text_value']: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), - card_iri=_innerhit['_id'], - ) - - class _SimpleTextQueryBuilder: - def __init__( - self, text_field, *, - relevance_matters=False, - ): - self._text_field = text_field - self._relevance_matters = relevance_matters - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - if textsegment.is_negated: - return {'must_not': [self.exact_text_query(textsegment.text)]} - if not textsegment.is_fuzzy: - return {'must': [self.exact_text_query(textsegment.text)]} - if not self._relevance_matters: - return {'must': [self.fuzzy_text_must_query(textsegment.text)]} - return { - 'must': [self.fuzzy_text_must_query(textsegment.text)], - 'should': [self.fuzzy_text_should_query(textsegment.text)], - } - def exact_text_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match_phrase': { - self._text_field: {'query': text}, - }} - - def fuzzy_text_must_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match': { - self._text_field: { - 'query': text, - 'fuzziness': 'AUTO', - # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} + class _SimpleTextQueryBuilder: + def __init__( + self, text_field, *, + relevance_matters=False, + ): + self._text_field = text_field + self._relevance_matters = relevance_matters + + def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: + if textsegment.is_negated: + return {'must_not': [self.exact_text_query(textsegment.text)]} + if not textsegment.is_fuzzy: + return {'must': [self.exact_text_query(textsegment.text)]} + if not self._relevance_matters: + return {'must': [self.fuzzy_text_must_query(textsegment.text)]} + return { + 'must': [self.fuzzy_text_must_query(textsegment.text)], + 'should': [self.fuzzy_text_should_query(textsegment.text)], + } - def fuzzy_text_should_query(self, text: str): - return {'match_phrase': { - self._text_field: { - 'query': text, - 'slop': len(text.split()), - }, - }} - - class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): - def __init__(self, **kwargs): - super().__init__('nested_text.text_value', **kwargs) - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - return { - _boolkey: [ - self._make_nested_query(textsegment, _query) - for _query in _queries - ] - for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() - } + def exact_text_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match_phrase': { + self._text_field: {'query': text}, + }} - def _make_nested_query(self, textsegment, query): - _nested_q = {'nested': { - 'path': 'nested_text', - 'query': {'bool': { - 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), - 'must': query, - }}, - }} - if self._relevance_matters: - _nested_q['nested']['inner_hits'] = self._inner_hits() - return _nested_q - - def _inner_hits(self, *, highlight_query=None) -> dict: - _highlight = { - 'type': 'unified', - 'fields': {'nested_text.text_value': {}}, - } - if highlight_query is not None: - _highlight['highlight_query'] = highlight_query - return { - 'name': str(uuid.uuid4()), # avoid inner-hit name collisions - 'highlight': _highlight, - '_source': False, # _source is expensive for nested docs - 'docvalue_fields': [ - 'nested_text.path_from_focus', - 'nested_text.language_iri', - ], - } + def fuzzy_text_must_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match': { + self._text_field: { + 'query': text, + 'fuzziness': 'AUTO', + # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + + def fuzzy_text_should_query(self, text: str): + return {'match_phrase': { + self._text_field: { + 'query': text, + 'slop': len(text.split()), + }, + }} + + class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): + def __init__(self, **kwargs): + super().__init__('nested_text.text_value', **kwargs) + + def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: + return { + _boolkey: [ + self._make_nested_query(textsegment, _query) + for _query in _queries + ] + for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() + } + + def _make_nested_query(self, textsegment, query): + _nested_q = {'nested': { + 'path': 'nested_text', + 'query': {'bool': { + 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), + 'must': query, + }}, + }} + if self._relevance_matters: + _nested_q['nested']['inner_hits'] = self._inner_hits() + return _nested_q + + def _inner_hits(self, *, highlight_query=None) -> dict: + _highlight = { + 'type': 'unified', + 'fields': {'nested_text.text_value': {}}, + } + if highlight_query is not None: + _highlight['highlight_query'] = highlight_query + return { + 'name': str(uuid.uuid4()), # avoid inner-hit name collisions + 'highlight': _highlight, + '_source': False, # _source is expensive for nested docs + 'docvalue_fields': [ + 'nested_text.path_from_focus', + 'nested_text.language_iri', + ], + } ### diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 15e2d17b2..e776b0ff9 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -59,6 +59,7 @@ salt='TrovesearchDenormIndexStrategy', hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', ) +_UNSPLIT_INDEX_SUBNAME = '' def _is_unsplit_strat(strategy: TrovesearchDenormIndexStrategy) -> bool: @@ -200,11 +201,7 @@ def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexname task__delete_iri_value_scraps.apply_async( kwargs={ 'index_strategy_name': self.strategy_name, - 'indexnames': [ - _indexname - for _indexname in affected_indexnames - if self.parse_full_index_name(_indexname).subname == 'valuesearch' - ], + 'indexnames': list(affected_indexnames), 'card_pks': messages_chunk.target_ids_chunk, 'timestamp': messages_chunk.timestamp, }, @@ -230,7 +227,7 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): if _is_unsplit_strat(self): _actions_by_index: dict[str, Iterable[dict]] = { # single combined index - '': itertools.chain(_cardsearch_actions, _valuesearch_actions), + _UNSPLIT_INDEX_SUBNAME: itertools.chain(_cardsearch_actions, _valuesearch_actions), } else: _actions_by_index = { @@ -241,7 +238,7 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _remaining_indexcard_pks.discard(_indexcard_pk) # delete any that were skipped for any reason for _indexcard_pk in _remaining_indexcard_pks: - _subname = ('' if _is_unsplit_strat(self) else 'cardsearch') + _subname = (_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cardsearch') yield self.MessageActionSet(_indexcard_pk, { _subname: [self.build_delete_action(_indexcard_pk)], }) @@ -250,10 +247,10 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): # handling searches def cardsearch_index(self) -> IndexStrategy.SpecificIndex: - return self.get_index('' if _is_unsplit_strat(self) else 'cardsearch') + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cardsearch') def valuesearch_index(self) -> IndexStrategy.SpecificIndex: - return self.get_index('' if _is_unsplit_strat(self) else 'valuesearch') + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'valuesearch') # abstract method from IndexStrategy def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: diff --git a/tests/api/test_elasticsearch.py b/tests/api/test_elasticsearch.py index 9a0bacff5..e37ad8141 100644 --- a/tests/api/test_elasticsearch.py +++ b/tests/api/test_elasticsearch.py @@ -57,7 +57,7 @@ def test_search(self): _mock_index_strategy_module .get_strategy_for_sharev2_search .return_value - .pls_handle_search__sharev2_backcompat + .pls_handle_search__passthru ) mock_handle_search.return_value = {'clop': 'clip'} for url in urls: diff --git a/tests/api/test_feeds.py b/tests/api/test_feeds.py index 0e56eac2c..218128baa 100644 --- a/tests/api/test_feeds.py +++ b/tests/api/test_feeds.py @@ -54,7 +54,7 @@ def fake_items(self, Graph): ] with mock.patch('api.views.feeds.index_strategy.get_strategy_for_sharev2_search') as mock_get_for_searching: mock_strategy = mock_get_for_searching.return_value - mock_strategy.pls_handle_search__sharev2_backcompat.return_value = { + mock_strategy.pls_handle_search__passthru.return_value = { 'hits': { 'hits': [ {'_source': item, '_id': item['id']} diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 81461a34e..b237d150c 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -122,7 +122,7 @@ def test_cardsearch_pagination(self): _result_iris: set[str] = set() _page_count = 0 while True: - _cardsearch_handle = self.current_index.pls_handle_cardsearch( + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch( CardsearchParams.from_querystring(_querystring), ) _page_iris = { @@ -151,7 +151,7 @@ def test_cardsearch_related_properties(self): ), ): _cardsearch_params = CardsearchParams.from_querystring('') - _cardsearch_handle = self.current_index.pls_handle_cardsearch(_cardsearch_params) + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) self.assertEqual(_cardsearch_handle.related_propertypath_results, [ PropertypathUsage((DCTERMS.creator,), 3), PropertypathUsage((DCTERMS.references,), 2), @@ -212,7 +212,7 @@ def _assert_cardsearch_iris(self, queryparams: dict, expected_focus_iris: Iterab _querystring = urlencode(queryparams) _cardsearch_params = CardsearchParams.from_querystring(_querystring) assert isinstance(_cardsearch_params, CardsearchParams) - _cardsearch_handle = self.current_index.pls_handle_cardsearch(_cardsearch_params) + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) # assumes all results fit on one page _actual_result_iris: set[str] | list[str] = [ self._indexcard_focus_by_uuid[_result.card_uuid] @@ -227,7 +227,7 @@ def _assert_valuesearch_values(self, queryparams, expected_values): _querystring = urlencode(queryparams) _valuesearch_params = ValuesearchParams.from_querystring(_querystring) assert isinstance(_valuesearch_params, ValuesearchParams) - _valuesearch_handle = self.current_index.pls_handle_valuesearch(_valuesearch_params) + _valuesearch_handle = self.index_strategy.pls_handle_valuesearch(_valuesearch_params) # assumes all results fit on one page _actual_values = { _result.value_iri or _result.value_value @@ -615,7 +615,7 @@ def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): _response.is_done for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) )) - self.current_index.pls_refresh() + self.index_strategy.pls_refresh() def _delete_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): for _indexcard in indexcards: diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index be24eca9e..582a377a5 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -74,7 +74,7 @@ def _assert_happypath_without_daemon(self, messages_chunk, expected_doc_count): _ids = {_response.index_message.target_id for _response in _responses} assert _ids == set(messages_chunk.target_ids_chunk) self.index_strategy.pls_refresh() - _search_response = self.index_strategy.pls_handle_search__sharev2_backcompat() + _search_response = self.index_strategy.pls_handle_search__passthru() _hits = _search_response['hits']['hits'] assert len(_hits) == expected_doc_count @@ -85,7 +85,7 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): for _ in range(23): _daemon_control.stop_event.wait(timeout=0.2) self.index_strategy.pls_refresh() - _search_response = self.index_strategy.pls_handle_search__sharev2_backcompat() + _search_response = self.index_strategy.pls_handle_search__passthru() _hits = _search_response['hits']['hits'] if len(_hits) == expected_doc_count: break # all good From ae980b724cf567d78c2ed9d12af2da69f8697952 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 15:39:27 -0500 Subject: [PATCH 18/35] wip... --- share/admin/search.py | 25 +-- share/search/index_status.py | 22 ++- share/search/index_strategy/_base.py | 58 +++++- share/search/index_strategy/elastic8.py | 23 ++- .../search/index_strategy/sharev2_elastic5.py | 10 +- .../index_strategy/trovesearch_denorm.py | 75 +++++--- templates/admin/search-indexes.html | 181 +++++++++--------- .../index_strategy/_with_real_services.py | 10 +- .../index_strategy/test_sharev2_elastic5.py | 2 +- .../index_strategy/test_trovesearch_denorm.py | 1 + trove/render/turtle.py | 2 + trove/trovesearch/trovesearch_gathering.py | 146 +++++++------- 12 files changed, 334 insertions(+), 221 deletions(-) diff --git a/share/admin/search.py b/share/admin/search.py index 9e1cdd4b7..95614a0fc 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -31,9 +31,7 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _index_strategy = parse_strategy_name( - request.POST['specific_indexname'], # TODO: rename in form - ) + _index_strategy = parse_strategy_name(request.POST['strategy_name']) _pls_doer = PLS_DOERS[request.POST['pls_do']] _pls_doer(_index_strategy) _redirect_id = _index_strategy.strategy_name @@ -66,22 +64,13 @@ def _index_status_by_strategy(): status_by_strategy = {} _messenger = IndexMessenger() for _index_strategy in each_strategy(): - _current_backfill = _backfill_by_checksum.get( - str(_index_strategy.CURRENT_STRATEGY_CHECKSUM), + _current_backfill = ( + _backfill_by_checksum.get(str(_index_strategy.CURRENT_STRATEGY_CHECKSUM)) + or _backfill_by_checksum.get(_index_strategy.indexname_prefix) # backcompat ) status_by_strategy[_index_strategy.strategy_name] = { - 'current': { - 'status': [ - _index.pls_get_status() - for _index in _index_strategy.each_subnamed_index() - ], - 'backfill': _serialize_backfill(_index_strategy, _current_backfill), - }, - 'prior': sorted(( - specific_index.pls_get_status() - for specific_index in _index_strategy.each_existing_index(any_strategy_check=True) - if not specific_index.is_current - ), reverse=True), + 'status': _index_strategy.pls_get_strategy_status(), + 'backfill': _serialize_backfill(_index_strategy, _current_backfill), 'queues': [ { 'name': _queue_name, @@ -143,7 +132,7 @@ def _pls_make_default_for_searching(index_strategy: IndexStrategy): def _pls_delete(index_strategy: IndexStrategy): assert not index_strategy.is_current - index_strategy.pls_delete() + index_strategy.pls_teardown() PLS_DOERS = { diff --git a/share/search/index_status.py b/share/search/index_status.py index 2c379c8a1..1ed16f9b7 100644 --- a/share/search/index_status.py +++ b/share/search/index_status.py @@ -1,12 +1,30 @@ +from __future__ import annotations import dataclasses @dataclasses.dataclass(order=True) class IndexStatus: creation_date: str - index_strategy_name: str index_subname: str specific_indexname: str + doc_count: int = 0 is_kept_live: bool = False is_default_for_searching: bool = False - doc_count: int = 0 + + +@dataclasses.dataclass +class StrategyStatus: + strategy_name: str + strategy_check: str + is_set_up: bool + is_default_for_searching: bool + index_statuses: list[IndexStatus] + existing_prior_strategies: list[StrategyStatus] + + @property + def strategy_id(self): + return f'{self.strategy_name}__{self.strategy_check}' + + @property + def is_kept_live(self) -> bool: + return all(_indexstatus.is_kept_live for _indexstatus in self.index_statuses) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index b1ea73ddc..858f535ab 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -8,7 +8,10 @@ from share.search import messages from share.models.index_backfill import IndexBackfill from share.search.exceptions import IndexStrategyError -from share.search.index_status import IndexStatus +from share.search.index_status import ( + IndexStatus, + StrategyStatus, +) from share.util.checksum_iri import ChecksumIri from trove.trovesearch.search_params import ( CardsearchParams, @@ -24,7 +27,7 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass +@dataclasses.dataclass(frozen=True) class IndexStrategy(abc.ABC): '''an abstraction for indexes in different places and ways. @@ -49,7 +52,7 @@ class IndexStrategy(abc.ABC): def __post_init__(self): indexnames.raise_if_invalid_indexname_part(self.strategy_name) if not self.strategy_check: - self.strategy_check = self.CURRENT_STRATEGY_CHECKSUM.hexdigest + object.__setattr__(self, 'strategy_check', self.CURRENT_STRATEGY_CHECKSUM.hexdigest) indexnames.raise_if_invalid_indexname_part(self.strategy_check) @classmethod @@ -133,7 +136,7 @@ def pls_setup(self, *, skip_backfill=False) -> None: _backfill.save() def pls_teardown(self) -> None: - for _index in self.each_subnamed_index(): + for _index in self.each_existing_index(): _index.pls_delete() def get_or_create_backfill(self): @@ -158,6 +161,45 @@ def pls_refresh(self) -> None: for _index in self.each_subnamed_index(): _index.pls_refresh() + def pls_start_keeping_live(self): + for _index in self.each_subnamed_index(): + _index.pls_start_keeping_live() + + def pls_stop_keeping_live(self): + for _index in self.each_live_index(): + _index.pls_stop_keeping_live() + + def pls_get_strategy_status(self) -> StrategyStatus: + _index_statuses: list[IndexStatus] = [] + _prior_strategy_statuses: list[StrategyStatus] = [] + if self.is_current: + _index_statuses = [ + _index.pls_get_status() + for _index in self.each_subnamed_index() + ] + _prior_strategies = { + _index.index_strategy + for _index in self.each_existing_index(any_strategy_check=True) + if not _index.index_strategy.is_current + } + _prior_strategy_statuses = [ + _strategy.pls_get_strategy_status() + for _strategy in _prior_strategies + ] + else: + _index_statuses = [ + _index.pls_get_status() + for _index in self.each_existing_index() + ] + return StrategyStatus( + strategy_name=self.strategy_name, + strategy_check=self.strategy_check, + is_set_up=self.pls_check_exists(), + is_default_for_searching=(self == self.pls_get_default_for_searching()), + index_statuses=_index_statuses, + existing_prior_strategies=_prior_strategy_statuses, + ) + ### # abstract methods (required for concrete subclasses) @@ -189,6 +231,10 @@ def backfill_message_type(self) -> messages.MessageType: def each_existing_index(self, *, any_strategy_check: bool = False) -> typing.Iterator[SpecificIndex]: raise NotImplementedError + @abc.abstractmethod + def each_live_index(self, *, any_strategy_check: bool = False) -> typing.Iterator[SpecificIndex]: + raise NotImplementedError + @abc.abstractmethod def pls_handle_messages_chunk(self, messages_chunk: messages.MessagesChunk) -> typing.Iterable[messages.IndexMessageResponse]: raise NotImplementedError @@ -263,6 +309,10 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise NotImplementedError + @abc.abstractmethod + def is_kept_live(self) -> bool: + raise NotImplementedError + def pls_get_mappings(self) -> dict: raise NotImplementedError diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 18a401b93..6d8d89e0d 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -27,7 +27,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass class Elastic8IndexStrategy(IndexStrategy): '''abstract base class for index strategies using elasticsearch 8 ''' @@ -166,6 +165,12 @@ def each_existing_index(self, *, any_strategy_check: bool = False): assert _index.index_strategy.strategy_name == self.strategy_name yield _index + def each_live_index(self, *, any_strategy_check: bool = False): + for _indexname in self._get_indexnames_for_alias(self._alias_for_keeping_live): + _index = self.parse_full_index_name(_indexname) + if any_strategy_check or (_index.index_strategy == self): + yield _index + # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) @@ -230,6 +235,7 @@ def pls_get_default_for_searching(self) -> IndexStrategy: return self # no default set, this one's fine (_strategyname, _strategycheck, *_) = parse_indexname_parts(_indexname) assert _strategyname == self.strategy_name + _strategycheck = _strategycheck.rstrip('*') # may be a wildcard alias return self.with_strategy_check(_strategycheck) # abstract method from IndexStrategy @@ -300,7 +306,11 @@ def _get_indexnames_for_action( ) -> set[str]: if is_backfill_action: return {self.get_index(index_subname).full_index_name} - return self._get_indexnames_for_alias(self._alias_for_keeping_live) + return { + _index.full_index_name + for _index in self.each_live_index() + if _index.subname == index_subname + } def _get_indexnames_for_alias(self, alias_name) -> set[str]: try: @@ -351,7 +361,6 @@ def index_def(self) -> Elastic8IndexStrategy.IndexDefinition: def pls_get_status(self) -> IndexStatus: if not self.pls_check_exists(): return IndexStatus( - index_strategy_name=self.index_strategy.strategy_name, index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=False, @@ -374,7 +383,6 @@ def pls_get_status(self) -> IndexStatus: ['indices'][self.full_index_name]['primaries']['docs']['count'] ) return IndexStatus( - index_strategy_name=self.index_strategy.strategy_name, index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=( @@ -462,6 +470,13 @@ def pls_stop_keeping_live(self): ) logger.warning('%r: no longer kept live', self) + # abstract method from IndexStrategy.SpecificIndex + def is_kept_live(self) -> bool: + _kept_live = self.index_strategy._get_indexnames_for_alias( + self.index_strategy._alias_for_keeping_live, + ) + return (self.full_index_name in _kept_live) + def pls_get_mappings(self): return self.index_strategy.es8_client.indices.get_mapping(index=self.full_index_name).body diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 27c4d6936..d2d1557e2 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -107,6 +107,10 @@ def each_existing_index(self, *args, **kwargs): if _index.pls_check_exists(): yield _index + # abstract method from IndexStrategy + def each_live_index(self, *args, **kwargs): + yield self.single_index + # abstract method from IndexStrategy def each_subnamed_index(self): yield self.single_index @@ -363,6 +367,10 @@ def pls_create(self): def pls_start_keeping_live(self): pass # there is just the one index, always kept live + # abstract method from IndexStrategy.SpecificIndex + def is_kept_live(self) -> bool: + return True # there is just the one index, always kept live + # abstract method from IndexStrategy.SpecificIndex def pls_stop_keeping_live(self): raise exceptions.IndexStrategyError( @@ -410,7 +418,6 @@ def pls_get_status(self) -> IndexStatus: except (KeyError, elasticsearch5.exceptions.NotFoundError): # not yet created return IndexStatus( - index_strategy_name=self.index_strategy.strategy_name, index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=False, @@ -419,7 +426,6 @@ def pls_get_status(self) -> IndexStatus: doc_count=0, ) return IndexStatus( - index_strategy_name=self.index_strategy.strategy_name, index_subname=self.subname, specific_indexname=self.full_index_name, is_kept_live=True, diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index e776b0ff9..d6f655c68 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -70,23 +70,30 @@ class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): CURRENT_STRATEGY_CHECKSUM = ChecksumIri( checksumalgorithm_name='sha-256', salt='TrovesearchDenormIndexStrategy', - hexdigest='e8fce41147e8436bbfacebf669567a3e941a152261130e7331b36845d5a20952', + hexdigest='4c8784ddd08914ec779b33b8f1945b0b2ff026eea355392ab3c4fe2fe10d71fe', ) # abstract method from Elastic8IndexStrategy @classmethod def define_current_indexes(cls) -> dict[str, Elastic8IndexStrategy.IndexDefinition]: return { - 'cardsearch': cls.IndexDefinition( + 'cards': cls.IndexDefinition( settings=cls._index_settings(), - mappings=cls._cardsearch_index_mappings(), + mappings=cls._cards_index_mappings(), ), - 'valuesearch': cls.IndexDefinition( + 'iri_values': cls.IndexDefinition( settings=cls._index_settings(), - mappings=cls._valuesearch_index_mappings(), + mappings=cls._iri_values_index_mappings(), ), } + # override from IndexStrategy + def each_subnamed_index(self): + if _is_unsplit_strat(self): + yield self.get_index(_UNSPLIT_INDEX_SUBNAME) + else: + yield from super().each_subnamed_index() + # abstract method from IndexStrategy @property def supported_message_types(self): @@ -108,24 +115,27 @@ def _index_settings(cls): } @classmethod - def _cardsearch_index_mappings(cls): + def _cards_index_mappings(cls): return { 'dynamic': 'false', 'dynamic_templates': cls._dynamic_templates(), 'properties': { 'card': {'properties': cls._card_mappings()}, - 'iri_value': {'properties': cls._iri_value_mappings()}, 'chunk_timestamp': {'type': 'unsigned_long'}, }, } @classmethod - def _valuesearch_index_mappings(cls): - _card_mappings = cls._cardsearch_index_mappings() - _card_mappings['properties']['iri_value'] = { - 'properties': cls._iri_value_mappings(), + def _iri_values_index_mappings(cls): + return { + 'dynamic': 'false', + 'dynamic_templates': cls._dynamic_templates(), + 'properties': { + 'card': {'properties': cls._card_mappings()}, + 'iri_value': {'properties': cls._iri_value_mappings()}, + 'chunk_timestamp': {'type': 'unsigned_long'}, + }, } - return _card_mappings @classmethod def _dynamic_templates(cls): @@ -224,21 +234,22 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): self.build_index_action(_doc_id, _doc) for _doc_id, _doc in _docbuilder.build_valuesearch_docs() ) + _actions_by_index: dict[str, Iterable[dict]] if _is_unsplit_strat(self): - _actions_by_index: dict[str, Iterable[dict]] = { - # single combined index + _actions_by_index = { + # back-compat: single combined index _UNSPLIT_INDEX_SUBNAME: itertools.chain(_cardsearch_actions, _valuesearch_actions), } else: _actions_by_index = { - 'cardsearch': _cardsearch_actions, - 'valuesearch': _valuesearch_actions, + 'cards': _cardsearch_actions, + 'iri_values': _valuesearch_actions, } yield self.MessageActionSet(_indexcard_pk, _actions_by_index) _remaining_indexcard_pks.discard(_indexcard_pk) # delete any that were skipped for any reason for _indexcard_pk in _remaining_indexcard_pks: - _subname = (_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cardsearch') + _subname = (_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cards') yield self.MessageActionSet(_indexcard_pk, { _subname: [self.build_delete_action(_indexcard_pk)], }) @@ -247,10 +258,10 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): # handling searches def cardsearch_index(self) -> IndexStrategy.SpecificIndex: - return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cardsearch') + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cards') - def valuesearch_index(self) -> IndexStrategy.SpecificIndex: - return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'valuesearch') + def irivaluesearch_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'iri_values') # abstract method from IndexStrategy def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: @@ -290,17 +301,18 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value _path = valuesearch_params.valuesearch_propertypath _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) _is_date_search = osfmap.is_date_property(_path[-1]) - _query = ( - _build_date_valuesearch(valuesearch_params) - if _is_date_search - else _build_iri_valuesearch(valuesearch_params, _cursor) - ) + if _is_date_search: + _index = self.cardsearch_index() + _query = _build_date_valuesearch(valuesearch_params) + else: + _index = self.irivaluesearch_index() + _query = _build_iri_valuesearch(valuesearch_params, _cursor) if settings.DEBUG: logger.info(json.dumps(_query, indent=2)) try: _es8_response = self.es8_client.search( **_query, - index=self.valuesearch_index().full_index_name, + index=_index.full_index_name, ) except elasticsearch8.TransportError as error: raise exceptions.IndexStrategyError() from error # TODO: error messaging @@ -897,8 +909,6 @@ def _build_date_valuesearch(params: ValuesearchParams) -> dict: relevance_matters=False, ).boolparts(), ) - # exclude iri_value docs (possible optimization: separate indexes) - _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) _field = f'card.date_by_propertypath.{_path_field_name(params.valuesearch_propertypath)}' return { 'query': _bool.as_query(), @@ -983,10 +993,15 @@ def task__delete_iri_value_scraps( ''' from share.search.index_strategy import get_strategy _index_strategy = get_strategy(index_strategy_name) - assert isinstance(_index_strategy, Elastic8IndexStrategy) + assert isinstance(_index_strategy, TrovesearchDenormIndexStrategy) + _irivalue_indexnames = { + _index.full_index_name + for _index in _index_strategy.each_live_index(any_strategy_check=True) + if _index.subname == 'iri_values' + } # delete any docs that belong to cards in this chunk but weren't touched by indexing _delete_resp = _index_strategy.es8_client.delete_by_query( - index=indexnames, + index=list(_irivalue_indexnames), query={'bool': {'must': [ {'terms': {'card.card_pk': card_pks}}, {'range': {'chunk_timestamp': {'lt': timestamp}}}, diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 1573f80d7..c005233cc 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -6,12 +6,17 @@ section, table { padding-left: 2em; } +nav { + display: flex; + flex-direction: horizontal; + gap: 1em; +} {% endblock %} {% block content %} -

{% trans "elasticsearch indexes" %}

-{% for index_strategy_name, indexes in index_status_by_strategy.items %} +

{% trans "trovesearch status by strategy" %}

+{% for index_strategy_name, strategy_info in index_status_by_strategy.items %}

{{ index_strategy_name }} index strategy

@@ -22,7 +27,7 @@

queues

- {% for queue_info in indexes.queues %} + {% for queue_info in strategy_info.queues %} @@ -32,12 +37,51 @@

queues

{% trans "shortname" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %} {% trans "actions" %} {% trans "links" %}{% trans "index name" %}{% trans "full index name" %}
{{ index_status.index_subname }} {{ index_status.creation_date }} {% if index_status.is_kept_live %}✓{% endif %} {% if index_status.is_default_for_searching %}✓{% endif %}
{% trans "depth" %} {% trans "rate (past 30s)" %}
{{ queue_info.name }} {{ queue_info.queue_depth }}
-

current indexes

- {% if indexes.current.backfill.backfill_admin_url %} -

- {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }} -

- {% endif %} +

current: {{ strategy_info.status.strategy_id }}

+ @@ -45,69 +89,61 @@

current indexes

- - {% for current_index_status in indexes.current.status %} + {% for current_index_status in strategy_info.status.index_statuses %} - - + {% endfor %}
{% trans "subname" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %}{% trans "actions" %} {% trans "links" %} {% trans "full name" %}
{{ current_index_status.index_subname }} {{ current_index_status.creation_date|default:"--" }} {% if current_index_status.is_kept_live %}✓{% endif %} {% if current_index_status.is_default_for_searching %}✓{% endif %} {{ current_index_status.doc_count }} - {% if not current_index_status.creation_date %} -
- {% csrf_token %} - - - -
- {% elif not current_index_status.is_kept_live %} -
- {% csrf_token %} - - - -
- {% elif indexes.current.backfill.can_start_backfill %} -
- {% csrf_token %} - - - -
- {% elif indexes.current.backfill.can_mark_backfill_complete %} -
- {% csrf_token %} - - - -
- {% endif %} - {% if current_index_status.creation_date and not current_index_status.is_default_for_searching %} -
- {% csrf_token %} - - - -
- {% endif %} -
{% if current_index_status.creation_date %} -

{% trans "mappings" %}

+ {% trans "mappings" %} {% endif %}
{{ index_status.specific_indexname }}{{ current_index_status.specific_indexname }}
- {% if indexes.prior %} + {% for prior_strategy_status in strategy_info.status.existing_prior_strategies %}
-

prior indexes

+

prior: {{ prior_strategy_status.strategy_id }}

+ @@ -115,49 +151,16 @@

prior indexes

- - {% for index_status in indexes.prior %} + {% for index_status in prior_strategy_status.index_statuses %} -
{% trans "shortname" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %}{% trans "actions" %} {% trans "links" %} {% trans "full index name" %}
{{ index_status.index_subname }} {{ index_status.creation_date }} {% if index_status.is_kept_live %}✓{% endif %} {% if index_status.is_default_for_searching %}✓{% endif %} {{ index_status.doc_count }} - {% if not index_status.is_default_for_searching %} -
- {% csrf_token %} - - - -
- {% endif %} - {% if index_status.is_kept_live %} -
- {% csrf_token %} - - - -
- {% else %} -
- {% csrf_token %} - - - -
-
- {% csrf_token %} - - - - -
- {% endif %} -
{% if index_status.creation_date %}

{% trans "mappings" %}

{% endif %} @@ -167,7 +170,7 @@

prior indexes

{% endfor %}
- {% endif %} + {% endfor %} {% endfor %} {% endblock %} diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 582a377a5..4c2c3f0c1 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -109,15 +109,17 @@ def _assert_setup_happypath(self): assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count - # keep index live (with ingested updates) - _index.pls_start_keeping_live() + # keep index live (with ingested updates) + self.index_strategy.pls_start_keeping_live() + for _index in self.index_strategy.each_subnamed_index(): index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count - # default index for searching - self.index_strategy.pls_make_default_for_searching() + # default index for searching + self.index_strategy.pls_make_default_for_searching() + for _index in self.index_strategy.each_subnamed_index(): index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 0a953a542..88e1d6b13 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -72,7 +72,7 @@ def _assert_happypath_until_ingest(self): assert index_status.is_default_for_searching # change from base class assert not index_status.doc_count # keep index live (with ingested updates) - _index.pls_start_keeping_live() # now a no-op + self.index_strategy.pls_start_keeping_live() # now a no-op index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live diff --git a/tests/share/search/index_strategy/test_trovesearch_denorm.py b/tests/share/search/index_strategy/test_trovesearch_denorm.py index 9a94928d3..2e71065a0 100644 --- a/tests/share/search/index_strategy/test_trovesearch_denorm.py +++ b/tests/share/search/index_strategy/test_trovesearch_denorm.py @@ -14,6 +14,7 @@ def setUp(self): # make the followup delete task eager def _fake_apply_async(*args, **kwargs): + self.index_strategy.pls_refresh() kwargs['countdown'] = 0 # don't wait task__delete_iri_value_scraps.apply(*args, **kwargs) self.enterContext( diff --git a/trove/render/turtle.py b/trove/render/turtle.py index fb2d6e352..2b682178c 100644 --- a/trove/render/turtle.py +++ b/trove/render/turtle.py @@ -1,6 +1,7 @@ from primitive_metadata import primitive_rdf as rdf from trove.vocab.namespaces import TROVE +from trove.vocab.trove import trove_shorthand from ._base import BaseRenderer @@ -13,4 +14,5 @@ def simple_render_document(self) -> str: return rdf.turtle_from_tripledict( self.response_data.tripledict, focus=self.response_focus.single_iri(), + shorthand=trove_shorthand, ) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index ca4870e82..c2a4159f9 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -1,7 +1,7 @@ import dataclasses import logging import urllib.parse -from typing import ClassVar +from typing import ClassVar, Any from primitive_metadata.primitive_rdf import ( Literal, @@ -107,10 +107,15 @@ class ValuesearchFocus(_TypedFocus): search_handle: ValuesearchHandle = dataclasses.field(compare=False) +@dataclasses.dataclass(frozen=True) class IndexcardFocus(_TypedFocus): TYPE_IRI = TROVE.Indexcard ADDITIONAL_TYPE_IRIS = (DCAT.CatalogRecord,) + # additional dataclass fields + indexcard: trove_db.Indexcard = dataclasses.field(compare=False) + resourceMetadata: Any = dataclasses.field(compare=False, default=None) + # TODO: per-field text search in rdf # @trovesearch_by_indexstrategy.gatherer(TROVE.cardSearchText) @@ -149,7 +154,10 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): _current_handle: CardsearchHandle | None = focus.search_handle while _current_handle is not None: _result_page = [] - _card_descriptions_by_iri = _load_card_descriptions(_current_handle.search_result_page, deriver_iri) + _cards_by_iri, _card_contents_by_iri = _load_cards_and_contents( + (_result.card_iri for _result in _current_handle.search_result_page), + deriver_iri=deriver_iri, + ) for _result in _current_handle.search_result_page or (): _text_evidence_twoples = ( (TROVE.matchEvidence, frozenset(( @@ -162,15 +170,13 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): ) _result_page.append(frozenset(( (RDF.type, TROVE.SearchResult), - (TROVE.indexCard, _result.card_iri), + (TROVE.indexCard, IndexcardFocus.new( + iris=_result.card_iri, + indexcard=_cards_by_iri[_result.card_iri], + resourceMetadata=_card_contents_by_iri.get(_result.card_iri), + )), *_text_evidence_twoples, ))) - try: - _card_description = _card_descriptions_by_iri[_result.card_iri] - except KeyError: - pass - else: - yield from rdf.iter_tripleset(_card_description.tripledict) yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() @@ -236,16 +242,17 @@ def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): else: _value_indexcards = [] for _result in focus.search_handle.search_result_page or (): - _indexcard_obj = None + _indexcard_obj: Any = None if _result.value_iri in _value_iris: for _indexcard in _value_indexcards: if any( _identifier.equivalent_to_iri(_result.value_iri) for _identifier in _indexcard.focus_identifier_set.all() ): - _indexcard_obj = _indexcard.get_iri() - yield (_indexcard_obj, RDF.type, TROVE.Indexcard) # so gather_card runs - # TODO: batch-load cards instead + _indexcard_obj = IndexcardFocus.new( + iris=_indexcard.get_iri(), + indexcard=_indexcard, + ) break # found the indexcard if _indexcard_obj is None: # no actual indexcard; put what we know in a blanknode-indexcard @@ -282,16 +289,53 @@ def gather_valuesearch_count(focus, **kwargs): # raise trove_exceptions.IriMismatch(f'could not find indexcard iri in {focus.iris} (looking for {_indexcard_namespace})') -def _load_card_descriptions(search_result_page, deriver_iri) -> dict[str, rdf.RdfGraph]: - _card_iris = {_result.card_iri for _result in search_result_page} +@trovesearch_by_indexstrategy.gatherer(DCTERMS.issued, focustype_iris={TROVE.Indexcard}) +def gather_card_issued(focus: IndexcardFocus, **kwargs): + yield (DCTERMS.issued, focus.indexcard.created.date()) + + +@trovesearch_by_indexstrategy.gatherer(DCTERMS.modified, focustype_iris={TROVE.Indexcard}) +def gather_card_modified(focus: IndexcardFocus, **kwargs): + yield (DCTERMS.modified, focus.indexcard.modified.date()) + + +@trovesearch_by_indexstrategy.gatherer( + (FOAF.primaryTopic, TROVE.focusIdentifier), + focustype_iris={TROVE.Indexcard}, +) +def gather_primary_topic(focus: IndexcardFocus, **kwargs): + for _identifier in focus.indexcard.focus_identifier_set.all(): + _iri = _identifier.as_iri() + yield (FOAF.primaryTopic, _iri) + yield (TROVE.focusIdentifier, literal(_iri)) + + +@trovesearch_by_indexstrategy.gatherer( + TROVE.resourceMetadata, + focustype_iris={TROVE.Indexcard}, +) +def gather_card_contents(focus: IndexcardFocus, *, deriver_iri, **kwargs): + if focus.resourceMetadata is not None: + yield (TROVE.resourceMetadata, focus.resourceMetadata) + else: + ... + + +def _load_cards_and_contents(card_iris, deriver_iri) -> tuple[ + dict[str, trove_db.Indexcard], # cards by iri + dict[str, Any], # card contents by iri +]: return ( - _load_card_descriptions_nonderived(_card_iris) + _load_cards_and_extracted_rdf_contents(card_iris) if deriver_iri is None - else _load_card_descriptions_derived(_card_iris, deriver_iri) + else _load_cards_and_derived_contents(card_iris, deriver_iri) ) -def _load_card_descriptions_nonderived(card_iris) -> dict[str, rdf.RdfGraph]: +def _load_cards_and_extracted_rdf_contents(card_iris) -> tuple[ + dict[str, trove_db.Indexcard], + dict[str, rdf.QuotedGraph], +]: _card_namespace = trove_indexcard_namespace() _indexcard_uuids = { iri_minus_namespace(_card_iri, namespace=_card_namespace) @@ -303,20 +347,24 @@ def _load_card_descriptions_nonderived(card_iris) -> dict[str, rdf.RdfGraph]: .select_related('indexcard') .prefetch_related('indexcard__focus_identifier_set') ) - _by_card_iri = {} + _cards_by_iri: dict[str, trove_db.Indexcard] = {} + _card_contents_by_iri: dict[str, rdf.QuotedGraph] = {} for _indexcard_rdf in _indexcard_rdf_qs: - _indexcard_iri = _indexcard_rdf.indexcard.get_iri() + _card = _indexcard_rdf.indexcard + _card_iri = _card.get_iri() + _cards_by_iri[_card_iri] = _card _quoted_graph = _indexcard_rdf.as_quoted_graph() _quoted_graph.add( - (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _indexcard_iri), - ) - _by_card_iri[_indexcard_iri] = _describe_indexcard_nonderived( - _indexcard_iri, _indexcard_rdf + (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _card_iri), ) - return _by_card_iri + _card_contents_by_iri[_card_iri] = _quoted_graph + return _cards_by_iri, _card_contents_by_iri -def _load_card_descriptions_derived(card_iris, deriver_iri: str) -> dict[str, rdf.RdfGraph]: +def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> tuple[ + dict[str, trove_db.Indexcard], + dict[str, rdf.Literal], +]: _card_namespace = trove_indexcard_namespace() _indexcard_uuids = { iri_minus_namespace(_card_iri, namespace=_card_namespace) @@ -335,49 +383,13 @@ def _load_card_descriptions_derived(card_iris, deriver_iri: str) -> dict[str, rd .select_related('upriver_indexcard') .prefetch_related('upriver_indexcard__focus_identifier_set') ) - _by_card_iri = {} + _cards_by_iri: dict[str, trove_db.Indexcard] = {} + _card_contents_by_iri: dict[str, rdf.Literal] = {} for _derived in _derived_indexcard_qs: _indexcard_iri = _derived.upriver_indexcard.get_iri() - _by_card_iri[_indexcard_iri] = _describe_indexcard_derived(_indexcard_iri, _derived) - return _by_card_iri - - -def _describe_indexcard_nonderived( - indexcard_iri: str, - indexcard_rdf: trove_db.IndexcardRdf, -) -> rdf.RdfGraph: - _card_description = rdf.RdfGraph({ - indexcard_iri: { - RDF.type: {TROVE.Indexcard, DCAT.CatalogRecord}, - TROVE.resourceMetadata: {indexcard_rdf.as_quoted_graph()}, - DCTERMS.issued: {indexcard_rdf.indexcard.created.date()}, - DCTERMS.modified: {indexcard_rdf.modified.date()}, - }, - }) - for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): - _iri = _identifier.as_iri() - _card_description.add((indexcard_iri, FOAF.primaryTopic, _iri)) - _card_description.add((indexcard_iri, TROVE.focusIdentifier, literal(_iri))) - return _card_description - - -def _describe_indexcard_derived( - indexcard_iri: str, - derived_indexcard: trove_db.DerivedIndexcard, -) -> rdf.RdfGraph: - _card_description = rdf.RdfGraph({ - indexcard_iri: { - RDF.type: {TROVE.Indexcard, DCAT.CatalogRecord}, - TROVE.resourceMetadata: {derived_indexcard.as_rdf_literal()}, - DCTERMS.issued: {derived_indexcard.upriver_indexcard.created.date()}, - DCTERMS.modified: {derived_indexcard.modified.date()}, - }, - }) - for _identifier in derived_indexcard.upriver_indexcard.focus_identifier_set.all(): - _iri = _identifier.as_iri() - _card_description.add((indexcard_iri, FOAF.primaryTopic, _iri)) - _card_description.add((indexcard_iri, TROVE.focusIdentifier, literal(_iri))) - return _card_description + _cards_by_iri[_indexcard_iri] = _derived.upriver_indexcard + _card_contents_by_iri[_indexcard_iri] = _derived.as_rdf_literal() + return _cards_by_iri, _card_contents_by_iri ### From 4b1fea92fe11474c1dae928d19ba3b010d934776 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 15:59:24 -0500 Subject: [PATCH 19/35] wip.. --- share/bin/search.py | 8 ++--- share/search/index_strategy/elastic8.py | 1 - templates/admin/search-indexes.html | 6 ++-- tests/share/bin/test_sharectl.py | 6 ++-- tests/share/search/__init__.py | 14 ++++----- .../search/index_strategy/test_elastic8.py | 31 ++++++++++--------- tests/share/search/test_index_backfill.py | 4 +-- 7 files changed, 35 insertions(+), 35 deletions(-) diff --git a/share/bin/search.py b/share/bin/search.py index a677d0445..8ccb65c4d 100644 --- a/share/bin/search.py +++ b/share/bin/search.py @@ -26,11 +26,11 @@ def search(args, argv): @search.subcommand('Drop the Elasticsearch index') def purge(args, argv): """ - Usage: {0} search purge ... + Usage: {0} search purge ... """ - for index_name in args['']: - specific_index = index_strategy.get_specific_index(index_name) - specific_index.pls_delete() + for _strategy_name in args['']: + _strategy = index_strategy.parse_strategy_name(_strategy_name) + _strategy.pls_teardown() @search.subcommand('Create indicies and apply mappings') diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 6d8d89e0d..88082bf10 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -415,7 +415,6 @@ def pls_check_exists(self): def pls_create(self): assert self.is_current, ( 'cannot create a non-current version of an index!' - ' maybe try `index_strategy.for_current_index()`?' ) index_to_create = self.full_index_name logger.debug('Ensuring index %s', index_to_create) diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index c005233cc..40e1a7347 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -84,13 +84,13 @@

current: {{ strategy_info.status.strategy_id }}

- + - + {% for current_index_status in strategy_info.status.index_statuses %} @@ -146,7 +146,7 @@

prior: {{ prior_strategy_status.strategy_id }}

{% trans "subname" %}{% trans "index" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %} {% trans "links" %}{% trans "full name" %}{% trans "full index name" %}
- + diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index ca31edb48..b6ed5b656 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -59,15 +59,13 @@ def test_setup_initial(self, settings): with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') for mock_index_strategy in _mock_index_strategys: - mock_specific_index = mock_index_strategy.for_current_index.return_value - assert mock_specific_index.pls_setup.mock_calls == [mock.call(skip_backfill=True)] + assert mock_index_strategy.pls_setup.mock_calls == [mock.call(skip_backfill=True)] def test_setup_index(self): mock_index_strategy = mock.Mock() with mock.patch('share.bin.search.index_strategy.get_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') - mock_current_index = mock_index_strategy.for_current_index.return_value - assert mock_current_index.pls_setup.mock_calls == [mock.call(skip_backfill=False)] + assert mock_index_strategy.pls_setup.mock_calls == [mock.call(skip_backfill=False)] def test_daemon(self, settings): with mock.patch('share.bin.search.IndexerDaemonControl') as mock_daemon_control: diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index fb12f9081..871256d44 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -1,4 +1,5 @@ import contextlib +import enum from typing import Iterable from unittest import mock @@ -7,11 +8,10 @@ @contextlib.contextmanager def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): - index_strategy.all_strategy_names.cache_clear() - with mock.patch.object( - index_strategy, - 'each_strategy', - return_value=strategies, - ): + with mock.patch.object(index_strategy, '_AvailableStrategies', new=enum.Enum( + '_AvailableStrategies', [ + (_strategy.strategy_name, _strategy) + for _strategy in strategies + ], + )): yield - index_strategy.all_strategy_names.cache_clear() diff --git a/tests/share/search/index_strategy/test_elastic8.py b/tests/share/search/index_strategy/test_elastic8.py index 6edcb30f9..20c68e67e 100644 --- a/tests/share/search/index_strategy/test_elastic8.py +++ b/tests/share/search/index_strategy/test_elastic8.py @@ -1,3 +1,4 @@ +import functools from unittest import mock import pytest @@ -26,6 +27,10 @@ def define_current_indexes(cls): ), } + @functools.cached_property + def es8_client(self): + return mock.Mock() + @property def supported_message_types(self): return { @@ -43,13 +48,7 @@ def build_elastic_actions(self, messages_chunk): class TestIndexStrategy: @pytest.fixture - def mock_es_client(self): - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8') as es8_mockpackage: - es8_mockclient = es8_mockpackage.Elasticsearch.return_value - yield es8_mockclient - - @pytest.fixture - def fake_strategy(self, mock_es_client, settings): + def fake_strategy(self, settings): settings.ELASTICSEARCH8_URL = 'http://nowhere.example:12345/' strat = FakeElastic8IndexStrategy('fake_es8') strat.assert_strategy_is_current() @@ -57,31 +56,35 @@ def fake_strategy(self, mock_es_client, settings): @pytest.fixture def fake_specific_index(self, fake_strategy): - return fake_strategy.for_current_index() + return fake_strategy.get_index('') + + @pytest.fixture + def mock_es_client(self, fake_strategy): + return fake_strategy.es8_client def test_pls_create(self, fake_specific_index, mock_es_client): mock_es_client.indices.exists.return_value = False fake_specific_index.pls_create() mock_es_client.indices.exists.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ) mock_es_client.indices.create.assert_called_once_with( - index=fake_specific_index.indexname, - settings=fake_specific_index.index_strategy.index_settings(), - mappings=fake_specific_index.index_strategy.index_mappings(), + index=fake_specific_index.full_index_name, + mappings={'my-mappings': 'lol'}, + settings={'my-settings': 'lol'}, ) # already exists: mock_es_client.reset_mock() mock_es_client.indices.exists.return_value = True, fake_specific_index.pls_create() mock_es_client.indices.exists.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ) mock_es_client.indices.create.assert_not_called() def test_delete_index(self, fake_specific_index, mock_es_client): fake_specific_index.pls_delete() mock_es_client.indices.delete.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ignore=[400, 404], ) diff --git a/tests/share/search/test_index_backfill.py b/tests/share/search/test_index_backfill.py index b18e93a61..e3934de25 100644 --- a/tests/share/search/test_index_backfill.py +++ b/tests/share/search/test_index_backfill.py @@ -10,14 +10,14 @@ class TestIndexBackfillMethods: @pytest.fixture def fake_strategy(self): fake_strategy = mock.Mock() - fake_strategy.name = 'foo' + fake_strategy.strategy_name = 'foo' fake_strategy.CURRENT_STRATEGY_CHECKSUM = 'foo_bar' return fake_strategy @pytest.fixture def index_backfill(self, fake_strategy): return IndexBackfill.objects.create( - index_strategy_name=fake_strategy.name, + index_strategy_name=fake_strategy.strategy_name, ) def test_happypath(self, index_backfill: IndexBackfill, fake_strategy): From 7eaac3f7451ebfbfd0dad31e68189217e9a4b596 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 16:18:08 -0500 Subject: [PATCH 20/35] wip... --- .../index_strategy/test_strategy_selection.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index afc458814..b4d8a1045 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -13,38 +13,39 @@ parse_strategy_name, ) from share.search.index_strategy._indexnames import combine_indexname_parts +from tests.share.search import patch_index_strategies @pytest.fixture -def expected_strategy_classes(): - return { - 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, - 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, - 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, - 'trovesearch_denorm': trovesearch_denorm.TrovesearchDenormIndexStrategy, - } +def patched_strategies(mock_elastic_clients): + _strategies = [ + sharev2_elastic5.Sharev2Elastic5IndexStrategy('sharev2_elastic5'), + sharev2_elastic8.Sharev2Elastic8IndexStrategy('sharev2_elastic8'), + trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats'), + trovesearch_denorm.TrovesearchDenormIndexStrategy('trovesearch_denorm'), + ] + with patch_index_strategies(_strategies): + yield _strategies class TestBaseIndexStrategy: - def test_get_index_strategy(self, mock_elastic_clients, expected_strategy_classes): - for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - index_strategy = get_strategy(strategy_name) - assert isinstance(index_strategy, expected_strategy_class) + def test_get_index_strategy(self, patched_strategies): + for expected_strategy in patched_strategies: + gotten_strategy = get_strategy(expected_strategy.strategy_name) + assert gotten_strategy == expected_strategy - def test_all_index_strategies(self, mock_elastic_clients, expected_strategy_classes): + def test_all_index_strategies(self, patched_strategies): all_strategys = tuple(each_strategy()) - assert len(all_strategys) == len(expected_strategy_classes) - strategy_names = {index_strategy.strategy_name for index_strategy in all_strategys} - assert strategy_names == set(expected_strategy_classes.keys()) + assert len(all_strategys) == len(patched_strategies) + gotten_names = {index_strategy.strategy_name for index_strategy in all_strategys} + assert gotten_names == {strategy.strategy_name for strategy in patched_strategies} for index_strategy in all_strategys: - strategy_class = expected_strategy_classes[index_strategy.strategy_name] - assert isinstance(index_strategy, strategy_class) assert issubclass(index_strategy.SpecificIndex, IndexStrategy.SpecificIndex) assert index_strategy.SpecificIndex is not IndexStrategy.SpecificIndex @pytest.mark.django_db - def test_get_by_request(self, mock_elastic_clients): - for _strategy in each_strategy(): + def test_get_by_request(self, patched_strategies): + for _strategy in patched_strategies: good_requests = [ _strategy.strategy_name, combine_indexname_parts(_strategy.strategy_name, _strategy.strategy_check), From d194be806341ef127ccd36a67196217b302b59bb Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 16:29:00 -0500 Subject: [PATCH 21/35] wip..... --- tests/share/search/index_strategy/_with_real_services.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 4c2c3f0c1..e49e7d1ca 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -24,6 +24,7 @@ def setUp(self): super().setUp() self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) self.index_strategy = self.get_index_strategy() + self.index_strategy.pls_teardown() # in case it already exists def _fake_get_index_strategy(name): if self.index_strategy.strategy_name == name: @@ -38,7 +39,6 @@ def _fake_get_index_strategy(name): celery_app=celery_app, index_strategys=[self.index_strategy], ) - self.index_strategy.pls_teardown() # in case it already exists self._assert_setup_happypath() def tearDown(self): @@ -101,8 +101,9 @@ def _assert_setup_happypath(self): assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count - # create index + for _index in self.index_strategy.each_subnamed_index(): _index.pls_create() + # create index assert _index.pls_check_exists() index_status = _index.pls_get_status() assert index_status.creation_date From 81581e312236ad8af880271fe122f19a683eebaa Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 16:41:22 -0500 Subject: [PATCH 22/35] wip...... --- .../search/index_strategy/_with_real_services.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index e49e7d1ca..d8e9fc866 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -8,6 +8,7 @@ from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger from share.search import index_strategy +from tests.share.search import patch_index_strategies # base class for testing IndexStrategy subclasses with actual elasticsearch. @@ -25,16 +26,7 @@ def setUp(self): self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) self.index_strategy = self.get_index_strategy() self.index_strategy.pls_teardown() # in case it already exists - - def _fake_get_index_strategy(name): - if self.index_strategy.strategy_name == name: - return self.index_strategy - raise ValueError(f'unknown index strategy in test: {name}') - - self.enterContext(mock.patch( - 'share.search.index_strategy.get_strategy', - new=_fake_get_index_strategy, - )) + self.enterContext(patch_index_strategies([self.index_strategy])) self.index_messenger = IndexMessenger( celery_app=celery_app, index_strategys=[self.index_strategy], From 63d7c59f73a779ddbaf6b30f61fc8ae54ab2f86b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 16:47:42 -0500 Subject: [PATCH 23/35] wip....... --- tests/share/search/conftest.py | 4 ++++ tests/share/search/test_admin_workflow.py | 19 ++++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/share/search/conftest.py b/tests/share/search/conftest.py index b87757372..3cba6ba08 100644 --- a/tests/share/search/conftest.py +++ b/tests/share/search/conftest.py @@ -11,3 +11,7 @@ def mock_elastic_clients(settings): with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5'): with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): yield + from share.search.index_strategy.elastic8 import Elastic8IndexStrategy + Elastic8IndexStrategy._get_elastic8_client.cache_clear() + from share.search.index_strategy.sharev2_elastic5 import Sharev2Elastic5IndexStrategy + Sharev2Elastic5IndexStrategy._get_elastic5_client.cache_clear() diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index fc0ede074..3b9a3ab26 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -1,5 +1,3 @@ -from unittest import mock - from django.test.client import Client import pytest @@ -13,12 +11,11 @@ def test_admin_search_indexes_view(mock_elastic_clients): ShareUser.objects.create_superuser(**credentials) client = Client() client.login(**credentials) - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): - resp = client.get('/admin/search-indexes') - for strategy_name in index_strategy.all_strategy_names(): - _index_strategy = index_strategy.get_strategy(strategy_name) - expected_header = f'

' - assert expected_header.encode() in resp.content - for _index in _index_strategy.each_subnamed_index(): - expected_row = f'

' - assert expected_row.encode() in resp.content + resp = client.get('/admin/search-indexes') + for strategy_name in index_strategy.all_strategy_names(): + _index_strategy = index_strategy.get_strategy(strategy_name) + expected_header = f'

' + assert expected_header.encode() in resp.content + for _index in _index_strategy.each_subnamed_index(): + expected_row = f'

' + assert expected_row.encode() in resp.content From 4e6ecd388feeb110daa466f13274da95a76e5c17 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 16:51:27 -0500 Subject: [PATCH 24/35] wip......... --- tests/share/search/test_daemon.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/share/search/test_daemon.py b/tests/share/search/test_daemon.py index 126016010..6bca002af 100644 --- a/tests/share/search/test_daemon.py +++ b/tests/share/search/test_daemon.py @@ -36,7 +36,7 @@ def wait_for(event: threading.Event): class FakeIndexStrategyForSetupOnly: # for tests that don't need any message-handling - name = 'fakefake' + strategy_name = 'fakefake' supported_message_types = { messages.MessageType.INDEX_SUID, } @@ -45,7 +45,7 @@ class FakeIndexStrategyForSetupOnly: class FakeIndexStrategyWithBlockingEvents: - name = 'fakefake-with-events' + strategy_name = 'fakefake-with-events' supported_message_types = { messages.MessageType.INDEX_SUID, } @@ -118,7 +118,7 @@ class UnexpectedError(Exception): pass class FakeIndexStrategyWithUnexpectedError: - name = 'fakefake_with_error' + strategy_name = 'fakefake_with_error' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' @@ -142,15 +142,15 @@ def pls_handle_messages_chunk(self, messages_chunk): def test_noncurrent_backfill(self): class FakeIndexStrategyWithNoncurrentBackfill: - name = 'fakefake-with-backfill' - current_indexname = 'not-what-you-expected' + CURRENT_STRATEGY_CHECKSUM = 'not-what-you-expected' + strategy_name = 'fakefake-with-backfill' supported_message_types = {messages.MessageType.BACKFILL_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' def get_or_create_backfill(self): class FakeIndexBackfill: - specific_indexname = 'what-you-expected' + strategy_checksum = 'what-you-expected' return FakeIndexBackfill() with _daemon_running( @@ -165,7 +165,7 @@ class FakeIndexBackfill: def test_message_error(self): class FakeIndexStrategyWithMessageError: - name = 'fakefake_with_msg_error' + strategy_name = 'fakefake_with_msg_error' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' @@ -197,7 +197,7 @@ def pls_handle_messages_chunk(self, messages_chunk): @mock.patch('share.search.daemon._backoff_wait', wraps=_backoff_wait) def test_backoff(self, mock_backoff_wait): class FakeIndexStrategyWith429: - name = 'fakefake_with_429' + strategy_name = 'fakefake_with_429' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' From 722300707eaa3fb2379efaf50d42ffb6b90d0bb2 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Jan 2025 17:03:52 -0500 Subject: [PATCH 25/35] wip.... --- tests/share/bin/test_sharectl.py | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index b6ed5b656..a8e5c6325 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -28,44 +28,44 @@ def test_sharectl_version(): class TestSharectlSearch: - @pytest.mark.parametrize('indexnames', [ + @pytest.mark.parametrize('strategynames', [ ['one'], ['another', 'makes', 'two'], ]) - def test_purge(self, indexnames): - mock_specific_indexes = { - indexname: mock.Mock() - for indexname in indexnames + def test_purge(self, strategynames): + mock_strategies = { + strategyname: mock.Mock() + for strategyname in strategynames } - def _get_specific_index(indexname): - return mock_specific_indexes[indexname] + def _fake_parse_strategy_name(strategyname): + return mock_strategies[strategyname] - with mock.patch('share.bin.search.index_strategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: - run_sharectl('search', 'purge', *indexnames) - assert mock_get_specific.mock_calls == [ - mock.call(indexname) - for indexname in mock_specific_indexes.keys() + with mock.patch('share.bin.search.index_strategy.parse_strategy_name', wraps=_fake_parse_strategy_name) as mock_get_strategy: + run_sharectl('search', 'purge', *strategynames) + assert mock_get_strategy.mock_calls == [ + mock.call(strategyname) + for strategyname in mock_strategies.keys() ] - for mock_specific_index in mock_specific_indexes.values(): - mock_specific_index.pls_delete.assert_called_once_with() + for mock_strategy in mock_strategies.values(): + mock_strategy.pls_teardown.assert_called_once_with() def test_setup_initial(self, settings): _expected_indexes = ['baz', 'bar', 'foo'] _mock_index_strategys = [ - mock.Mock(name=_name) + mock.Mock(strategy_name=_name) for _name in _expected_indexes ] with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') for mock_index_strategy in _mock_index_strategys: - assert mock_index_strategy.pls_setup.mock_calls == [mock.call(skip_backfill=True)] + assert mock_index_strategy.pls_setup.mock_calls == [mock.call()] def test_setup_index(self): mock_index_strategy = mock.Mock() with mock.patch('share.bin.search.index_strategy.get_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') - assert mock_index_strategy.pls_setup.mock_calls == [mock.call(skip_backfill=False)] + assert mock_index_strategy.pls_setup.mock_calls == [mock.call()] def test_daemon(self, settings): with mock.patch('share.bin.search.IndexerDaemonControl') as mock_daemon_control: From b8866530db1e17e1e37a099901302a789a2fedf6 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Jan 2025 10:30:01 -0500 Subject: [PATCH 26/35] wip... (fixes...) --- trove/trovesearch/search_params.py | 8 ++- trove/trovesearch/trovesearch_gathering.py | 58 +++++++++++----------- trove/views/_gather_ask.py | 21 ++++++++ trove/views/indexcard.py | 29 ++++++----- trove/views/search.py | 16 +----- trove/views/vocab.py | 4 +- 6 files changed, 76 insertions(+), 60 deletions(-) create mode 100644 trove/views/_gather_ask.py diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index efcb2b0d5..473629cc6 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -62,6 +62,7 @@ DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) DEFAULT_INCLUDES_BY_TYPE: collections.abc.Mapping[str, frozenset[Propertypath]] = freeze({ + TROVE.Indexcard: set(), TROVE.Cardsearch: { (TROVE.searchResultPage,), (TROVE.relatedPropertyList,), @@ -179,7 +180,7 @@ def _gather_include(cls, queryparams: QueryparamDict) -> PropertypathSet: _parse_propertypath_set(_include_value) for _, _include_value in _include_params )) - return DEFAULT_INCLUDES_BY_TYPE[cls.static_focus_type] + return DEFAULT_INCLUDES_BY_TYPE.get(cls.static_focus_type, frozenset()) @classmethod def _gather_attrpaths(cls, queryparams: QueryparamDict) -> collections.abc.Mapping[ @@ -536,6 +537,11 @@ def as_queryparam(self) -> tuple[str, str]: return (_name, _value) +@dataclasses.dataclass(frozen=True) +class IndexcardParams(BaseTroveParams): + static_focus_type = TROVE.Indexcard + + @dataclasses.dataclass(frozen=True) class CardsearchParams(BaseTroveParams): cardsearch_textsegment_set: frozenset[Textsegment] diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index c2a4159f9..b69057f99 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -154,7 +154,7 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): _current_handle: CardsearchHandle | None = focus.search_handle while _current_handle is not None: _result_page = [] - _cards_by_iri, _card_contents_by_iri = _load_cards_and_contents( + _card_foci = _load_cards_and_contents( (_result.card_iri for _result in _current_handle.search_result_page), deriver_iri=deriver_iri, ) @@ -170,13 +170,13 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): ) _result_page.append(frozenset(( (RDF.type, TROVE.SearchResult), - (TROVE.indexCard, IndexcardFocus.new( - iris=_result.card_iri, - indexcard=_cards_by_iri[_result.card_iri], - resourceMetadata=_card_contents_by_iri.get(_result.card_iri), - )), + (TROVE.indexCard, _result.card_iri), *_text_evidence_twoples, ))) + # hack around (current) limitations of primitive_metadata.gather: + # yield a redundant triple to make this IndexcardFocus gatherable + _card_focus = _card_foci[_result.card_iri] + yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI) yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() @@ -318,13 +318,13 @@ def gather_card_contents(focus: IndexcardFocus, *, deriver_iri, **kwargs): if focus.resourceMetadata is not None: yield (TROVE.resourceMetadata, focus.resourceMetadata) else: - ... + _iri = focus.single_iri() + _loaded_foci = _load_cards_and_contents([_iri], deriver_iri) + _loaded_metadata = _loaded_foci[_iri].resourceMetadata + yield (TROVE.resourceMetadata, _loaded_metadata) -def _load_cards_and_contents(card_iris, deriver_iri) -> tuple[ - dict[str, trove_db.Indexcard], # cards by iri - dict[str, Any], # card contents by iri -]: +def _load_cards_and_contents(card_iris, deriver_iri) -> dict[str, IndexcardFocus]: return ( _load_cards_and_extracted_rdf_contents(card_iris) if deriver_iri is None @@ -332,10 +332,7 @@ def _load_cards_and_contents(card_iris, deriver_iri) -> tuple[ ) -def _load_cards_and_extracted_rdf_contents(card_iris) -> tuple[ - dict[str, trove_db.Indexcard], - dict[str, rdf.QuotedGraph], -]: +def _load_cards_and_extracted_rdf_contents(card_iris) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() _indexcard_uuids = { iri_minus_namespace(_card_iri, namespace=_card_namespace) @@ -347,24 +344,23 @@ def _load_cards_and_extracted_rdf_contents(card_iris) -> tuple[ .select_related('indexcard') .prefetch_related('indexcard__focus_identifier_set') ) - _cards_by_iri: dict[str, trove_db.Indexcard] = {} - _card_contents_by_iri: dict[str, rdf.QuotedGraph] = {} + _card_foci: dict[str, IndexcardFocus] = {} for _indexcard_rdf in _indexcard_rdf_qs: _card = _indexcard_rdf.indexcard _card_iri = _card.get_iri() - _cards_by_iri[_card_iri] = _card _quoted_graph = _indexcard_rdf.as_quoted_graph() _quoted_graph.add( (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _card_iri), ) - _card_contents_by_iri[_card_iri] = _quoted_graph - return _cards_by_iri, _card_contents_by_iri + _card_foci[_card_iri] = IndexcardFocus.new( + iris=_card_iri, + indexcard=_card, + resourceMetadata=_quoted_graph, + ) + return _card_foci -def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> tuple[ - dict[str, trove_db.Indexcard], - dict[str, rdf.Literal], -]: +def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() _indexcard_uuids = { iri_minus_namespace(_card_iri, namespace=_card_namespace) @@ -383,13 +379,15 @@ def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> tuple[ .select_related('upriver_indexcard') .prefetch_related('upriver_indexcard__focus_identifier_set') ) - _cards_by_iri: dict[str, trove_db.Indexcard] = {} - _card_contents_by_iri: dict[str, rdf.Literal] = {} + _card_foci: dict[str, IndexcardFocus] = {} for _derived in _derived_indexcard_qs: - _indexcard_iri = _derived.upriver_indexcard.get_iri() - _cards_by_iri[_indexcard_iri] = _derived.upriver_indexcard - _card_contents_by_iri[_indexcard_iri] = _derived.as_rdf_literal() - return _cards_by_iri, _card_contents_by_iri + _card_iri = _derived.upriver_indexcard.get_iri() + _card_foci[_card_iri] = IndexcardFocus.new( + iris=_card_iri, + indexcard=_derived.upriver_indexcard, + resourceMetadata=_derived.as_rdf_literal(), + ) + return _card_foci ### diff --git a/trove/views/_gather_ask.py b/trove/views/_gather_ask.py new file mode 100644 index 000000000..63bae1098 --- /dev/null +++ b/trove/views/_gather_ask.py @@ -0,0 +1,21 @@ +from primitive_metadata import gather + +from trove.trovesearch.search_params import BaseTroveParams + + +def ask_gathering_from_params( + gathering: gather.Gathering, + params: BaseTroveParams, + start_focus: gather.Focus, +): + # fill the gathering's cache with included related resources... + gathering.ask(params.included_relations, focus=start_focus) + # ...and add requested attributes on the focus and related resources + for _focus in gathering.cache.focus_set: + for _focustype in _focus.type_iris: + try: + _attrpaths = params.attrpaths_by_type[_focustype] + except KeyError: + pass # no attribute fields for this type + else: + gathering.ask(_attrpaths, focus=_focus) diff --git a/trove/views/indexcard.py b/trove/views/indexcard.py index a685428d8..208a15f85 100644 --- a/trove/views/indexcard.py +++ b/trove/views/indexcard.py @@ -1,14 +1,18 @@ from django.views import View -from primitive_metadata import gather from trove import exceptions as trove_exceptions +from trove import models as trove_db from trove.render import ( DEFAULT_RENDERER_TYPE, get_renderer_type, ) -from trove.trovesearch.trovesearch_gathering import trovesearch_by_indexstrategy -from trove.vocab.namespaces import TROVE +from trove.trovesearch.search_params import IndexcardParams +from trove.trovesearch.trovesearch_gathering import ( + trovesearch_by_indexstrategy, + IndexcardFocus, +) from trove.vocab.trove import trove_indexcard_iri +from ._gather_ask import ask_gathering_from_params from ._responder import ( make_http_error_response, make_http_response, @@ -19,18 +23,17 @@ class IndexcardView(View): def get(self, request, indexcard_uuid): try: _renderer_type = get_renderer_type(request) - _search_gathering = trovesearch_by_indexstrategy.new_gathering({ - # TODO (gather): allow omitting kwargs that go unused - 'search_params': None, - 'specific_index': None, + _gathering = trovesearch_by_indexstrategy.new_gathering({ 'deriver_iri': _renderer_type.INDEXCARD_DERIVER_IRI, }) _indexcard_iri = trove_indexcard_iri(indexcard_uuid) - _search_gathering.ask( - {}, # TODO: build from `include`/`fields` - focus=gather.Focus.new(_indexcard_iri, TROVE.Indexcard), + _params = IndexcardParams.from_querystring(request.META['QUERY_STRING']) + _focus = IndexcardFocus.new( + iris=_indexcard_iri, + indexcard=trove_db.Indexcard.objects.get_for_iri(_indexcard_iri), ) - _renderer = _renderer_type(_indexcard_iri, _search_gathering.leaf_a_record()) + ask_gathering_from_params(_gathering, _params, _focus) + _renderer = _renderer_type(_focus, _gathering) return make_http_response( content_rendering=_renderer.render_document(), http_request=request, @@ -38,10 +41,10 @@ def get(self, request, indexcard_uuid): except trove_exceptions.CannotRenderMediatype as _error: return make_http_error_response( error=_error, - renderer=DEFAULT_RENDERER_TYPE(_indexcard_iri), + renderer_type=DEFAULT_RENDERER_TYPE, ) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, - renderer=_renderer_type(_indexcard_iri), + renderer_type=_renderer_type, ) diff --git a/trove/views/search.py b/trove/views/search.py index d620b668a..d164b36e4 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -23,6 +23,7 @@ DEFAULT_RENDERER_TYPE, get_renderer_type, ) +from ._gather_ask import ask_gathering_from_params from ._responder import ( make_http_error_response, make_http_response, @@ -59,7 +60,7 @@ def get(self, request): search_handle=self.get_search_handle(_strategy, _search_params), ) if _renderer_type.PASSIVE_RENDER: - self._fill_gathering(_search_gathering, _search_params, _focus) + ask_gathering_from_params(_search_gathering, _search_params, _focus) # take gathered data into a response _renderer = _renderer_type(_focus, _search_gathering) return make_http_response( @@ -83,19 +84,6 @@ def _start_gathering(self, renderer_type) -> gather.Gathering: 'deriver_iri': renderer_type.INDEXCARD_DERIVER_IRI, }) - def _fill_gathering(self, search_gathering, search_params, start_focus): - # fill the gathering's cache with included related resources... - search_gathering.ask(search_params.included_relations, focus=start_focus) - # ...and add requested attributes on the focus and related resources - for _focus in search_gathering.cache.focus_set: - for _focustype in _focus.type_iris: - try: - _attrpaths = search_params.attrpaths_by_type[_focustype] - except KeyError: - pass # no attribute fields for this type - else: - search_gathering.ask(_attrpaths, focus=_focus) - def get_search_handle(self, strategy, search_params) -> BasicSearchHandle: return self._get_wrapped_handler(strategy)(search_params) diff --git a/trove/views/vocab.py b/trove/views/vocab.py index dcab1c373..62982f34e 100644 --- a/trove/views/vocab.py +++ b/trove/views/vocab.py @@ -31,10 +31,10 @@ def get(self, request, vocab_term): except trove_exceptions.CannotRenderMediatype as _error: return make_http_error_response( error=_error, - renderer=DEFAULT_RENDERER_TYPE(_iri), + renderer_type=DEFAULT_RENDERER_TYPE, ) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, - renderer=_renderer_type(_iri), + renderer_type=_renderer_type, ) From 4bbcfbf6e6f9d802c578caa082d6859df238fff5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Jan 2025 10:44:31 -0500 Subject: [PATCH 27/35] remove unnecessary migration --- ...ndexname_indexbackfill_strategy_checksum.py | 18 ------------------ share/models/index_backfill.py | 12 +++++++++++- 2 files changed, 11 insertions(+), 19 deletions(-) delete mode 100644 share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py diff --git a/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py b/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py deleted file mode 100644 index 37867fe2b..000000000 --- a/share/migrations/0077_rename_specific_indexname_indexbackfill_strategy_checksum.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 3.2.25 on 2025-01-16 20:32 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('share', '0076_rawdatum_share_rawdatum_expiration_idx'), - ] - - operations = [ - migrations.RenameField( - model_name='indexbackfill', - old_name='specific_indexname', - new_name='strategy_checksum', - ), - ] diff --git a/share/models/index_backfill.py b/share/models/index_backfill.py index bcbc63b08..47dff03c7 100644 --- a/share/models/index_backfill.py +++ b/share/models/index_backfill.py @@ -47,7 +47,7 @@ class IndexBackfill(models.Model): ) backfill_status = models.TextField(choices=BACKFILL_STATUS_CHOICES, default=INITIAL) index_strategy_name = models.TextField(unique=True) - strategy_checksum = models.TextField() + specific_indexname = models.TextField() error_type = models.TextField(blank=True) error_message = models.TextField(blank=True) error_context = models.TextField(blank=True) @@ -68,6 +68,16 @@ def __repr__(self): def __str__(self): return repr(self) + @property + def strategy_checksum(self): + # back-compat alias for specific_indexname (may be removed if that's renamed via migration) + return self.specific_indexname # for backcompat + + @strategy_checksum.setter + def strategy_checksum(self, value): + # back-compat alias for specific_indexname (may be removed if that's renamed via migration) + self.specific_indexname = value + @contextlib.contextmanager def mutex(self): with IndexBackfill.objects.get_with_mutex(pk=self.pk) as index_backfill: From a9a9b7000b09cf3d9a9fb85dcb7092b5605c0ca2 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Jan 2025 11:55:38 -0500 Subject: [PATCH 28/35] wip... (fix valuesearch) --- trove/trovesearch/trovesearch_gathering.py | 32 +++++++++++++--------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index b69057f99..560e3b375 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -27,6 +27,7 @@ ValuesearchHandle, ValuesearchResult, ) +from trove.util.iris import get_sufficiently_unique_iri from trove.vocab.namespaces import RDF, FOAF, DCTERMS, RDFS, DCAT, TROVE from trove.vocab.jsonapi import ( JSONAPI_LINK_OBJECT, @@ -77,8 +78,9 @@ class _TypedFocus(gather.Focus): ADDITIONAL_TYPE_IRIS: ClassVar[tuple[str, ...]] = () # (optional on subclasses) @classmethod - def new(cls, *, type_iris=(), **kwargs): + def new(cls, *args, type_iris=(), **kwargs): return super().new( + *args, # add type_iri to new Focus instance type_iris={ cls.TYPE_IRI, @@ -241,19 +243,23 @@ def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): ) else: _value_indexcards = [] + _cards_by_suffuniq_iri = { + _identifier.sufficiently_unique_iri: _indexcard + for _indexcard in _value_indexcards + for _identifier in _indexcard.focus_identifier_set.all() + } for _result in focus.search_handle.search_result_page or (): - _indexcard_obj: Any = None - if _result.value_iri in _value_iris: - for _indexcard in _value_indexcards: - if any( - _identifier.equivalent_to_iri(_result.value_iri) - for _identifier in _indexcard.focus_identifier_set.all() - ): - _indexcard_obj = IndexcardFocus.new( - iris=_indexcard.get_iri(), - indexcard=_indexcard, - ) - break # found the indexcard + _indexcard_obj = None + if _result.value_iri is not None: + _indexcard = _cards_by_suffuniq_iri.get( + get_sufficiently_unique_iri(_result.value_iri), + ) + if _indexcard is not None: + _indexcard_obj = _indexcard.get_iri() + # hack around (current) limitations of primitive_metadata.gather: + # yield a redundant triple to make this IndexcardFocus gatherable + _card_focus = IndexcardFocus.new(_indexcard_obj, indexcard=_indexcard) + yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI) if _indexcard_obj is None: # no actual indexcard; put what we know in a blanknode-indexcard _indexcard_obj = _valuesearch_result_as_indexcard_blanknode(_result) From cd36203c4a6ac64e9d0c35758f4f0ffd7daeb360 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Jan 2025 12:45:12 -0500 Subject: [PATCH 29/35] wip..... --- trove/trovesearch/trovesearch_gathering.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 560e3b375..8d0364bdb 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -218,7 +218,7 @@ def gather_cardsearch_filter(focus, **kwargs): TROVE.searchResultPage, focustype_iris={TROVE.Valuesearch}, ) -def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): +def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): _result_page = [] _value_iris = { _result.value_iri @@ -235,8 +235,7 @@ def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): ), derived_indexcard_set__deriver_identifier__in=( trove_db.ResourceIdentifier.objects - .queryset_for_iri(TROVE['derive/osfmap_json']) - # TODO: choose deriver by queryparam/gatherer-kwarg + .queryset_for_iri(deriver_iri) ), ) .prefetch_related('focus_identifier_set') @@ -306,7 +305,8 @@ def gather_card_modified(focus: IndexcardFocus, **kwargs): @trovesearch_by_indexstrategy.gatherer( - (FOAF.primaryTopic, TROVE.focusIdentifier), + FOAF.primaryTopic, + TROVE.focusIdentifier, focustype_iris={TROVE.Indexcard}, ) def gather_primary_topic(focus: IndexcardFocus, **kwargs): From 332434a00c9499e1f7b8de5020e95d2c7f9e0f8f Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 23 Jan 2025 09:58:44 -0500 Subject: [PATCH 30/35] remove temp TODO --- _TODO_multindex.txt | 54 --------------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 _TODO_multindex.txt diff --git a/_TODO_multindex.txt b/_TODO_multindex.txt deleted file mode 100644 index 7df6fadae..000000000 --- a/_TODO_multindex.txt +++ /dev/null @@ -1,54 +0,0 @@ - -IndexStrategy revamp plan/log: - -- update existing classes to dataclasses - - IndexStrategy (each instance represents a current or past version of the strategy, identified by subname) - - strategy_name (rename existing `name` attr for disambig) - - strategy_check (new; default CURRENT_STRATEGY_CHECKSUM but may be parsed from or index name or `indexStrategy` query param) - - IndexStrategy.SpecificIndex - - index_strategy (existing) - - subname (new; unique within a strategy) - - (base SpecificIndex now mainly inward, focused on constructing index names and checking index status) - -- move search methods from IndexStrategy.SpecificIndex to IndexStrategy - -- consolidate parsing names from `indexStrategy` queryparam and from elastic state - -- remove uniindex methods from IndexStrategy (and friends) - - each_specific_index - - for_specific_index - - for_current_index - - SpecificIndex.pls_setup - - SpecificIndex.pls_check_exists - - SpecificIndex.pls_handle_cardsearch - - SpecificIndex.pls_handle_valuesearch - - SpecificIndex.pls_refresh - - SpecificIndex.pls_delete - - SpecificIndex.pls_start_keeping_live - - Elastic8IndexStrategy.index_settings - - Elastic8IndexStrategy.index_mappings - -- add replacement multiindex methods to IndexStrategy (and friends) - - (classmethod) each_existing_index (based on index names from elastic; may be any hex) - - each_named_index (includes non-existent; ) - - get_index - - subnames - - is_current - - pls_setup - - pls_check_exists - - pls_start_keeping_live - - pls_teardown - - pls_handle_cardsearch - - pls_handle_valuesearch - - pls_ensure_fresh - - with_strategy_check (return copy of the strategy with another check) - - Elastic8IndexStrategy.define_current_indexes (abstractmethod) - - Elastic8IndexStrategy.each_named_index (based on current_index_definitions) - -- update `share.search.index_strategy` public interface (see __all__) - -- update existing base methods - - add strategy_check to indexname_prefix - - pls_get_default_for_searching (get strategy_check from es alias (or current)) - - pls_make_default_for_searching (by strategy instance (or strategy_check), not SpecificIndex) - From 9a6c904a690b2f2a0aef896b498789499fed6f09 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 23 Jan 2025 12:41:41 -0500 Subject: [PATCH 31/35] tidy names --- api/search/views.py | 4 ++-- share/search/index_strategy/trove_indexcard_flats.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/api/search/views.py b/api/search/views.py index 7cf781947..f8d6b4dcd 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -32,11 +32,11 @@ def _handle_request(self, request): if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: - specific_index = index_strategy.get_strategy_for_sharev2_search(requested_index_strategy) + _index_strategy = index_strategy.get_strategy_for_sharev2_search(requested_index_strategy) except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: - response_json = specific_index.pls_handle_search__passthru( + response_json = _index_strategy.pls_handle_search__passthru( request_body=request.data, request_queryparams=queryparams, ) diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index e5a6488d1..49874d189 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -294,7 +294,7 @@ def _make_actionset(indexcard_id, *actions): def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: return self.es8_client.search( - index=self.get_index('').full_index_name, + index=self.__index.full_index_name, body={ **(request_body or {}), 'track_total_hits': True, @@ -356,7 +356,7 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value logger.info(json.dumps(_search_kwargs, indent=2)) try: _es8_response = self.es8_client.search( - index=self.get_index('').full_index_name, + index=self.__index.full_index_name, **_search_kwargs, ) except elasticsearch8.TransportError as error: From 4e52c477b9d95e496278be313b9611be2ff84677 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 24 Jan 2025 12:34:47 -0500 Subject: [PATCH 32/35] respond to group review --- ARCHITECTURE.md | 5 +++++ share/search/index_strategy/__init__.py | 5 +++++ share/search/index_strategy/_base.py | 5 +---- share/search/index_strategy/elastic8.py | 14 -------------- .../search/index_strategy/sharev2_elastic5.py | 4 ---- .../index_strategy/_with_real_services.py | 18 +++++++++--------- 6 files changed, 20 insertions(+), 31 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 00f313da6..d2dd034d1 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -103,6 +103,11 @@ Multiple records which describe the same item/object are grouped by a the source repository. In most outward-facing views, default to showing only the most recent record for each suid. +### Conventions +(an incomplete list) + +- functions prefixed `pls_` ("please") are a request for something to happen + ## Why this? inspired by [this writeup](https://matklad.github.io/2021/02/06/ARCHITECTURE.md.html) and [this example architecture document](https://github.com/rust-analyzer/rust-analyzer/blob/d7c99931d05e3723d878bea5dc26766791fa4e69/docs/dev/architecture.md) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 4c020b654..c00d2fbf1 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -42,6 +42,11 @@ class _AvailableStrategies(enum.Enum): trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') +if __debug__: + for _strategy_enum in _AvailableStrategies: + assert _strategy_enum.name == _strategy_enum.value.strategy_name, 'expected _AvailableStrategies enum name to match strategy name' + + ### # module public interface diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index 858f535ab..a61e5532d 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -150,6 +150,7 @@ def pls_start_backfill(self): def pls_mark_backfill_complete(self): self.get_or_create_backfill().pls_mark_complete() + self.pls_refresh() # explicit refresh after backfill def pls_check_exists(self) -> bool: return all( @@ -309,10 +310,6 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise NotImplementedError - @abc.abstractmethod - def is_kept_live(self) -> bool: - raise NotImplementedError - def pls_get_mappings(self) -> dict: raise NotImplementedError diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 88082bf10..751af06c0 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -256,13 +256,6 @@ def pls_handle_search__passthru(self, request_body=None, request_queryparams=Non params=(request_queryparams or {}), ) - # override from IndexStrategy - def pls_mark_backfill_complete(self): - super().pls_mark_backfill_complete() - # explicit refresh after bulk operation - for _index in self.each_subnamed_index(): - _index.pls_refresh() - # override from IndexStrategy def pls_refresh(self): super().pls_refresh() # refreshes each index @@ -469,13 +462,6 @@ def pls_stop_keeping_live(self): ) logger.warning('%r: no longer kept live', self) - # abstract method from IndexStrategy.SpecificIndex - def is_kept_live(self) -> bool: - _kept_live = self.index_strategy._get_indexnames_for_alias( - self.index_strategy._alias_for_keeping_live, - ) - return (self.full_index_name in _kept_live) - def pls_get_mappings(self): return self.index_strategy.es8_client.indices.get_mapping(index=self.full_index_name).body diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index d2d1557e2..8e775569c 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -367,10 +367,6 @@ def pls_create(self): def pls_start_keeping_live(self): pass # there is just the one index, always kept live - # abstract method from IndexStrategy.SpecificIndex - def is_kept_live(self) -> bool: - return True # there is just the one index, always kept live - # abstract method from IndexStrategy.SpecificIndex def pls_stop_keeping_live(self): raise exceptions.IndexStrategyError( diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index d8e9fc866..8ad685026 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -85,7 +85,7 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): assert False, 'checked and waited but the daemon did not do the thing' def _assert_setup_happypath(self): - # initial + # initial (no indexes exist) for _index in self.index_strategy.each_subnamed_index(): assert not _index.pls_check_exists() index_status = _index.pls_get_status() @@ -93,28 +93,28 @@ def _assert_setup_happypath(self): assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count + # create each index for _index in self.index_strategy.each_subnamed_index(): _index.pls_create() - # create index - assert _index.pls_check_exists() + assert _index.pls_check_exists() # new! index_status = _index.pls_get_status() - assert index_status.creation_date + assert index_status.creation_date # new! assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count - # keep index live (with ingested updates) + # start keeping each index live (with ingested updates) self.index_strategy.pls_start_keeping_live() for _index in self.index_strategy.each_subnamed_index(): index_status = _index.pls_get_status() assert index_status.creation_date - assert index_status.is_kept_live + assert index_status.is_kept_live # new! assert not index_status.is_default_for_searching assert not index_status.doc_count - # default index for searching + # make this version of the strategy the default for searching self.index_strategy.pls_make_default_for_searching() for _index in self.index_strategy.each_subnamed_index(): index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live - assert index_status.is_default_for_searching - assert not index_status.doc_count + assert index_status.is_default_for_searching # new! + assert not index_status.doc_count # (still empty) From 7f17b1e5003edad3a0c880e5205aa130b12dcaa6 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 28 Jan 2025 10:11:08 -0500 Subject: [PATCH 33/35] fix: respect small page[size] while streaming --- trove/trovesearch/page_cursor.py | 15 ++++++++++++++- trove/trovesearch/search_handle.py | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py index 33aa7f8f6..e5e5ee3ff 100644 --- a/trove/trovesearch/page_cursor.py +++ b/trove/trovesearch/page_cursor.py @@ -49,6 +49,10 @@ def bounded_page_size(self) -> int: else int(self.page_size) ) + @property + def is_complete_page(self) -> bool: + return self.bounded_page_size == self.page_size + def as_queryparam_value(self) -> str: _cls_key = _PageCursorTypes(type(self)).name _as_json = json.dumps([_cls_key, *dataclasses.astuple(self)]) @@ -82,10 +86,19 @@ class OffsetCursor(PageCursor): # total_count: int | float (from PageCursor) start_offset: int = 0 + @property + def bounded_page_size(self) -> int: + # overrides PageCursor + _bounded_page_size = super().bounded_page_size + if (_bounded_page_size < self.page_size < MAX_OFFSET): + _remaining = self.page_size - self.start_offset + _bounded_page_size = int(min(_bounded_page_size, _remaining)) + return _bounded_page_size + def is_valid(self) -> bool: _end_offset = ( self.total_count - if self.bounded_page_size == self.page_size + if self.is_complete_page else min(self.total_count, self.page_size) ) return ( diff --git a/trove/trovesearch/search_handle.py b/trove/trovesearch/search_handle.py index 3278cf8c6..90f44265d 100644 --- a/trove/trovesearch/search_handle.py +++ b/trove/trovesearch/search_handle.py @@ -63,6 +63,8 @@ def __post_init__(self): return _page def get_next_streaming_handle(self) -> typing.Self | None: + if self.cursor.is_complete_page: + return None _next_cursor = self.cursor.next_cursor() if (_next_cursor is not None) and (self.handler is not None): assert isinstance(self.search_params, CardsearchParams) From 7c8396bad0874b8ac7c6d588a46447cc2d27c53e Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 28 Jan 2025 10:12:10 -0500 Subject: [PATCH 34/35] improve hacks around gathering --- trove/render/_simple_trovesearch.py | 2 +- trove/trovesearch/trovesearch_gathering.py | 141 +++++++++++++-------- 2 files changed, 89 insertions(+), 54 deletions(-) diff --git a/trove/render/_simple_trovesearch.py b/trove/render/_simple_trovesearch.py index 6827c7918..6e6ba6eb1 100644 --- a/trove/render/_simple_trovesearch.py +++ b/trove/render/_simple_trovesearch.py @@ -92,7 +92,7 @@ def _get_card_content( _card_content = ( next(self.response_gathering.ask(TROVE.resourceMetadata, focus=card)) if graph is None - else next(graph.q(card, TROVE.resourceMetadata)) + else next(graph.q(card, TROVE.resourceMetadata), None) ) elif isinstance(card, frozenset): _card_content = next( diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 8d0364bdb..0ceed3ccb 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -1,7 +1,7 @@ import dataclasses import logging import urllib.parse -from typing import ClassVar, Any +from typing import ClassVar, Any, Iterator, Iterable from primitive_metadata.primitive_rdf import ( Literal, @@ -116,7 +116,7 @@ class IndexcardFocus(_TypedFocus): # additional dataclass fields indexcard: trove_db.Indexcard = dataclasses.field(compare=False) - resourceMetadata: Any = dataclasses.field(compare=False, default=None) + resourceMetadata: Any = dataclasses.field(compare=False, default=None, repr=False) # TODO: per-field text search in rdf @@ -157,7 +157,7 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): while _current_handle is not None: _result_page = [] _card_foci = _load_cards_and_contents( - (_result.card_iri for _result in _current_handle.search_result_page), + card_iris=(_result.card_iri for _result in _current_handle.search_result_page), deriver_iri=deriver_iri, ) for _result in _current_handle.search_result_page or (): @@ -175,10 +175,19 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): (TROVE.indexCard, _result.card_iri), *_text_evidence_twoples, ))) - # hack around (current) limitations of primitive_metadata.gather: - # yield a redundant triple to make this IndexcardFocus gatherable + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) _card_focus = _card_foci[_result.card_iri] - yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI) + _card_twoples = _minimal_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_result.card_iri, _pred, _obj) yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() @@ -226,39 +235,34 @@ def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): if _result.value_iri } if _value_iris: - _value_indexcards = ( - trove_db.Indexcard.objects - .filter( - focus_identifier_set__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iris(_value_iris) - ), - derived_indexcard_set__deriver_identifier__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iri(deriver_iri) - ), - ) - .prefetch_related('focus_identifier_set') - ) + _card_foci = _load_cards_and_contents(value_iris=_value_iris, deriver_iri=deriver_iri) else: - _value_indexcards = [] - _cards_by_suffuniq_iri = { - _identifier.sufficiently_unique_iri: _indexcard - for _indexcard in _value_indexcards - for _identifier in _indexcard.focus_identifier_set.all() + _card_foci = {} + _card_foci_by_suffuniq_iri: dict[str, IndexcardFocus] = { + _identifier.sufficiently_unique_iri: _focus + for _focus in _card_foci.values() + for _identifier in _focus.indexcard.focus_identifier_set.all() } for _result in focus.search_handle.search_result_page or (): _indexcard_obj = None if _result.value_iri is not None: - _indexcard = _cards_by_suffuniq_iri.get( + _card_focus = _card_foci_by_suffuniq_iri.get( get_sufficiently_unique_iri(_result.value_iri), ) - if _indexcard is not None: - _indexcard_obj = _indexcard.get_iri() - # hack around (current) limitations of primitive_metadata.gather: - # yield a redundant triple to make this IndexcardFocus gatherable - _card_focus = IndexcardFocus.new(_indexcard_obj, indexcard=_indexcard) - yield (_card_focus, RDF.type, IndexcardFocus.TYPE_IRI) + if _card_focus is not None: + _indexcard_obj = _card_focus.indexcard.get_iri() + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) + _card_twoples = _minimal_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_indexcard_obj, _pred, _obj) if _indexcard_obj is None: # no actual indexcard; put what we know in a blanknode-indexcard _indexcard_obj = _valuesearch_result_as_indexcard_blanknode(_result) @@ -325,31 +329,39 @@ def gather_card_contents(focus: IndexcardFocus, *, deriver_iri, **kwargs): yield (TROVE.resourceMetadata, focus.resourceMetadata) else: _iri = focus.single_iri() - _loaded_foci = _load_cards_and_contents([_iri], deriver_iri) + _loaded_foci = _load_cards_and_contents(card_iris=[_iri], deriver_iri=deriver_iri) _loaded_metadata = _loaded_foci[_iri].resourceMetadata yield (TROVE.resourceMetadata, _loaded_metadata) -def _load_cards_and_contents(card_iris, deriver_iri) -> dict[str, IndexcardFocus]: +def _load_cards_and_contents(*, card_iris=None, value_iris=None, deriver_iri) -> dict[str, IndexcardFocus]: return ( - _load_cards_and_extracted_rdf_contents(card_iris) + _load_cards_and_extracted_rdf_contents(card_iris, value_iris) if deriver_iri is None - else _load_cards_and_derived_contents(card_iris, deriver_iri) + else _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri) ) -def _load_cards_and_extracted_rdf_contents(card_iris) -> dict[str, IndexcardFocus]: +def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() - _indexcard_uuids = { - iri_minus_namespace(_card_iri, namespace=_card_namespace) - for _card_iri in card_iris - } _indexcard_rdf_qs = ( trove_db.LatestIndexcardRdf.objects - .filter(indexcard__uuid__in=_indexcard_uuids) .select_related('indexcard') .prefetch_related('indexcard__focus_identifier_set') ) + if card_iris is not None: + _indexcard_uuids = { + iri_minus_namespace(_card_iri, namespace=_card_namespace) + for _card_iri in card_iris + } + _indexcard_rdf_qs = _indexcard_rdf_qs.filter(indexcard__uuid__in=_indexcard_uuids) + if value_iris is not None: + _indexcard_rdf_qs = _indexcard_rdf_qs.filter( + indexcard__focus_identifier_set__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iris(value_iris) + ), + ) _card_foci: dict[str, IndexcardFocus] = {} for _indexcard_rdf in _indexcard_rdf_qs: _card = _indexcard_rdf.indexcard @@ -366,17 +378,12 @@ def _load_cards_and_extracted_rdf_contents(card_iris) -> dict[str, IndexcardFocu return _card_foci -def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> dict[str, IndexcardFocus]: +def _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri: str) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() - _indexcard_uuids = { - iri_minus_namespace(_card_iri, namespace=_card_namespace) - for _card_iri in card_iris - } # include pre-formatted data from a DerivedIndexcard _derived_indexcard_qs = ( trove_db.DerivedIndexcard.objects .filter( - upriver_indexcard__uuid__in=_indexcard_uuids, deriver_identifier__in=( trove_db.ResourceIdentifier.objects .queryset_for_iri(deriver_iri) @@ -385,6 +392,21 @@ def _load_cards_and_derived_contents(card_iris, deriver_iri: str) -> dict[str, I .select_related('upriver_indexcard') .prefetch_related('upriver_indexcard__focus_identifier_set') ) + if card_iris is not None: + _indexcard_uuids = { + iri_minus_namespace(_card_iri, namespace=_card_namespace) + for _card_iri in card_iris + } + _derived_indexcard_qs = _derived_indexcard_qs.filter( + upriver_indexcard__uuid__in=_indexcard_uuids, + ) + if value_iris is not None: + _derived_indexcard_qs = _derived_indexcard_qs.filter( + upriver_indexcard__focus_identifier_set__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iris(value_iris) + ), + ) _card_foci: dict[str, IndexcardFocus] = {} for _derived in _derived_indexcard_qs: _card_iri = _derived.upriver_indexcard.get_iri() @@ -437,12 +459,25 @@ def _valuesearch_result_as_json(result: ValuesearchResult) -> Literal: ) +def _minimal_indexcard_twoples( + focus_identifiers: Iterable[str], + resource_metadata: rdf.Literal, +) -> Iterator[rdf.RdfTwople]: + yield (RDF.type, TROVE.Indexcard) + for _identifier in focus_identifiers: + yield (TROVE.focusIdentifier, ( + _identifier + if isinstance(_identifier, rdf.Literal) + else literal(_identifier) + )) + yield (TROVE.resourceMetadata, resource_metadata) + + def _valuesearch_result_as_indexcard_blanknode(result: ValuesearchResult) -> frozenset: - return blanknode({ - RDF.type: {TROVE.Indexcard}, - TROVE.focusIdentifier: {literal(result.value_iri or result.value_value)}, - TROVE.resourceMetadata: {_valuesearch_result_as_json(result)}, - }) + return frozenset(_minimal_indexcard_twoples( + focus_identifiers=[literal(result.value_iri or result.value_value)], + resource_metadata=_valuesearch_result_as_json(result), + )) def _osfmap_json(tripledict, focus_iri): From 2d267eb000cd89a17c63493bde89e4c31f93a921 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 28 Jan 2025 10:12:29 -0500 Subject: [PATCH 35/35] more legible 'simple' json --- trove/render/simple_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trove/render/simple_json.py b/trove/render/simple_json.py index a962d8aae..10f896fff 100644 --- a/trove/render/simple_json.py +++ b/trove/render/simple_json.py @@ -41,7 +41,7 @@ def _stream_json(self, card_pages: typing.Iterator[dict[str, dict]]): for _card_iri, _osfmap_json in _page.items(): if _datum_prefix is not None: yield _datum_prefix - yield json.dumps(self._render_card_content(_card_iri, _osfmap_json)) + yield json.dumps(self._render_card_content(_card_iri, _osfmap_json), indent=2) _datum_prefix = ',' _nondata = json.dumps({ 'meta': self._render_meta(),
{% trans "shortname" %}{% trans "index" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %}