diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 00f313da6..d2dd034d1 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -103,6 +103,11 @@ Multiple records which describe the same item/object are grouped by a the source repository. In most outward-facing views, default to showing only the most recent record for each suid. +### Conventions +(an incomplete list) + +- functions prefixed `pls_` ("please") are a request for something to happen + ## Why this? inspired by [this writeup](https://matklad.github.io/2021/02/06/ARCHITECTURE.md.html) and [this example architecture document](https://github.com/rust-analyzer/rust-analyzer/blob/d7c99931d05e3723d878bea5dc26766791fa4e69/docs/dev/architecture.md) diff --git a/api/search/views.py b/api/search/views.py index 12075a82d..f8d6b4dcd 100644 --- a/api/search/views.py +++ b/api/search/views.py @@ -28,15 +28,15 @@ def post(self, request): def _handle_request(self, request): queryparams = request.query_params.dict() - requested_index_strategy = queryparams.pop('indexStrategy', None) + requested_index_strategy = queryparams.get('indexStrategy', None) if 'scroll' in queryparams: return http.HttpResponseForbidden(reason='Scroll is not supported.') try: - specific_index = index_strategy.get_index_for_sharev2_search(requested_index_strategy) + _index_strategy = index_strategy.get_strategy_for_sharev2_search(requested_index_strategy) except exceptions.IndexStrategyError as error: raise http.Http404(str(error)) try: - response_json = specific_index.pls_handle_search__sharev2_backcompat( + response_json = _index_strategy.pls_handle_search__passthru( request_body=request.data, request_queryparams=queryparams, ) diff --git a/api/views/feeds.py b/api/views/feeds.py index 417d479fa..f2a74ecd6 100644 --- a/api/views/feeds.py +++ b/api/views/feeds.py @@ -34,7 +34,7 @@ class MetadataRecordsRSS(Feed): description = 'Updates to the SHARE open dataset' author_name = 'SHARE' - _search_index: index_strategy.IndexStrategy.SpecificIndex + _search_strategy: index_strategy.IndexStrategy def title(self, obj): query = json.dumps(obj.get('query', 'All')) @@ -43,7 +43,7 @@ def title(self, obj): def get_object(self, request): self._order = request.GET.get('order') elastic_query = request.GET.get('elasticQuery') - self._search_index = index_strategy.get_index_for_sharev2_search(request.GET.get('indexStrategy')) + self._search_strategy = index_strategy.get_strategy_for_sharev2_search(request.GET.get('indexStrategy')) if self._order not in {'date_modified', 'date_updated', 'date_created', 'date_published'}: self._order = 'date_modified' @@ -64,7 +64,7 @@ def get_object(self, request): def items(self, obj): try: - json_response = self._search_index.pls_handle_search__sharev2_backcompat( + json_response = self._search_strategy.pls_handle_search__passthru( request_body=obj, ) except IndexStrategyError: diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 9e68fe2e9..7174cc418 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -318,15 +318,15 @@ class FormattedMetadataRecordAdmin(admin.ModelAdmin): class IndexBackfillAdmin(admin.ModelAdmin): readonly_fields = ( 'index_strategy_name', - 'specific_indexname', + 'strategy_checksum', 'error_type', 'error_message', 'error_context', ) paginator = TimeLimitedPaginator - list_display = ('index_strategy_name', 'backfill_status', 'created', 'modified', 'specific_indexname') + list_display = ('index_strategy_name', 'backfill_status', 'created', 'modified', 'strategy_checksum') show_full_result_count = False - search_fields = ('index_strategy_name', 'specific_indexname',) + search_fields = ('index_strategy_name', 'strategy_checksum',) actions = ('reset_to_initial',) def reset_to_initial(self, request, queryset): diff --git a/share/admin/search.py b/share/admin/search.py index fbf2446b0..95614a0fc 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -7,7 +7,13 @@ from share.admin.util import admin_url from share.models.index_backfill import IndexBackfill from share.search.index_messenger import IndexMessenger -from share.search import index_strategy +from share.search.index_strategy import ( + IndexStrategy, + all_strategy_names, + each_strategy, + parse_strategy_name, + parse_specific_index_name, +) logger = logging.getLogger(__name__) @@ -25,19 +31,15 @@ def search_indexes_view(request): }, ) if request.method == 'POST': - _specific_index = index_strategy.get_specific_index(request.POST['specific_indexname']) + _index_strategy = parse_strategy_name(request.POST['strategy_name']) _pls_doer = PLS_DOERS[request.POST['pls_do']] - _pls_doer(_specific_index) - _redirect_id = ( - _specific_index.index_strategy.name - if _pls_doer is _pls_delete - else _specific_index.indexname - ) + _pls_doer(_index_strategy) + _redirect_id = _index_strategy.strategy_name return HttpResponseRedirect('#'.join((request.path, _redirect_id))) def search_index_mappings_view(request, index_name): - _specific_index = index_strategy.get_specific_index(index_name) + _specific_index = parse_specific_index_name(index_name) _mappings = _specific_index.pls_get_mappings() return JsonResponse(_mappings) @@ -52,30 +54,23 @@ def _mappings_url_prefix(): def _index_status_by_strategy(): - backfill_by_indexname: dict[str, IndexBackfill] = { - backfill.specific_indexname: backfill - for backfill in ( + _backfill_by_checksum: dict[str, IndexBackfill] = { + _backfill.strategy_checksum: _backfill + for _backfill in ( IndexBackfill.objects - .filter(index_strategy_name__in=index_strategy.all_index_strategies().keys()) + .filter(index_strategy_name__in=all_strategy_names()) ) } status_by_strategy = {} _messenger = IndexMessenger() - for _index_strategy in index_strategy.all_index_strategies().values(): - current_index = _index_strategy.for_current_index() - status_by_strategy[_index_strategy.name] = { - 'current': { - 'status': current_index.pls_get_status(), - 'backfill': _serialize_backfill( - current_index, - backfill_by_indexname.get(current_index.indexname), - ), - }, - 'prior': sorted(( - specific_index.pls_get_status() - for specific_index in _index_strategy.each_specific_index() - if not specific_index.is_current - ), reverse=True), + for _index_strategy in each_strategy(): + _current_backfill = ( + _backfill_by_checksum.get(str(_index_strategy.CURRENT_STRATEGY_CHECKSUM)) + or _backfill_by_checksum.get(_index_strategy.indexname_prefix) # backcompat + ) + status_by_strategy[_index_strategy.strategy_name] = { + 'status': _index_strategy.pls_get_strategy_status(), + 'backfill': _serialize_backfill(_index_strategy, _current_backfill), 'queues': [ { 'name': _queue_name, @@ -91,14 +86,14 @@ def _index_status_by_strategy(): def _serialize_backfill( - specific_index: index_strategy.IndexStrategy.SpecificIndex, + strategy: IndexStrategy, backfill: IndexBackfill | None, ): - if not specific_index.is_current: + if not strategy.is_current: return {} if not backfill: return { - 'can_start_backfill': specific_index.pls_check_exists(), + 'can_start_backfill': strategy.pls_check_exists(), } return { 'backfill_status': backfill.backfill_status, @@ -109,35 +104,35 @@ def _serialize_backfill( } -def _pls_setup(specific_index): - assert specific_index.is_current - specific_index.pls_setup() +def _pls_setup(index_strategy: IndexStrategy): + assert index_strategy.is_current + index_strategy.pls_setup() -def _pls_start_keeping_live(specific_index): - specific_index.pls_start_keeping_live() +def _pls_start_keeping_live(index_strategy: IndexStrategy): + index_strategy.pls_start_keeping_live() -def _pls_stop_keeping_live(specific_index): - specific_index.pls_stop_keeping_live() +def _pls_stop_keeping_live(index_strategy: IndexStrategy): + index_strategy.pls_stop_keeping_live() -def _pls_start_backfill(specific_index): - assert specific_index.is_current - specific_index.index_strategy.pls_start_backfill() +def _pls_start_backfill(index_strategy: IndexStrategy): + assert index_strategy.is_current + index_strategy.pls_start_backfill() -def _pls_mark_backfill_complete(specific_index): - specific_index.index_strategy.pls_mark_backfill_complete() +def _pls_mark_backfill_complete(index_strategy: IndexStrategy): + index_strategy.pls_mark_backfill_complete() -def _pls_make_default_for_searching(specific_index): - specific_index.index_strategy.pls_make_default_for_searching(specific_index) +def _pls_make_default_for_searching(index_strategy: IndexStrategy): + index_strategy.pls_make_default_for_searching() -def _pls_delete(specific_index): - assert not specific_index.is_current - specific_index.pls_delete() +def _pls_delete(index_strategy: IndexStrategy): + assert not index_strategy.is_current + index_strategy.pls_teardown() PLS_DOERS = { diff --git a/share/bin/search.py b/share/bin/search.py index 69f5c0eff..8ccb65c4d 100644 --- a/share/bin/search.py +++ b/share/bin/search.py @@ -26,11 +26,11 @@ def search(args, argv): @search.subcommand('Drop the Elasticsearch index') def purge(args, argv): """ - Usage: {0} search purge ... + Usage: {0} search purge ... """ - for index_name in args['']: - specific_index = index_strategy.get_specific_index(index_name) - specific_index.pls_delete() + for _strategy_name in args['']: + _strategy = index_strategy.parse_strategy_name(_strategy_name) + _strategy.pls_teardown() @search.subcommand('Create indicies and apply mappings') @@ -41,25 +41,16 @@ def setup(args, argv): """ _is_initial = args.get('--initial') if _is_initial: - _specific_indexes = [ - _index_strategy.for_current_index() - for _index_strategy in index_strategy.all_index_strategies().values() - ] + for _index_strategy in index_strategy.each_strategy(): + _index_strategy.pls_setup() else: _index_or_strategy_name = args[''] try: - _specific_indexes = [index_strategy.get_specific_index(_index_or_strategy_name)] + _strategy = index_strategy.get_strategy(_index_or_strategy_name) except IndexStrategyError: - try: - _specific_indexes = [ - index_strategy.get_specific_index(_index_or_strategy_name), - ] - except IndexStrategyError: - raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') - for _specific_index in _specific_indexes: - _specific_index.pls_setup( - skip_backfill=_is_initial, # for initial setup, there's nothing back to fill - ) + raise IndexStrategyError(f'unrecognized index or strategy name "{_index_or_strategy_name}"') + else: + _strategy.pls_setup() @search.subcommand('Start the search indexing daemon') diff --git a/share/checks.py b/share/checks.py index a53d2a228..1dda809d6 100644 --- a/share/checks.py +++ b/share/checks.py @@ -5,7 +5,7 @@ def check_all_index_strategies_current(app_configs, **kwargs): from share.search import index_strategy from share.search.exceptions import IndexStrategyError errors = [] - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in index_strategy.each_strategy(): try: _index_strategy.assert_strategy_is_current() except IndexStrategyError as exception: diff --git a/share/models/index_backfill.py b/share/models/index_backfill.py index c8e92ffed..47dff03c7 100644 --- a/share/models/index_backfill.py +++ b/share/models/index_backfill.py @@ -68,6 +68,16 @@ def __repr__(self): def __str__(self): return repr(self) + @property + def strategy_checksum(self): + # back-compat alias for specific_indexname (may be removed if that's renamed via migration) + return self.specific_indexname # for backcompat + + @strategy_checksum.setter + def strategy_checksum(self, value): + # back-compat alias for specific_indexname (may be removed if that's renamed via migration) + self.specific_indexname = value + @contextlib.contextmanager def mutex(self): with IndexBackfill.objects.get_with_mutex(pk=self.pk) as index_backfill: @@ -76,14 +86,14 @@ def mutex(self): def pls_start(self, index_strategy): with self.mutex() as locked_self: - assert locked_self.index_strategy_name == index_strategy.name - current_index = index_strategy.for_current_index() - if locked_self.specific_indexname == current_index.indexname: + assert locked_self.index_strategy_name == index_strategy.strategy_name + _current_checksum = str(index_strategy.CURRENT_STRATEGY_CHECKSUM) + if locked_self.strategy_checksum == _current_checksum: # what is "current" has not changed -- should be INITIAL assert locked_self.backfill_status == IndexBackfill.INITIAL else: # what is "current" has changed! disregard backfill_status - locked_self.specific_indexname = current_index.indexname + locked_self.strategy_checksum = _current_checksum locked_self.backfill_status = IndexBackfill.INITIAL locked_self.__update_error(None) try: diff --git a/share/search/daemon.py b/share/search/daemon.py index 1fa7cce23..4d33a5b50 100644 --- a/share/search/daemon.py +++ b/share/search/daemon.py @@ -68,7 +68,7 @@ def start_daemonthreads_for_strategy(self, index_strategy): return _daemon def start_all_daemonthreads(self): - for _index_strategy in index_strategy.all_index_strategies().values(): + for _index_strategy in index_strategy.each_strategy(): self.start_daemonthreads_for_strategy(_index_strategy) def stop_daemonthreads(self, *, wait=False): @@ -119,7 +119,7 @@ def get_consumers(self, Consumer, channel): ] def __repr__(self): - return '<{}({})>'.format(self.__class__.__name__, self.__index_strategy.name) + return '<{}({})>'.format(self.__class__.__name__, self.__index_strategy.strategy_name) def consume(self, *args, **kwargs): # wrap `consume` in `kombu.Connection.ensure`, following guidance from @@ -191,7 +191,7 @@ def on_message(self, body, message): continue def __repr__(self): - return '<{}({})>'.format(self.__class__.__name__, self.index_strategy.name) + return '<{}({})>'.format(self.__class__.__name__, self.index_strategy.strategy_name) @dataclasses.dataclass @@ -232,11 +232,12 @@ def _the_loop_itself(self): def _raise_if_backfill_noncurrent(self): if self.message_type.is_backfill: index_backfill = self.index_strategy.get_or_create_backfill() - if index_backfill.specific_indexname != self.index_strategy.current_indexname: + _current_checksum = str(self.index_strategy.CURRENT_STRATEGY_CHECKSUM) + if index_backfill.strategy_checksum != _current_checksum: raise exceptions.DaemonSetupError( 'IndexerDaemon observes conflicting currence:' - f'\n\tIndexBackfill (from database) says current is "{index_backfill.specific_indexname}"' - f'\n\tIndexStrategy (from static code) says current is "{self.index_strategy.current_indexname}"' + f'\n\tIndexBackfill (from database) says current is "{index_backfill.strategy_checksum}"' + f'\n\tIndexStrategy (from static code) says current is "{_current_checksum}"' '\n\t(may be the daemon is running old code -- will die and retry,' ' but if this keeps happening you may need to reset backfill_status' ' to INITIAL and restart the backfill)' diff --git a/share/search/index_messenger.py b/share/search/index_messenger.py index 0cd51293b..67a7b154b 100644 --- a/share/search/index_messenger.py +++ b/share/search/index_messenger.py @@ -32,7 +32,7 @@ def __init__(self, *, celery_app=None, index_strategys=None): if celery_app is None else celery_app ) - self.index_strategys = index_strategys or tuple(index_strategy.all_index_strategies().values()) + self.index_strategys = index_strategys or tuple(index_strategy.each_strategy()) def notify_indexcard_update(self, indexcards, *, urgent=False): self.send_messages_chunk( diff --git a/share/search/index_status.py b/share/search/index_status.py index c413503a3..1ed16f9b7 100644 --- a/share/search/index_status.py +++ b/share/search/index_status.py @@ -1,11 +1,30 @@ +from __future__ import annotations import dataclasses @dataclasses.dataclass(order=True) class IndexStatus: creation_date: str - index_strategy_name: str + index_subname: str specific_indexname: str + doc_count: int = 0 is_kept_live: bool = False is_default_for_searching: bool = False - doc_count: int = 0 + + +@dataclasses.dataclass +class StrategyStatus: + strategy_name: str + strategy_check: str + is_set_up: bool + is_default_for_searching: bool + index_statuses: list[IndexStatus] + existing_prior_strategies: list[StrategyStatus] + + @property + def strategy_id(self): + return f'{self.strategy_name}__{self.strategy_check}' + + @property + def is_kept_live(self) -> bool: + return all(_indexstatus.is_kept_live for _indexstatus in self.index_statuses) diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index 297702475..c00d2fbf1 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -import functools -from types import MappingProxyType +import enum +from typing import Iterator from django.conf import settings @@ -12,79 +12,112 @@ from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy +from ._indexnames import parse_indexname_parts __all__ = ( 'IndexStrategy', - 'all_index_strategies', - 'get_index_for_sharev2_search', - 'get_index_for_trovesearch', - 'get_index_strategy', - 'get_specific_index', + 'all_strategy_names', + 'each_strategy', + 'get_strategy', + 'get_strategy_for_sharev2_search', + 'get_strategy_for_trovesearch', + 'parse_specific_index_name', + 'parse_strategy_name', ) -@functools.cache -def all_index_strategies() -> MappingProxyType[str, IndexStrategy]: - return MappingProxyType({ - _strategy.name: _strategy - for _strategy in _iter_all_index_strategies() - }) +class _AvailableStrategies(enum.Enum): + '''static source of truth for available index strategies + (don't import this enum directly -- access via the other functions in this module) + ''' -def _iter_all_index_strategies(): if settings.ELASTICSEARCH5_URL: - yield Sharev2Elastic5IndexStrategy(name='sharev2_elastic5') + sharev2_elastic5 = Sharev2Elastic5IndexStrategy('sharev2_elastic5') + if settings.ELASTICSEARCH8_URL: - yield Sharev2Elastic8IndexStrategy(name='sharev2_elastic8') - yield TroveIndexcardFlatsIndexStrategy(name='trove_indexcard_flats') - yield TrovesearchDenormIndexStrategy(name='trovesearch_denorm') + sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') + trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') + trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') -def get_index_strategy(strategyname: str) -> IndexStrategy: - try: - return all_index_strategies()[strategyname] - except KeyError: - raise IndexStrategyError(f'unknown index strategy "{strategyname}"') +if __debug__: + for _strategy_enum in _AvailableStrategies: + assert _strategy_enum.name == _strategy_enum.value.strategy_name, 'expected _AvailableStrategies enum name to match strategy name' + + +### +# module public interface +def all_strategy_names() -> frozenset[str]: + return frozenset(_AvailableStrategies.__members__.keys()) -def get_specific_index(indexname_or_strategyname: str, *, for_search=False) -> IndexStrategy.SpecificIndex: + +def each_strategy() -> Iterator[IndexStrategy]: + for _strat_enum in _AvailableStrategies: + yield _strat_enum.value + + +def get_strategy( + strategy_name: str, + strategy_check: str = '', + *, + for_search: bool = False, +) -> IndexStrategy: try: - _strategy = get_index_strategy(indexname_or_strategyname) - return ( - _strategy.pls_get_default_for_searching() - if for_search - else _strategy.for_current_index() - ) - except IndexStrategyError: - for _index_strategy in all_index_strategies().values(): - try: - return _index_strategy.for_specific_index(indexname_or_strategyname) - except IndexStrategyError: - pass - raise IndexStrategyError(f'unrecognized name "{indexname_or_strategyname}"') + _strategy: IndexStrategy = _AvailableStrategies[strategy_name].value + except KeyError: + raise IndexStrategyError(f'unrecognized strategy name "{strategy_name}"') + if strategy_check: + _strategy = _strategy.with_strategy_check(strategy_check) + return ( + _strategy.pls_get_default_for_searching() + if (for_search and not strategy_check) + else _strategy + ) -def get_index_for_sharev2_search(requested_name=None) -> IndexStrategy.SpecificIndex: +def get_strategy_for_sharev2_search(requested_name: str | None = None) -> IndexStrategy: if requested_name: _name = requested_name elif ( settings.ELASTICSEARCH5_URL and not FeatureFlag.objects.flag_is_up(FeatureFlag.ELASTIC_EIGHT_DEFAULT) ): - _name = 'sharev2_elastic5' + _name = _AvailableStrategies.sharev2_elastic5.name elif settings.ELASTICSEARCH8_URL: - _name = 'sharev2_elastic8' + _name = _AvailableStrategies.sharev2_elastic8.name else: raise IndexStrategyError('no available index for sharev2 search') - return get_specific_index(_name, for_search=True) + return parse_strategy_name(_name) -def get_index_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy.SpecificIndex: +def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy: if params.index_strategy_name: # specific strategy requested - _name = params.index_strategy_name - elif FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY): - _name = 'trovesearch_denorm' + _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) else: - _name = 'trove_indexcard_flats' - return get_specific_index(_name, for_search=True) + _strategy_name = ( + _AvailableStrategies.trovesearch_denorm.name + if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) + else _AvailableStrategies.trove_indexcard_flats.name + ) + _strategy = get_strategy(_strategy_name, for_search=True) + return _strategy + + +def parse_specific_index_name(index_name: str) -> IndexStrategy.SpecificIndex: + try: + _strategy = parse_strategy_name(index_name) + return _strategy.parse_full_index_name(index_name) + except IndexStrategyError: + raise IndexStrategyError(f'invalid index_name "{index_name}"') + + +def parse_strategy_name(requested_strategy_name: str, *, for_search=False) -> IndexStrategy: + (_strategyname, *_etc) = parse_indexname_parts(requested_strategy_name) + return get_strategy( + strategy_name=_strategyname, + strategy_check=(_etc[0] if _etc else ''), + for_search=for_search + ) diff --git a/share/search/index_strategy/_base.py b/share/search/index_strategy/_base.py index bafec1fa4..a61e5532d 100644 --- a/share/search/index_strategy/_base.py +++ b/share/search/index_strategy/_base.py @@ -1,4 +1,6 @@ +from __future__ import annotations import abc +import dataclasses import functools import logging import typing @@ -6,7 +8,10 @@ from share.search import messages from share.models.index_backfill import IndexBackfill from share.search.exceptions import IndexStrategyError -from share.search.index_status import IndexStatus +from share.search.index_status import ( + IndexStatus, + StrategyStatus, +) from share.util.checksum_iri import ChecksumIri from trove.trovesearch.search_params import ( CardsearchParams, @@ -16,11 +21,13 @@ CardsearchHandle, ValuesearchHandle, ) +from . import _indexnames as indexnames logger = logging.getLogger(__name__) +@dataclasses.dataclass(frozen=True) class IndexStrategy(abc.ABC): '''an abstraction for indexes in different places and ways. @@ -37,40 +44,49 @@ class IndexStrategy(abc.ABC): * may know of version- or cluster-specific features (should include identifiers like version numbers in subclass name) ''' - CURRENT_STRATEGY_CHECKSUM: ChecksumIri # set on subclasses to protect against accidents + CURRENT_STRATEGY_CHECKSUM: typing.ClassVar[ChecksumIri] # set on subclasses to protect against accidents + + strategy_name: str + strategy_check: str = '' # if unspecified, uses current checksum + + def __post_init__(self): + indexnames.raise_if_invalid_indexname_part(self.strategy_name) + if not self.strategy_check: + object.__setattr__(self, 'strategy_check', self.CURRENT_STRATEGY_CHECKSUM.hexdigest) + indexnames.raise_if_invalid_indexname_part(self.strategy_check) - def __init__(self, name): - self.name = name + @classmethod + @functools.cache + def index_subname_set(cls) -> frozenset[str]: + return frozenset(cls.each_index_subname()) - def __repr__(self): - return ''.join(( - self.__class__.__qualname__, - f'(name="{self.name}")' - )) + def each_subnamed_index(self) -> typing.Iterator[SpecificIndex]: + for _subname in self.index_subname_set(): + yield self.get_index(_subname) + + @property + def nonurgent_messagequeue_name(self) -> str: + return f'{self.strategy_name}.nonurgent' @property - def nonurgent_messagequeue_name(self): - return f'{self.name}.nonurgent' + def urgent_messagequeue_name(self) -> str: + return f'{self.strategy_name}.urgent' @property - def urgent_messagequeue_name(self): - return f'{self.name}.urgent' + def indexname_prefix_parts(self) -> list[str]: + return [self.strategy_name, self.strategy_check] @property - def indexname_prefix(self): - return f'{self.name}__' + def indexname_prefix(self) -> str: + return indexnames.combine_indexname_parts(*self.indexname_prefix_parts) @property - def indexname_wildcard(self): + def indexname_wildcard(self) -> str: return f'{self.indexname_prefix}*' - @functools.cached_property - def current_indexname(self): - self.assert_strategy_is_current() - return ''.join(( - self.indexname_prefix, - self.CURRENT_STRATEGY_CHECKSUM.hexdigest, - )) + @property + def is_current(self) -> bool: + return self.strategy_check == self.CURRENT_STRATEGY_CHECKSUM.hexdigest def assert_message_type(self, message_type: messages.MessageType): if message_type not in self.supported_message_types: @@ -91,14 +107,42 @@ def assert_strategy_is_current(self): ) ```''') - def for_specific_index(self, specific_indexname) -> 'IndexStrategy.SpecificIndex': - return self.SpecificIndex(self, specific_indexname) # type: ignore[abstract] - - def for_current_index(self) -> 'IndexStrategy.SpecificIndex': - return self.for_specific_index(self.current_indexname) + def get_index(self, subname: str) -> SpecificIndex: + return self.SpecificIndex(self, subname) # type: ignore[abstract] + + def parse_full_index_name(self, index_name: str) -> SpecificIndex: + _parts = indexnames.parse_indexname_parts(index_name) + try: + (_strategy_name, _strategy_check, *_etc) = _parts + except ValueError: + raise IndexStrategyError(f'expected "strategyname__strategycheck", at least (got "{index_name}")') + if _strategy_name != self.strategy_name: + raise IndexStrategyError(f'this index belongs to another strategy (expected strategy name "{self.strategy_name}"; got "{_strategy_name}" from index name {index_name})') + _strategy = self.with_strategy_check(_strategy_check) + return _strategy.get_index(_etc[0] if _etc else '') + + def with_strategy_check(self, strategy_check: str) -> IndexStrategy: + return dataclasses.replace(self, strategy_check=strategy_check) + + def pls_setup(self, *, skip_backfill=False) -> None: + if not self.is_current: + raise IndexStrategyError('cannot setup a non-current strategy') + for _index in self.each_subnamed_index(): + _index.pls_create() + _index.pls_start_keeping_live() + if skip_backfill: + _backfill = self.get_or_create_backfill() + _backfill.backfill_status = _backfill.COMPLETE + _backfill.save() + + def pls_teardown(self) -> None: + for _index in self.each_existing_index(): + _index.pls_delete() def get_or_create_backfill(self): - (index_backfill, _) = IndexBackfill.objects.get_or_create(index_strategy_name=self.name) + (index_backfill, _) = IndexBackfill.objects.get_or_create( + index_strategy_name=self.strategy_name, + ) return index_backfill def pls_start_backfill(self): @@ -106,6 +150,73 @@ def pls_start_backfill(self): def pls_mark_backfill_complete(self): self.get_or_create_backfill().pls_mark_complete() + self.pls_refresh() # explicit refresh after backfill + + def pls_check_exists(self) -> bool: + return all( + _index.pls_check_exists() + for _index in self.each_subnamed_index() + ) + + def pls_refresh(self) -> None: + for _index in self.each_subnamed_index(): + _index.pls_refresh() + + def pls_start_keeping_live(self): + for _index in self.each_subnamed_index(): + _index.pls_start_keeping_live() + + def pls_stop_keeping_live(self): + for _index in self.each_live_index(): + _index.pls_stop_keeping_live() + + def pls_get_strategy_status(self) -> StrategyStatus: + _index_statuses: list[IndexStatus] = [] + _prior_strategy_statuses: list[StrategyStatus] = [] + if self.is_current: + _index_statuses = [ + _index.pls_get_status() + for _index in self.each_subnamed_index() + ] + _prior_strategies = { + _index.index_strategy + for _index in self.each_existing_index(any_strategy_check=True) + if not _index.index_strategy.is_current + } + _prior_strategy_statuses = [ + _strategy.pls_get_strategy_status() + for _strategy in _prior_strategies + ] + else: + _index_statuses = [ + _index.pls_get_status() + for _index in self.each_existing_index() + ] + return StrategyStatus( + strategy_name=self.strategy_name, + strategy_check=self.strategy_check, + is_set_up=self.pls_check_exists(), + is_default_for_searching=(self == self.pls_get_default_for_searching()), + index_statuses=_index_statuses, + existing_prior_strategies=_prior_strategy_statuses, + ) + + ### + # abstract methods (required for concrete subclasses) + + @classmethod + @abc.abstractmethod + def compute_strategy_checksum(self) -> ChecksumIri: + '''get a dict (json-serializable and thereby checksummable) of all + configuration held still by this IndexStrategy subclass -- changes + in the checksum may result in new indices being created and filled + ''' + raise NotImplementedError + + @classmethod + @abc.abstractmethod + def each_index_subname(self) -> typing.Iterable[str]: + raise NotImplementedError @property @abc.abstractmethod @@ -118,16 +229,11 @@ def backfill_message_type(self) -> messages.MessageType: raise NotImplementedError @abc.abstractmethod - def compute_strategy_checksum(self) -> ChecksumIri: - '''get a dict (json-serializable and thereby checksummable) of all - configuration held still by this IndexStrategy instance -- changes - in this value's checksum may invoke changes in index lifecycle, as - may be defined by IndexStrategy subclasses - ''' + def each_existing_index(self, *, any_strategy_check: bool = False) -> typing.Iterator[SpecificIndex]: raise NotImplementedError @abc.abstractmethod - def each_specific_index(self) -> 'typing.Iterable[SpecificIndex]': + def each_live_index(self, *, any_strategy_check: bool = False) -> typing.Iterator[SpecificIndex]: raise NotImplementedError @abc.abstractmethod @@ -135,57 +241,46 @@ def pls_handle_messages_chunk(self, messages_chunk: messages.MessagesChunk) -> t raise NotImplementedError @abc.abstractmethod - def pls_make_default_for_searching(self, specific_index: 'SpecificIndex'): + def pls_make_default_for_searching(self) -> None: raise NotImplementedError @abc.abstractmethod - def pls_get_default_for_searching(self) -> 'SpecificIndex': + def pls_get_default_for_searching(self) -> IndexStrategy: + raise NotImplementedError + + ### + # optional implementations + + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: raise NotImplementedError + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + raise NotImplementedError + + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__passthru (either implement it or don\'t use this strategy for that)') + # IndexStrategy.SpecificIndex must be implemented by subclasses # in their own `class SpecificIndex(IndexStrategy.SpecificIndex)` + @dataclasses.dataclass class SpecificIndex(abc.ABC): - def __init__(self, index_strategy, indexname): - if not indexname.startswith(index_strategy.indexname_prefix): - raise IndexStrategyError( - f'invalid indexname "{indexname}"!' - f' (expected to start with "{index_strategy.indexname_prefix}")' - ) - self.index_strategy = index_strategy - self.indexname = indexname - - def __eq__(self, other): - return ( - other.__class__ is self.__class__ - and other.index_strategy is self.index_strategy - and other.indexname == self.indexname - ) + index_strategy: IndexStrategy + subname: str # unique per index_strategy - def __repr__(self): - return ''.join(( - self.__class__.__qualname__, - f'(index_strategy={self.index_strategy}, ' - f'indexname={self.indexname})' - )) + @property + def is_current(self) -> bool: + return self.index_strategy.is_current + + @property + def has_valid_subname(self) -> bool: + return self.subname in self.index_strategy.index_subname_set() @property - def is_current(self): - return self.indexname == self.index_strategy.current_indexname - - def pls_setup(self, *, skip_backfill=False): - assert self.is_current, 'cannot setup a non-current index' - _preexisting_index_count = sum( - _index.pls_check_exists() - for _index in self.index_strategy.each_specific_index() + def full_index_name(self) -> str: + return indexnames.combine_indexname_parts( + *self.index_strategy.indexname_prefix_parts, + self.subname, ) - self.pls_create() - self.pls_start_keeping_live() - if skip_backfill: - _backfill = self.index_strategy.get_or_create_backfill() - _backfill.backfill_status = _backfill.COMPLETE - _backfill.save() - if not _preexisting_index_count: # first index for a strategy is automatic default - self.index_strategy.pls_make_default_for_searching(self) @abc.abstractmethod def pls_get_status(self) -> IndexStatus: @@ -215,16 +310,6 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise NotImplementedError - # optional for subclasses - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - raise NotImplementedError(f'{self.__class__.__name__} does not implement pls_handle_search__sharev2_backcompat (either implement it or don\'t use this strategy for backcompat)') - - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - raise NotImplementedError - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - raise NotImplementedError - def pls_get_mappings(self) -> dict: raise NotImplementedError diff --git a/share/search/index_strategy/_indexnames.py b/share/search/index_strategy/_indexnames.py new file mode 100644 index 000000000..bc9f1e149 --- /dev/null +++ b/share/search/index_strategy/_indexnames.py @@ -0,0 +1,21 @@ +from share.search.exceptions import IndexStrategyError + + +INDEXNAME_DELIM = '__' # used to separate indexnames into a list of meaningful values + + +def is_valid_indexname_part(indexname_part: str) -> bool: + return bool(INDEXNAME_DELIM not in indexname_part) + + +def raise_if_invalid_indexname_part(indexname_part: str) -> None: + if INDEXNAME_DELIM in indexname_part: + raise IndexStrategyError(f'name may not contain "{INDEXNAME_DELIM}" (got "{indexname_part}")') + + +def combine_indexname_parts(*indexname_parts: str) -> str: + return INDEXNAME_DELIM.join(filter(bool, indexname_parts)) + + +def parse_indexname_parts(name: str) -> list[str]: + return name.split(INDEXNAME_DELIM) diff --git a/share/search/index_strategy/elastic8.py b/share/search/index_strategy/elastic8.py index 546889e9f..751af06c0 100644 --- a/share/search/index_strategy/elastic8.py +++ b/share/search/index_strategy/elastic8.py @@ -1,9 +1,12 @@ from __future__ import annotations import abc import collections +from collections.abc import Mapping import dataclasses +import functools from http import HTTPStatus import logging +import types import typing from django.conf import settings @@ -15,6 +18,10 @@ from share.search import messages from share.search.index_strategy._util import timestamp_to_readable_datetime from share.util.checksum_iri import ChecksumIri +from ._indexnames import ( + parse_indexname_parts, + combine_indexname_parts, +) logger = logging.getLogger(__name__) @@ -23,61 +30,40 @@ class Elastic8IndexStrategy(IndexStrategy): '''abstract base class for index strategies using elasticsearch 8 ''' + index_definitions: typing.ClassVar[dict[str, IndexDefinition]] - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - should_sniff = settings.ELASTICSEARCH['SNIFF'] - timeout = settings.ELASTICSEARCH['TIMEOUT'] - self.es8_client = elasticsearch8.Elasticsearch( - settings.ELASTICSEARCH8_URL, - # security: - ca_certs=settings.ELASTICSEARCH8_CERT_PATH, - basic_auth=( - (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) - if settings.ELASTICSEARCH8_SECRET is not None - else None - ), - # retry: - retry_on_timeout=True, - request_timeout=timeout, - # sniffing: - sniff_on_start=should_sniff, - sniff_before_requests=should_sniff, - sniff_on_node_failure=should_sniff, - sniff_timeout=timeout, - min_delay_between_sniffing=timeout, - ) + ### + # for use when defining abstract methods in subclasses + + @dataclasses.dataclass(frozen=True) + class IndexDefinition: + mappings: dict + settings: dict + + @dataclasses.dataclass + class MessageActionSet: + message_target_id: int + actions_by_subname: dict[str, typing.Iterable[dict]] ### # abstract methods for subclasses to implement + @classmethod @abc.abstractmethod - def index_settings(self): - raise NotImplementedError - - @abc.abstractmethod - def index_mappings(self): + def define_current_indexes(cls) -> dict[str, IndexDefinition]: raise NotImplementedError @abc.abstractmethod def build_elastic_actions( self, messages_chunk: messages.MessagesChunk, - ) -> typing.Iterable[tuple[int, dict | typing.Iterable[dict]]]: - # yield (message_target_id, [elastic_action, ...]) pairs + ) -> typing.Iterable[MessageActionSet]: raise NotImplementedError - def before_chunk( - self, - messages_chunk: messages.MessagesChunk, - indexnames: typing.Iterable[str], - ) -> None: - ... # implement when needed - def after_chunk( self, messages_chunk: messages.MessagesChunk, - indexnames: typing.Iterable[str], + affected_indexnames: typing.Iterable[str], ) -> None: ... # implement when needed @@ -108,47 +94,100 @@ def build_update_action(self, doc_id, doc_source): # implementation for subclasses to ignore # abstract method from IndexStrategy - def compute_strategy_checksum(self): + @classmethod + def compute_strategy_checksum(cls): + _current_json = { + _subname: dataclasses.asdict(_def) + for _subname, _def in cls.current_index_defs().items() + } + if set(_current_json.keys()) == {''}: + _current_json = _current_json[''] return ChecksumIri.digest_json( checksumalgorithm_name='sha-256', - salt=self.__class__.__name__, - raw_json={ - 'settings': self.index_settings(), - 'mappings': self.index_mappings(), - }, + salt=cls.__name__, + raw_json=_current_json, + ) + + # abstract method from IndexStrategy + @classmethod + def each_index_subname(self) -> typing.Iterable[str]: + yield from self.current_index_defs().keys() + + @classmethod + @functools.cache + def current_index_defs(cls) -> Mapping[str, IndexDefinition]: + # readonly and cached per class + return types.MappingProxyType(cls.define_current_indexes()) + + @classmethod + @functools.cache + def _get_elastic8_client(cls) -> elasticsearch8.Elasticsearch: + should_sniff = settings.ELASTICSEARCH['SNIFF'] + timeout = settings.ELASTICSEARCH['TIMEOUT'] + return elasticsearch8.Elasticsearch( + settings.ELASTICSEARCH8_URL, + # security: + ca_certs=settings.ELASTICSEARCH8_CERT_PATH, + basic_auth=( + (settings.ELASTICSEARCH8_USERNAME, settings.ELASTICSEARCH8_SECRET) + if settings.ELASTICSEARCH8_SECRET is not None + else None + ), + # retry: + retry_on_timeout=True, + request_timeout=timeout, + # sniffing: + sniff_on_start=should_sniff, + sniff_before_requests=should_sniff, + sniff_on_node_failure=should_sniff, + sniff_timeout=timeout, + min_delay_between_sniffing=timeout, ) + @property + def es8_client(self): + return self._get_elastic8_client() # cached classmethod for shared client + # abstract method from IndexStrategy - def each_specific_index(self): + def each_existing_index(self, *, any_strategy_check: bool = False): + _index_wildcard = ( + combine_indexname_parts(self.strategy_name, '*') + if any_strategy_check + else self.indexname_wildcard + ) indexname_set = set( self.es8_client.indices - .get(index=self.indexname_wildcard, features=',') + .get(index=_index_wildcard, features=',') .keys() ) - indexname_set.add(self.current_indexname) for indexname in indexname_set: - yield self.for_specific_index(indexname) + _index = self.parse_full_index_name(indexname) + assert _index.index_strategy.strategy_name == self.strategy_name + yield _index + + def each_live_index(self, *, any_strategy_check: bool = False): + for _indexname in self._get_indexnames_for_alias(self._alias_for_keeping_live): + _index = self.parse_full_index_name(_indexname) + if any_strategy_check or (_index.index_strategy == self): + yield _index # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): self.assert_message_type(messages_chunk.message_type) - if messages_chunk.message_type.is_backfill: - _indexnames = {self.current_indexname} - else: - _indexnames = self._get_indexnames_for_alias(self._alias_for_keeping_live) - self.before_chunk(messages_chunk, _indexnames) _action_tracker = _ActionTracker() _bulk_stream = streaming_bulk( self.es8_client, - self._elastic_actions_with_index(messages_chunk, _indexnames, _action_tracker), + self._elastic_actions_with_index(messages_chunk, _action_tracker), raise_on_error=False, max_retries=settings.ELASTICSEARCH['MAX_RETRIES'], ) + _affected_indexnames: set[str] = set() for (_ok, _response) in _bulk_stream: (_op_type, _response_body) = next(iter(_response.items())) _status = _response_body.get('status') _docid = _response_body['_id'] _indexname = _response_body['_index'] + _affected_indexnames.add(_indexname) _is_done = _ok or (_op_type == 'delete' and _status == 404) if _is_done: _finished_message_id = _action_tracker.action_done(_indexname, _docid) @@ -178,50 +217,93 @@ def pls_handle_messages_chunk(self, messages_chunk): status_code=HTTPStatus.OK.value, error_text=None, ) - self.after_chunk(messages_chunk, _indexnames) + self.after_chunk(messages_chunk, _affected_indexnames) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index: IndexStrategy.SpecificIndex): + def pls_make_default_for_searching(self): self._set_indexnames_for_alias( self._alias_for_searching, - {specific_index.indexname}, + {self.indexname_wildcard}, ) # abstract method from IndexStrategy - def pls_get_default_for_searching(self) -> IndexStrategy.SpecificIndex: - # a SpecificIndex for an alias will work fine for searching, but - # will error if you try to invoke lifecycle hooks - return self.for_specific_index(self._alias_for_searching) + def pls_get_default_for_searching(self) -> IndexStrategy: + _searchnames = self._get_indexnames_for_alias(self._alias_for_searching) + try: + (_indexname, *_) = _searchnames + except ValueError: + return self # no default set, this one's fine + (_strategyname, _strategycheck, *_) = parse_indexname_parts(_indexname) + assert _strategyname == self.strategy_name + _strategycheck = _strategycheck.rstrip('*') # may be a wildcard alias + return self.with_strategy_check(_strategycheck) + + # abstract method from IndexStrategy + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + _queryparams = request_queryparams or {} + _requested_strategy = _queryparams.pop('indexStrategy', '') + _indexname = self.indexname_wildcard + if _requested_strategy and _requested_strategy.startswith(self.indexname_prefix): + _index = self.parse_full_index_name(_requested_strategy) + if _index.has_valid_subname: + _indexname = _index.full_index_name + return self.es8_client.search( + index=_indexname, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) # override from IndexStrategy - def pls_mark_backfill_complete(self): - super().pls_mark_backfill_complete() - # explicit refresh after bulk operation - self.for_current_index().pls_refresh() + def pls_refresh(self): + super().pls_refresh() # refreshes each index + logger.debug('%s: Waiting for yellow status', self.strategy_name) + self.es8_client.cluster.health(wait_for_status='yellow') @property def _alias_for_searching(self): - return f'{self.indexname_prefix}search' + return combine_indexname_parts(self.strategy_name, 'search') @property def _alias_for_keeping_live(self): - return f'{self.indexname_prefix}live' - - def _elastic_actions_with_index(self, messages_chunk, indexnames, action_tracker: _ActionTracker): - if not indexnames: - raise ValueError('cannot index to no indexes') - for _message_target_id, _elastic_actions in self.build_elastic_actions(messages_chunk): - if isinstance(_elastic_actions, dict): # allow a single action - _elastic_actions = [_elastic_actions] - for _elastic_action in _elastic_actions: - _docid = _elastic_action['_id'] - for _indexname in indexnames: - action_tracker.add_action(_message_target_id, _indexname, _docid) - yield { - **_elastic_action, - '_index': _indexname, - } - action_tracker.done_scheduling(_message_target_id) + return combine_indexname_parts(self.strategy_name, 'live') + + def _elastic_actions_with_index( + self, + messages_chunk: messages.MessagesChunk, + action_tracker: _ActionTracker, + ): + for _actionset in self.build_elastic_actions(messages_chunk): + for _index_subname, _elastic_actions in _actionset.actions_by_subname.items(): + _indexnames = self._get_indexnames_for_action( + index_subname=_index_subname, + is_backfill_action=messages_chunk.message_type.is_backfill, + ) + for _elastic_action in _elastic_actions: + _docid = _elastic_action['_id'] + for _indexname in _indexnames: + action_tracker.add_action(_actionset.message_target_id, _indexname, _docid) + yield { + **_elastic_action, + '_index': _indexname, + } + action_tracker.done_scheduling(_actionset.message_target_id) + + def _get_indexnames_for_action( + self, + index_subname: str, + *, + is_backfill_action: bool = False, + ) -> set[str]: + if is_backfill_action: + return {self.get_index(index_subname).full_index_name} + return { + _index.full_index_name + for _index in self.each_live_index() + if _index.subname == index_subname + } def _get_indexnames_for_alias(self, alias_name) -> set[str]: try: @@ -260,14 +342,20 @@ def _set_indexnames_for_alias(self, alias_name, indexnames): ), ]) + @dataclasses.dataclass class SpecificIndex(IndexStrategy.SpecificIndex): + index_strategy: Elastic8IndexStrategy # note: narrower type + + @property + def index_def(self) -> Elastic8IndexStrategy.IndexDefinition: + return self.index_strategy.current_index_defs()[self.subname] # abstract method from IndexStrategy.SpecificIndex def pls_get_status(self) -> IndexStatus: if not self.pls_check_exists(): return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_subname=self.subname, + specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, doc_count=0, @@ -275,8 +363,8 @@ def pls_get_status(self) -> IndexStatus: ) index_info = ( self.index_strategy.es8_client.indices - .get(index=self.indexname, features='aliases,settings') - [self.indexname] + .get(index=self.full_index_name, features='aliases,settings') + [self.full_index_name] ) index_aliases = set(index_info['aliases'].keys()) creation_date = timestamp_to_readable_datetime( @@ -284,12 +372,12 @@ def pls_get_status(self) -> IndexStatus: ) doc_count = ( self.index_strategy.es8_client.indices - .stats(index=self.indexname, metric='docs') - ['indices'][self.indexname]['primaries']['docs']['count'] + .stats(index=self.full_index_name, metric='docs') + ['indices'][self.full_index_name]['primaries']['docs']['count'] ) return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_subname=self.subname, + specific_indexname=self.full_index_name, is_kept_live=( self.index_strategy._alias_for_keeping_live in index_aliases @@ -304,62 +392,64 @@ def pls_get_status(self) -> IndexStatus: # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): - indexname = self.indexname - logger.info(f'{self.__class__.__name__}: checking for index {indexname}') - return bool( + _indexname = self.full_index_name + _result = bool( self.index_strategy.es8_client.indices - .exists(index=indexname) + .exists(index=_indexname) + ) + logger.info( + f'{_indexname}: exists' + if _result + else f'{_indexname}: does not exist' ) + return _result # abstract method from IndexStrategy.SpecificIndex def pls_create(self): assert self.is_current, ( 'cannot create a non-current version of an index!' - ' maybe try `index_strategy.for_current_index()`?' ) - index_to_create = self.indexname + index_to_create = self.full_index_name logger.debug('Ensuring index %s', index_to_create) index_exists = ( self.index_strategy.es8_client.indices .exists(index=index_to_create) ) if not index_exists: - logger.warning('Creating index %s', index_to_create) + logger.info('Creating index %s', index_to_create) + _index_def = self.index_def ( self.index_strategy.es8_client.indices .create( index=index_to_create, - settings=self.index_strategy.index_settings(), - mappings=self.index_strategy.index_mappings(), + settings=_index_def.settings, + mappings=_index_def.mappings, ) ) self.pls_refresh() # abstract method from IndexStrategy.SpecificIndex def pls_refresh(self): + _indexname = self.full_index_name ( self.index_strategy.es8_client.indices - .refresh(index=self.indexname) - ) - logger.debug('%r: Waiting for yellow status', self) - ( - self.index_strategy.es8_client.cluster - .health(wait_for_status='yellow') + .refresh(index=_indexname) ) - logger.info('%r: Refreshed', self) + logger.info('%s: Refreshed', _indexname) # abstract method from IndexStrategy.SpecificIndex def pls_delete(self): + _indexname = self.full_index_name ( self.index_strategy.es8_client.indices - .delete(index=self.indexname, ignore=[400, 404]) + .delete(index=_indexname, ignore=[400, 404]) ) - logger.warning('%r: deleted', self) + logger.warning('%s: deleted', _indexname) # abstract method from IndexStrategy.SpecificIndex def pls_start_keeping_live(self): self.index_strategy._add_indexname_to_alias( - indexname=self.indexname, + indexname=self.full_index_name, alias_name=self.index_strategy._alias_for_keeping_live, ) logger.info('%r: now kept live', self) @@ -367,13 +457,13 @@ def pls_start_keeping_live(self): # abstract method from IndexStrategy.SpecificIndex def pls_stop_keeping_live(self): self.index_strategy._remove_indexname_from_alias( - indexname=self.indexname, + indexname=self.full_index_name, alias_name=self.index_strategy._alias_for_keeping_live, ) logger.warning('%r: no longer kept live', self) def pls_get_mappings(self): - return self.index_strategy.es8_client.indices.get_mapping(index=self.indexname).body + return self.index_strategy.es8_client.indices.get_mapping(index=self.full_index_name).body @dataclasses.dataclass diff --git a/share/search/index_strategy/sharev2_elastic5.py b/share/search/index_strategy/sharev2_elastic5.py index 13edb4881..8e775569c 100644 --- a/share/search/index_strategy/sharev2_elastic5.py +++ b/share/search/index_strategy/sharev2_elastic5.py @@ -1,3 +1,5 @@ +from __future__ import annotations +import functools import json import logging @@ -35,10 +37,11 @@ class Sharev2Elastic5IndexStrategy(IndexStrategy): # perpetuated optimizations from times long past MAX_CHUNK_BYTES = 10 * 1024 ** 2 # 10 megs - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + @classmethod + @functools.cache + def _get_elastic5_client(cls) -> elasticsearch5.Elasticsearch: should_sniff = settings.ELASTICSEARCH['SNIFF'] - self.es5_client = elasticsearch5.Elasticsearch( + return elasticsearch5.Elasticsearch( settings.ELASTICSEARCH5_URL, retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT'], @@ -50,6 +53,19 @@ def __init__(self, *args, **kwargs): sniffer_timeout=60 if should_sniff else None, ) + @property + def es5_client(self): + return self._get_elastic5_client() # cached classmethod for shared client + + @property + def single_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index(self.STATIC_INDEXNAME) + + # abstract method from IndexStrategy + @classmethod + def each_index_subname(self): + yield self.STATIC_INDEXNAME + # override IndexStrategy @property def nonurgent_messagequeue_name(self): @@ -65,11 +81,6 @@ def urgent_messagequeue_name(self): def indexname_prefix(self): return self.STATIC_INDEXNAME - # override IndexStrategy - @property - def current_indexname(self): - return self.STATIC_INDEXNAME - # abstract method from IndexStrategy def compute_strategy_checksum(self): return ChecksumIri.digest_json( @@ -83,17 +94,26 @@ def compute_strategy_checksum(self): ) # abstract method from IndexStrategy - def pls_make_default_for_searching(self, specific_index): - assert specific_index.index_strategy is self - assert specific_index.indexname == self.STATIC_INDEXNAME + def pls_make_default_for_searching(self): + pass # the one index is the only one # abstract method from IndexStrategy def pls_get_default_for_searching(self): - return self.for_specific_index(self.STATIC_INDEXNAME) + return self + + # abstract method from IndexStrategy + def each_existing_index(self, *args, **kwargs): + _index = self.single_index + if _index.pls_check_exists(): + yield _index # abstract method from IndexStrategy - def each_specific_index(self): - yield self.for_specific_index(self.STATIC_INDEXNAME) + def each_live_index(self, *args, **kwargs): + yield self.single_index + + # abstract method from IndexStrategy + def each_subnamed_index(self): + yield self.single_index # abstract method from IndexStrategy def pls_handle_messages_chunk(self, messages_chunk): @@ -315,14 +335,21 @@ def _build_elastic_actions(self, messages_chunk): yield action class SpecificIndex(IndexStrategy.SpecificIndex): + index_strategy: Sharev2Elastic5IndexStrategy # narrow type + + # override IndexStrategy.SpecificIndex + @property + def full_index_name(self): + return self.index_strategy.STATIC_INDEXNAME + # abstract method from IndexStrategy.SpecificIndex def pls_create(self): # check index exists (if not, create) - logger.debug('Ensuring index %s', self.indexname) + logger.debug('Ensuring index %s', self.full_index_name) indices_api = self.index_strategy.es5_client.indices - if not indices_api.exists(index=self.indexname): + if not indices_api.exists(index=self.full_index_name): indices_api.create( - self.indexname, + self.full_index_name, body={ 'settings': self.index_strategy._index_settings(), 'mappings': self.index_strategy._index_mappings(), @@ -334,7 +361,7 @@ def pls_create(self): self.index_strategy.es5_client.cluster .health(wait_for_status='yellow') ) - logger.info('Finished setting up Elasticsearch index %s', self.indexname) + logger.info('Finished setting up Elasticsearch index %s', self.full_index_name) # abstract method from IndexStrategy.SpecificIndex def pls_start_keeping_live(self): @@ -344,7 +371,7 @@ def pls_start_keeping_live(self): def pls_stop_keeping_live(self): raise exceptions.IndexStrategyError( f'{self.__class__.__qualname__} is implemented for only one index, ' - f'"{self.indexname}", which is always kept live (until elasticsearch5 ' + f'"{self.full_index_name}", which is always kept live (until elasticsearch5 ' 'support is dropped)' ) @@ -352,23 +379,23 @@ def pls_stop_keeping_live(self): def pls_refresh(self): ( self.index_strategy.es5_client.indices - .refresh(index=self.indexname) + .refresh(index=self.full_index_name) ) - logger.info('Refreshed index %s', self.indexname) + logger.info('Refreshed index %s', self.full_index_name) # abstract method from IndexStrategy.SpecificIndex def pls_delete(self): - logger.warning(f'{self.__class__.__name__}: deleting index {self.indexname}') + logger.warning(f'{self.__class__.__name__}: deleting index {self.full_index_name}') ( self.index_strategy.es5_client.indices - .delete(index=self.indexname, ignore=[400, 404]) + .delete(index=self.full_index_name, ignore=[400, 404]) ) # abstract method from IndexStrategy.SpecificIndex def pls_check_exists(self): return bool( self.index_strategy.es5_client.indices - .exists(index=self.indexname) + .exists(index=self.full_index_name) ) # abstract method from IndexStrategy.SpecificIndex @@ -376,27 +403,27 @@ def pls_get_status(self) -> IndexStatus: try: stats = ( self.index_strategy.es5_client.indices - .stats(index=self.indexname, metric='docs') + .stats(index=self.full_index_name, metric='docs') ) existing_indexes = ( self.index_strategy.es5_client.indices - .get_settings(index=self.indexname, name='index.creation_date') + .get_settings(index=self.full_index_name, name='index.creation_date') ) - index_settings = existing_indexes[self.indexname] - index_stats = stats['indices'][self.indexname] + index_settings = existing_indexes[self.full_index_name] + index_stats = stats['indices'][self.full_index_name] except (KeyError, elasticsearch5.exceptions.NotFoundError): # not yet created return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_subname=self.subname, + specific_indexname=self.full_index_name, is_kept_live=False, is_default_for_searching=False, - creation_date=None, + creation_date='', doc_count=0, ) return IndexStatus( - index_strategy_name=self.index_strategy.name, - specific_indexname=self.indexname, + index_subname=self.subname, + specific_indexname=self.full_index_name, is_kept_live=True, is_default_for_searching=True, creation_date=timestamp_to_readable_datetime( @@ -406,12 +433,12 @@ def pls_get_status(self) -> IndexStatus: ) # optional method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: '''the definitive sharev2-search api: passthru to elasticsearch version 5 ''' try: return self.index_strategy.es5_client.search( - index=self.indexname, + index=self.full_index_name, body=request_body or {}, params=request_queryparams or {}, ) diff --git a/share/search/index_strategy/sharev2_elastic8.py b/share/search/index_strategy/sharev2_elastic8.py index 016503d96..6de96a668 100644 --- a/share/search/index_strategy/sharev2_elastic8.py +++ b/share/search/index_strategy/sharev2_elastic8.py @@ -43,7 +43,17 @@ def backfill_message_type(self): return messages.MessageType.BACKFILL_SUID # abstract method from Elastic8IndexStrategy - def index_settings(self): + @classmethod + def define_current_indexes(cls): + return { # empty index subname, for backcompat + '': cls.IndexDefinition( + mappings=cls.index_mappings(), + settings=cls.index_settings(), + ), + } + + @classmethod + def index_settings(cls): return { 'analysis': { 'analyzer': { @@ -78,8 +88,8 @@ def index_settings(self): } } - # abstract method from Elastic8IndexStrategy - def index_mappings(self): + @classmethod + def index_mappings(cls): exact_field = { 'exact': { 'type': 'keyword', @@ -129,15 +139,19 @@ def index_mappings(self): # abstract method from Elastic8IndexStrategy def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + def _make_actionset(suid_id, *actions): + return self.MessageActionSet(suid_id, {'': actions}) + _suid_ids = set(messages_chunk.target_ids_chunk) for _suid_id, _serialized_doc in self._load_docs(_suid_ids): _source_doc = json.loads(_serialized_doc) _doc_id = _source_doc['id'] _suid_ids.discard(_suid_id) - if _source_doc.pop('is_deleted', False): - yield _suid_id, self.build_delete_action(_doc_id) - else: - yield _suid_id, self.build_index_action(_doc_id, _source_doc) + yield _make_actionset(_suid_id, ( + self.build_delete_action(_doc_id) + if _source_doc.pop('is_deleted', False) + else self.build_index_action(_doc_id, _source_doc) + )) # delete any leftovers for _leftover_suid in SourceUniqueIdentifier.objects.filter(id__in=_suid_ids): _suid_ids.discard(_leftover_suid.id) @@ -145,10 +159,14 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _leftover_suid_id = _leftover_suid.get_backcompat_sharev2_suid().id except SourceUniqueIdentifier.DoesNotExist: _leftover_suid_id = _leftover_suid.id - yield _leftover_suid.id, self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _make_actionset(_leftover_suid_id, self.build_delete_action( + self._get_doc_id(_leftover_suid_id), + )) # these ones don't even exist! for _leftover_suid_id in _suid_ids: - yield _leftover_suid_id, self.build_delete_action(self._get_doc_id(_leftover_suid_id)) + yield _make_actionset(_leftover_suid_id, self.build_delete_action( + self._get_doc_id(_leftover_suid_id), + )) def _get_doc_id(self, suid_id: int): return IDObfuscator.encode_id(suid_id, SourceUniqueIdentifier) @@ -171,22 +189,21 @@ def _load_docs(self, suid_ids) -> typing.Iterable[tuple[int, str]]: for _record in _record_qs: yield (_record.suid_id, _record.formatted_metadata) - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - # optional method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - try: - json_response = self.index_strategy.es8_client.search( - index=self.indexname, - body=(request_body or {}), - params=(request_queryparams or {}), - track_total_hits=True, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - try: # mangle response for some limited backcompat with elasticsearch5 - es8_total = json_response['hits']['total'] - json_response['hits']['total'] = es8_total['value'] - json_response['hits']['_total'] = es8_total - except KeyError: - pass - return json_response + # optional method from IndexStrategy + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + try: + json_response = self.es8_client.search( + index=self.get_index('').full_index_name, + body=(request_body or {}), + params=(request_queryparams or {}), + track_total_hits=True, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + try: # mangle response for some limited backcompat with elasticsearch5 + es8_total = json_response['hits']['total'] + json_response['hits']['total'] = es8_total['value'] + json_response['hits']['_total'] = es8_total + except KeyError: + pass + return json_response diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index dceb272df..49874d189 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -14,6 +14,7 @@ from share.search import exceptions from share.search import messages +from share.search.index_strategy._base import IndexStrategy from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db @@ -72,10 +73,21 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD - def index_settings(self): + @classmethod + def define_current_indexes(cls): + return { # empty index subname, for backcompat + '': cls.IndexDefinition( + mappings=cls.index_mappings(), + settings=cls.index_settings(), + ), + } + + @classmethod + def index_settings(cls): return {} - def index_mappings(self): + @classmethod + def index_mappings(cls): _capped_keyword = { 'type': 'keyword', 'ignore_above': KEYWORD_LENGTH_MAX, @@ -155,6 +167,11 @@ def index_mappings(self): }, } + @property + def __index(self) -> IndexStrategy.SpecificIndex: + # this is a single-index strategy -- for back-compat, that index has empty subname + return self.get_index('') + def _build_sourcedoc(self, indexcard_rdf): _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() if _should_skip_card(indexcard_rdf, _rdfdoc): @@ -254,6 +271,8 @@ def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]) } def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + def _make_actionset(indexcard_id, *actions): + return self.MessageActionSet(indexcard_id, {'': actions}) _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) for _indexcard_rdf in _indexcard_rdf_qs: @@ -267,563 +286,562 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): doc_source=_sourcedoc, ) _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _indexcard_rdf.indexcard_id, _index_action + yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) # delete any that don't have "latest" rdf and derived osfmap_json _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) for _indexcard in _leftovers: - yield _indexcard.id, self.build_delete_action(_indexcard.get_iri()) - - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - return self.index_strategy.es8_client.search( - index=self.indexname, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) + yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) + + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + return self.es8_client.search( + index=self.__index.full_index_name, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - _cursor = self._cardsearch_cursor(cardsearch_params) - _sort = self._cardsearch_sort(cardsearch_params.sort_list) - _query = self._cardsearch_query( - cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_textsegment_set, - cardsearch_cursor=_cursor, - ) - _from_offset = ( - _cursor.start_offset - if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) - else _cursor.start_offset - len(_cursor.first_page_ids) - ) - _search_kwargs = dict( - query=_query, - aggs=self._cardsearch_aggs(cardsearch_params), - sort=_sort, - from_=_from_offset, - size=_cursor.bounded_page_size, - source=False, # no need to get _source; _id is enough - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) - _search_kwargs = dict( - query=self._cardsearch_query( - valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - )}}], - ), - size=0, # ignore cardsearch hits; just want the aggs - aggs=( - self._valuesearch_date_aggs(valuesearch_params) - if _is_date_search - else self._valuesearch_iri_aggs(valuesearch_params, _cursor) - ), + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + _cursor = self._cardsearch_cursor(cardsearch_params) + _sort = self._cardsearch_sort(cardsearch_params.sort_list) + _query = self._cardsearch_query( + cardsearch_params.cardsearch_filter_set, + cardsearch_params.cardsearch_textsegment_set, + cardsearch_cursor=_cursor, + ) + _from_offset = ( + _cursor.start_offset + if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) + else _cursor.start_offset - len(_cursor.first_page_ids) + ) + _search_kwargs = dict( + query=_query, + aggs=self._cardsearch_aggs(cardsearch_params), + sort=_sort, + from_=_from_offset, + size=_cursor.bounded_page_size, + source=False, # no need to get _source; _id is enough + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.__index.full_index_name, + **_search_kwargs, ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) - - ### - # query implementation - - def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: - _request_cursor = cardsearch_params.page_cursor - if ( - _request_cursor.is_basic() - and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_textsegment_set - ): - return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) - return OffsetCursor.from_cursor(_request_cursor) - - def _cardsearch_query( - self, - filter_set, textsegment_set, *, - additional_filters=None, - cardsearch_cursor: PageCursor | None = None, - ) -> dict: - _bool_query = { - 'filter': additional_filters or [], - 'must': [], - 'must_not': [], - 'should': [], - } - for _searchfilter in filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) + + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) + _search_kwargs = dict( + query=self._cardsearch_query( + valuesearch_params.cardsearch_filter_set, + valuesearch_params.cardsearch_textsegment_set, + additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + )}}], + ), + size=0, # ignore cardsearch hits; just want the aggs + aggs=( + self._valuesearch_date_aggs(valuesearch_params) + if _is_date_search + else self._valuesearch_iri_aggs(valuesearch_params, _cursor) + ), + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.__index.full_index_name, + **_search_kwargs, ) - for _textsegment in textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _bool_query[_boolkey].extend(_textqueries) - if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): - # no need for randomness - return {'bool': _bool_query} - if not cardsearch_cursor.first_page_ids: - # independent random sample - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} - if cardsearch_cursor.is_first_page(): - # returning to a first page previously visited - _bool_query['filter'].append(_firstpage_uuid_query) - return {'bool': _bool_query} - # get a subsequent page using reproducible randomness - _bool_query['must_not'].append(_firstpage_uuid_query) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) + + ### + # query implementation + + def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: + _request_cursor = cardsearch_params.page_cursor + if ( + _request_cursor.is_basic() + and not cardsearch_params.sort_list + and not cardsearch_params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + + def _cardsearch_query( + self, + filter_set, textsegment_set, *, + additional_filters=None, + cardsearch_cursor: PageCursor | None = None, + ) -> dict: + _bool_query = { + 'filter': additional_filters or [], + 'must': [], + 'must_not': [], + 'should': [], + } + for _searchfilter in filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator.is_date_operator(): + _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + _textq_builder = self._NestedTextQueryBuilder( + relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), + ) + for _textsegment in textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _bool_query[_boolkey].extend(_textqueries) + if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): + # no need for randomness + return {'bool': _bool_query} + if not cardsearch_cursor.first_page_ids: + # independent random sample return { 'function_score': { 'query': {'bool': _bool_query}, 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_ids), - 'field': 'indexcard_uuid', - }, + 'random_score': {}, # default random_score is fast and unpredictable }, } + _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} + if cardsearch_cursor.is_first_page(): + # returning to a first page previously visited + _bool_query['filter'].append(_firstpage_uuid_query) + return {'bool': _bool_query} + # get a subsequent page using reproducible randomness + _bool_query['must_not'].append(_firstpage_uuid_query) + return { + 'function_score': { + 'query': {'bool': _bool_query}, + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(cardsearch_cursor.first_page_ids), + 'field': 'indexcard_uuid', + }, + }, + } - def _cardsearch_aggs(self, cardsearch_params): - _aggs = {} - if cardsearch_params.related_property_paths: - _aggs['related_propertypath_usage'] = {'terms': { - 'field': 'iri_paths_present', - 'include': [ - iri_path_as_keyword(_path) - for _path in cardsearch_params.related_property_paths - ], - 'size': len(cardsearch_params.related_property_paths), - }} - return _aggs - - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool: dict[str, Any] = { - 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - )}}], - 'must': [], - 'must_not': [], - 'should': [], - } - _nested_terms_agg = { - 'field': 'nested_iri.iri_value', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_offset + cursor.bounded_page_size + 1, - } - _iris = list(valuesearch_params.valuesearch_iris()) - if _iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.iri_value': _iris, - }}) - _nested_terms_agg['size'] = len(_iris) - _nested_terms_agg['include'] = _iris - _type_iris = list(valuesearch_params.valuesearch_type_iris()) - if _type_iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.value_type_iri': _type_iris, - }}) - _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _nested_iri_bool[_boolkey].extend(_textqueries) - return { - 'in_nested_iri': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'bool': _nested_iri_bool}, - 'aggs': { - 'iri_values': { - 'terms': _nested_terms_agg, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, + def _cardsearch_aggs(self, cardsearch_params): + _aggs = {} + if cardsearch_params.related_property_paths: + _aggs['related_propertypath_usage'] = {'terms': { + 'field': 'iri_paths_present', + 'include': [ + iri_path_as_keyword(_path) + for _path in cardsearch_params.related_property_paths + ], + 'size': len(cardsearch_params.related_property_paths), + }} + return _aggs + + def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): + _nested_iri_bool: dict[str, Any] = { + 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + )}}], + 'must': [], + 'must_not': [], + 'should': [], + } + _nested_terms_agg = { + 'field': 'nested_iri.iri_value', + # WARNING: terribly inefficient pagination (part one) + 'size': cursor.start_offset + cursor.bounded_page_size + 1, + } + _iris = list(valuesearch_params.valuesearch_iris()) + if _iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.iri_value': _iris, + }}) + _nested_terms_agg['size'] = len(_iris) + _nested_terms_agg['include'] = _iris + _type_iris = list(valuesearch_params.valuesearch_type_iris()) + if _type_iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.value_type_iri': _type_iris, + }}) + _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') + for _textsegment in valuesearch_params.valuesearch_textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _nested_iri_bool[_boolkey].extend(_textqueries) + return { + 'in_nested_iri': { + 'nested': {'path': 'nested_iri'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'bool': _nested_iri_bool}, + 'aggs': { + 'iri_values': { + 'terms': _nested_terms_agg, + 'aggs': { + 'type_iri': {'terms': { + 'field': 'nested_iri.value_type_iri', + }}, + 'name_text': {'terms': { + 'field': 'nested_iri.value_name_text.raw', + }}, + 'title_text': {'terms': { + 'field': 'nested_iri.value_title_text.raw', + }}, + 'label_text': {'terms': { + 'field': 'nested_iri.value_label_text.raw', + }}, }, }, }, }, }, - } + }, + } - def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): - _aggs = { - 'in_nested_date': { - 'nested': {'path': 'nested_date'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - ), - }}, - 'aggs': { - 'count_by_year': { - 'date_histogram': { - 'field': 'nested_date.date_value', - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, + def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): + _aggs = { + 'in_nested_date': { + 'nested': {'path': 'nested_date'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + ), + }}, + 'aggs': { + 'count_by_year': { + 'date_histogram': { + 'field': 'nested_date.date_value', + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, }, }, }, }, }, - } - return _aggs - - def _valuesearch_handle( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: OffsetCursor, - ): - _iri_aggs = es8_response['aggregations'].get('in_nested_iri') - if _iri_aggs: - _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_offset + cursor.bounded_page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.total_count = ( - MANY_MORE - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchHandle( - cursor=cursor, - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - search_params=valuesearch_params, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations']['in_nested_date'] - ['value_at_propertypath']['count_by_year']['buckets'] - ) - return ValuesearchHandle( - cursor=PageCursor(len(_year_buckets)), - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - search_params=valuesearch_params, - ) - - def _valuesearch_iri_result(self, iri_bucket): - return ValuesearchResult( - value_iri=iri_bucket['key'], - value_type=_bucketlist(iri_bucket['type_iri']), - name_text=_bucketlist(iri_bucket['name_text']), - title_text=_bucketlist(iri_bucket['title_text']), - label_text=_bucketlist(iri_bucket['label_text']), - match_count=iri_bucket['doc_count'], + }, + } + return _aggs + + def _valuesearch_handle( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: OffsetCursor, + ): + _iri_aggs = es8_response['aggregations'].get('in_nested_iri') + if _iri_aggs: + _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly inefficient pagination (part two) + _page_end_index = cursor.start_offset + cursor.bounded_page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count ) - - def _valuesearch_date_result(self, date_bucket): - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], + return ValuesearchHandle( + cursor=cursor, + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + search_params=valuesearch_params, + ) + else: # assume date + _year_buckets = ( + es8_response['aggregations']['in_nested_date'] + ['value_at_propertypath']['count_by_year']['buckets'] + ) + return ValuesearchHandle( + cursor=PageCursor(len(_year_buckets)), + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + search_params=valuesearch_params, ) - def _cardsearch_presence_query(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} + def _valuesearch_iri_result(self, iri_bucket): + return ValuesearchResult( + value_iri=iri_bucket['key'], + value_type=_bucketlist(iri_bucket['type_iri']), + name_text=_bucketlist(iri_bucket['name_text']), + title_text=_bucketlist(iri_bucket['title_text']), + label_text=_bucketlist(iri_bucket['label_text']), + match_count=iri_bucket['doc_count'], + ) - def _cardsearch_path_presence_query(self, path: tuple[str, ...]): - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, - }} - return {'term': { - 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), - }} + def _valuesearch_date_result(self, date_bucket): + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) - def _cardsearch_iri_filter(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_iri_query(_path, search_filter.value_set) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, + def _cardsearch_presence_query(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_presence_query(_path) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} + + def _cardsearch_path_presence_query(self, path: tuple[str, ...]): + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): + return {'nested': { + 'path': 'nested_iri', + 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, }} + return {'term': { + 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), + }} - def _cardsearch_path_iri_query(self, path, value_set): - _suffuniq_values = [ - get_sufficiently_unique_iri(_iri) - for _iri in value_set - ] - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'bool': { - 'must': [ # both - {'term': {'nested_iri.distance_from_focus': len(path)}}, - {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, - ], - }}, - }} - # without a glob-path, can use the flattened keyword field - return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} + def _cardsearch_iri_filter(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_iri_query(_path, search_filter.value_set) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} - def _cardsearch_date_filter(self, search_filter): + def _cardsearch_path_iri_query(self, path, value_set): + _suffuniq_values = [ + get_sufficiently_unique_iri(_iri) + for _iri in value_set + ] + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): return {'nested': { - 'path': 'nested_date', - 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, + 'path': 'nested_iri', + 'query': {'bool': { + 'must': [ # both + {'term': {'nested_iri.distance_from_focus': len(path)}}, + {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, + ], + }}, }} + # without a glob-path, can use the flattened keyword field + return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} - def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: - # filter by requested paths - yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') - # filter by requested value/operator - if search_filter.operator == SearchFilter.FilterOperator.BEFORE: - _value = min(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'lt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AFTER: - _value = max(search_filter.value_set) # rely on string-comparable isoformat + def _cardsearch_date_filter(self, search_filter): + return {'nested': { + 'path': 'nested_date', + 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, + }} + + def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: + # filter by requested paths + yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') + # filter by requested value/operator + if search_filter.operator == SearchFilter.FilterOperator.BEFORE: + _value = min(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'lt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AFTER: + _value = max(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'gt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: + for _value in search_filter.value_set: + _filtervalue = _daterange_value_and_format(_value) yield {'range': {'nested_date.date_value': { - 'gt': _daterange_value_and_format(_value) + 'gte': _filtervalue, + 'lte': _filtervalue, }}} - elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: - for _value in search_filter.value_set: - _filtervalue = _daterange_value_and_format(_value) - yield {'range': {'nested_date.date_value': { - 'gte': _filtervalue, - 'lte': _filtervalue, - }}} - else: - raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - - def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): - if not sort_list: - return None - return [ - {'nested_date.date_value': { - 'order': ('desc' if _sortparam.descending else 'asc'), - 'nested': { - 'path': 'nested_date', - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - _sortparam.propertypath, - suffuniq=True, - ), - }}, - }, - }} - for _sortparam in sort_list - ] + else: + raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - def _cardsearch_handle( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: OffsetCursor, - ) -> CardsearchHandle: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.total_count = MANY_MORE - elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) - else: # exact (and small) count - cursor.total_count = _es8_total['value'] - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['_id'] - _results.append(CardsearchResult( - card_iri=_card_iri, - text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), - )) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchHandle( - cursor=cursor, - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - search_params=cardsearch_params, + def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): + if not sort_list: + return None + return [ + {'nested_date.date_value': { + 'order': ('desc' if _sortparam.descending else 'asc'), + 'nested': { + 'path': 'nested_date', + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + _sortparam.propertypath, + suffuniq=True, + ), + }}, + }, + }} + for _sortparam in sort_list + ] + + def _cardsearch_handle( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: OffsetCursor, + ) -> CardsearchHandle: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) + else: # exact (and small) count + cursor.total_count = _es8_total['value'] + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['_id'] + _results.append(CardsearchResult( + card_iri=_card_iri, + text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), + )) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchHandle( + cursor=cursor, + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + search_params=cardsearch_params, + ) - def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: - for _innerhit_group in es8_hit.get('inner_hits', {}).values(): - for _innerhit in _innerhit_group['hits']['hits']: - _property_path = tuple( - json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), + def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: + for _innerhit_group in es8_hit.get('inner_hits', {}).values(): + for _innerhit in _innerhit_group['hits']['hits']: + _property_path = tuple( + json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), + ) + try: + _language_iris = _innerhit['fields']['nested_text.language_iri'] + except KeyError: + _language_iris = () + for _highlight in _innerhit['highlight']['nested_text.text_value']: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), + card_iri=_innerhit['_id'], ) - try: - _language_iris = _innerhit['fields']['nested_text.language_iri'] - except KeyError: - _language_iris = () - for _highlight in _innerhit['highlight']['nested_text.text_value']: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), - card_iri=_innerhit['_id'], - ) - - class _SimpleTextQueryBuilder: - def __init__( - self, text_field, *, - relevance_matters=False, - ): - self._text_field = text_field - self._relevance_matters = relevance_matters - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - if textsegment.is_negated: - return {'must_not': [self.exact_text_query(textsegment.text)]} - if not textsegment.is_fuzzy: - return {'must': [self.exact_text_query(textsegment.text)]} - if not self._relevance_matters: - return {'must': [self.fuzzy_text_must_query(textsegment.text)]} - return { - 'must': [self.fuzzy_text_must_query(textsegment.text)], - 'should': [self.fuzzy_text_should_query(textsegment.text)], - } - def exact_text_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match_phrase': { - self._text_field: {'query': text}, - }} - - def fuzzy_text_must_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match': { - self._text_field: { - 'query': text, - 'fuzziness': 'AUTO', - # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} + class _SimpleTextQueryBuilder: + def __init__( + self, text_field, *, + relevance_matters=False, + ): + self._text_field = text_field + self._relevance_matters = relevance_matters + + def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: + if textsegment.is_negated: + return {'must_not': [self.exact_text_query(textsegment.text)]} + if not textsegment.is_fuzzy: + return {'must': [self.exact_text_query(textsegment.text)]} + if not self._relevance_matters: + return {'must': [self.fuzzy_text_must_query(textsegment.text)]} + return { + 'must': [self.fuzzy_text_must_query(textsegment.text)], + 'should': [self.fuzzy_text_should_query(textsegment.text)], + } - def fuzzy_text_should_query(self, text: str): - return {'match_phrase': { - self._text_field: { - 'query': text, - 'slop': len(text.split()), - }, - }} - - class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): - def __init__(self, **kwargs): - super().__init__('nested_text.text_value', **kwargs) - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - return { - _boolkey: [ - self._make_nested_query(textsegment, _query) - for _query in _queries - ] - for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() - } + def exact_text_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match_phrase': { + self._text_field: {'query': text}, + }} - def _make_nested_query(self, textsegment, query): - _nested_q = {'nested': { - 'path': 'nested_text', - 'query': {'bool': { - 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), - 'must': query, - }}, - }} - if self._relevance_matters: - _nested_q['nested']['inner_hits'] = self._inner_hits() - return _nested_q - - def _inner_hits(self, *, highlight_query=None) -> dict: - _highlight = { - 'type': 'unified', - 'fields': {'nested_text.text_value': {}}, - } - if highlight_query is not None: - _highlight['highlight_query'] = highlight_query - return { - 'name': str(uuid.uuid4()), # avoid inner-hit name collisions - 'highlight': _highlight, - '_source': False, # _source is expensive for nested docs - 'docvalue_fields': [ - 'nested_text.path_from_focus', - 'nested_text.language_iri', - ], - } + def fuzzy_text_must_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match': { + self._text_field: { + 'query': text, + 'fuzziness': 'AUTO', + # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + + def fuzzy_text_should_query(self, text: str): + return {'match_phrase': { + self._text_field: { + 'query': text, + 'slop': len(text.split()), + }, + }} + + class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): + def __init__(self, **kwargs): + super().__init__('nested_text.text_value', **kwargs) + + def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: + return { + _boolkey: [ + self._make_nested_query(textsegment, _query) + for _query in _queries + ] + for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() + } + + def _make_nested_query(self, textsegment, query): + _nested_q = {'nested': { + 'path': 'nested_text', + 'query': {'bool': { + 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), + 'must': query, + }}, + }} + if self._relevance_matters: + _nested_q['nested']['inner_hits'] = self._inner_hits() + return _nested_q + + def _inner_hits(self, *, highlight_query=None) -> dict: + _highlight = { + 'type': 'unified', + 'fields': {'nested_text.text_value': {}}, + } + if highlight_query is not None: + _highlight['highlight_query'] = highlight_query + return { + 'name': str(uuid.uuid4()), # avoid inner-hit name collisions + 'highlight': _highlight, + '_source': False, # _source is expensive for nested docs + 'docvalue_fields': [ + 'nested_text.path_from_focus', + 'nested_text.language_iri', + ], + } ### diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2dbeb4614..d6f655c68 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -2,6 +2,7 @@ from collections import abc, defaultdict import dataclasses import functools +import itertools import json import logging import re @@ -18,6 +19,7 @@ from share.search import exceptions from share.search import messages +from share.search.index_strategy._base import IndexStrategy from share.search.index_strategy.elastic8 import Elastic8IndexStrategy from share.util.checksum_iri import ChecksumIri from trove import models as trove_db @@ -52,13 +54,46 @@ logger = logging.getLogger(__name__) +_PRIOR_UNSPLIT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TrovesearchDenormIndexStrategy', + hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', +) +_UNSPLIT_INDEX_SUBNAME = '' + + +def _is_unsplit_strat(strategy: TrovesearchDenormIndexStrategy) -> bool: + return (strategy.strategy_check == _PRIOR_UNSPLIT_STRATEGY_CHECKSUM.hexdigest) + + class TrovesearchDenormIndexStrategy(Elastic8IndexStrategy): CURRENT_STRATEGY_CHECKSUM = ChecksumIri( checksumalgorithm_name='sha-256', salt='TrovesearchDenormIndexStrategy', - hexdigest='8a87bb51d46af9794496e798f033e8ba1ea0251fa7a8ffa5d037e90fb0c602c8', + hexdigest='4c8784ddd08914ec779b33b8f1945b0b2ff026eea355392ab3c4fe2fe10d71fe', ) + # abstract method from Elastic8IndexStrategy + @classmethod + def define_current_indexes(cls) -> dict[str, Elastic8IndexStrategy.IndexDefinition]: + return { + 'cards': cls.IndexDefinition( + settings=cls._index_settings(), + mappings=cls._cards_index_mappings(), + ), + 'iri_values': cls.IndexDefinition( + settings=cls._index_settings(), + mappings=cls._iri_values_index_mappings(), + ), + } + + # override from IndexStrategy + def each_subnamed_index(self): + if _is_unsplit_strat(self): + yield self.get_index(_UNSPLIT_INDEX_SUBNAME) + else: + yield from super().each_subnamed_index() + # abstract method from IndexStrategy @property def supported_message_types(self): @@ -72,26 +107,38 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_INDEXCARD - # abstract method from Elastic8IndexStrategy - def index_settings(self): + @classmethod + def _index_settings(cls): return { 'number_of_shards': 5, 'number_of_replicas': 2, } - # abstract method from Elastic8IndexStrategy - def index_mappings(self): + @classmethod + def _cards_index_mappings(cls): + return { + 'dynamic': 'false', + 'dynamic_templates': cls._dynamic_templates(), + 'properties': { + 'card': {'properties': cls._card_mappings()}, + 'chunk_timestamp': {'type': 'unsigned_long'}, + }, + } + + @classmethod + def _iri_values_index_mappings(cls): return { 'dynamic': 'false', - 'dynamic_templates': self._dynamic_templates(), + 'dynamic_templates': cls._dynamic_templates(), 'properties': { - 'card': {'properties': self._card_mappings()}, - 'iri_value': {'properties': self._iri_value_mappings()}, + 'card': {'properties': cls._card_mappings()}, + 'iri_value': {'properties': cls._iri_value_mappings()}, 'chunk_timestamp': {'type': 'unsigned_long'}, }, } - def _dynamic_templates(self): + @classmethod + def _dynamic_templates(cls): return [ {'dynamic_text_by_propertypath': { 'path_match': '*.text_by_propertypath.*', @@ -114,7 +161,8 @@ def _dynamic_templates(self): }}, ] - def _card_mappings(self): + @classmethod + def _card_mappings(cls): return { # simple keyword properties 'card_iri': ts.KEYWORD_MAPPING, @@ -126,19 +174,21 @@ def _card_mappings(self): 'source_record_identifier': ts.KEYWORD_MAPPING, }, }, - **self._paths_and_values_mappings(), + **cls._paths_and_values_mappings(), } - def _iri_value_mappings(self): + @classmethod + def _iri_value_mappings(cls): return { 'value_name': ts.KEYWORD_MAPPING, 'value_title': ts.KEYWORD_MAPPING, 'value_label': ts.KEYWORD_MAPPING, 'at_card_propertypaths': ts.KEYWORD_MAPPING, - **self._paths_and_values_mappings(), + **cls._paths_and_values_mappings(), } - def _paths_and_values_mappings(self): + @classmethod + def _paths_and_values_mappings(cls): return { 'single_focus_iri': ts.KEYWORD_MAPPING, 'focus_iri_synonyms': ts.KEYWORD_MAPPING, @@ -153,12 +203,15 @@ def _paths_and_values_mappings(self): 'int_by_propertypath': {'type': 'object', 'dynamic': True}, } + ### + # receiving and acting on chunks of messages + # override method from Elastic8IndexStrategy - def after_chunk(self, messages_chunk: messages.MessagesChunk, indexnames: Iterable[str]): + def after_chunk(self, messages_chunk: messages.MessagesChunk, affected_indexnames: Iterable[str]): task__delete_iri_value_scraps.apply_async( kwargs={ - 'index_strategy_name': self.name, - 'indexnames': list(indexnames), + 'index_strategy_name': self.strategy_name, + 'indexnames': list(affected_indexnames), 'card_pks': messages_chunk.target_ids_chunk, 'timestamp': messages_chunk.timestamp, }, @@ -173,91 +226,101 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): _docbuilder = self._SourcedocBuilder(_indexcard_rdf, messages_chunk.timestamp) if not _docbuilder.should_skip(): # if skipped, will be deleted _indexcard_pk = _indexcard_rdf.indexcard_id - yield _indexcard_pk, ( - self.build_index_action( - doc_id=_doc_id, - doc_source=_doc, - ) - for _doc_id, _doc in _docbuilder.build_docs() + _cardsearch_actions = ( + self.build_index_action(_doc_id, _doc) + for _doc_id, _doc in _docbuilder.build_cardsearch_docs() ) + _valuesearch_actions = ( + self.build_index_action(_doc_id, _doc) + for _doc_id, _doc in _docbuilder.build_valuesearch_docs() + ) + _actions_by_index: dict[str, Iterable[dict]] + if _is_unsplit_strat(self): + _actions_by_index = { + # back-compat: single combined index + _UNSPLIT_INDEX_SUBNAME: itertools.chain(_cardsearch_actions, _valuesearch_actions), + } + else: + _actions_by_index = { + 'cards': _cardsearch_actions, + 'iri_values': _valuesearch_actions, + } + yield self.MessageActionSet(_indexcard_pk, _actions_by_index) _remaining_indexcard_pks.discard(_indexcard_pk) # delete any that were skipped for any reason for _indexcard_pk in _remaining_indexcard_pks: - yield _indexcard_pk, self.build_delete_action(_indexcard_pk) + _subname = (_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cards') + yield self.MessageActionSet(_indexcard_pk, { + _subname: [self.build_delete_action(_indexcard_pk)], + }) ### - # implement abstract IndexStrategy.SpecificIndex + # handling searches - class SpecificIndex(Elastic8IndexStrategy.SpecificIndex): + def cardsearch_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'cards') - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_search__sharev2_backcompat(self, request_body=None, request_queryparams=None) -> dict: - return self.index_strategy.es8_client.search( - index=self.indexname, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) + def irivaluesearch_index(self) -> IndexStrategy.SpecificIndex: + return self.get_index(_UNSPLIT_INDEX_SUBNAME if _is_unsplit_strat(self) else 'iri_values') - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - # cases to handle: - # - sort by field value (streamable) - # - sort by relevance to text (non-streamable) - # - random sort (...non-streamable?) - # - first page (full random) - # - subsequent page (reproducibly randomm) - # (for streaming pages, skip aggs and such on subsequents) - # maybe start with a "header" request (no hits, minimal aggs) - _querybuilder = _CardsearchQueryBuilder(cardsearch_params) - _search_kwargs = _querybuilder.build() - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - index=self.indexname, - source=False, # no need to get _source, identifiers are enough - docvalue_fields=['card.card_iri'], - highlight={ - 'require_field_match': False, - 'fields': {'card.text_by_propertypath.*': {}}, - }, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self.index_strategy._cardsearch_handle( - cardsearch_params, - _es8_response, - _querybuilder.response_cursor, + # abstract method from IndexStrategy + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + # cases to handle: + # - sort by field value (streamable) + # - sort by relevance to text (non-streamable) + # - random sort (...non-streamable?) + # - first page (full random) + # - subsequent page (reproducibly randomm) + # (for streaming pages, skip aggs and such on subsequents) + # maybe start with a "header" request (no hits, minimal aggs) + _querybuilder = _CardsearchQueryBuilder(cardsearch_params) + _search_kwargs = _querybuilder.build() + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.cardsearch_index().full_index_name, + source=False, # no need to get _source, identifiers are enough + docvalue_fields=['card.card_iri'], + highlight={ + 'require_field_match': False, + 'fields': {'card.text_by_propertypath.*': {}}, + }, + **_search_kwargs, ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._cardsearch_handle( + cardsearch_params, + _es8_response, + _querybuilder.response_cursor, + ) - # abstract method from IndexStrategy.SpecificIndex - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _path = valuesearch_params.valuesearch_propertypath - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(_path[-1]) - _query = ( - _build_date_valuesearch(valuesearch_params) - if _is_date_search - else _build_iri_valuesearch(valuesearch_params, _cursor) - ) - if settings.DEBUG: - logger.info(json.dumps(_query, indent=2)) - try: - _es8_response = self.index_strategy.es8_client.search( - **_query, - index=self.indexname, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return ( - self.index_strategy._valuesearch_dates_response(valuesearch_params, _es8_response) - if _is_date_search - else self.index_strategy._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + # abstract method from IndexStrategy + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + _path = valuesearch_params.valuesearch_propertypath + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = osfmap.is_date_property(_path[-1]) + if _is_date_search: + _index = self.cardsearch_index() + _query = _build_date_valuesearch(valuesearch_params) + else: + _index = self.irivaluesearch_index() + _query = _build_iri_valuesearch(valuesearch_params, _cursor) + if settings.DEBUG: + logger.info(json.dumps(_query, indent=2)) + try: + _es8_response = self.es8_client.search( + **_query, + index=_index.full_index_name, ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return ( + self._valuesearch_dates_response(valuesearch_params, _es8_response) + if _is_date_search + else self._valuesearch_iris_response(valuesearch_params, _es8_response, _cursor) + ) ### # building sourcedocs @@ -286,12 +349,13 @@ def should_skip(self) -> bool: or not any(self.rdfdoc.q(self.focus_iri, osfmap.NAMELIKE_PROPERTIES)) ) - def build_docs(self) -> Iterator[tuple[str, dict]]: - # index once without `iri_value` + def build_cardsearch_docs(self) -> Iterator[tuple[str, dict]]: yield self._doc_id(), { 'card': self._card_subdoc, 'chunk_timestamp': self.chunk_timestamp, } + + def build_valuesearch_docs(self) -> Iterator[tuple[str, dict]]: for _iri in self._fullwalk.paths_by_iri: yield self._doc_id(_iri), { 'card': self._card_subdoc, @@ -845,8 +909,6 @@ def _build_date_valuesearch(params: ValuesearchParams) -> dict: relevance_matters=False, ).boolparts(), ) - # exclude iri_value docs (possible optimization: separate indexes) - _bool.add_boolpart('must_not', {'exists': {'field': 'iri_value'}}) _field = f'card.date_by_propertypath.{_path_field_name(params.valuesearch_propertypath)}' return { 'query': _bool.as_query(), @@ -929,12 +991,17 @@ def task__delete_iri_value_scraps( this task deletes those untouched value-docs after the index has refreshed at its own pace (allowing a slightly longer delay for items to _stop_ matching queries for removed values) ''' - from share.search.index_strategy import get_index_strategy - _index_strategy = get_index_strategy(index_strategy_name) - assert isinstance(_index_strategy, Elastic8IndexStrategy) + from share.search.index_strategy import get_strategy + _index_strategy = get_strategy(index_strategy_name) + assert isinstance(_index_strategy, TrovesearchDenormIndexStrategy) + _irivalue_indexnames = { + _index.full_index_name + for _index in _index_strategy.each_live_index(any_strategy_check=True) + if _index.subname == 'iri_values' + } # delete any docs that belong to cards in this chunk but weren't touched by indexing _delete_resp = _index_strategy.es8_client.delete_by_query( - index=indexnames, + index=list(_irivalue_indexnames), query={'bool': {'must': [ {'terms': {'card.card_pk': card_pks}}, {'range': {'chunk_timestamp': {'lt': timestamp}}}, diff --git a/share/tasks/__init__.py b/share/tasks/__init__.py index fa93ccf76..37da10801 100644 --- a/share/tasks/__init__.py +++ b/share/tasks/__init__.py @@ -61,7 +61,7 @@ def schedule_index_backfill(self, index_backfill_pk): _index_backfill = db.IndexBackfill.objects.get(pk=index_backfill_pk) _index_backfill.pls_note_scheduling_has_begun() try: - _index_strategy = index_strategy.get_index_strategy(_index_backfill.index_strategy_name) + _index_strategy = index_strategy.get_strategy(_index_backfill.index_strategy_name) _messenger = IndexMessenger(celery_app=self.app, index_strategys=[_index_strategy]) _messagetype = _index_strategy.backfill_message_type assert _messagetype in _index_strategy.supported_message_types diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 30d5e11a1..40e1a7347 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -6,12 +6,17 @@ section, table { padding-left: 2em; } +nav { + display: flex; + flex-direction: horizontal; + gap: 1em; +} {% endblock %} {% block content %} -

{% trans "elasticsearch indexes" %}

-{% for index_strategy_name, indexes in index_status_by_strategy.items %} +

{% trans "trovesearch status by strategy" %}

+{% for index_strategy_name, strategy_info in index_status_by_strategy.items %}

{{ index_strategy_name }} index strategy

@@ -22,7 +27,7 @@

queues

{% trans "depth" %} {% trans "rate (past 30s)" %} - {% for queue_info in indexes.queues %} + {% for queue_info in strategy_info.queues %} {{ queue_info.name }} {{ queue_info.queue_depth }} @@ -32,124 +37,130 @@

queues

-

current index: {{indexes.current.status.specific_indexname}}

+

current: {{ strategy_info.status.strategy_id }}

+ + - + - - - - - + {% for current_index_status in strategy_info.status.index_statuses %} + + + + + + - + + {% endfor %}
{% trans "index" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %}{% trans "actions" %} {% trans "links" %}{% trans "full index name" %}
{{ indexes.current.status.creation_date|default:"--" }}{% if indexes.current.status.is_kept_live %}✓{% endif %}{% if indexes.current.status.is_default_for_searching %}✓{% endif %}{{ indexes.current.status.doc_count }}
{{ current_index_status.index_subname }}{{ current_index_status.creation_date|default:"--" }}{% if current_index_status.is_kept_live %}✓{% endif %}{% if current_index_status.is_default_for_searching %}✓{% endif %}{{ current_index_status.doc_count }} - {% if not indexes.current.status.creation_date %} -
- {% csrf_token %} - - - -
- {% elif not indexes.current.status.is_kept_live %} -
- {% csrf_token %} - - - -
- {% elif indexes.current.backfill.can_start_backfill %} -
- {% csrf_token %} - - - -
- {% elif indexes.current.backfill.can_mark_backfill_complete %} -
- {% csrf_token %} - - - -
- {% endif %} - {% if indexes.current.status.creation_date and not indexes.current.status.is_default_for_searching %} -
- {% csrf_token %} - - - -
- {% endif %} -
- {% if indexes.current.backfill.backfill_admin_url %} -

- {% trans "backfill" %}:{{ indexes.current.backfill.backfill_status }} -

- {% endif %} - {% if indexes.current.status.creation_date %} -

{% trans "mappings" %}

+ {% if current_index_status.creation_date %} + {% trans "mappings" %} {% endif %}
{{ current_index_status.specific_indexname }}
- {% if indexes.prior %} + {% for prior_strategy_status in strategy_info.status.existing_prior_strategies %}
-

prior indexes

+

prior: {{ prior_strategy_status.strategy_id }}

+ + - - + - {% for index_status in indexes.prior %} + {% for index_status in prior_strategy_status.index_statuses %} + -
{% trans "index" %} {% trans "created" %} {% trans "is kept live" %} {% trans "is default for searching" %} {% trans "doc count" %}{% trans "actions" %} {% trans "links" %}{% trans "index name" %}{% trans "full index name" %}
{{ index_status.index_subname }} {{ index_status.creation_date }} {% if index_status.is_kept_live %}✓{% endif %} {% if index_status.is_default_for_searching %}✓{% endif %} {{ index_status.doc_count }} - {% if not index_status.is_default_for_searching %} -
- {% csrf_token %} - - - -
- {% endif %} - {% if index_status.is_kept_live %} -
- {% csrf_token %} - - - -
- {% else %} -
- {% csrf_token %} - - - -
-
- {% csrf_token %} - - - - -
- {% endif %} -
{% if index_status.creation_date %}

{% trans "mappings" %}

{% endif %} @@ -159,7 +170,7 @@

prior indexes

{% endfor %}
- {% endif %} + {% endfor %}
{% endfor %} {% endblock %} diff --git a/tests/api/test_elasticsearch.py b/tests/api/test_elasticsearch.py index 13e6688f5..e37ad8141 100644 --- a/tests/api/test_elasticsearch.py +++ b/tests/api/test_elasticsearch.py @@ -55,9 +55,9 @@ def test_search(self): with mock.patch('api.search.views.index_strategy') as _mock_index_strategy_module: mock_handle_search = ( _mock_index_strategy_module - .get_index_for_sharev2_search + .get_strategy_for_sharev2_search .return_value - .pls_handle_search__sharev2_backcompat + .pls_handle_search__passthru ) mock_handle_search.return_value = {'clop': 'clip'} for url in urls: diff --git a/tests/api/test_feeds.py b/tests/api/test_feeds.py index 49a016664..218128baa 100644 --- a/tests/api/test_feeds.py +++ b/tests/api/test_feeds.py @@ -52,9 +52,9 @@ def fake_items(self, Graph): json.loads(formatted_item) for formatted_item in formatted_items ] - with mock.patch('api.views.feeds.index_strategy.get_index_for_sharev2_search') as mock_get_for_searching: + with mock.patch('api.views.feeds.index_strategy.get_strategy_for_sharev2_search') as mock_get_for_searching: mock_strategy = mock_get_for_searching.return_value - mock_strategy.pls_handle_search__sharev2_backcompat.return_value = { + mock_strategy.pls_handle_search__passthru.return_value = { 'hits': { 'hits': [ {'_source': item, '_id': item['id']} diff --git a/tests/share/bin/test_sharectl.py b/tests/share/bin/test_sharectl.py index e39c6140c..a8e5c6325 100644 --- a/tests/share/bin/test_sharectl.py +++ b/tests/share/bin/test_sharectl.py @@ -28,46 +28,44 @@ def test_sharectl_version(): class TestSharectlSearch: - @pytest.mark.parametrize('indexnames', [ + @pytest.mark.parametrize('strategynames', [ ['one'], ['another', 'makes', 'two'], ]) - def test_purge(self, indexnames): - mock_specific_indexes = { - indexname: mock.Mock() - for indexname in indexnames + def test_purge(self, strategynames): + mock_strategies = { + strategyname: mock.Mock() + for strategyname in strategynames } - def _get_specific_index(indexname): - return mock_specific_indexes[indexname] + def _fake_parse_strategy_name(strategyname): + return mock_strategies[strategyname] - with mock.patch('share.bin.search.index_strategy.get_specific_index', wraps=_get_specific_index) as mock_get_specific: - run_sharectl('search', 'purge', *indexnames) - assert mock_get_specific.mock_calls == [ - mock.call(indexname) - for indexname in mock_specific_indexes.keys() + with mock.patch('share.bin.search.index_strategy.parse_strategy_name', wraps=_fake_parse_strategy_name) as mock_get_strategy: + run_sharectl('search', 'purge', *strategynames) + assert mock_get_strategy.mock_calls == [ + mock.call(strategyname) + for strategyname in mock_strategies.keys() ] - for mock_specific_index in mock_specific_indexes.values(): - mock_specific_index.pls_delete.assert_called_once_with() + for mock_strategy in mock_strategies.values(): + mock_strategy.pls_teardown.assert_called_once_with() def test_setup_initial(self, settings): _expected_indexes = ['baz', 'bar', 'foo'] - _mock_index_strategys = { - _name: mock.Mock() + _mock_index_strategys = [ + mock.Mock(strategy_name=_name) for _name in _expected_indexes - } + ] with patch_index_strategies(_mock_index_strategys): run_sharectl('search', 'setup', '--initial') - for mock_index_strategy in _mock_index_strategys.values(): - mock_specific_index = mock_index_strategy.for_current_index.return_value - assert mock_specific_index.pls_setup.mock_calls == [mock.call(skip_backfill=True)] + for mock_index_strategy in _mock_index_strategys: + assert mock_index_strategy.pls_setup.mock_calls == [mock.call()] def test_setup_index(self): mock_index_strategy = mock.Mock() - with mock.patch('share.bin.search.index_strategy.get_index_strategy', return_value=mock_index_strategy): + with mock.patch('share.bin.search.index_strategy.get_strategy', return_value=mock_index_strategy): run_sharectl('search', 'setup', 'foo') - mock_current_index = mock_index_strategy.for_current_index.return_value - assert mock_current_index.pls_setup.mock_calls == [mock.call(skip_backfill=False)] + assert mock_index_strategy.pls_setup.mock_calls == [mock.call()] def test_daemon(self, settings): with mock.patch('share.bin.search.IndexerDaemonControl') as mock_daemon_control: diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index a7a49aaf9..871256d44 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -1,16 +1,17 @@ import contextlib +import enum +from typing import Iterable from unittest import mock from share.search import index_strategy @contextlib.contextmanager -def patch_index_strategies(strategies: dict[str, index_strategy.IndexStrategy]): - index_strategy.all_index_strategies.cache_clear() - with mock.patch.object( - index_strategy, - 'all_index_strategies', - return_value=strategies, - ): +def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): + with mock.patch.object(index_strategy, '_AvailableStrategies', new=enum.Enum( + '_AvailableStrategies', [ + (_strategy.strategy_name, _strategy) + for _strategy in strategies + ], + )): yield - index_strategy.all_index_strategies.cache_clear() diff --git a/tests/share/search/conftest.py b/tests/share/search/conftest.py index b87757372..3cba6ba08 100644 --- a/tests/share/search/conftest.py +++ b/tests/share/search/conftest.py @@ -11,3 +11,7 @@ def mock_elastic_clients(settings): with mock.patch('share.search.index_strategy.sharev2_elastic5.elasticsearch5'): with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): yield + from share.search.index_strategy.elastic8 import Elastic8IndexStrategy + Elastic8IndexStrategy._get_elastic8_client.cache_clear() + from share.search.index_strategy.sharev2_elastic5 import Sharev2Elastic5IndexStrategy + Sharev2Elastic5IndexStrategy._get_elastic5_client.cache_clear() diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 81461a34e..b237d150c 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -122,7 +122,7 @@ def test_cardsearch_pagination(self): _result_iris: set[str] = set() _page_count = 0 while True: - _cardsearch_handle = self.current_index.pls_handle_cardsearch( + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch( CardsearchParams.from_querystring(_querystring), ) _page_iris = { @@ -151,7 +151,7 @@ def test_cardsearch_related_properties(self): ), ): _cardsearch_params = CardsearchParams.from_querystring('') - _cardsearch_handle = self.current_index.pls_handle_cardsearch(_cardsearch_params) + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) self.assertEqual(_cardsearch_handle.related_propertypath_results, [ PropertypathUsage((DCTERMS.creator,), 3), PropertypathUsage((DCTERMS.references,), 2), @@ -212,7 +212,7 @@ def _assert_cardsearch_iris(self, queryparams: dict, expected_focus_iris: Iterab _querystring = urlencode(queryparams) _cardsearch_params = CardsearchParams.from_querystring(_querystring) assert isinstance(_cardsearch_params, CardsearchParams) - _cardsearch_handle = self.current_index.pls_handle_cardsearch(_cardsearch_params) + _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) # assumes all results fit on one page _actual_result_iris: set[str] | list[str] = [ self._indexcard_focus_by_uuid[_result.card_uuid] @@ -227,7 +227,7 @@ def _assert_valuesearch_values(self, queryparams, expected_values): _querystring = urlencode(queryparams) _valuesearch_params = ValuesearchParams.from_querystring(_querystring) assert isinstance(_valuesearch_params, ValuesearchParams) - _valuesearch_handle = self.current_index.pls_handle_valuesearch(_valuesearch_params) + _valuesearch_handle = self.index_strategy.pls_handle_valuesearch(_valuesearch_params) # assumes all results fit on one page _actual_values = { _result.value_iri or _result.value_value @@ -615,7 +615,7 @@ def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): _response.is_done for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) )) - self.current_index.pls_refresh() + self.index_strategy.pls_refresh() def _delete_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): for _indexcard in indexcards: diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 11c24594d..8ad685026 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -8,6 +8,7 @@ from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger from share.search import index_strategy +from tests.share.search import patch_index_strategies # base class for testing IndexStrategy subclasses with actual elasticsearch. @@ -24,27 +25,17 @@ def setUp(self): super().setUp() self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) self.index_strategy = self.get_index_strategy() - - def _fake_get_index_strategy(name): - if self.index_strategy.name == name: - return self.index_strategy - raise ValueError(f'unknown index strategy in test: {name}') - - self.enterContext(mock.patch( - 'share.search.index_strategy.get_index_strategy', - new=_fake_get_index_strategy, - )) + self.index_strategy.pls_teardown() # in case it already exists + self.enterContext(patch_index_strategies([self.index_strategy])) self.index_messenger = IndexMessenger( celery_app=celery_app, index_strategys=[self.index_strategy], ) - self.current_index = self.index_strategy.for_current_index() - self.current_index.pls_delete() # in case it already exists self._assert_setup_happypath() def tearDown(self): super().tearDown() - self.current_index.pls_delete() + self.index_strategy.pls_teardown() # HACK: copied from TransactionTestCase._fixture_setup; restores db # to the state from before TransactionTestCase clobbered it (relies # on how django 3.2 implements `serialized_rollback = True`, above) @@ -74,8 +65,8 @@ def _assert_happypath_without_daemon(self, messages_chunk, expected_doc_count): assert all(_response.is_done for _response in _responses) _ids = {_response.index_message.target_id for _response in _responses} assert _ids == set(messages_chunk.target_ids_chunk) - self.current_index.pls_refresh() - _search_response = self.current_index.pls_handle_search__sharev2_backcompat() + self.index_strategy.pls_refresh() + _search_response = self.index_strategy.pls_handle_search__passthru() _hits = _search_response['hits']['hits'] assert len(_hits) == expected_doc_count @@ -85,8 +76,8 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): self.index_messenger.send_messages_chunk(messages_chunk) for _ in range(23): _daemon_control.stop_event.wait(timeout=0.2) - self.current_index.pls_refresh() - _search_response = self.current_index.pls_handle_search__sharev2_backcompat() + self.index_strategy.pls_refresh() + _search_response = self.index_strategy.pls_handle_search__passthru() _hits = _search_response['hits']['hits'] if len(_hits) == expected_doc_count: break # all good @@ -94,32 +85,36 @@ def _assert_happypath_with_daemon(self, messages_chunk, expected_doc_count): assert False, 'checked and waited but the daemon did not do the thing' def _assert_setup_happypath(self): - # initial - assert not self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() - assert not index_status.creation_date - assert not index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # create index - self.current_index.pls_create() - assert self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert not index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # keep index live (with ingested updates) - self.current_index.pls_start_keeping_live() - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert index_status.is_kept_live - assert not index_status.is_default_for_searching - assert not index_status.doc_count - # default index for searching - self.index_strategy.pls_make_default_for_searching(self.current_index) - index_status = self.current_index.pls_get_status() - assert index_status.creation_date - assert index_status.is_kept_live - assert index_status.is_default_for_searching - assert not index_status.doc_count + # initial (no indexes exist) + for _index in self.index_strategy.each_subnamed_index(): + assert not _index.pls_check_exists() + index_status = _index.pls_get_status() + assert not index_status.creation_date + assert not index_status.is_kept_live + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # create each index + for _index in self.index_strategy.each_subnamed_index(): + _index.pls_create() + assert _index.pls_check_exists() # new! + index_status = _index.pls_get_status() + assert index_status.creation_date # new! + assert not index_status.is_kept_live + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # start keeping each index live (with ingested updates) + self.index_strategy.pls_start_keeping_live() + for _index in self.index_strategy.each_subnamed_index(): + index_status = _index.pls_get_status() + assert index_status.creation_date + assert index_status.is_kept_live # new! + assert not index_status.is_default_for_searching + assert not index_status.doc_count + # make this version of the strategy the default for searching + self.index_strategy.pls_make_default_for_searching() + for _index in self.index_strategy.each_subnamed_index(): + index_status = _index.pls_get_status() + assert index_status.creation_date + assert index_status.is_kept_live + assert index_status.is_default_for_searching # new! + assert not index_status.doc_count # (still empty) diff --git a/tests/share/search/index_strategy/test_elastic8.py b/tests/share/search/index_strategy/test_elastic8.py index 5de732690..20c68e67e 100644 --- a/tests/share/search/index_strategy/test_elastic8.py +++ b/tests/share/search/index_strategy/test_elastic8.py @@ -1,3 +1,4 @@ +import functools from unittest import mock import pytest @@ -17,6 +18,19 @@ class FakeElastic8IndexStrategy(Elastic8IndexStrategy): hexdigest='5371df2d0e3daaa9f1c344d14384cdbe65000f2b449b1c2f30ae322b0321eb12', ) + @classmethod + def define_current_indexes(cls): + return { + '': cls.IndexDefinition( + mappings={'my-mappings': 'lol'}, + settings={'my-settings': 'lol'}, + ), + } + + @functools.cached_property + def es8_client(self): + return mock.Mock() + @property def supported_message_types(self): return { @@ -28,57 +42,49 @@ def supported_message_types(self): def backfill_message_type(self): return messages.MessageType.BACKFILL_SUID - def index_settings(self): - return {'my-settings': 'lol'} - - def index_mappings(self): - return {'my-mappings': 'lol'} - def build_elastic_actions(self, messages_chunk): return FAKE_ACTION_ITERATOR class TestIndexStrategy: @pytest.fixture - def mock_es_client(self): - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8') as es8_mockpackage: - es8_mockclient = es8_mockpackage.Elasticsearch.return_value - yield es8_mockclient - - @pytest.fixture - def fake_strategy(self, mock_es_client, settings): + def fake_strategy(self, settings): settings.ELASTICSEARCH8_URL = 'http://nowhere.example:12345/' - strat = FakeElastic8IndexStrategy(name='fake_es8') + strat = FakeElastic8IndexStrategy('fake_es8') strat.assert_strategy_is_current() return strat @pytest.fixture def fake_specific_index(self, fake_strategy): - return fake_strategy.for_current_index() + return fake_strategy.get_index('') + + @pytest.fixture + def mock_es_client(self, fake_strategy): + return fake_strategy.es8_client def test_pls_create(self, fake_specific_index, mock_es_client): mock_es_client.indices.exists.return_value = False fake_specific_index.pls_create() mock_es_client.indices.exists.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ) mock_es_client.indices.create.assert_called_once_with( - index=fake_specific_index.indexname, - settings=fake_specific_index.index_strategy.index_settings(), - mappings=fake_specific_index.index_strategy.index_mappings(), + index=fake_specific_index.full_index_name, + mappings={'my-mappings': 'lol'}, + settings={'my-settings': 'lol'}, ) # already exists: mock_es_client.reset_mock() mock_es_client.indices.exists.return_value = True, fake_specific_index.pls_create() mock_es_client.indices.exists.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ) mock_es_client.indices.create.assert_not_called() def test_delete_index(self, fake_specific_index, mock_es_client): fake_specific_index.pls_delete() mock_es_client.indices.delete.assert_called_once_with( - index=fake_specific_index.indexname, + index=fake_specific_index.full_index_name, ignore=[400, 404], ) diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 6b1618301..88e1d6b13 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -56,30 +56,31 @@ def _get_formatted_record(self): # (single index that will not be updated again before being deleted) def _assert_happypath_until_ingest(self): # initial - assert not self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() + _index = next(self.index_strategy.each_subnamed_index()) + assert not _index.pls_check_exists() + index_status = _index.pls_get_status() assert not index_status.creation_date assert not index_status.is_kept_live assert not index_status.is_default_for_searching assert not index_status.doc_count # create index - self.current_index.pls_create() - assert self.current_index.pls_check_exists() - index_status = self.current_index.pls_get_status() + _index.pls_create() + assert _index.pls_check_exists() + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live # change from base class assert index_status.is_default_for_searching # change from base class assert not index_status.doc_count # keep index live (with ingested updates) - self.current_index.pls_start_keeping_live() # now a no-op - index_status = self.current_index.pls_get_status() + self.index_strategy.pls_start_keeping_live() # now a no-op + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live assert index_status.is_default_for_searching # change from base class assert not index_status.doc_count # default index for searching - self.index_strategy.pls_make_default_for_searching(self.current_index) # now a no-op - index_status = self.current_index.pls_get_status() + self.index_strategy.pls_make_default_for_searching() # now a no-op + index_status = _index.pls_get_status() assert index_status.creation_date assert index_status.is_kept_live assert index_status.is_default_for_searching diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index e24fb0a1a..b4d8a1045 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -3,68 +3,57 @@ from share.search.exceptions import IndexStrategyError from share.search.index_strategy import ( - all_index_strategies, - get_index_strategy, - get_specific_index, - get_index_for_sharev2_search, IndexStrategy, + each_strategy, + get_strategy, sharev2_elastic5, sharev2_elastic8, trove_indexcard_flats, trovesearch_denorm, + parse_strategy_name, ) +from share.search.index_strategy._indexnames import combine_indexname_parts +from tests.share.search import patch_index_strategies @pytest.fixture -def expected_strategy_classes(): - return { - 'sharev2_elastic5': sharev2_elastic5.Sharev2Elastic5IndexStrategy, - 'sharev2_elastic8': sharev2_elastic8.Sharev2Elastic8IndexStrategy, - 'trove_indexcard_flats': trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy, - 'trovesearch_denorm': trovesearch_denorm.TrovesearchDenormIndexStrategy, - } +def patched_strategies(mock_elastic_clients): + _strategies = [ + sharev2_elastic5.Sharev2Elastic5IndexStrategy('sharev2_elastic5'), + sharev2_elastic8.Sharev2Elastic8IndexStrategy('sharev2_elastic8'), + trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats'), + trovesearch_denorm.TrovesearchDenormIndexStrategy('trovesearch_denorm'), + ] + with patch_index_strategies(_strategies): + yield _strategies class TestBaseIndexStrategy: - def test_get_index_strategy(self, mock_elastic_clients, expected_strategy_classes): - for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - index_strategy = get_index_strategy(strategy_name) - assert isinstance(index_strategy, expected_strategy_class) - - def test_all_index_strategies(self, mock_elastic_clients, expected_strategy_classes): - all_strategys = tuple(all_index_strategies().values()) - assert len(all_strategys) == len(expected_strategy_classes) - strategy_names = {index_strategy.name for index_strategy in all_strategys} - assert strategy_names == set(expected_strategy_classes.keys()) + def test_get_index_strategy(self, patched_strategies): + for expected_strategy in patched_strategies: + gotten_strategy = get_strategy(expected_strategy.strategy_name) + assert gotten_strategy == expected_strategy + + def test_all_index_strategies(self, patched_strategies): + all_strategys = tuple(each_strategy()) + assert len(all_strategys) == len(patched_strategies) + gotten_names = {index_strategy.strategy_name for index_strategy in all_strategys} + assert gotten_names == {strategy.strategy_name for strategy in patched_strategies} for index_strategy in all_strategys: - strategy_class = expected_strategy_classes[index_strategy.name] - assert isinstance(index_strategy, strategy_class) assert issubclass(index_strategy.SpecificIndex, IndexStrategy.SpecificIndex) assert index_strategy.SpecificIndex is not IndexStrategy.SpecificIndex - def test_get_by_specific_indexname(self, mock_elastic_clients, expected_strategy_classes): - for strategy_name, expected_strategy_class in expected_strategy_classes.items(): - indexname_prefix = get_index_strategy(strategy_name).indexname_prefix - specific_indexname = ''.join((indexname_prefix, 'foo')) - specific_index = get_specific_index(specific_indexname) - assert isinstance(specific_index.index_strategy, expected_strategy_class) - assert isinstance(specific_index, expected_strategy_class.SpecificIndex) - assert specific_index.indexname == specific_indexname - bad_indexname = 'foo_foo' # assumed to not start with index prefix - with pytest.raises(IndexStrategyError): - get_specific_index(bad_indexname) - @pytest.mark.django_db - def test_get_by_request(self, mock_elastic_clients): - for strategy_name, index_strategy in all_index_strategies().items(): + def test_get_by_request(self, patched_strategies): + for _strategy in patched_strategies: good_requests = [ - strategy_name, - index_strategy.current_indexname, - ''.join((index_strategy.indexname_prefix, 'foo')), + _strategy.strategy_name, + combine_indexname_parts(_strategy.strategy_name, _strategy.strategy_check), + combine_indexname_parts(_strategy.strategy_name, _strategy.strategy_check, 'foo'), ] for good_request in good_requests: - specific_index = get_index_for_sharev2_search(good_request) - assert isinstance(specific_index, index_strategy.SpecificIndex) - assert specific_index.index_strategy is index_strategy + _got_strategy = parse_strategy_name(good_request) + assert isinstance(_got_strategy, IndexStrategy) + assert _got_strategy == _strategy with pytest.raises(IndexStrategyError): - get_index_for_sharev2_search('bad-request') + get_strategy('bad-request') diff --git a/tests/share/search/index_strategy/test_trovesearch_denorm.py b/tests/share/search/index_strategy/test_trovesearch_denorm.py index 9a94928d3..2e71065a0 100644 --- a/tests/share/search/index_strategy/test_trovesearch_denorm.py +++ b/tests/share/search/index_strategy/test_trovesearch_denorm.py @@ -14,6 +14,7 @@ def setUp(self): # make the followup delete task eager def _fake_apply_async(*args, **kwargs): + self.index_strategy.pls_refresh() kwargs['countdown'] = 0 # don't wait task__delete_iri_value_scraps.apply(*args, **kwargs) self.enterContext( diff --git a/tests/share/search/test_admin_workflow.py b/tests/share/search/test_admin_workflow.py index 6a1ee9a03..3b9a3ab26 100644 --- a/tests/share/search/test_admin_workflow.py +++ b/tests/share/search/test_admin_workflow.py @@ -1,5 +1,3 @@ -from unittest import mock - from django.test.client import Client import pytest @@ -13,9 +11,11 @@ def test_admin_search_indexes_view(mock_elastic_clients): ShareUser.objects.create_superuser(**credentials) client = Client() client.login(**credentials) - with mock.patch('share.search.index_strategy.elastic8.elasticsearch8'): - resp = client.get('/admin/search-indexes') - for strategy_name in index_strategy.all_index_strategies(): - _index_strategy = index_strategy.get_index_strategy(strategy_name) - expected_header = f'

current index: {_index_strategy.current_indexname}

' - assert expected_header.encode() in resp.content + resp = client.get('/admin/search-indexes') + for strategy_name in index_strategy.all_strategy_names(): + _index_strategy = index_strategy.get_strategy(strategy_name) + expected_header = f'

' + assert expected_header.encode() in resp.content + for _index in _index_strategy.each_subnamed_index(): + expected_row = f'' + assert expected_row.encode() in resp.content diff --git a/tests/share/search/test_daemon.py b/tests/share/search/test_daemon.py index 126016010..6bca002af 100644 --- a/tests/share/search/test_daemon.py +++ b/tests/share/search/test_daemon.py @@ -36,7 +36,7 @@ def wait_for(event: threading.Event): class FakeIndexStrategyForSetupOnly: # for tests that don't need any message-handling - name = 'fakefake' + strategy_name = 'fakefake' supported_message_types = { messages.MessageType.INDEX_SUID, } @@ -45,7 +45,7 @@ class FakeIndexStrategyForSetupOnly: class FakeIndexStrategyWithBlockingEvents: - name = 'fakefake-with-events' + strategy_name = 'fakefake-with-events' supported_message_types = { messages.MessageType.INDEX_SUID, } @@ -118,7 +118,7 @@ class UnexpectedError(Exception): pass class FakeIndexStrategyWithUnexpectedError: - name = 'fakefake_with_error' + strategy_name = 'fakefake_with_error' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' @@ -142,15 +142,15 @@ def pls_handle_messages_chunk(self, messages_chunk): def test_noncurrent_backfill(self): class FakeIndexStrategyWithNoncurrentBackfill: - name = 'fakefake-with-backfill' - current_indexname = 'not-what-you-expected' + CURRENT_STRATEGY_CHECKSUM = 'not-what-you-expected' + strategy_name = 'fakefake-with-backfill' supported_message_types = {messages.MessageType.BACKFILL_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' def get_or_create_backfill(self): class FakeIndexBackfill: - specific_indexname = 'what-you-expected' + strategy_checksum = 'what-you-expected' return FakeIndexBackfill() with _daemon_running( @@ -165,7 +165,7 @@ class FakeIndexBackfill: def test_message_error(self): class FakeIndexStrategyWithMessageError: - name = 'fakefake_with_msg_error' + strategy_name = 'fakefake_with_msg_error' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' @@ -197,7 +197,7 @@ def pls_handle_messages_chunk(self, messages_chunk): @mock.patch('share.search.daemon._backoff_wait', wraps=_backoff_wait) def test_backoff(self, mock_backoff_wait): class FakeIndexStrategyWith429: - name = 'fakefake_with_429' + strategy_name = 'fakefake_with_429' supported_message_types = {messages.MessageType.INDEX_SUID} nonurgent_messagequeue_name = 'fake.nonurgent' urgent_messagequeue_name = 'fake.urgent' diff --git a/tests/share/search/test_index_backfill.py b/tests/share/search/test_index_backfill.py index 2ee24dd41..e3934de25 100644 --- a/tests/share/search/test_index_backfill.py +++ b/tests/share/search/test_index_backfill.py @@ -10,24 +10,24 @@ class TestIndexBackfillMethods: @pytest.fixture def fake_strategy(self): fake_strategy = mock.Mock() - fake_strategy.name = 'foo' - fake_strategy.for_current_index.return_value.indexname = 'foo_bar' + fake_strategy.strategy_name = 'foo' + fake_strategy.CURRENT_STRATEGY_CHECKSUM = 'foo_bar' return fake_strategy @pytest.fixture def index_backfill(self, fake_strategy): return IndexBackfill.objects.create( - index_strategy_name=fake_strategy.name, + index_strategy_name=fake_strategy.strategy_name, ) - def test_happypath(self, index_backfill, fake_strategy): + def test_happypath(self, index_backfill: IndexBackfill, fake_strategy): assert index_backfill.backfill_status == IndexBackfill.INITIAL - assert index_backfill.specific_indexname == '' + assert index_backfill.strategy_checksum == '' with mock.patch('share.tasks.schedule_index_backfill') as mock_task: index_backfill.pls_start(fake_strategy) mock_task.apply_async.assert_called_once_with((index_backfill.pk,)) assert index_backfill.backfill_status == IndexBackfill.WAITING - assert index_backfill.specific_indexname == 'foo_bar' + assert index_backfill.strategy_checksum == 'foo_bar' index_backfill.pls_note_scheduling_has_begun() assert index_backfill.backfill_status == IndexBackfill.SCHEDULING index_backfill.pls_note_scheduling_has_finished() diff --git a/trove/render/_simple_trovesearch.py b/trove/render/_simple_trovesearch.py index 6827c7918..6e6ba6eb1 100644 --- a/trove/render/_simple_trovesearch.py +++ b/trove/render/_simple_trovesearch.py @@ -92,7 +92,7 @@ def _get_card_content( _card_content = ( next(self.response_gathering.ask(TROVE.resourceMetadata, focus=card)) if graph is None - else next(graph.q(card, TROVE.resourceMetadata)) + else next(graph.q(card, TROVE.resourceMetadata), None) ) elif isinstance(card, frozenset): _card_content = next( diff --git a/trove/render/simple_json.py b/trove/render/simple_json.py index a962d8aae..10f896fff 100644 --- a/trove/render/simple_json.py +++ b/trove/render/simple_json.py @@ -41,7 +41,7 @@ def _stream_json(self, card_pages: typing.Iterator[dict[str, dict]]): for _card_iri, _osfmap_json in _page.items(): if _datum_prefix is not None: yield _datum_prefix - yield json.dumps(self._render_card_content(_card_iri, _osfmap_json)) + yield json.dumps(self._render_card_content(_card_iri, _osfmap_json), indent=2) _datum_prefix = ',' _nondata = json.dumps({ 'meta': self._render_meta(), diff --git a/trove/render/turtle.py b/trove/render/turtle.py index fb2d6e352..2b682178c 100644 --- a/trove/render/turtle.py +++ b/trove/render/turtle.py @@ -1,6 +1,7 @@ from primitive_metadata import primitive_rdf as rdf from trove.vocab.namespaces import TROVE +from trove.vocab.trove import trove_shorthand from ._base import BaseRenderer @@ -13,4 +14,5 @@ def simple_render_document(self) -> str: return rdf.turtle_from_tripledict( self.response_data.tripledict, focus=self.response_focus.single_iri(), + shorthand=trove_shorthand, ) diff --git a/trove/trovesearch/page_cursor.py b/trove/trovesearch/page_cursor.py index 33aa7f8f6..e5e5ee3ff 100644 --- a/trove/trovesearch/page_cursor.py +++ b/trove/trovesearch/page_cursor.py @@ -49,6 +49,10 @@ def bounded_page_size(self) -> int: else int(self.page_size) ) + @property + def is_complete_page(self) -> bool: + return self.bounded_page_size == self.page_size + def as_queryparam_value(self) -> str: _cls_key = _PageCursorTypes(type(self)).name _as_json = json.dumps([_cls_key, *dataclasses.astuple(self)]) @@ -82,10 +86,19 @@ class OffsetCursor(PageCursor): # total_count: int | float (from PageCursor) start_offset: int = 0 + @property + def bounded_page_size(self) -> int: + # overrides PageCursor + _bounded_page_size = super().bounded_page_size + if (_bounded_page_size < self.page_size < MAX_OFFSET): + _remaining = self.page_size - self.start_offset + _bounded_page_size = int(min(_bounded_page_size, _remaining)) + return _bounded_page_size + def is_valid(self) -> bool: _end_offset = ( self.total_count - if self.bounded_page_size == self.page_size + if self.is_complete_page else min(self.total_count, self.page_size) ) return ( diff --git a/trove/trovesearch/search_handle.py b/trove/trovesearch/search_handle.py index 3278cf8c6..90f44265d 100644 --- a/trove/trovesearch/search_handle.py +++ b/trove/trovesearch/search_handle.py @@ -63,6 +63,8 @@ def __post_init__(self): return _page def get_next_streaming_handle(self) -> typing.Self | None: + if self.cursor.is_complete_page: + return None _next_cursor = self.cursor.next_cursor() if (_next_cursor is not None) and (self.handler is not None): assert isinstance(self.search_params, CardsearchParams) diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index efcb2b0d5..473629cc6 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -62,6 +62,7 @@ DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) DEFAULT_INCLUDES_BY_TYPE: collections.abc.Mapping[str, frozenset[Propertypath]] = freeze({ + TROVE.Indexcard: set(), TROVE.Cardsearch: { (TROVE.searchResultPage,), (TROVE.relatedPropertyList,), @@ -179,7 +180,7 @@ def _gather_include(cls, queryparams: QueryparamDict) -> PropertypathSet: _parse_propertypath_set(_include_value) for _, _include_value in _include_params )) - return DEFAULT_INCLUDES_BY_TYPE[cls.static_focus_type] + return DEFAULT_INCLUDES_BY_TYPE.get(cls.static_focus_type, frozenset()) @classmethod def _gather_attrpaths(cls, queryparams: QueryparamDict) -> collections.abc.Mapping[ @@ -536,6 +537,11 @@ def as_queryparam(self) -> tuple[str, str]: return (_name, _value) +@dataclasses.dataclass(frozen=True) +class IndexcardParams(BaseTroveParams): + static_focus_type = TROVE.Indexcard + + @dataclasses.dataclass(frozen=True) class CardsearchParams(BaseTroveParams): cardsearch_textsegment_set: frozenset[Textsegment] diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index ca4870e82..0ceed3ccb 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -1,7 +1,7 @@ import dataclasses import logging import urllib.parse -from typing import ClassVar +from typing import ClassVar, Any, Iterator, Iterable from primitive_metadata.primitive_rdf import ( Literal, @@ -27,6 +27,7 @@ ValuesearchHandle, ValuesearchResult, ) +from trove.util.iris import get_sufficiently_unique_iri from trove.vocab.namespaces import RDF, FOAF, DCTERMS, RDFS, DCAT, TROVE from trove.vocab.jsonapi import ( JSONAPI_LINK_OBJECT, @@ -77,8 +78,9 @@ class _TypedFocus(gather.Focus): ADDITIONAL_TYPE_IRIS: ClassVar[tuple[str, ...]] = () # (optional on subclasses) @classmethod - def new(cls, *, type_iris=(), **kwargs): + def new(cls, *args, type_iris=(), **kwargs): return super().new( + *args, # add type_iri to new Focus instance type_iris={ cls.TYPE_IRI, @@ -107,10 +109,15 @@ class ValuesearchFocus(_TypedFocus): search_handle: ValuesearchHandle = dataclasses.field(compare=False) +@dataclasses.dataclass(frozen=True) class IndexcardFocus(_TypedFocus): TYPE_IRI = TROVE.Indexcard ADDITIONAL_TYPE_IRIS = (DCAT.CatalogRecord,) + # additional dataclass fields + indexcard: trove_db.Indexcard = dataclasses.field(compare=False) + resourceMetadata: Any = dataclasses.field(compare=False, default=None, repr=False) + # TODO: per-field text search in rdf # @trovesearch_by_indexstrategy.gatherer(TROVE.cardSearchText) @@ -149,7 +156,10 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): _current_handle: CardsearchHandle | None = focus.search_handle while _current_handle is not None: _result_page = [] - _card_descriptions_by_iri = _load_card_descriptions(_current_handle.search_result_page, deriver_iri) + _card_foci = _load_cards_and_contents( + card_iris=(_result.card_iri for _result in _current_handle.search_result_page), + deriver_iri=deriver_iri, + ) for _result in _current_handle.search_result_page or (): _text_evidence_twoples = ( (TROVE.matchEvidence, frozenset(( @@ -165,12 +175,19 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): (TROVE.indexCard, _result.card_iri), *_text_evidence_twoples, ))) - try: - _card_description = _card_descriptions_by_iri[_result.card_iri] - except KeyError: - pass - else: - yield from rdf.iter_tripleset(_card_description.tripledict) + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) + _card_focus = _card_foci[_result.card_iri] + _card_twoples = _minimal_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_result.card_iri, _pred, _obj) yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() @@ -210,7 +227,7 @@ def gather_cardsearch_filter(focus, **kwargs): TROVE.searchResultPage, focustype_iris={TROVE.Valuesearch}, ) -def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): +def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): _result_page = [] _value_iris = { _result.value_iri @@ -218,35 +235,34 @@ def gather_valuesearch_page(focus: ValuesearchFocus, **kwargs): if _result.value_iri } if _value_iris: - _value_indexcards = ( - trove_db.Indexcard.objects - .filter( - focus_identifier_set__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iris(_value_iris) - ), - derived_indexcard_set__deriver_identifier__in=( - trove_db.ResourceIdentifier.objects - .queryset_for_iri(TROVE['derive/osfmap_json']) - # TODO: choose deriver by queryparam/gatherer-kwarg - ), - ) - .prefetch_related('focus_identifier_set') - ) + _card_foci = _load_cards_and_contents(value_iris=_value_iris, deriver_iri=deriver_iri) else: - _value_indexcards = [] + _card_foci = {} + _card_foci_by_suffuniq_iri: dict[str, IndexcardFocus] = { + _identifier.sufficiently_unique_iri: _focus + for _focus in _card_foci.values() + for _identifier in _focus.indexcard.focus_identifier_set.all() + } for _result in focus.search_handle.search_result_page or (): _indexcard_obj = None - if _result.value_iri in _value_iris: - for _indexcard in _value_indexcards: - if any( - _identifier.equivalent_to_iri(_result.value_iri) - for _identifier in _indexcard.focus_identifier_set.all() - ): - _indexcard_obj = _indexcard.get_iri() - yield (_indexcard_obj, RDF.type, TROVE.Indexcard) # so gather_card runs - # TODO: batch-load cards instead - break # found the indexcard + if _result.value_iri is not None: + _card_focus = _card_foci_by_suffuniq_iri.get( + get_sufficiently_unique_iri(_result.value_iri), + ) + if _card_focus is not None: + _indexcard_obj = _card_focus.indexcard.get_iri() + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) + _card_twoples = _minimal_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_indexcard_obj, _pred, _obj) if _indexcard_obj is None: # no actual indexcard; put what we know in a blanknode-indexcard _indexcard_obj = _valuesearch_result_as_indexcard_blanknode(_result) @@ -282,51 +298,92 @@ def gather_valuesearch_count(focus, **kwargs): # raise trove_exceptions.IriMismatch(f'could not find indexcard iri in {focus.iris} (looking for {_indexcard_namespace})') -def _load_card_descriptions(search_result_page, deriver_iri) -> dict[str, rdf.RdfGraph]: - _card_iris = {_result.card_iri for _result in search_result_page} +@trovesearch_by_indexstrategy.gatherer(DCTERMS.issued, focustype_iris={TROVE.Indexcard}) +def gather_card_issued(focus: IndexcardFocus, **kwargs): + yield (DCTERMS.issued, focus.indexcard.created.date()) + + +@trovesearch_by_indexstrategy.gatherer(DCTERMS.modified, focustype_iris={TROVE.Indexcard}) +def gather_card_modified(focus: IndexcardFocus, **kwargs): + yield (DCTERMS.modified, focus.indexcard.modified.date()) + + +@trovesearch_by_indexstrategy.gatherer( + FOAF.primaryTopic, + TROVE.focusIdentifier, + focustype_iris={TROVE.Indexcard}, +) +def gather_primary_topic(focus: IndexcardFocus, **kwargs): + for _identifier in focus.indexcard.focus_identifier_set.all(): + _iri = _identifier.as_iri() + yield (FOAF.primaryTopic, _iri) + yield (TROVE.focusIdentifier, literal(_iri)) + + +@trovesearch_by_indexstrategy.gatherer( + TROVE.resourceMetadata, + focustype_iris={TROVE.Indexcard}, +) +def gather_card_contents(focus: IndexcardFocus, *, deriver_iri, **kwargs): + if focus.resourceMetadata is not None: + yield (TROVE.resourceMetadata, focus.resourceMetadata) + else: + _iri = focus.single_iri() + _loaded_foci = _load_cards_and_contents(card_iris=[_iri], deriver_iri=deriver_iri) + _loaded_metadata = _loaded_foci[_iri].resourceMetadata + yield (TROVE.resourceMetadata, _loaded_metadata) + + +def _load_cards_and_contents(*, card_iris=None, value_iris=None, deriver_iri) -> dict[str, IndexcardFocus]: return ( - _load_card_descriptions_nonderived(_card_iris) + _load_cards_and_extracted_rdf_contents(card_iris, value_iris) if deriver_iri is None - else _load_card_descriptions_derived(_card_iris, deriver_iri) + else _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri) ) -def _load_card_descriptions_nonderived(card_iris) -> dict[str, rdf.RdfGraph]: +def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() - _indexcard_uuids = { - iri_minus_namespace(_card_iri, namespace=_card_namespace) - for _card_iri in card_iris - } _indexcard_rdf_qs = ( trove_db.LatestIndexcardRdf.objects - .filter(indexcard__uuid__in=_indexcard_uuids) .select_related('indexcard') .prefetch_related('indexcard__focus_identifier_set') ) - _by_card_iri = {} + if card_iris is not None: + _indexcard_uuids = { + iri_minus_namespace(_card_iri, namespace=_card_namespace) + for _card_iri in card_iris + } + _indexcard_rdf_qs = _indexcard_rdf_qs.filter(indexcard__uuid__in=_indexcard_uuids) + if value_iris is not None: + _indexcard_rdf_qs = _indexcard_rdf_qs.filter( + indexcard__focus_identifier_set__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iris(value_iris) + ), + ) + _card_foci: dict[str, IndexcardFocus] = {} for _indexcard_rdf in _indexcard_rdf_qs: - _indexcard_iri = _indexcard_rdf.indexcard.get_iri() + _card = _indexcard_rdf.indexcard + _card_iri = _card.get_iri() _quoted_graph = _indexcard_rdf.as_quoted_graph() _quoted_graph.add( - (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _indexcard_iri), + (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _card_iri), ) - _by_card_iri[_indexcard_iri] = _describe_indexcard_nonderived( - _indexcard_iri, _indexcard_rdf + _card_foci[_card_iri] = IndexcardFocus.new( + iris=_card_iri, + indexcard=_card, + resourceMetadata=_quoted_graph, ) - return _by_card_iri + return _card_foci -def _load_card_descriptions_derived(card_iris, deriver_iri: str) -> dict[str, rdf.RdfGraph]: +def _load_cards_and_derived_contents(card_iris, value_iris, deriver_iri: str) -> dict[str, IndexcardFocus]: _card_namespace = trove_indexcard_namespace() - _indexcard_uuids = { - iri_minus_namespace(_card_iri, namespace=_card_namespace) - for _card_iri in card_iris - } # include pre-formatted data from a DerivedIndexcard _derived_indexcard_qs = ( trove_db.DerivedIndexcard.objects .filter( - upriver_indexcard__uuid__in=_indexcard_uuids, deriver_identifier__in=( trove_db.ResourceIdentifier.objects .queryset_for_iri(deriver_iri) @@ -335,49 +392,30 @@ def _load_card_descriptions_derived(card_iris, deriver_iri: str) -> dict[str, rd .select_related('upriver_indexcard') .prefetch_related('upriver_indexcard__focus_identifier_set') ) - _by_card_iri = {} + if card_iris is not None: + _indexcard_uuids = { + iri_minus_namespace(_card_iri, namespace=_card_namespace) + for _card_iri in card_iris + } + _derived_indexcard_qs = _derived_indexcard_qs.filter( + upriver_indexcard__uuid__in=_indexcard_uuids, + ) + if value_iris is not None: + _derived_indexcard_qs = _derived_indexcard_qs.filter( + upriver_indexcard__focus_identifier_set__in=( + trove_db.ResourceIdentifier.objects + .queryset_for_iris(value_iris) + ), + ) + _card_foci: dict[str, IndexcardFocus] = {} for _derived in _derived_indexcard_qs: - _indexcard_iri = _derived.upriver_indexcard.get_iri() - _by_card_iri[_indexcard_iri] = _describe_indexcard_derived(_indexcard_iri, _derived) - return _by_card_iri - - -def _describe_indexcard_nonderived( - indexcard_iri: str, - indexcard_rdf: trove_db.IndexcardRdf, -) -> rdf.RdfGraph: - _card_description = rdf.RdfGraph({ - indexcard_iri: { - RDF.type: {TROVE.Indexcard, DCAT.CatalogRecord}, - TROVE.resourceMetadata: {indexcard_rdf.as_quoted_graph()}, - DCTERMS.issued: {indexcard_rdf.indexcard.created.date()}, - DCTERMS.modified: {indexcard_rdf.modified.date()}, - }, - }) - for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): - _iri = _identifier.as_iri() - _card_description.add((indexcard_iri, FOAF.primaryTopic, _iri)) - _card_description.add((indexcard_iri, TROVE.focusIdentifier, literal(_iri))) - return _card_description - - -def _describe_indexcard_derived( - indexcard_iri: str, - derived_indexcard: trove_db.DerivedIndexcard, -) -> rdf.RdfGraph: - _card_description = rdf.RdfGraph({ - indexcard_iri: { - RDF.type: {TROVE.Indexcard, DCAT.CatalogRecord}, - TROVE.resourceMetadata: {derived_indexcard.as_rdf_literal()}, - DCTERMS.issued: {derived_indexcard.upriver_indexcard.created.date()}, - DCTERMS.modified: {derived_indexcard.modified.date()}, - }, - }) - for _identifier in derived_indexcard.upriver_indexcard.focus_identifier_set.all(): - _iri = _identifier.as_iri() - _card_description.add((indexcard_iri, FOAF.primaryTopic, _iri)) - _card_description.add((indexcard_iri, TROVE.focusIdentifier, literal(_iri))) - return _card_description + _card_iri = _derived.upriver_indexcard.get_iri() + _card_foci[_card_iri] = IndexcardFocus.new( + iris=_card_iri, + indexcard=_derived.upriver_indexcard, + resourceMetadata=_derived.as_rdf_literal(), + ) + return _card_foci ### @@ -421,12 +459,25 @@ def _valuesearch_result_as_json(result: ValuesearchResult) -> Literal: ) +def _minimal_indexcard_twoples( + focus_identifiers: Iterable[str], + resource_metadata: rdf.Literal, +) -> Iterator[rdf.RdfTwople]: + yield (RDF.type, TROVE.Indexcard) + for _identifier in focus_identifiers: + yield (TROVE.focusIdentifier, ( + _identifier + if isinstance(_identifier, rdf.Literal) + else literal(_identifier) + )) + yield (TROVE.resourceMetadata, resource_metadata) + + def _valuesearch_result_as_indexcard_blanknode(result: ValuesearchResult) -> frozenset: - return blanknode({ - RDF.type: {TROVE.Indexcard}, - TROVE.focusIdentifier: {literal(result.value_iri or result.value_value)}, - TROVE.resourceMetadata: {_valuesearch_result_as_json(result)}, - }) + return frozenset(_minimal_indexcard_twoples( + focus_identifiers=[literal(result.value_iri or result.value_value)], + resource_metadata=_valuesearch_result_as_json(result), + )) def _osfmap_json(tripledict, focus_iri): diff --git a/trove/views/_gather_ask.py b/trove/views/_gather_ask.py new file mode 100644 index 000000000..63bae1098 --- /dev/null +++ b/trove/views/_gather_ask.py @@ -0,0 +1,21 @@ +from primitive_metadata import gather + +from trove.trovesearch.search_params import BaseTroveParams + + +def ask_gathering_from_params( + gathering: gather.Gathering, + params: BaseTroveParams, + start_focus: gather.Focus, +): + # fill the gathering's cache with included related resources... + gathering.ask(params.included_relations, focus=start_focus) + # ...and add requested attributes on the focus and related resources + for _focus in gathering.cache.focus_set: + for _focustype in _focus.type_iris: + try: + _attrpaths = params.attrpaths_by_type[_focustype] + except KeyError: + pass # no attribute fields for this type + else: + gathering.ask(_attrpaths, focus=_focus) diff --git a/trove/views/indexcard.py b/trove/views/indexcard.py index a685428d8..208a15f85 100644 --- a/trove/views/indexcard.py +++ b/trove/views/indexcard.py @@ -1,14 +1,18 @@ from django.views import View -from primitive_metadata import gather from trove import exceptions as trove_exceptions +from trove import models as trove_db from trove.render import ( DEFAULT_RENDERER_TYPE, get_renderer_type, ) -from trove.trovesearch.trovesearch_gathering import trovesearch_by_indexstrategy -from trove.vocab.namespaces import TROVE +from trove.trovesearch.search_params import IndexcardParams +from trove.trovesearch.trovesearch_gathering import ( + trovesearch_by_indexstrategy, + IndexcardFocus, +) from trove.vocab.trove import trove_indexcard_iri +from ._gather_ask import ask_gathering_from_params from ._responder import ( make_http_error_response, make_http_response, @@ -19,18 +23,17 @@ class IndexcardView(View): def get(self, request, indexcard_uuid): try: _renderer_type = get_renderer_type(request) - _search_gathering = trovesearch_by_indexstrategy.new_gathering({ - # TODO (gather): allow omitting kwargs that go unused - 'search_params': None, - 'specific_index': None, + _gathering = trovesearch_by_indexstrategy.new_gathering({ 'deriver_iri': _renderer_type.INDEXCARD_DERIVER_IRI, }) _indexcard_iri = trove_indexcard_iri(indexcard_uuid) - _search_gathering.ask( - {}, # TODO: build from `include`/`fields` - focus=gather.Focus.new(_indexcard_iri, TROVE.Indexcard), + _params = IndexcardParams.from_querystring(request.META['QUERY_STRING']) + _focus = IndexcardFocus.new( + iris=_indexcard_iri, + indexcard=trove_db.Indexcard.objects.get_for_iri(_indexcard_iri), ) - _renderer = _renderer_type(_indexcard_iri, _search_gathering.leaf_a_record()) + ask_gathering_from_params(_gathering, _params, _focus) + _renderer = _renderer_type(_focus, _gathering) return make_http_response( content_rendering=_renderer.render_document(), http_request=request, @@ -38,10 +41,10 @@ def get(self, request, indexcard_uuid): except trove_exceptions.CannotRenderMediatype as _error: return make_http_error_response( error=_error, - renderer=DEFAULT_RENDERER_TYPE(_indexcard_iri), + renderer_type=DEFAULT_RENDERER_TYPE, ) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, - renderer=_renderer_type(_indexcard_iri), + renderer_type=_renderer_type, ) diff --git a/trove/views/search.py b/trove/views/search.py index fd4043259..d164b36e4 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -23,6 +23,7 @@ DEFAULT_RENDERER_TYPE, get_renderer_type, ) +from ._gather_ask import ask_gathering_from_params from ._responder import ( make_http_error_response, make_http_response, @@ -52,14 +53,14 @@ def get(self, request): _url = request.build_absolute_uri() _search_gathering = self._start_gathering(renderer_type=_renderer_type) _search_params = self._parse_search_params(request) - _specific_index = index_strategy.get_index_for_trovesearch(_search_params) + _strategy = index_strategy.get_strategy_for_trovesearch(_search_params) _focus = self.focus_type.new( iris=_url, search_params=_search_params, - search_handle=self.get_search_handle(_specific_index, _search_params), + search_handle=self.get_search_handle(_strategy, _search_params), ) if _renderer_type.PASSIVE_RENDER: - self._fill_gathering(_search_gathering, _search_params, _focus) + ask_gathering_from_params(_search_gathering, _search_params, _focus) # take gathered data into a response _renderer = _renderer_type(_focus, _search_gathering) return make_http_response( @@ -83,30 +84,17 @@ def _start_gathering(self, renderer_type) -> gather.Gathering: 'deriver_iri': renderer_type.INDEXCARD_DERIVER_IRI, }) - def _fill_gathering(self, search_gathering, search_params, start_focus): - # fill the gathering's cache with included related resources... - search_gathering.ask(search_params.included_relations, focus=start_focus) - # ...and add requested attributes on the focus and related resources - for _focus in search_gathering.cache.focus_set: - for _focustype in _focus.type_iris: - try: - _attrpaths = search_params.attrpaths_by_type[_focustype] - except KeyError: - pass # no attribute fields for this type - else: - search_gathering.ask(_attrpaths, focus=_focus) - - def get_search_handle(self, specific_index, search_params) -> BasicSearchHandle: - return self._get_wrapped_handler(specific_index)(search_params) + def get_search_handle(self, strategy, search_params) -> BasicSearchHandle: + return self._get_wrapped_handler(strategy)(search_params) def get_search_handler( self, - specific_index: index_strategy.IndexStrategy.SpecificIndex, + strategy: index_strategy.IndexStrategy, ) -> _TrovesearchHandler: raise NotImplementedError - def _get_wrapped_handler(self, specific_index): - _raw_handler = self.get_search_handler(specific_index) + def _get_wrapped_handler(self, strategy: index_strategy.IndexStrategy): + _raw_handler = self.get_search_handler(strategy) def _wrapped_handler(search_params): _handle = _raw_handler(search_params) @@ -119,13 +107,13 @@ class CardsearchView(_BaseTrovesearchView): focus_type = CardsearchFocus params_dataclass = CardsearchParams - def get_search_handler(self, specific_index): - return specific_index.pls_handle_cardsearch + def get_search_handler(self, strategy): + return strategy.pls_handle_cardsearch class ValuesearchView(_BaseTrovesearchView): focus_type = ValuesearchFocus params_dataclass = ValuesearchParams - def get_search_handler(self, specific_index): - return specific_index.pls_handle_valuesearch + def get_search_handler(self, strategy): + return strategy.pls_handle_valuesearch diff --git a/trove/views/vocab.py b/trove/views/vocab.py index dcab1c373..62982f34e 100644 --- a/trove/views/vocab.py +++ b/trove/views/vocab.py @@ -31,10 +31,10 @@ def get(self, request, vocab_term): except trove_exceptions.CannotRenderMediatype as _error: return make_http_error_response( error=_error, - renderer=DEFAULT_RENDERER_TYPE(_iri), + renderer_type=DEFAULT_RENDERER_TYPE, ) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, - renderer=_renderer_type(_iri), + renderer_type=_renderer_type, )