Skip to content

Commit 1f59691

Browse files
committed
fixup! [A] Workaround: Manifest content hash computation times out (#6123)
1 parent b2b3369 commit 1f59691

File tree

3 files changed

+50
-54
lines changed

3 files changed

+50
-54
lines changed

scripts/reindex.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -45,21 +45,15 @@
4545
parser.add_argument('--local',
4646
default=False,
4747
action='store_true',
48-
help=(
49-
'' if config.enable_bundle_notifications else '**DISABLED** '
50-
) + (
51-
'Do not offload the listing of subgraphs to the indexer Lambda function. When '
52-
'this option is used, this script queries the repository without partitioning, '
53-
'and the indexer notification endpoint is invoked for each subgraph '
54-
'individually and concurrently using worker threads. This is magnitudes slower '
55-
'than remote (i.e. partitioned) indexing. If this option is not used (the '
56-
'default), the set of subgraphs matching the query is partitioned using the '
57-
'partition prefix length configured for each of the catalog sources being '
58-
'reindexed. Each query partition is processed independently and remotely by '
59-
'the indexer lambda. The index Lambda function queries the repository for each '
60-
'partition and queues a notification for each matching subgraph in the '
61-
'partition.'
62-
)
48+
help='Do not offload the listing of subgraphs to the indexer Lambda function. When this option is '
49+
'used, this script queries the repository without partitioning, and the indexer notification '
50+
'endpoint is invoked for each subgraph individually and concurrently using worker threads. '
51+
'This is magnitudes slower than remote (i.e. partitioned) indexing. If this option is not '
52+
'used (the default), the set of subgraphs matching the query is partitioned using the '
53+
'partition prefix length configured for each of the catalog sources being reindexed. Each '
54+
'query partition is processed independently and remotely by the indexer lambda. The index '
55+
'Lambda function queries the repository for each partition and queues a notification for each '
56+
'matching subgraph in the partition.')
6357
)
6458
parser.add_argument('--catalogs',
6559
nargs='+',
@@ -140,10 +134,6 @@ def main(argv: list[str]):
140134
deindex = args.deindex or (args.delete and not every_source)
141135
delete = args.delete and every_source
142136

143-
if args.local and not config.enable_bundle_notifications:
144-
parser.error('Local reindexing is not available while bundle '
145-
'notifications are disabled.')
146-
147137
if every_source:
148138
if deindex:
149139
parser.error('--deindex is incompatible with source `*`. Use --delete instead.')

src/azul/service/manifest_service.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,9 @@ def manifest_key(self) -> ManifestKey:
985985
filter_string = repr(sort_frozen(freeze(self.filters.explicit)))
986986
# If incremental index changes are disabled, we don't need to worry
987987
# about individual bundles, only sources.
988-
content_hash = str(self.manifest_hash(config.enable_bundle_notifications))
988+
content_hash = str(
989+
self.manifest_hash(by_bundle=config.enable_bundle_notifications)
990+
)
989991
catalog = self.catalog
990992
format = self.format()
991993
manifest_hash_input = [
@@ -1150,29 +1152,38 @@ def _azul_file_url(self,
11501152
**args))
11511153

11521154
@cache
1153-
def manifest_hash(self, by_bundle: bool) -> int:
1155+
def manifest_hash(self, *, by_bundle: bool) -> int:
11541156
"""
1155-
Return a content hash for the manifest.
1156-
1157-
If `by_bundle` is True, the hash is computed from the fully-qualified
1158-
identifiers of all bundles containing files that match the current
1159-
filter. The return value approximates a hash of the content of the
1160-
manifest because a change of the file data requires a change to the file
1161-
metadata which requires a new bundle or bundle version.
1162-
1163-
If `by_bundle` is False, the hash is computed from the identifiers of
1164-
the sources from which projects/datasets containing files matching the
1165-
current filters were indexed. It's worth noting that a filter may match
1166-
a project/dataset but none of the project's files. For example, if a
1167-
project contains only files derived from either mouse brains or lion
1168-
hearts, the project will match the filter `species=lion and
1169-
organ=brain`, but none of its files will. If such a project/dataset is
1170-
added/removed to/from the index, the manifest hash returned for a given
1171-
filter will be different even though the contents of the manifest hasn't
1172-
changed, as no matching files were added or removed.
1173-
1174-
So while the hash computed from the sources is less sensitive than the
1175-
one computed from the bundles, it can be computed much more quickly.
1157+
Return a hash of the input this generator builds the manifest from. The
1158+
input is the set of ES documents from the files index. For two generator
1159+
instances g1 and g2 created at two different points in time, and any
1160+
boolean value b, if
1161+
1162+
g1.manifest_hash(by_bundle=b) == g2.manifest_hash(by_bundle=b)
1163+
1164+
then there is a high probability that the manifests generated by g1 and
1165+
g2 contain the same set of entries. This test can be used in deciding
1166+
whether g2 can reuse g1's manifest, thereby avoiding an expensive
1167+
operation. A false positive occurs when the hashes are equal but the
1168+
inputs differ. A false negative occurs when the hashes differ, but the
1169+
inputs are equal. False negatives are less problematic because they only
1170+
lead to redundant computations: the manifest is regenerated when it
1171+
could have been reused. False positives are problematic because they
1172+
lead to a manifest being reused erroneously, yielding an incorrect
1173+
manifest that is inconsistent with the input.
1174+
1175+
If ``by_bundle`` is True, the hash is computed from the fully-qualified
1176+
identifiers (FQID) of all bundles (subgraphs) containing files that
1177+
match the current filter. The rate of false negatives is low because a
1178+
change to any file entity requires a new bundle or a new bundle version,
1179+
both of which have different FQIDs, leading to a different hash. This
1180+
mode is slower and should be used if the index is changing or is likely
1181+
to change due to the incremental incorporation of bundles.
1182+
1183+
If ``by_bundle`` is False, the hash is instead computed from the set of
1184+
identifiers of the sources that contributed files matching the current
1185+
filters. This mode should *not* be used if the index is changing or is
1186+
likely to change due to the incremental incorporation of bundles.
11761187
"""
11771188
log.debug('Computing content hash for manifest from %s using %r ...',
11781189
'bundles' if by_bundle else 'sources', self.filters)

test/service/test_manifest.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@
126126
MutableJSON,
127127
MutableJSONs,
128128
)
129+
from azul_test_case import (
130+
patch_config,
131+
)
129132
from indexer import (
130133
AnvilCannedBundleTestCase,
131134
CannedFileTestCase,
@@ -902,15 +905,11 @@ def test_manifest_filter_validation(self):
902905
response = requests.put(str(url))
903906
self.assertEqual(400, response.status_code, response.content)
904907

905-
@patch.object(type(config),
906-
'enable_bundle_notifications',
907-
new=PropertyMock(return_value=True))
908+
@patch_config('enable_bundle_notifications', True)
908909
def test_content_disposition_header_with_notifications_enabled(self) -> None:
909910
self._test_content_disposition_header()
910911

911-
@patch.object(type(config),
912-
'enable_bundle_notifications',
913-
new=PropertyMock(return_value=False))
912+
@patch_config('enable_bundle_notifications', False)
914913
def test_content_disposition_header_with_notifications_disabled(self) -> None:
915914
self._test_content_disposition_header()
916915

@@ -1077,15 +1076,11 @@ def test_compact_metadata_cache(self, _time_until_object_expires: MagicMock):
10771076
self.assertEqual([2, 2], list(map(len, file_names.values())))
10781077
self.assertEqual([1, 1], list(map(len, map(set, file_names.values()))))
10791078

1080-
@patch.object(type(config),
1081-
'enable_bundle_notifications',
1082-
new=PropertyMock(return_value=True))
1079+
@patch_config('enable_bundle_notifications', True)
10831080
def test_hash_validity_with_notifications_enabled(self) -> None:
10841081
self._test_hash_validity()
10851082

1086-
@patch.object(type(config),
1087-
'enable_bundle_notifications',
1088-
new=PropertyMock(return_value=False))
1083+
@patch_config('enable_bundle_notifications', False)
10891084
def test_hash_validity_with_notifications_disabled(self) -> None:
10901085
self._test_hash_validity()
10911086

0 commit comments

Comments
 (0)