Skip to content

Commit c961d06

Browse files
Improve monitoring
1 parent a7cdade commit c961d06

File tree

1 file changed

+15
-6
lines changed
  • archive_query_log/monitoring

1 file changed

+15
-6
lines changed

archive_query_log/monitoring/home.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class Progress(NamedTuple):
5454
DocumentType = Type[BaseDocument]
5555

5656
_statistics_cache: dict[
57-
tuple[DocumentType, str, str],
57+
tuple[DocumentType, str, str | None, str | None, str, str],
5858
Statistics,
5959
] = ExpiringDict(
6060
max_len=100,
@@ -78,16 +78,19 @@ def _get_statistics(
7878
description: str,
7979
index: str,
8080
document: DocumentType,
81+
filter_field: str | None = None,
8182
status_field: str | None = None,
8283
last_modified_field: str = "last_modified",
8384
) -> Statistics:
84-
key = (document, index, last_modified_field)
85+
key = (document, index, filter_field, status_field, last_modified_field, name)
8586
if key in _statistics_cache:
8687
return _statistics_cache[key]
8788
print(f"Get statistics: {name}")
8889

8990
search = document.search(using=config.es.client, index=index)
9091
search = search.filter(Exists(field=last_modified_field))
92+
if filter_field is not None:
93+
search = search.filter(Exists(field=filter_field))
9194
if status_field is not None:
9295
search = search.filter(Term(**{status_field: False}))
9396
total = search.count()
@@ -290,23 +293,23 @@ def home(config: Config) -> str | Response:
290293
_get_statistics(
291294
config=config,
292295
name="Sources",
293-
description="The cross product of all archives and "
294-
"the provider's domains and URL prefixes.",
296+
description="Cross product of archives and "
297+
"provider domains and URL prefixes.",
295298
document=Source,
296299
index=config.es.index_sources,
297300
),
298301
_get_statistics(
299302
config=config,
300303
name="Captures",
301-
description="Captures matching from the archives "
304+
description="Captures from the archives "
302305
"that match domain and URL prefixes.",
303306
document=Capture,
304307
index=config.es.index_captures,
305308
),
306309
_get_statistics(
307310
config=config,
308311
name="SERPs",
309-
description="Search engine result pages that have been "
312+
description="Search engine result pages "
310313
"identified among the captures.",
311314
document=Serp,
312315
index=config.es.index_serps,
@@ -324,6 +327,7 @@ def home(config: Config) -> str | Response:
324327
description="SERPs for which the page has been parsed from the URL.",
325328
document=Serp,
326329
index=config.es.index_serps,
330+
filter_field="url_page",
327331
status_field="url_page_parser.should_parse",
328332
last_modified_field="url_page_parser.last_parsed",
329333
),
@@ -333,6 +337,7 @@ def home(config: Config) -> str | Response:
333337
description="SERPs for which the offset has been parsed from the URL.",
334338
document=Serp,
335339
index=config.es.index_serps,
340+
filter_field="url_offset",
336341
status_field="url_offset_parser.should_parse",
337342
last_modified_field="url_offset_parser.last_parsed",
338343
),
@@ -342,6 +347,7 @@ def home(config: Config) -> str | Response:
342347
description="SERPs for which the WARC has been downloaded.",
343348
document=Serp,
344349
index=config.es.index_serps,
350+
filter_field="warc_location",
345351
status_field="warc_downloader.should_download",
346352
last_modified_field="warc_downloader.last_downloaded",
347353
),
@@ -351,6 +357,7 @@ def home(config: Config) -> str | Response:
351357
description="SERPs for which the query has been parsed from the WARC.",
352358
document=Serp,
353359
index=config.es.index_serps,
360+
filter_field="warc_query",
354361
status_field="warc_query_parser.should_parse",
355362
last_modified_field="warc_query_parser.last_parsed",
356363
),
@@ -360,6 +367,7 @@ def home(config: Config) -> str | Response:
360367
description="SERPs for which the snippets have been parsed from the WARC.",
361368
document=Serp,
362369
index=config.es.index_serps,
370+
filter_field="warc_snippets",
363371
status_field="warc_snippets_parser.should_parse",
364372
last_modified_field="warc_snippets_parser.last_parsed",
365373
),
@@ -394,6 +402,7 @@ def home(config: Config) -> str | Response:
394402
# description="Search results for which the WARC has been downloaded.",
395403
# document=Result,
396404
# index=config.es.index_results,
405+
# filter_field="warc_location",
397406
# status_field="warc_downloader.should_download",
398407
# last_modified_field="warc_downloader.last_downloaded",
399408
# ),

0 commit comments

Comments
 (0)