@@ -54,7 +54,7 @@ class Progress(NamedTuple):
54
54
DocumentType = Type [BaseDocument ]
55
55
56
56
_statistics_cache : dict [
57
- tuple [DocumentType , str , str ],
57
+ tuple [DocumentType , str , str | None , str | None , str , str ],
58
58
Statistics ,
59
59
] = ExpiringDict (
60
60
max_len = 100 ,
@@ -78,16 +78,19 @@ def _get_statistics(
78
78
description : str ,
79
79
index : str ,
80
80
document : DocumentType ,
81
+ filter_field : str | None = None ,
81
82
status_field : str | None = None ,
82
83
last_modified_field : str = "last_modified" ,
83
84
) -> Statistics :
84
- key = (document , index , last_modified_field )
85
+ key = (document , index , filter_field , status_field , last_modified_field , name )
85
86
if key in _statistics_cache :
86
87
return _statistics_cache [key ]
87
88
print (f"Get statistics: { name } " )
88
89
89
90
search = document .search (using = config .es .client , index = index )
90
91
search = search .filter (Exists (field = last_modified_field ))
92
+ if filter_field is not None :
93
+ search = search .filter (Exists (field = filter_field ))
91
94
if status_field is not None :
92
95
search = search .filter (Term (** {status_field : False }))
93
96
total = search .count ()
@@ -290,23 +293,23 @@ def home(config: Config) -> str | Response:
290
293
_get_statistics (
291
294
config = config ,
292
295
name = "Sources" ,
293
- description = "The cross product of all archives and "
294
- "the provider's domains and URL prefixes." ,
296
+ description = "Cross product of archives and "
297
+ "provider domains and URL prefixes." ,
295
298
document = Source ,
296
299
index = config .es .index_sources ,
297
300
),
298
301
_get_statistics (
299
302
config = config ,
300
303
name = "Captures" ,
301
- description = "Captures matching from the archives "
304
+ description = "Captures from the archives "
302
305
"that match domain and URL prefixes." ,
303
306
document = Capture ,
304
307
index = config .es .index_captures ,
305
308
),
306
309
_get_statistics (
307
310
config = config ,
308
311
name = "SERPs" ,
309
- description = "Search engine result pages that have been "
312
+ description = "Search engine result pages "
310
313
"identified among the captures." ,
311
314
document = Serp ,
312
315
index = config .es .index_serps ,
@@ -324,6 +327,7 @@ def home(config: Config) -> str | Response:
324
327
description = "SERPs for which the page has been parsed from the URL." ,
325
328
document = Serp ,
326
329
index = config .es .index_serps ,
330
+ filter_field = "url_page" ,
327
331
status_field = "url_page_parser.should_parse" ,
328
332
last_modified_field = "url_page_parser.last_parsed" ,
329
333
),
@@ -333,6 +337,7 @@ def home(config: Config) -> str | Response:
333
337
description = "SERPs for which the offset has been parsed from the URL." ,
334
338
document = Serp ,
335
339
index = config .es .index_serps ,
340
+ filter_field = "url_offset" ,
336
341
status_field = "url_offset_parser.should_parse" ,
337
342
last_modified_field = "url_offset_parser.last_parsed" ,
338
343
),
@@ -342,6 +347,7 @@ def home(config: Config) -> str | Response:
342
347
description = "SERPs for which the WARC has been downloaded." ,
343
348
document = Serp ,
344
349
index = config .es .index_serps ,
350
+ filter_field = "warc_location" ,
345
351
status_field = "warc_downloader.should_download" ,
346
352
last_modified_field = "warc_downloader.last_downloaded" ,
347
353
),
@@ -351,6 +357,7 @@ def home(config: Config) -> str | Response:
351
357
description = "SERPs for which the query has been parsed from the WARC." ,
352
358
document = Serp ,
353
359
index = config .es .index_serps ,
360
+ filter_field = "warc_query" ,
354
361
status_field = "warc_query_parser.should_parse" ,
355
362
last_modified_field = "warc_query_parser.last_parsed" ,
356
363
),
@@ -360,6 +367,7 @@ def home(config: Config) -> str | Response:
360
367
description = "SERPs for which the snippets have been parsed from the WARC." ,
361
368
document = Serp ,
362
369
index = config .es .index_serps ,
370
+ filter_field = "warc_snippets" ,
363
371
status_field = "warc_snippets_parser.should_parse" ,
364
372
last_modified_field = "warc_snippets_parser.last_parsed" ,
365
373
),
@@ -394,6 +402,7 @@ def home(config: Config) -> str | Response:
394
402
# description="Search results for which the WARC has been downloaded.",
395
403
# document=Result,
396
404
# index=config.es.index_results,
405
+ # filter_field="warc_location",
397
406
# status_field="warc_downloader.should_download",
398
407
# last_modified_field="warc_downloader.last_downloaded",
399
408
# ),
0 commit comments