Skip to content

Commit e42efd7

Browse files
Add some logging to the monitoring
1 parent 7b0b2b6 commit e42efd7

File tree

1 file changed

+20
-10
lines changed
  • archive_query_log/monitoring

1 file changed

+20
-10
lines changed

archive_query_log/monitoring/home.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from datetime import datetime
2-
from gzip import open as gzip_open
2+
from gzip import open as gzip_open, BadGzipFile
33
from typing import NamedTuple, Type
44
from pathlib import Path
5+
from warnings import warn
56

67
from elasticsearch_dsl.query import Exists, Query, Term
78
from expiringdict import ExpiringDict
89
from flask import render_template, Response, make_response
10+
from tqdm import tqdm
911
from warcio import ArchiveIterator
1012

1113
from archive_query_log.config import Config
@@ -78,6 +80,7 @@ def _get_statistics(
7880
key = (document, index, last_modified_field)
7981
if key in _statistics_cache:
8082
return _statistics_cache[key]
83+
print(f"Get statistics: {name}")
8184

8285
config.es.client.indices.refresh(index=index)
8386
stats = config.es.client.indices.stats(index=index)
@@ -129,6 +132,7 @@ def _get_warc_cache_statistics(
129132
key = (cache_path.resolve(), temporary)
130133
if key in _warc_cache_statistics_cache:
131134
return _warc_cache_statistics_cache[key]
135+
print(f"Get statistics: {name}")
132136

133137
file_paths: list[Path]
134138
if temporary:
@@ -144,15 +148,20 @@ def _get_warc_cache_statistics(
144148
if len(file_paths) > 0:
145149
disk_size_bytes = sum(file_path.stat().st_size for file_path in file_paths)
146150
last_modified = max(file_path.stat().st_mtime for file_path in file_paths)
147-
for file_path in file_paths:
148-
with gzip_open(file_path, mode="rb") as gzip_file:
149-
iterator = ArchiveIterator(
150-
fileobj=gzip_file,
151-
no_record_parse=True,
152-
)
153-
warc_count += sum(
154-
1 for record in iterator if record.rec_type == "request"
155-
)
151+
for file_path in tqdm(file_paths, desc="Counting WARC records", unit="file"):
152+
try:
153+
with gzip_open(file_path, mode="rb") as gzip_file:
154+
iterator = ArchiveIterator(
155+
fileobj=gzip_file,
156+
no_record_parse=True,
157+
)
158+
warc_count += sum(
159+
1 for record in iterator if record.rec_type == "request"
160+
)
161+
except BadGzipFile:
162+
warn(f"Invalid gzip file: {file_path}")
163+
# Ignore invalid gzip files.
164+
pass
156165

157166
statistics = Statistics(
158167
name=name,
@@ -187,6 +196,7 @@ def _get_processed_progress(
187196
key = (document, index, repr(filter_query), status_field)
188197
if key in _progress_cache:
189198
return _progress_cache[key]
199+
print(f"Get progress: {input_name}{output_name}")
190200

191201
config.es.client.indices.refresh(index=index)
192202
search = document.search(using=config.es.client, index=index)

0 commit comments

Comments
 (0)