11from datetime import datetime
2- from gzip import open as gzip_open
2+ from gzip import open as gzip_open , BadGzipFile
33from typing import NamedTuple , Type
44from pathlib import Path
5+ from warnings import warn
56
67from elasticsearch_dsl .query import Exists , Query , Term
78from expiringdict import ExpiringDict
89from flask import render_template , Response , make_response
10+ from tqdm import tqdm
911from warcio import ArchiveIterator
1012
1113from archive_query_log .config import Config
@@ -78,6 +80,7 @@ def _get_statistics(
7880 key = (document , index , last_modified_field )
7981 if key in _statistics_cache :
8082 return _statistics_cache [key ]
83+ print (f"Get statistics: { name } " )
8184
8285 config .es .client .indices .refresh (index = index )
8386 stats = config .es .client .indices .stats (index = index )
@@ -129,6 +132,7 @@ def _get_warc_cache_statistics(
129132 key = (cache_path .resolve (), temporary )
130133 if key in _warc_cache_statistics_cache :
131134 return _warc_cache_statistics_cache [key ]
135+ print (f"Get statistics: { name } " )
132136
133137 file_paths : list [Path ]
134138 if temporary :
@@ -144,15 +148,20 @@ def _get_warc_cache_statistics(
144148 if len (file_paths ) > 0 :
145149 disk_size_bytes = sum (file_path .stat ().st_size for file_path in file_paths )
146150 last_modified = max (file_path .stat ().st_mtime for file_path in file_paths )
147- for file_path in file_paths :
148- with gzip_open (file_path , mode = "rb" ) as gzip_file :
149- iterator = ArchiveIterator (
150- fileobj = gzip_file ,
151- no_record_parse = True ,
152- )
153- warc_count += sum (
154- 1 for record in iterator if record .rec_type == "request"
155- )
151+ for file_path in tqdm (file_paths , desc = "Counting WARC records" , unit = "file" ):
152+ try :
153+ with gzip_open (file_path , mode = "rb" ) as gzip_file :
154+ iterator = ArchiveIterator (
155+ fileobj = gzip_file ,
156+ no_record_parse = True ,
157+ )
158+ warc_count += sum (
159+ 1 for record in iterator if record .rec_type == "request"
160+ )
161+ except BadGzipFile :
162+ warn (f"Invalid gzip file: { file_path } " )
163+ # Ignore invalid gzip files.
164+ pass
156165
157166 statistics = Statistics (
158167 name = name ,
@@ -187,6 +196,7 @@ def _get_processed_progress(
187196 key = (document , index , repr (filter_query ), status_field )
188197 if key in _progress_cache :
189198 return _progress_cache [key ]
199+ print (f"Get progress: { input_name } → { output_name } " )
190200
191201 config .es .client .indices .refresh (index = index )
192202 search = document .search (using = config .es .client , index = index )
0 commit comments