You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# FIXME: Counting WARC records takes too long at the moment due to the large number of files. Replace this again with record counting once the number of files is reduced.
155
-
# try:
156
-
# with gzip_open(file_path, mode="rb") as gzip_file:
157
-
# iterator = ArchiveIterator(
158
-
# fileobj=gzip_file,
159
-
# no_record_parse=True,
160
-
# )
161
-
# warc_count += sum(
162
-
# 1 for record in iterator if record.rec_type == "request"
163
-
# )
164
-
# except BadGzipFile:
165
-
# warn(f"Invalid gzip file: {file_path}")
166
-
# # Ignore invalid gzip files.
167
-
# pass
168
-
warc_count+=1
149
+
try:
150
+
disk_size_bytes+=file_path.stat().st_size
151
+
last_modified=max(
152
+
last_modified,
153
+
file_path.stat().st_mtime,
154
+
)
155
+
# FIXME: Counting WARC records takes too long at the moment due to the large number of files. Replace this again with record counting once the number of files is reduced.
156
+
# try:
157
+
# with gzip_open(file_path, mode="rb") as gzip_file:
158
+
# iterator = ArchiveIterator(
159
+
# fileobj=gzip_file,
160
+
# no_record_parse=True,
161
+
# )
162
+
# warc_count += sum(
163
+
# 1 for record in iterator if record.rec_type == "request"
164
+
# )
165
+
# except BadGzipFile:
166
+
# warn(f"Invalid gzip file: {file_path}")
167
+
# # Ignore invalid gzip files.
168
+
# pass
169
+
warc_count+=1
170
+
exceptFileNotFoundError:
171
+
# Ignore files that have been deleted while processing.
0 commit comments