Skip to content

Commit 1098bbb

Browse files
edsuMr0grog
andauthored
Correct CDX API timestamps with month of 00 (#89)
This commit extends the existing logic for handling invalid days of `00` to months that are `00`. It also adds a warning to be logged in both situations. For example, if a timestamp of `20200001120000` will get rewritten to `20200112000000` prior to conversion to a datetime. Fixes #88 Co-authored-by: Rob Brackett <[email protected]>
1 parent 7e7d140 commit 1098bbb

File tree

4 files changed

+41
-7
lines changed

4 files changed

+41
-7
lines changed

docs/source/release-history.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Release History
55
In Development
66
--------------
77

8-
TBD
8+
Fixes a timestamp issue that is extremely similar to the one from v0.3.2: some Wayback CDX records have invalid timestamps with ``"00"`` for the month portion. :meth:`wayback.WaybackClient.search` previously raised an exception when parsing CDX records with this issue, but now handles them safely. Thanks to @edsu for discovering this issue and addressing it. (:issue:`88`)
99

1010

1111
v0.3.2 (2021-11-16)

wayback/_utils.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections import defaultdict
22
from contextlib import contextmanager
33
from datetime import date, datetime, timezone
4+
import logging
45
import re
56
import requests
67
import requests.adapters
@@ -9,6 +10,7 @@
910
import urllib.parse
1011
from .exceptions import SessionClosedError
1112

13+
logger = logging.getLogger(__name__)
1214

1315
URL_DATE_FORMAT = '%Y%m%d%H%M%S'
1416
MEMENTO_URL_PATTERN = re.compile(
@@ -47,12 +49,39 @@ def parse_timestamp(time_string):
4749
"""
4850
Given a Wayback-style timestamp string, return an equivalent ``datetime``.
4951
"""
50-
# Fix bad timestamps
52+
# Before parsing, try to fix invalid timestamps.
53+
# We've seen a handful of timestamps where "00" was inserted before the
54+
# month or day part of the timestamp, e.g:
55+
#
56+
# 20000008241731
57+
# ^^ Month is "00"
58+
#
59+
# The Wayback team looked into some of these, and the "00" was always
60+
# an insertion, pushing the month or day and the following components of
61+
# the timestamp out by two characters. Then the seconds get truncated
62+
# (there's only room for 14 characters in the timestamp in the CDX index).
63+
# For example:
64+
#
65+
# In raw data: 2000000824173151 (16 characters)
66+
# In CDX index: 20000008241731 (Truncated to 14 characters)
67+
# Correct value: 20000824173151 (Aug. 24, 2000 at 17:31:51 UTC)
68+
#
69+
# The best we can do for these cases is pull out the incorrect "00" and add
70+
# "00" for the seconds that got truncated. This isn't exact, but we can't
71+
# see the raw data so this is as close as we can get.
72+
#
73+
# The issue seems to be limited to some crawls in the year 2000.
5174
timestamp_chars = list(time_string)
52-
# If the timestamp has a day of "00"
53-
if timestamp_chars[6:8] == ['0', '0']:
75+
if timestamp_chars[4:6] == ['0', '0']:
76+
logger.warning("found invalid timestamp with month 00: %s", time_string)
77+
del timestamp_chars[4:6]
78+
timestamp_chars.extend(['0', '0'])
79+
elif timestamp_chars[6:8] == ['0', '0']:
80+
logger.warning("found invalid timestamp with day 00: %s", time_string)
5481
del timestamp_chars[6:8]
5582
timestamp_chars.extend(['0', '0'])
83+
84+
# Parse the cleaned-up result.
5685
return (datetime
5786
.strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
5887
.replace(tzinfo=timezone.utc))

wayback/tests/test_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,12 @@ def test_search_handles_bad_timestamp_cdx_records(requests_mock):
241241

242242
record_list = list(records)
243243
assert 5 == len(record_list)
244-
assert record_list[-1].timestamp.day == 24
244+
245+
# 00 month in 20000012170449 gets rewritten to 20001217044900
246+
assert record_list[3].timestamp.month == 12
247+
248+
# 00 day in 20000800241623 gets rewritten to 20000824162300
249+
assert record_list[4].timestamp.day == 24
245250

246251

247252
@ia_vcr.use_cassette()
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
22
com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
33
com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
4-
com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
5-
com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177
4+
com,usatoday)/2000/century/tech/004.htm 20000012170449 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
5+
com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177

0 commit comments

Comments
 (0)