Correct CDX API timestamps with month of 00 (#89)

edsu · Mr0grog · web-flow · commit 1098bbb3bd2e · 2022-09-30T10:59:29.000-07:00
This commit extends the existing logic for handling invalid days of `00` to months that are `00`. It also adds a warning to be logged in both situations. For example, if a timestamp of `20200001120000` will get rewritten to `20200112000000` prior to conversion to a datetime. Fixes #88 Co-authored-by: Rob Brackett <rob@robbrackett.com>
diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst
@@ -5,7 +5,7 @@ Release History
 In Development
 --------------
 
-TBD
+Fixes a timestamp issue that is extremely similar to the one from v0.3.2: some Wayback CDX records have invalid timestamps with ``"00"`` for the month portion. :meth:`wayback.WaybackClient.search` previously raised an exception when parsing CDX records with this issue, but now handles them safely. Thanks to @edsu for discovering this issue and addressing it. (:issue:`88`)
 
 
 v0.3.2 (2021-11-16)
diff --git a/wayback/_utils.py b/wayback/_utils.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from datetime import date, datetime, timezone
+import logging
 import re
 import requests
 import requests.adapters
@@ -9,6 +10,7 @@
 import urllib.parse
 from .exceptions import SessionClosedError
 
+logger = logging.getLogger(__name__)
 
 URL_DATE_FORMAT = '%Y%m%d%H%M%S'
 MEMENTO_URL_PATTERN = re.compile(
@@ -47,12 +49,39 @@ def parse_timestamp(time_string):
     """
     Given a Wayback-style timestamp string, return an equivalent ``datetime``.
     """
-    # Fix bad timestamps
+    # Before parsing, try to fix invalid timestamps.
+    # We've seen a handful of timestamps where "00" was inserted before the
+    # month or day part of the timestamp, e.g:
+    #
+    #   20000008241731
+    #       ^^ Month is "00"
+    #
+    # The Wayback team looked into some of these, and the "00" was always
+    # an insertion, pushing the month or day and the following components of
+    # the timestamp out by two characters. Then the seconds get truncated
+    # (there's only room for 14 characters in the timestamp in the CDX index).
+    # For example:
+    #
+    #   In raw data:   2000000824173151 (16 characters)
+    #   In CDX index:  20000008241731   (Truncated to 14 characters)
+    #   Correct value: 20000824173151   (Aug. 24, 2000 at 17:31:51 UTC)
+    #
+    # The best we can do for these cases is pull out the incorrect "00" and add
+    # "00" for the seconds that got truncated. This isn't exact, but we can't
+    # see the raw data so this is as close as we can get.
+    #
+    # The issue seems to be limited to some crawls in the year 2000.
     timestamp_chars = list(time_string)
-    # If the timestamp has a day of "00"
-    if timestamp_chars[6:8] == ['0', '0']:
+    if timestamp_chars[4:6] == ['0', '0']:
+        logger.warning("found invalid timestamp with month 00: %s", time_string)
+        del timestamp_chars[4:6]
+        timestamp_chars.extend(['0', '0'])
+    elif timestamp_chars[6:8] == ['0', '0']:
+        logger.warning("found invalid timestamp with day 00: %s", time_string)
         del timestamp_chars[6:8]
         timestamp_chars.extend(['0', '0'])
+
+    # Parse the cleaned-up result.
     return (datetime
             .strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
             .replace(tzinfo=timezone.utc))
diff --git a/wayback/tests/test_client.py b/wayback/tests/test_client.py
@@ -241,7 +241,12 @@ def test_search_handles_bad_timestamp_cdx_records(requests_mock):
 
         record_list = list(records)
         assert 5 == len(record_list)
-        assert record_list[-1].timestamp.day == 24
+
+        # 00 month in 20000012170449 gets rewritten to 20001217044900
+        assert record_list[3].timestamp.month == 12
+
+        # 00 day in 20000800241623 gets rewritten to 20000824162300
+        assert record_list[4].timestamp.day == 24
 
 
 @ia_vcr.use_cassette()
diff --git a/wayback/tests/test_files/bad_timestamp_cdx.txt b/wayback/tests/test_files/bad_timestamp_cdx.txt
@@ -1,5 +1,5 @@
 com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
 com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
 com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
-com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
-com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177
+com,usatoday)/2000/century/tech/004.htm 20000012170449 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
+com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177