|
1 | 1 | from collections import defaultdict
|
2 | 2 | from contextlib import contextmanager
|
3 | 3 | from datetime import date, datetime, timezone
|
| 4 | +import logging |
4 | 5 | import re
|
5 | 6 | import requests
|
6 | 7 | import requests.adapters
|
|
9 | 10 | import urllib.parse
|
10 | 11 | from .exceptions import SessionClosedError
|
11 | 12 |
|
| 13 | +logger = logging.getLogger(__name__) |
12 | 14 |
|
13 | 15 | URL_DATE_FORMAT = '%Y%m%d%H%M%S'
|
14 | 16 | MEMENTO_URL_PATTERN = re.compile(
|
@@ -47,12 +49,39 @@ def parse_timestamp(time_string):
|
47 | 49 | """
|
48 | 50 | Given a Wayback-style timestamp string, return an equivalent ``datetime``.
|
49 | 51 | """
|
50 |
| - # Fix bad timestamps |
| 52 | + # Before parsing, try to fix invalid timestamps. |
| 53 | + # We've seen a handful of timestamps where "00" was inserted before the |
| 54 | + # month or day part of the timestamp, e.g: |
| 55 | + # |
| 56 | + # 20000008241731 |
| 57 | + # ^^ Month is "00" |
| 58 | + # |
| 59 | + # The Wayback team looked into some of these, and the "00" was always |
| 60 | + # an insertion, pushing the month or day and the following components of |
| 61 | + # the timestamp out by two characters. Then the seconds get truncated |
| 62 | + # (there's only room for 14 characters in the timestamp in the CDX index). |
| 63 | + # For example: |
| 64 | + # |
| 65 | + # In raw data: 2000000824173151 (16 characters) |
| 66 | + # In CDX index: 20000008241731 (Truncated to 14 characters) |
| 67 | + # Correct value: 20000824173151 (Aug. 24, 2000 at 17:31:51 UTC) |
| 68 | + # |
| 69 | + # The best we can do for these cases is pull out the incorrect "00" and add |
| 70 | + # "00" for the seconds that got truncated. This isn't exact, but we can't |
| 71 | + # see the raw data so this is as close as we can get. |
| 72 | + # |
| 73 | + # The issue seems to be limited to some crawls in the year 2000. |
51 | 74 | timestamp_chars = list(time_string)
|
52 |
| - # If the timestamp has a day of "00" |
53 |
| - if timestamp_chars[6:8] == ['0', '0']: |
| 75 | + if timestamp_chars[4:6] == ['0', '0']: |
| 76 | + logger.warning("found invalid timestamp with month 00: %s", time_string) |
| 77 | + del timestamp_chars[4:6] |
| 78 | + timestamp_chars.extend(['0', '0']) |
| 79 | + elif timestamp_chars[6:8] == ['0', '0']: |
| 80 | + logger.warning("found invalid timestamp with day 00: %s", time_string) |
54 | 81 | del timestamp_chars[6:8]
|
55 | 82 | timestamp_chars.extend(['0', '0'])
|
| 83 | + |
| 84 | + # Parse the cleaned-up result. |
56 | 85 | return (datetime
|
57 | 86 | .strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
|
58 | 87 | .replace(tzinfo=timezone.utc))
|
|
0 commit comments