diff --git a/README.rst b/README.rst index 9b400d3c6..08d090ac5 100644 --- a/README.rst +++ b/README.rst @@ -111,9 +111,3 @@ Limitations * Only Python 2 support for now (Python 3 support **will be** added in future versions) -* `dateparser` at this point does not support generic parsing of dates with fixed UTC offsets. This restricts its ability to reliably parse time zone aware dates since the use of abbreviated time zones as a sole designator of time zones is not recommended. - - Read `Wikipedia Time Zone article`_ for more information. - -.. _Wikipedia Time Zone Article: https://en.wikipedia.org/wiki/Time_zone#Abbreviations - diff --git a/data/languages.yaml b/data/languages.yaml index 4ed646c61..e24ffbbfb 100644 --- a/data/languages.yaml +++ b/data/languages.yaml @@ -5,7 +5,7 @@ base: en: name: English - skip: ["at", "on", "and", "ad", "m", "t", "of", "st", "nd", "rd", "th", "about", "the", "just"] + skip: ["at", "on", "and", "ad", "m", "of", "st", "nd", "rd", "th", "about", "the", "just"] pertain: ["of"] monday: diff --git a/dateparser/conf.py b/dateparser/conf.py index f82941c32..e758bdfa0 100644 --- a/dateparser/conf.py +++ b/dateparser/conf.py @@ -25,6 +25,14 @@ >>> settings.update('PREFER_DATES_FROM', 'future') >>> parse(u'March') datetime.datetime(2016, 3, 16, 0, 0) + +*``SKIP_TOKENS``* is a ``list`` of tokens to discard while detecting language. Defaults to ``['t']`` which skips T in iso format datetime string.e.g. ``2015-05-02T10:20:19+0000``. +This only works with :mod:`DateDataParser` like below: + + >>> settings.update('SKIP_TOKENS', ['de']) # Turkish word for 'at' + >>> from dateparser.date import DateDataParser + >>> DateDataParser().get_date_data(u'27 Haziran 1981 de') # Turkish (at 27 June 1981) + {'date_obj': datetime.datetime(1981, 6, 27, 0, 0), 'period': 'day'} """ @@ -32,6 +40,7 @@ class Settings(object): PREFER_DATES_FROM = 'current_period' # past, future, current_period SUPPORT_BEFORE_COMMON_ERA = False PREFER_DAY_OF_MONTH = 'current' # current, first, last + SKIP_TOKENS = ['t'] def __init__(self, **kwargs): for key in kwargs: diff --git a/dateparser/date.py b/dateparser/date.py index 954291821..0bdaf399c 100644 --- a/dateparser/date.py +++ b/dateparser/date.py @@ -10,7 +10,7 @@ from dateparser.date_parser import date_parser from dateparser.freshness_date_parser import freshness_date_parser -from dateparser.languages import default_language_loader +from dateparser.languages.loader import LanguageDataLoader from dateparser.languages.detection import AutoDetectLanguage, ExactLanguages @@ -244,10 +244,12 @@ class DateDataParser(object): :raises: ValueError - Unknown Language, TypeError - Languages argument must be a list """ + language_loader = None def __init__(self, languages=None, allow_redetect_language=False): + available_language_map = self._get_language_loader().get_language_map() + if isinstance(languages, (list, tuple, collections.Set)): - available_language_map = default_language_loader.get_language_map() if all([language in available_language_map for language in languages]): languages = [available_language_map[language] for language in languages] @@ -258,12 +260,14 @@ def __init__(self, languages=None, allow_redetect_language=False): raise TypeError("languages argument must be a list (%r given)" % type(languages)) if allow_redetect_language: - self.language_detector = AutoDetectLanguage(languages=languages if languages else None, - allow_redetection=True) + self.language_detector = AutoDetectLanguage( + languages if languages else available_language_map.values(), + allow_redetection=True) elif languages: self.language_detector = ExactLanguages(languages=languages) else: - self.language_detector = AutoDetectLanguage(languages=None, allow_redetection=False) + self.language_detector = AutoDetectLanguage( + available_language_map.values(), allow_redetection=False) def get_date_data(self, date_string, date_formats=None): """ @@ -314,3 +318,9 @@ def get_date_data(self, date_string, date_formats=None): return parsed_date else: return {'date_obj': None, 'period': 'day'} + + @classmethod + def _get_language_loader(cls): + if not cls.language_loader: + cls.language_loader = LanguageDataLoader() + return cls.language_loader diff --git a/dateparser/languages/detection.py b/dateparser/languages/detection.py index 8f133ba88..7d986a58b 100644 --- a/dateparser/languages/detection.py +++ b/dateparser/languages/detection.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from functools import wraps -from dateparser.languages import default_language_loader - def _restore_languages_on_generator_exit(method): @wraps(method) @@ -39,9 +37,7 @@ def _filter_languages(date_string, languages): class AutoDetectLanguage(BaseLanguageDetector): - def __init__(self, languages=None, allow_redetection=False): - if languages is None: - languages = default_language_loader.get_languages() + def __init__(self, languages, allow_redetection=False): super(AutoDetectLanguage, self).__init__(languages=languages[:]) self.language_pool = languages[:] self.allow_redetection = allow_redetection diff --git a/dateparser/languages/loader.py b/dateparser/languages/loader.py index bf005e3ca..262f9102b 100644 --- a/dateparser/languages/loader.py +++ b/dateparser/languages/loader.py @@ -4,6 +4,7 @@ from yaml import load as load_yaml from .language import Language +from ..conf import settings class LanguageDataLoader(object): @@ -35,7 +36,8 @@ def _load_data(self): else: data = self.file.read() data = load_yaml(data) - base_data = data.pop('base', {}) + base_data = data.pop('base', {'skip': []}) + base_data['skip'] += settings.SKIP_TOKENS known_languages = {} for shortname, language_info in data.iteritems(): self._update_language_info_with_base_info(language_info, base_data) diff --git a/dateparser/timezone_parser.py b/dateparser/timezone_parser.py index 5d0b3547e..1849947ea 100644 --- a/dateparser/timezone_parser.py +++ b/dateparser/timezone_parser.py @@ -4,14 +4,12 @@ from dateparser.timezones import timezone_info_list -TIMEZONE_REGEX_PATTERN = r'(\b|\d)%s$' - def pop_tz_offset_from_string(date_string, as_offset=True): - for name, info in _tz_offsets.iteritems(): + for name, info in _tz_offsets: timezone_re = info['regex'] if timezone_re.search(date_string): - date_string = timezone_re.sub(r'\1', date_string) # \1 = (\b|\d) in TIMEZONE_REGEX_PATTERN + date_string = timezone_re.sub(r'\1', date_string) return date_string, info['offset'] if as_offset else name else: return date_string, None @@ -22,18 +20,32 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset): def get_tz_offsets(): - return { - tz_info[0]: { - 'regex': re.compile(TIMEZONE_REGEX_PATTERN % tz_info[0], re.IGNORECASE), - 'offset': timedelta(seconds=tz_info[1]), - } - for tz_info in timezone_info_list - } + + def get_offset(tz_obj, regex, repl='', replw=''): + return ( + tz_obj[0], + { + 'regex': re.compile(re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE), + 'offset': timedelta(seconds=tz_obj[1]) + } + ) + + for tz_info in timezone_info_list: + for regex in tz_info['regex_patterns']: + for tz_obj in tz_info['timezones']: + yield get_offset(tz_obj, regex) + + # alternate patterns + for replace, replacewith in tz_info.get('replace', []): + for tz_obj in tz_info['timezones']: + yield get_offset(tz_obj, regex, repl=replace, replw=replacewith) def get_local_tz_offset(): - return datetime.now() - datetime.utcnow() + offset = datetime.now() - datetime.utcnow() + offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1)) + return offset -_tz_offsets = get_tz_offsets() +_tz_offsets = list(get_tz_offsets()) local_tz_offset = get_local_tz_offset() diff --git a/dateparser/timezones.py b/dateparser/timezones.py index a7778801b..dac7f8bd9 100644 --- a/dateparser/timezones.py +++ b/dateparser/timezones.py @@ -1,393 +1,448 @@ # Based on http://stackoverflow.com/q/1703546 # As well as http://en.wikipedia.org/wiki/List_of_time_zone_abbreviations # As well as https://github.com/scrapinghub/dateparser/pull/4 +# As well as http://en.wikipedia.org/wiki/List_of_UTC_time_offsets timezone_info_list = [ - ('ACDT', 37800), - ('ACST', 34200), - ('ACT', -18000), - ('ACWDT', 35100), - ('ACWST', 31500), - ('ADDT', -7200), - ('ADMT', 9300), - ('ADT', -10800), - ('AEDT', 39600), - ('AEST', 36000), - ('AFT', 16200), - ('AHDT', -32400), - ('AHST', -36000), - ('AKDT', -28800), - ('AKST', -32400), - ('AKTST', 21600), - ('AKTT', 18000), - ('ALMST', 25200), - ('ALMT', 21600), - ('AMST', 18000), - ('AMT', 14400), - ('ANAST', 43200), - ('ANAT', 43200), - ('ANT', -16200), - ('APT', -10800), - ('AQTST', 21600), - ('AQTT', 18000), - ('ARST', -10800), - ('ART', -10800), - ('ASHST', 21600), - ('ASHT', 18000), - ('AST', -14400), - ('AWDT', 32400), - ('AWST', 28800), - ('AWT', -10800), - ('AZOMT', 0), - ('AZOST', -3600), - ('AZOT', -3600), - ('AZST', 18000), - ('AZT', 14400), - ('BAKST', 14400), - ('BAKT', 10800), - ('BDST', 7200), - ('BDT', 28800), - ('BEAT', 9000), - ('BEAUT', 9900), - ('BIOT', 21600), - ('BMT', 1800), - ('BNT', 28800), - ('BORT', 28800), - ('BOST', -12780), - ('BOT', -14400), - ('BRST', -7200), - ('BRT', -10800), - ('BST', 39600), - ('BTT', 21600), - ('BURT', 23400), - ('CANT', -3600), - ('CAPT', -32400), - ('CAST', 10800), - ('CAT', 7200), - ('CAWT', -32400), - ('CCT', 23400), - ('CDDT', -14400), - ('CDT', -18000), - ('CEDT', 7200), - ('CEMT', 10800), - ('CEST', 7200), - ('CET', 3600), - ('CGST', -3600), - ('CGT', -7200), - ('CHADT', 49500), - ('CHAST', 45900), - ('CHDT', -19800), - ('CHOST', 36000), - ('CHOT', 28800), - ('CIST', -28800), - ('CKHST', -34200), - ('CKT', -36000), - ('CLST', -10800), - ('CLT', -14400), - ('CMT', -16080), - ('COST', -14400), - ('COT', -18000), - ('CPT', -18000), - ('CST', -21600), - ('CUT', 8400), - ('CVST', -3600), - ('CVT', -3600), - ('CWT', -18000), - ('CXT', 25200), - ('ChST', 36000), - ('DACT', 21600), - ('DAVT', 25200), - ('DDUT', 36000), - ('DFT', 3600), - ('DMT', -1500), - ('DUSST', 21600), - ('DUST', 21600), - ('EASST', -18000), - ('EAST', -21600), - ('EAT', 10800), - ('ECT', -18000), - ('EDDT', -10800), - ('EDT', -14400), - ('EEDT', 10800), - ('EEST', 10800), - ('EET', 7200), - ('EGST', 0), - ('EGT', -3600), - ('EHDT', -16200), - ('EMT', -26220), - ('EPT', -14400), - ('EST', -18000), - ('ET', -18000), - ('EWT', -14400), - ('FET', 10800), - ('FFMT', -14640), - ('FJST', 46800), - ('FJT', 43200), - ('FKST', -10800), - ('FKT', -14400), - ('FMT', -4080), - ('FNST', -3600), - ('FNT', -7200), - ('FORT', 14400), - ('FRUST', 25200), - ('FRUT', 18000), - ('GALT', -21600), - ('GAMT', -32400), - ('GBGT', -13500), - ('GEST', 14400), - ('GET', 14400), - ('GFT', -10800), - ('GHST', 1200), - ('GILT', 43200), - ('GIT', -32400), - ('GMT', 0), - ('GST', 14400), - ('GYT', -14400), - ('HAA', -10800), - ('HAC', -18000), - ('HADT', -32400), - ('HAE', -14400), - ('HAP', -25200), - ('HAR', -21600), - ('HAST', -36000), - ('HAT', -9000), - ('HAY', -28800), - ('HDT', -34200), - ('HKST', 32400), - ('HKT', 28800), - ('HLV', -16200), - ('HMT', 18000), - ('HNA', -14400), - ('HNC', -21600), - ('HNE', -18000), - ('HNP', -28800), - ('HNR', -25200), - ('HNT', -12600), - ('HNY', -32400), - ('HOVST', 28800), - ('HOVT', 25200), - ('HST', -36000), - ('ICT', 25200), - ('IDDT', 14400), - ('IDT', 10800), - ('IHST', 21600), - ('IMT', 7020), - ('IOT', 21600), - ('IRDT', 16200), - ('IRKST', 32400), - ('IRKT', 28800), - ('IRST', 12600), - ('ISST', 0), - ('IST', 7200), - ('JAVT', 26400), - ('JCST', 32400), - ('JDT', 36000), - ('JMT', 8460), - ('JST', 32400), - ('JWST', 28800), - ('KART', 18000), - ('KDT', 32400), - ('KGST', 21600), - ('KGT', 21600), - ('KIZST', 21600), - ('KIZT', 18000), - ('KMT', 5760), - ('KOST', 39600), - ('KRAST', 28800), - ('KRAT', 25200), - ('KST', 32400), - ('KUYST', 18000), - ('KUYT', 14400), - ('KWAT', -43200), - ('LHDT', 39600), - ('LHST', 37800), - ('LINT', 50400), - ('LKT', 23400), - ('LMT', -20160), - ('LMT', -17640), - ('LMT', -20580), - ('LMT', -14400), - ('LRT', -2640), - ('LST', 9420), - ('MADMT', 3600), - ('MADST', 0), - ('MADT', -3600), - ('MAGST', 43200), - ('MAGT', 39600), - ('MALST', 26400), - ('MALT', 27000), - ('MART', -34200), - ('MAWT', 18000), - ('MDDT', -18000), - ('MDST', 16260), - ('MDT', -21600), - ('MEST', 7200), - ('MET', 3600), - ('MHT', 43200), - ('MIST', 39600), - ('MIT', -34200), - ('MMT', 23400), - ('MOST', 32400), - ('MOT', 28800), - ('MPT', -21600), - ('MSD', 14400), - ('MSK', 10800), - ('MSM', 18000), - ('MST', -25200), - ('MUST', 18000), - ('MUT', 14400), - ('MVT', 18000), - ('MWT', -21600), - ('MYT', 28800), - ('NCST', 43200), - ('NCT', 39600), - ('NDDT', -5400), - ('NDT', -9000), - ('NEGT', -12600), - ('NEST', 4800), - ('NET', 1200), - ('NFT', 41400), - ('NMT', 40320), - ('NOVST', 25200), - ('NOVT', 21600), - ('NPT', 20700), - ('NRT', 41400), - ('NST', -12600), - ('NT', -12600), - ('NUT', -39600), - ('NWT', -36000), - ('NZDT', 46800), - ('NZMT', 41400), - ('NZST', 43200), - ('OMSST', 25200), - ('OMST', 21600), - ('ORAST', 18000), - ('ORAT', 18000), - ('PDDT', -21600), - ('PDT', -25200), - ('PEST', -14400), - ('PET', -18000), - ('PETST', 43200), - ('PETT', 43200), - ('PGT', 36000), - ('PHOT', 46800), - ('PHST', 32400), - ('PHT', 28800), - ('PKST', 21600), - ('PKT', 18000), - ('PLMT', 25620), - ('PMDT', -7200), - ('PMMT', 35340), - ('PMST', -10800), - ('PMT', 540), - ('PNT', -30600), - ('PONT', 39600), - ('PPMT', -17340), - ('PPT', -25200), - ('PST', -28800), - ('PT', -28800), - ('PWT', -25200), - ('PYST', -10800), - ('PYT', -14400), - ('QMT', -18840), - ('QYZST', 25200), - ('QYZT', 21600), - ('RET', 14400), - ('RMT', 3000), - ('ROTT', -10800), - ('SAKST', 43200), - ('SAKT', 39600), - ('SAMT', 14400), - ('SAST', 7200), - ('SBT', 39600), - ('SCT', 14400), - ('SDMT', -16800), - ('SDT', -36000), - ('SET', 3600), - ('SGT', 28800), - ('SHEST', 21600), - ('SHET', 18000), - ('SJMT', -20160), - ('SLT', 19800), - ('SMT', -13860), - ('SRET', 39600), - ('SRT', -10800), - ('SST', -39600), - ('STAT', 10800), - ('SVEST', 21600), - ('SVET', 14400), - ('SWAT', 5400), - ('SYOT', 10800), - ('TAHT', -36000), - ('TASST', 25200), - ('TAST', 21600), - ('TBIST', 18000), - ('TBIT', 10800), - ('TBMT', 10740), - ('TFT', 18000), - ('THA', 25200), - ('TJT', 18000), - ('TKT', -39600), - ('TLT', 32400), - ('TMT', 18000), - ('TOST', 50400), - ('TOT', 46800), - ('TRST', 14400), - ('TRT', 10800), - ('TSAT', 10800), - ('TVT', 43200), - ('ULAST', 32400), - ('ULAT', 28800), - ('URAST', 18000), - ('URAT', 18000), - ('UTC', 0), - ('UYHST', -9000), - ('UYST', -7200), - ('UYT', -10800), - ('UZST', 21600), - ('UZT', 18000), - ('VET', -16200), - ('VLAST', 39600), - ('VLAT', 36000), - ('VOLST', 14400), - ('VOLT', 14400), - ('VOST', 21600), - ('VUST', 43200), - ('VUT', 39600), - ('WARST', -10800), - ('WART', -14400), - ('WAST', 7200), - ('WAT', 3600), - ('WDT', 32400), - ('WEDT', 3600), - ('WEMT', 7200), - ('WEST', 3600), - ('WET', 0), - ('WFT', 43200), - ('WGST', -7200), - ('WGT', -10800), - ('WIB', 25200), - ('WIT', 32400), - ('WITA', 28800), - ('WMT', 5040), - ('WSDT', 50400), - ('WSST', 46800), - ('WST', 28800), - ('WT', 0), - ('XJT', 21600), - ('YAKST', 36000), - ('YAKT', 32400), - ('YAPT', 36000), - ('YDDT', -25200), - ('YDT', -28800), - ('YEKST', 21600), - ('YEKST', 21600), - ('YEKT', 18000), - ('YEKT', 18000), - ('YERST', 14400), - ('YERT', 10800), - ('YPT', -28800), - ('YST', -32400), - ('YWT', -28800), - ('zzz', 0) + { + 'regex_patterns': + [r'(\W|\d|_)\(?%s\)?$'], + 'timezones': + [('ACDT', 37800), + ('ACST', 34200), + ('ACT', -18000), + ('ACWDT', 35100), + ('ACWST', 31500), + ('ADDT', -7200), + ('ADMT', 9300), + ('ADT', -10800), + ('AEDT', 39600), + ('AEST', 36000), + ('AFT', 16200), + ('AHDT', -32400), + ('AHST', -36000), + ('AKDT', -28800), + ('AKST', -32400), + ('AKTST', 21600), + ('AKTT', 18000), + ('ALMST', 25200), + ('ALMT', 21600), + ('AMST', 18000), + ('AMT', 14400), + ('ANAST', 43200), + ('ANAT', 43200), + ('ANT', -16200), + ('APT', -10800), + ('AQTST', 21600), + ('AQTT', 18000), + ('ARST', -10800), + ('ART', -10800), + ('ASHST', 21600), + ('ASHT', 18000), + ('AST', -14400), + ('AWDT', 32400), + ('AWST', 28800), + ('AWT', -10800), + ('AZOMT', 0), + ('AZOST', -3600), + ('AZOT', -3600), + ('AZST', 18000), + ('AZT', 14400), + ('BAKST', 14400), + ('BAKT', 10800), + ('BDST', 7200), + ('BDT', 28800), + ('BEAT', 9000), + ('BEAUT', 9900), + ('BIOT', 21600), + ('BMT', 1800), + ('BNT', 28800), + ('BORT', 28800), + ('BOST', -12780), + ('BOT', -14400), + ('BRST', -7200), + ('BRT', -10800), + ('BST', 39600), + ('BTT', 21600), + ('BURT', 23400), + ('CANT', -3600), + ('CAPT', -32400), + ('CAST', 10800), + ('CAT', 7200), + ('CAWT', -32400), + ('CCT', 23400), + ('CDDT', -14400), + ('CDT', -18000), + ('CEDT', 7200), + ('CEMT', 10800), + ('CEST', 7200), + ('CET', 3600), + ('CGST', -3600), + ('CGT', -7200), + ('CHADT', 49500), + ('CHAST', 45900), + ('CHDT', -19800), + ('CHOST', 36000), + ('CHOT', 28800), + ('CIST', -28800), + ('CKHST', -34200), + ('CKT', -36000), + ('CLST', -10800), + ('CLT', -14400), + ('CMT', -16080), + ('COST', -14400), + ('COT', -18000), + ('CPT', -18000), + ('CST', -21600), + ('CUT', 8400), + ('CVST', -3600), + ('CVT', -3600), + ('CWT', -18000), + ('CXT', 25200), + ('ChST', 36000), + ('DACT', 21600), + ('DAVT', 25200), + ('DDUT', 36000), + ('DFT', 3600), + ('DMT', -1500), + ('DUSST', 21600), + ('DUST', 21600), + ('EASST', -18000), + ('EAST', -21600), + ('EAT', 10800), + ('ECT', -18000), + ('EDDT', -10800), + ('EDT', -14400), + ('EEDT', 10800), + ('EEST', 10800), + ('EET', 7200), + ('EGST', 0), + ('EGT', -3600), + ('EHDT', -16200), + ('EMT', -26220), + ('EPT', -14400), + ('EST', -18000), + ('ET', -18000), + ('EWT', -14400), + ('FET', 10800), + ('FFMT', -14640), + ('FJST', 46800), + ('FJT', 43200), + ('FKST', -10800), + ('FKT', -14400), + ('FMT', -4080), + ('FNST', -3600), + ('FNT', -7200), + ('FORT', 14400), + ('FRUST', 25200), + ('FRUT', 18000), + ('GALT', -21600), + ('GAMT', -32400), + ('GBGT', -13500), + ('GEST', 14400), + ('GET', 14400), + ('GFT', -10800), + ('GHST', 1200), + ('GILT', 43200), + ('GIT', -32400), + ('GMT', 0), + ('GST', 14400), + ('GYT', -14400), + ('HAA', -10800), + ('HAC', -18000), + ('HADT', -32400), + ('HAE', -14400), + ('HAP', -25200), + ('HAR', -21600), + ('HAST', -36000), + ('HAT', -9000), + ('HAY', -28800), + ('HDT', -34200), + ('HKST', 32400), + ('HKT', 28800), + ('HLV', -16200), + ('HMT', 18000), + ('HNA', -14400), + ('HNC', -21600), + ('HNE', -18000), + ('HNP', -28800), + ('HNR', -25200), + ('HNT', -12600), + ('HNY', -32400), + ('HOVST', 28800), + ('HOVT', 25200), + ('HST', -36000), + ('ICT', 25200), + ('IDDT', 14400), + ('IDT', 10800), + ('IHST', 21600), + ('IMT', 7020), + ('IOT', 21600), + ('IRDT', 16200), + ('IRKST', 32400), + ('IRKT', 28800), + ('IRST', 12600), + ('ISST', 0), + ('IST', 7200), + ('JAVT', 26400), + ('JCST', 32400), + ('JDT', 36000), + ('JMT', 8460), + ('JST', 32400), + ('JWST', 28800), + ('KART', 18000), + ('KDT', 32400), + ('KGST', 21600), + ('KGT', 21600), + ('KIZST', 21600), + ('KIZT', 18000), + ('KMT', 5760), + ('KOST', 39600), + ('KRAST', 28800), + ('KRAT', 25200), + ('KST', 32400), + ('KUYST', 18000), + ('KUYT', 14400), + ('KWAT', -43200), + ('LHDT', 39600), + ('LHST', 37800), + ('LINT', 50400), + ('LKT', 23400), + ('LMT', -20160), + ('LMT', -17640), + ('LMT', -20580), + ('LMT', -14400), + ('LRT', -2640), + ('LST', 9420), + ('MADMT', 3600), + ('MADST', 0), + ('MADT', -3600), + ('MAGST', 43200), + ('MAGT', 39600), + ('MALST', 26400), + ('MALT', 27000), + ('MART', -34200), + ('MAWT', 18000), + ('MDDT', -18000), + ('MDST', 16260), + ('MDT', -21600), + ('MEST', 7200), + ('MET', 3600), + ('MHT', 43200), + ('MIST', 39600), + ('MIT', -34200), + ('MMT', 23400), + ('MOST', 32400), + ('MOT', 28800), + ('MPT', -21600), + ('MSD', 14400), + ('MSK', 10800), + ('MSM', 18000), + ('MST', -25200), + ('MUST', 18000), + ('MUT', 14400), + ('MVT', 18000), + ('MWT', -21600), + ('MYT', 28800), + ('NCST', 43200), + ('NCT', 39600), + ('NDDT', -5400), + ('NDT', -9000), + ('NEGT', -12600), + ('NEST', 4800), + ('NET', 1200), + ('NFT', 41400), + ('NMT', 40320), + ('NOVST', 25200), + ('NOVT', 21600), + ('NPT', 20700), + ('NRT', 41400), + ('NST', -12600), + ('NT', -12600), + ('NUT', -39600), + ('NWT', -36000), + ('NZDT', 46800), + ('NZMT', 41400), + ('NZST', 43200), + ('OMSST', 25200), + ('OMST', 21600), + ('ORAST', 18000), + ('ORAT', 18000), + ('PDDT', -21600), + ('PDT', -25200), + ('PEST', -14400), + ('PET', -18000), + ('PETST', 43200), + ('PETT', 43200), + ('PGT', 36000), + ('PHOT', 46800), + ('PHST', 32400), + ('PHT', 28800), + ('PKST', 21600), + ('PKT', 18000), + ('PLMT', 25620), + ('PMDT', -7200), + ('PMMT', 35340), + ('PMST', -10800), + ('PMT', 540), + ('PNT', -30600), + ('PONT', 39600), + ('PPMT', -17340), + ('PPT', -25200), + ('PST', -28800), + ('PT', -28800), + ('PWT', -25200), + ('PYST', -10800), + ('PYT', -14400), + ('QMT', -18840), + ('QYZST', 25200), + ('QYZT', 21600), + ('RET', 14400), + ('RMT', 3000), + ('ROTT', -10800), + ('SAKST', 43200), + ('SAKT', 39600), + ('SAMT', 14400), + ('SAST', 7200), + ('SBT', 39600), + ('SCT', 14400), + ('SDMT', -16800), + ('SDT', -36000), + ('SET', 3600), + ('SGT', 28800), + ('SHEST', 21600), + ('SHET', 18000), + ('SJMT', -20160), + ('SLT', 19800), + ('SMT', -13860), + ('SRET', 39600), + ('SRT', -10800), + ('SST', -39600), + ('STAT', 10800), + ('SVEST', 21600), + ('SVET', 14400), + ('SWAT', 5400), + ('SYOT', 10800), + ('TAHT', -36000), + ('TASST', 25200), + ('TAST', 21600), + ('TBIST', 18000), + ('TBIT', 10800), + ('TBMT', 10740), + ('TFT', 18000), + ('THA', 25200), + ('TJT', 18000), + ('TKT', -39600), + ('TLT', 32400), + ('TMT', 18000), + ('TOST', 50400), + ('TOT', 46800), + ('TRST', 14400), + ('TRT', 10800), + ('TSAT', 10800), + ('TVT', 43200), + ('ULAST', 32400), + ('ULAT', 28800), + ('URAST', 18000), + ('URAT', 18000), + ('UTC', 0), + ('UYHST', -9000), + ('UYST', -7200), + ('UYT', -10800), + ('UZST', 21600), + ('UZT', 18000), + ('VET', -16200), + ('VLAST', 39600), + ('VLAT', 36000), + ('VOLST', 14400), + ('VOLT', 14400), + ('VOST', 21600), + ('VUST', 43200), + ('VUT', 39600), + ('WARST', -10800), + ('WART', -14400), + ('WAST', 7200), + ('WAT', 3600), + ('WDT', 32400), + ('WEDT', 3600), + ('WEMT', 7200), + ('WEST', 3600), + ('WET', 0), + ('WFT', 43200), + ('WGST', -7200), + ('WGT', -10800), + ('WIB', 25200), + ('WIT', 32400), + ('WITA', 28800), + ('WMT', 5040), + ('WSDT', 50400), + ('WSST', 46800), + ('WST', 28800), + ('WT', 0), + ('XJT', 21600), + ('YAKST', 36000), + ('YAKT', 32400), + ('YAPT', 36000), + ('YDDT', -25200), + ('YDT', -28800), + ('YEKST', 21600), + ('YEKST', 21600), + ('YEKT', 18000), + ('YEKT', 18000), + ('YERST', 14400), + ('YERT', 10800), + ('YPT', -28800), + ('YST', -32400), + ('YWT', -28800), + ('zzz', 0)] + }, + { + 'regex_patterns': + [r'(.)%s$',], + 'replace': + [(r'UTC', r''), + (r':', r''), + (r':|UTC', r''),], + 'timezones': + [('UTC\-12:00', -43200), + ('UTC\-11:00', -39600), + ('UTC\-10:00', -36000), + ('UTC\-09:30', -34200), + ('UTC\-09:00', -32400), + ('UTC\-08:00', -28800), + ('UTC\-07:00', -25200), + ('UTC\-06:00', -21600), + ('UTC\-05:00', -18000), + ('UTC\-04:30', -16200), + ('UTC\-04:00', -14400), + ('UTC\-03:30', -12600), + ('UTC\-03:00', -10800), + ('UTC\-02:00', -7200), + ('UTC\-01:00', -3600), + ('UTC\+00:00', 0), + ('UTC\+01:00', 3600), + ('UTC\+02:00', 7200), + ('UTC\+03:00', 10800), + ('UTC\+03:30', 12600), + ('UTC\+04:00', 14400), + ('UTC\+04:30', 16200), + ('UTC\+05:00', 18000), + ('UTC\+05:30', 19800), + ('UTC\+05:45', 20700), + ('UTC\+06:00', 21600), + ('UTC\+06:30', 23400), + ('UTC\+07:00', 25200), + ('UTC\+08:00', 28800), + ('UTC\+08:45', 31500), + ('UTC\+09:00', 32400), + ('UTC\+09:30', 34200), + ('UTC\+10:00', 36000), + ('UTC\+10:30', 37800), + ('UTC\+11:00', 39600), + ('UTC\+11:30', 41400), + ('UTC\+12:00', 43200), + ('UTC\+12:45', 45900), + ('UTC\+13:00', 46800), + ('UTC\+14:00', 50400)] + }, ] diff --git a/tests/test_date.py b/tests/test_date.py index 7fd6c75ad..0c45266e1 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -14,6 +14,9 @@ from dateparser import date from dateparser.date import get_last_day_of_month from dateparser.languages.loader import LanguageDataLoader +from dateparser.languages.loader import default_language_loader +from dateparser.conf import settings + from tests import BaseTestCase @@ -417,14 +420,14 @@ def given_parser(self, restrict_to_languages=None, **params): if restrict_to_languages is not None: language_loader = LanguageDataLoader() - language_map = date.default_language_loader.get_language_map() + language_map = default_language_loader.get_language_map() ordered_languages = OrderedDict([ (shortname, language_map[shortname]) for shortname in restrict_to_languages ]) language_loader._data = ordered_languages - self.add_patch(patch('dateparser.date.default_language_loader', new=language_loader)) + self.add_patch(patch('dateparser.date.DateDataParser.language_loader', new=language_loader)) def given_local_tz_offset(self, offset): self.add_patch( diff --git a/tests/test_date_parser.py b/tests/test_date_parser.py index 47dce9be6..000c5448f 100644 --- a/tests/test_date_parser.py +++ b/tests/test_date_parser.py @@ -106,7 +106,7 @@ def given_parser(self, languages=None, allow_redetection=False): language_map = default_language_loader.get_language_map() languages = [language_map[language] for language in languages] - self.parser = AutoDetectLanguage(languages=languages, allow_redetection=allow_redetection) + self.parser = AutoDetectLanguage(languages, allow_redetection=allow_redetection) def given_parser_languages_are(self, languages): language_map = default_language_loader.get_language_map() @@ -293,6 +293,7 @@ def test_dates_parsing(self, date_string, expected): param('17th October, 2034 @ 01:08 am PDT', datetime(2034, 10, 17, 9, 8)), param('15 May 2004 23:24 EDT', datetime(2004, 5, 16, 4, 24)), param('15 May 2004', datetime(2004, 5, 15, 0, 0)), + param('08/17/14 17:00 (PDT)', datetime(2014, 8, 18, 1, 0)), ]) def test_parsing_with_time_zones(self, date_string, expected): self.given_local_tz_offset(+1) @@ -302,6 +303,21 @@ def test_parsing_with_time_zones(self, date_string, expected): self.then_period_is('day') self.then_date_obj_exactly_is(expected) + @parameterized.expand([ + param('15 May 2004 16:10 -0400', datetime(2004, 5, 15, 20, 10)), + param('1999-12-31 19:00:00 -0500', datetime(2000, 1, 1, 0, 0)), + param('1999-12-31 19:00:00 +0500', datetime(1999, 12, 31, 14, 0)), + param('Fri, 09 Sep 2005 13:51:39 -0700', datetime(2005, 9, 9, 20, 51, 39)), + param('Fri, 09 Sep 2005 13:51:39 +0000', datetime(2005, 9, 9, 13, 51, 39)), + ]) + def test_parsing_with_utc_offsets(self, date_string, expected): + self.given_local_tz_offset(0) + self.given_parser() + self.when_date_is_parsed(date_string) + self.then_date_was_parsed_by_date_parser() + self.then_period_is('day') + self.then_date_obj_exactly_is(expected) + def test_empty_dates_string_is_not_parsed(self): self.when_date_is_parsed_by_date_parser('') self.then_error_was_raised(ValueError, "Empty string") @@ -351,6 +367,7 @@ def test_preferably_future_dates(self, date_string, expected): self.then_date_was_parsed_by_date_parser() self.then_date_obj_exactly_is(expected) + @parameterized.expand([ param('10 December', datetime(2015, 12, 10)), param('March', datetime(2015, 3, 15)), @@ -440,6 +457,18 @@ def test_error_should_be_raised_for_invalid_dates_with_too_large_day_number(self self.when_date_is_parsed_by_date_parser(date_string) self.then_error_was_raised(ValueError, 'Day not in range for month') + @parameterized.expand([ + param('2015-05-02T10:20:19+0000', languages=['fr'], expected=datetime(2015, 5, 2, 10, 20, 19)), + param('2015-05-02T10:20:19+0000', languages=['en'], expected=datetime(2015, 5, 2, 10, 20, 19)), + param('2015-05-02T10:20:19+0000', languages=[], expected=datetime(2015, 5, 2, 10, 20, 19)), + ]) + def test_iso_datestamp_format_should_always_parse(self, date_string, languages, expected): + self.given_local_tz_offset(0) + self.given_parser(languages=languages) + self.when_date_is_parsed(date_string) + self.then_date_was_parsed_by_date_parser() + self.then_date_obj_exactly_is(expected) + @parameterized.expand([ param('10 December', expected=datetime(2015, 12, 10), period='day'), param('March', expected=datetime(2015, 3, 15), period='month'), @@ -473,20 +502,21 @@ def given_local_tz_offset(self, offset): new=timedelta(seconds=3600 * offset)) ) - def given_parser(self): + def given_parser(self, *args, **kwds): def collecting_get_date_data(parse): @wraps(parse) def wrapped(date_string): self.date_result = parse(date_string) return self.date_result return wrapped + self.add_patch(patch.object(date_parser, 'parse', collecting_get_date_data(date_parser.parse))) self.date_parser = Mock(wraps=date_parser) self.add_patch(patch('dateparser.date.date_parser', new=self.date_parser)) - self.parser = DateDataParser() + self.parser = DateDataParser(*args, **kwds) def given_configuration(self, key, value): self.add_patch(patch.object(settings, key, new=value)) diff --git a/tests/test_timezone_parser.py b/tests/test_timezone_parser.py index 46c9453c3..032ab0b03 100644 --- a/tests/test_timezone_parser.py +++ b/tests/test_timezone_parser.py @@ -17,13 +17,19 @@ def setUp(self): @parameterized.expand([ param('Sep 03 2014 | 4:32 pm EDT', -4), param('17th October, 2034 @ 01:08 am PDT', -7), + param('17th October, 2034 @ 01:08 am (PDT)', -7), param('October 17, 2014 at 7:30 am PST', -8), param('20 Oct 2014 13:08 CET', +1), param('20 Oct 2014 13:08cet', +1), param('Nov 25 2014 | 10:17 pm EST', -5), + param('Nov 25 2014 | 10:17 pm +0600', +6), + param('Nov 25 2014 | 10:17 pm -0930', -9.5), + param('20 Oct 2014 | 05:17 am -1200', -12), + param('20 Oct 2014 | 05:17 am +0000', 0), param('15 May 2004', None), ]) def test_extracting_valid_offset(self, initial_string, expected_offset): + print self.datetime_string, self.timezone_offset self.given_string(initial_string) self.when_offset_popped_from_string() self.then_offset_is(expected_offset) @@ -35,6 +41,8 @@ def test_extracting_valid_offset(self, initial_string, expected_offset): param('20 Oct 2014 13:08 CET', '20 Oct 2014 13:08 '), param('20 Oct 2014 13:08cet', '20 Oct 2014 13:08'), param('Nov 25 2014 | 10:17 pm EST', 'Nov 25 2014 | 10:17 pm '), + param('17th October, 2034 @ 01:08 am +0700', '17th October, 2034 @ 01:08 am '), + param('Sep 03 2014 4:32 pm +0630', 'Sep 03 2014 4:32 pm '), ]) def test_timezone_deleted_from_string(self, initial_string, result_string): self.given_string(initial_string)