Skip to content

Commit

Permalink
Merge pull request #86 from scrapinghub/feature-iso-date-format-with-…
Browse files Browse the repository at this point in the history
…non-english-language

Feature iso date format with non english language
  • Loading branch information
waqasshabbir committed Jul 13, 2015
2 parents de675dc + e4d4ff8 commit 1718d73
Show file tree
Hide file tree
Showing 11 changed files with 542 additions and 423 deletions.
6 changes: 0 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,3 @@ Limitations

* Only Python 2 support for now (Python 3 support **will be** added in future versions)

* `dateparser` at this point does not support generic parsing of dates with fixed UTC offsets. This restricts its ability to reliably parse time zone aware dates since the use of abbreviated time zones as a sole designator of time zones is not recommended.

Read `Wikipedia Time Zone article`_ for more information.

.. _Wikipedia Time Zone Article: https://en.wikipedia.org/wiki/Time_zone#Abbreviations

2 changes: 1 addition & 1 deletion data/languages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ base:
en:
name: English

skip: ["at", "on", "and", "ad", "m", "t", "of", "st", "nd", "rd", "th", "about", "the", "just"]
skip: ["at", "on", "and", "ad", "m", "of", "st", "nd", "rd", "th", "about", "the", "just"]
pertain: ["of"]

monday:
Expand Down
9 changes: 9 additions & 0 deletions dateparser/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,22 @@
>>> settings.update('PREFER_DATES_FROM', 'future')
>>> parse(u'March')
datetime.datetime(2016, 3, 16, 0, 0)
*``SKIP_TOKENS``* is a ``list`` of tokens to discard while detecting language. Defaults to ``['t']`` which skips T in iso format datetime string.e.g. ``2015-05-02T10:20:19+0000``.
This only works with :mod:`DateDataParser` like below:
>>> settings.update('SKIP_TOKENS', ['de']) # Turkish word for 'at'
>>> from dateparser.date import DateDataParser
>>> DateDataParser().get_date_data(u'27 Haziran 1981 de') # Turkish (at 27 June 1981)
{'date_obj': datetime.datetime(1981, 6, 27, 0, 0), 'period': 'day'}
"""


class Settings(object):
PREFER_DATES_FROM = 'current_period' # past, future, current_period
SUPPORT_BEFORE_COMMON_ERA = False
PREFER_DAY_OF_MONTH = 'current' # current, first, last
SKIP_TOKENS = ['t']

def __init__(self, **kwargs):
for key in kwargs:
Expand Down
20 changes: 15 additions & 5 deletions dateparser/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from dateparser.date_parser import date_parser
from dateparser.freshness_date_parser import freshness_date_parser
from dateparser.languages import default_language_loader
from dateparser.languages.loader import LanguageDataLoader
from dateparser.languages.detection import AutoDetectLanguage, ExactLanguages


Expand Down Expand Up @@ -244,10 +244,12 @@ class DateDataParser(object):
:raises:
ValueError - Unknown Language, TypeError - Languages argument must be a list
"""
language_loader = None

def __init__(self, languages=None, allow_redetect_language=False):
available_language_map = self._get_language_loader().get_language_map()

if isinstance(languages, (list, tuple, collections.Set)):
available_language_map = default_language_loader.get_language_map()

if all([language in available_language_map for language in languages]):
languages = [available_language_map[language] for language in languages]
Expand All @@ -258,12 +260,14 @@ def __init__(self, languages=None, allow_redetect_language=False):
raise TypeError("languages argument must be a list (%r given)" % type(languages))

if allow_redetect_language:
self.language_detector = AutoDetectLanguage(languages=languages if languages else None,
allow_redetection=True)
self.language_detector = AutoDetectLanguage(
languages if languages else available_language_map.values(),
allow_redetection=True)
elif languages:
self.language_detector = ExactLanguages(languages=languages)
else:
self.language_detector = AutoDetectLanguage(languages=None, allow_redetection=False)
self.language_detector = AutoDetectLanguage(
available_language_map.values(), allow_redetection=False)

def get_date_data(self, date_string, date_formats=None):
"""
Expand Down Expand Up @@ -314,3 +318,9 @@ def get_date_data(self, date_string, date_formats=None):
return parsed_date
else:
return {'date_obj': None, 'period': 'day'}

@classmethod
def _get_language_loader(cls):
if not cls.language_loader:
cls.language_loader = LanguageDataLoader()
return cls.language_loader
6 changes: 1 addition & 5 deletions dateparser/languages/detection.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
from functools import wraps

from dateparser.languages import default_language_loader


def _restore_languages_on_generator_exit(method):
@wraps(method)
Expand Down Expand Up @@ -39,9 +37,7 @@ def _filter_languages(date_string, languages):


class AutoDetectLanguage(BaseLanguageDetector):
def __init__(self, languages=None, allow_redetection=False):
if languages is None:
languages = default_language_loader.get_languages()
def __init__(self, languages, allow_redetection=False):
super(AutoDetectLanguage, self).__init__(languages=languages[:])
self.language_pool = languages[:]
self.allow_redetection = allow_redetection
Expand Down
4 changes: 3 additions & 1 deletion dateparser/languages/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from yaml import load as load_yaml

from .language import Language
from ..conf import settings


class LanguageDataLoader(object):
Expand Down Expand Up @@ -35,7 +36,8 @@ def _load_data(self):
else:
data = self.file.read()
data = load_yaml(data)
base_data = data.pop('base', {})
base_data = data.pop('base', {'skip': []})
base_data['skip'] += settings.SKIP_TOKENS
known_languages = {}
for shortname, language_info in data.iteritems():
self._update_language_info_with_base_info(language_info, base_data)
Expand Down
38 changes: 25 additions & 13 deletions dateparser/timezone_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@

from dateparser.timezones import timezone_info_list

TIMEZONE_REGEX_PATTERN = r'(\b|\d)%s$'


def pop_tz_offset_from_string(date_string, as_offset=True):
for name, info in _tz_offsets.iteritems():
for name, info in _tz_offsets:
timezone_re = info['regex']
if timezone_re.search(date_string):
date_string = timezone_re.sub(r'\1', date_string) # \1 = (\b|\d) in TIMEZONE_REGEX_PATTERN
date_string = timezone_re.sub(r'\1', date_string)
return date_string, info['offset'] if as_offset else name
else:
return date_string, None
Expand All @@ -22,18 +20,32 @@ def convert_to_local_tz(datetime_obj, datetime_tz_offset):


def get_tz_offsets():
return {
tz_info[0]: {
'regex': re.compile(TIMEZONE_REGEX_PATTERN % tz_info[0], re.IGNORECASE),
'offset': timedelta(seconds=tz_info[1]),
}
for tz_info in timezone_info_list
}

def get_offset(tz_obj, regex, repl='', replw=''):
return (
tz_obj[0],
{
'regex': re.compile(re.sub(repl, replw, regex % tz_obj[0]), re.IGNORECASE),
'offset': timedelta(seconds=tz_obj[1])
}
)

for tz_info in timezone_info_list:
for regex in tz_info['regex_patterns']:
for tz_obj in tz_info['timezones']:
yield get_offset(tz_obj, regex)

# alternate patterns
for replace, replacewith in tz_info.get('replace', []):
for tz_obj in tz_info['timezones']:
yield get_offset(tz_obj, regex, repl=replace, replw=replacewith)


def get_local_tz_offset():
return datetime.now() - datetime.utcnow()
offset = datetime.now() - datetime.utcnow()
offset = timedelta(days=offset.days, seconds=round(offset.seconds, -1))
return offset


_tz_offsets = get_tz_offsets()
_tz_offsets = list(get_tz_offsets())
local_tz_offset = get_local_tz_offset()
Loading

0 comments on commit 1718d73

Please sign in to comment.