Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve concurrency issues #834

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 28 additions & 19 deletions dateparser/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,21 @@ class Settings:
_pyfile_data = None
_mod_settings = dict()

def __init__(self, settings=None):
if settings:
self._updateall(settings.items())
else:
def __init__(self, **kwargs):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Collaborator

@noviluni noviluni Nov 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gallaecio @alex-triphonov
would it be possible to keep the settings=None and, in case it's used, raise a deprecation warning and preserve the old behavior?

Something like:

def __init__(self, settings=None, **kwargs):
    if settings:
        # raise warning (deprecation warning, not concurrent) + old behavior
    else:
        # new behavior using kwargs

I know the code won't be nice, but I think in that way we could keep the backward compatibility until the next major version.

(You should do the same for get_key() and maybe something similar in the other two methods you touched).

Maybe I'm saying something stupid, sorry guys, I didn't have too much time to play with the code.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this is not possible- settings are still being used, but in kwargs for more convenience, the only way to separate behaviours is to add some new kwarg like new_settings but is old behaviour with fewer Settings instances that much essential?

Copy link
Member

@Gallaecio Gallaecio Nov 23, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me it’s mostly a matter of API. The current change would require to postpone this to dateparser 2.x, as it is backward incompatible. If we can figure a backward compatible implementation, this can go on the next dateparser version.

if not kwargs.get('settings'):
self._updateall(self._get_settings_from_pyfile().items())

elif len(self.__dict__) == 1:
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
self._updateall(kwargs['settings'].items())

@classmethod
def get_key(cls, settings=None):
if not settings:
return 'default'
def get_key(cls, **kwargs):
if kwargs:
keys = sorted('{}-{}'.format(key, val) for key, val in kwargs.pop('settings').items())
keys.extend(sorted('{}-{}'.format(key, val) for key, val in kwargs.items() if val))
return hashlib.md5(''.join(sorted(keys)).encode('utf-8')).hexdigest()

keys = sorted(['%s-%s' % (key, str(settings[key])) for key in settings])
return hashlib.md5(''.join(keys).encode('utf-8')).hexdigest()
return 'default'

@classmethod
def _get_settings_from_pyfile(cls):
Expand All @@ -57,18 +59,21 @@ def _updateall(self, iterable):
setattr(self, key, value)

def replace(self, mod_settings=None, **kwds):
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
for k, v in kwds.items():
_settings = kwds.get('settings', {}).copy()
for k, v in _settings.items():
if v is None:
raise TypeError('Invalid {{"{}": {}}}'.format(k, v))

for x in self._get_settings_from_pyfile().keys():
kwds.setdefault(x, getattr(self, x))
z = self._get_settings_from_pyfile().keys()
for x in z:
_settings.setdefault(x, getattr(self, x))

kwds['_default'] = False
_settings['_default'] = False
if mod_settings:
kwds['_mod_settings'] = mod_settings
_settings['_mod_settings'] = mod_settings

return self.__class__(settings=kwds)
kwds['settings'] = _settings
return self.__class__(**kwds)


settings = Settings()
Expand All @@ -77,11 +82,15 @@ def replace(self, mod_settings=None, **kwds):
def apply_settings(f):
@wraps(f)
def wrapper(*args, **kwargs):
mod_settings = kwargs.get('settings')
kwargs['settings'] = mod_settings or settings
mod_settings = kwargs.get('settings', {})
if mod_settings is None:
kwargs['settings'] = mod_settings = {}
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(kwargs['settings'], dict):
kwargs['settings'] = settings.replace(mod_settings=mod_settings, **kwargs['settings'])
if kwargs:
if isinstance(mod_settings, dict):
kwargs['settings'] = settings.replace(mod_settings=mod_settings.copy(), **kwargs)
else:
kwargs['settings'] = settings

if not isinstance(kwargs['settings'], Settings):
raise TypeError("settings can only be either dict or instance of Settings class")
Expand Down
37 changes: 17 additions & 20 deletions dateparser/freshness_date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

class FreshnessDateDataParser:
""" Parses date string like "1 year, 2 months ago" and "3 hours, 50 minutes ago" """
def __init__(self):
self.now = None

def _are_all_words_units(self, date_string):
skip = [_UNITS,
Expand Down Expand Up @@ -59,42 +57,42 @@ def apply_time(dateobj, timeobj):
)

if settings.RELATIVE_BASE:
self.now = settings.RELATIVE_BASE
now = settings.RELATIVE_BASE

if 'local' not in _settings_tz:
self.now = localize_timezone(self.now, settings.TIMEZONE)
now = localize_timezone(now, settings.TIMEZONE)

if ptz:
if self.now.tzinfo:
self.now = self.now.astimezone(ptz)
if now.tzinfo:
now = now.astimezone(ptz)
else:
if hasattr(ptz, 'localize'):
self.now = ptz.localize(self.now)
now = ptz.localize(now)
else:
self.now = self.now.replace(tzinfo=ptz)
now = now.replace(tzinfo=ptz)

if not self.now.tzinfo:
if not now.tzinfo:
if hasattr(self.get_local_tz(), 'localize'):
self.now = self.get_local_tz().localize(self.now)
now = self.get_local_tz().localize(now)
else:
self.now = self.now.replace(tzinfo=self.get_local_tz())
now = now.replace(tzinfo=self.get_local_tz())

elif ptz:
_now = datetime.now(ptz)

if 'local' in _settings_tz:
self.now = _now
now = _now
else:
self.now = apply_timezone(_now, settings.TIMEZONE)
now = apply_timezone(_now, settings.TIMEZONE)

else:
if 'local' not in _settings_tz:
utc_dt = datetime.utcnow()
self.now = apply_timezone(utc_dt, settings.TIMEZONE)
now = apply_timezone(utc_dt, settings.TIMEZONE)
else:
self.now = datetime.now(self.get_local_tz())
now = datetime.now(self.get_local_tz())

date, period = self._parse_date(date_string, settings.PREFER_DATES_FROM)
date, period = self._parse_date(date_string, now, settings.PREFER_DATES_FROM)

if date:
old_date = date
Expand All @@ -112,10 +110,9 @@ def apply_time(dateobj, timeobj):
):
date = date.replace(tzinfo=None)

self.now = None
return date, period

def _parse_date(self, date_string, prefer_dates_from):
def _parse_date(self, date_string, now, prefer_dates_from):
if not self._are_all_words_units(date_string):
return None, None

Expand All @@ -135,9 +132,9 @@ def _parse_date(self, date_string, prefer_dates_from):
or re.search(r'\bfuture\b', prefer_dates_from)
and not re.search(r'\bago\b', date_string)
):
date = self.now + td
date = now + td
else:
date = self.now - td
date = now - td
return date, period

def get_kwargs(self, date_string):
Expand Down
2 changes: 1 addition & 1 deletion dateparser/search/text_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def __init__(self, languages):
self.language_chars = []

def get_unique_characters(self, settings):
settings = settings.replace(NORMALIZE=False)
settings = settings.replace(settings={'NORMALIZE': False})

for language in self.languages:
chars = language.get_wordchars_for_detection(settings=settings)
Expand Down
58 changes: 58 additions & 0 deletions tests/test_concurrency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import concurrent.futures
import random
from datetime import datetime

import dateparser
from tests import BaseTestCase

RELATIVE = {'RELATIVE_BASE': datetime(2014, 9, 15, 10, 30)}

TEST_DATA = [
{'ds': 'Tue May 07, 2018 10:55 PM', 'expected': datetime(2018, 5, 7, 22, 55), 'loc': 'en'},
{'ds': '2018-10-07T22:55:01', 'expected': datetime(2018, 10, 7, 22, 55, 1), 'loc': 'en'},
{'ds': '2018-Oct-11', 'expected': datetime(2018, 10, 11, 0, 0), 'loc': 'en'},
{'ds': '12.04.2018', 'expected': datetime(2018, 12, 4, 0, 0), 'loc': 'en'},
{'ds': '12-10-2018 20:13', 'expected': datetime(2018, 12, 10, 20, 13), 'loc': 'en'},
{'ds': '03.04.2019', 'expected': datetime(2019, 4, 3, 0, 0), 'loc': 'en-150'},
{'ds': 'on Tue October 7, 2019 04:55 PM', 'expected': datetime(2019, 10, 7, 16, 55), 'loc': 'en-150'},
{'ds': '2019Oct8', 'expected': datetime(2019, 10, 8, 0, 0), 'loc': 'en-150'},
{'ds': '07.03.2020 - 11:13', 'expected': datetime(2020, 3, 7, 11, 13), 'loc': 'ru'},
{'ds': '9 Авг. 2020 17:11:01', 'expected': datetime(2020, 8, 9, 17, 11, 1), 'loc': 'ru'},
{'ds': '07.01.2020', 'expected': datetime(2020, 1, 7, 0, 0), 'loc': 'ru'},
{'ds': 'yesterday 11:00', 'expected': datetime(2014, 9, 14, 11), 'loc': 'en', 'extra': RELATIVE},
{'ds': '13 days ago', 'expected': datetime(2014, 9, 2, 10, 30), 'loc': 'en', 'extra': RELATIVE},
] * 180

random.shuffle(TEST_DATA)


class TestConcurrency(BaseTestCase):

def test_concurrency(self):
with concurrent.futures.ThreadPoolExecutor() as executor:

results = list(executor.map(self.concurrency_test, TEST_DATA))
results_with_error = [(r['ds'], r['error']) for r in results if r['error']]
msg = '{}Threads failed with errors:\n{}'
self.assertEqual([], results_with_error,
msg.format(len(results_with_error), set(results_with_error)))

wrong_results = [str(r) for r in results if (r['expected'] != r['date'])]
msg = '{} Threads returned wrong date time:\n{}'
self.assertEqual([], wrong_results,
msg.format(len(wrong_results), '\n'.join(wrong_results)))

@staticmethod
def concurrency_test(data_for_test):
try:
date_string = data_for_test['ds']
date = dateparser.parse(date_string, locales=[data_for_test['loc']],
settings=data_for_test.get('extra'))
if date:
data_for_test['date'] = date
data_for_test['error'] = None
except Exception as error:
data_for_test['error'] = str(error)
data_for_test['date'] = None
finally:
return data_for_test
3 changes: 1 addition & 2 deletions tests/test_freshness_date_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1546,7 +1546,7 @@ def test_freshness_date_with_timezone_conversion(self, date_string, timezone, to
self.then_time_is(time)

def test_freshness_date_with_to_timezone_setting(self):
_settings = settings.replace(**{
_settings = settings.replace(settings={
'TIMEZONE': 'local',
'TO_TIMEZONE': 'UTC',
'RELATIVE_BASE': datetime(2014, 9, 1, 10, 30)
Expand Down Expand Up @@ -1666,7 +1666,6 @@ def wrapped(*args, **kwargs):
collecting_get_date_data(freshness_date_parser.get_date_data)))

self.freshness_parser = Mock(wraps=freshness_date_parser)
self.add_patch(patch.object(self.freshness_parser, 'now', self.now))

dt_mock = Mock(wraps=dateparser.freshness_date_parser.datetime)
dt_mock.utcnow = Mock(return_value=self.now)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def given_configurations(self, confs):
if 'TIMEZONE' not in confs:
confs.update({'TIMEZONE': 'local'})

self.confs = settings.replace(**confs)
self.confs = settings.replace(settings=confs)

def when_date_is_parsed(self):
self.result = parse(self.given_ds, settings=(self.confs or {}))
Expand Down
6 changes: 4 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def test_apply_timezone_function(self, date, timezone, expected):
param(datetime(2015, 12, 12, 10, 12), timezone='-0500', expected=datetime(2015, 12, 12, 5, 12)),
])
def test_apply_timezone_from_settings_function(self, date, timezone, expected):
result = apply_timezone_from_settings(date, settings.replace(**{'TO_TIMEZONE': timezone, 'TIMEZONE': 'UTC'}))
result = apply_timezone_from_settings(date,
settings.replace(settings={'TO_TIMEZONE': timezone, 'TIMEZONE': 'UTC'})
)
self.assertEqual(expected, result)

@parameterized.expand([
Expand All @@ -101,7 +103,7 @@ def test_apply_timezone_from_settings_function_none_settings(self, date, expecte
param(datetime(2015, 12, 12, 10, 12),),
])
def test_apply_timezone_from_settings_function_should_return_tz(self, date):
result = apply_timezone_from_settings(date, settings.replace(**{'RETURN_AS_TIMEZONE_AWARE': True}))
result = apply_timezone_from_settings(date, settings.replace(settings={'RETURN_AS_TIMEZONE_AWARE': True}))
self.assertTrue(bool(result.tzinfo))

def test_registry_when_get_keys_not_implemented(self):
Expand Down