Skip to content

Commit

Permalink
Merge pull request #932 from gavishpoddar/language
Browse files Browse the repository at this point in the history
Optional Language Detect
  • Loading branch information
lopuhin authored Sep 6, 2021
2 parents 44e8624 + b8dcf7b commit 544ea39
Show file tree
Hide file tree
Showing 25 changed files with 1,218 additions and 25 deletions.
13 changes: 10 additions & 3 deletions dateparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@


@apply_settings
def parse(date_string, date_formats=None, languages=None, locales=None, region=None, settings=None):
def parse(date_string, date_formats=None, languages=None, locales=None,
region=None, settings=None, detect_languages_function=None):
"""Parse date and time from given date string.
:param date_string:
Expand Down Expand Up @@ -39,6 +40,12 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
Configure customized behavior using settings defined in :mod:`dateparser.conf.Settings`.
:type settings: dict
:param detect_languages_function:
A function for language detection that takes as input a string (the `date_string`) and
a `confidence_threshold`, and returns a list of detected language codes.
Note: this function is only used if ``languages`` and ``locales`` are not provided.
:type detect_languages_function: function
:return: Returns :class:`datetime <datetime.datetime>` representing parsed date if successful, else returns None
:rtype: :class:`datetime <datetime.datetime>`.
:raises:
Expand All @@ -47,9 +54,9 @@ def parse(date_string, date_formats=None, languages=None, locales=None, region=N
"""
parser = _default_parser

if languages or locales or region or not settings._default:
if languages or locales or region or detect_languages_function or not settings._default:
parser = DateDataParser(languages=languages, locales=locales,
region=region, settings=settings)
region=region, settings=settings, detect_languages_function=detect_languages_function)

data = parser.get_date_data(date_string, date_formats)

Expand Down
33 changes: 33 additions & 0 deletions dateparser/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime
from functools import wraps

from dateparser.data.languages_info import language_order
from .parser import date_order_chart
from .utils import registry

Expand All @@ -25,6 +26,8 @@ class Settings:
* `NORMALIZE`
* `RETURN_TIME_AS_PERIOD`
* `PARSERS`
* `DEFAULT_LANGUAGES`
* `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD`
"""

_default = True
Expand Down Expand Up @@ -129,6 +132,28 @@ def _check_parsers(setting_name, setting_value):
_check_repeated_values(setting_name, setting_value)


def _check_default_languages(setting_name, setting_value):
unsupported_languages = set(setting_value) - set(language_order)
if unsupported_languages:
raise SettingValidationError(
"Found invalid languages in the '{}' setting: {}".format(
setting_name, ', '.join(map(repr, unsupported_languages))
)
)
_check_repeated_values(setting_name, setting_value)


def _check_between_0_and_1(setting_name, setting_value):
is_valid = 0 <= setting_value <= 1
if not is_valid:
raise SettingValidationError(
'{} is not a valid value for {}. It can take values between 0 and '
'1.'.format(
setting_value, setting_name,
)
)


def check_settings(settings):
"""
Check if provided settings are valid, if not it raises `SettingValidationError`.
Expand Down Expand Up @@ -193,6 +218,14 @@ def check_settings(settings):
'PREFER_LOCALE_DATE_ORDER': {
'type': bool
},
'DEFAULT_LANGUAGES': {
'type': list,
'extra_check': _check_default_languages
},
'LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD': {
'type': float,
'extra_check': _check_between_0_and_1
},
}

modified_settings = settings._mod_settings # check only modified settings
Expand Down
Empty file.
45 changes: 45 additions & 0 deletions dateparser/custom_language_detection/fasttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

import fasttext

from dateparser_cli.fasttext_manager import fasttext_downloader
from dateparser_cli.utils import dateparser_model_home, create_data_model_home
from dateparser_cli.exceptions import FastTextModelNotFoundException


_supported_models = ["large.bin", "small.bin"]
_DEFAULT_MODEL = "small"


class _FastTextCache:
model = None


def _load_fasttext_model():
if _FastTextCache.model:
return _FastTextCache.model
create_data_model_home()
downloaded_models = [
file for file in os.listdir(dateparser_model_home)
if file in _supported_models
]
if not downloaded_models:
fasttext_downloader(_DEFAULT_MODEL)
return _load_fasttext_model()
model_path = os.path.join(dateparser_model_home, downloaded_models[0])
if not os.path.isfile(model_path):
raise FastTextModelNotFoundException('Fasttext model file not found')
_FastTextCache.model = fasttext.load_model(model_path)
return _FastTextCache.model


def detect_languages(text, confidence_threshold):
_language_parser = _load_fasttext_model()
text = text.replace('\n', ' ').replace('\r', '')
language_codes = []
parser_data = _language_parser.predict(text)
for idx, language_probability in enumerate(parser_data[1]):
if language_probability > confidence_threshold:
language_code = parser_data[0][idx].replace("__label__", "")
language_codes.append(language_code)
return language_codes
37 changes: 37 additions & 0 deletions dateparser/custom_language_detection/langdetect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import langdetect


# The below _Factory is set to prevent setting global state of the library
# but still get consistent results.
# Refer : https://github.com/Mimino666/langdetect

class _Factory:
data = None


def _init_factory():
if _Factory.data is None:
_Factory.data = langdetect.detector_factory.DetectorFactory()
_Factory.data.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY)
_Factory.data.seed = 0


def _get_language_probablities(text):
_init_factory()
detector = _Factory.data.create()
detector.append(text)
return detector.get_probabilities()


def detect_languages(text, confidence_threshold):
language_codes = []
try:
parser_data = _get_language_probablities(text)
for language_candidate in parser_data:
if language_candidate.prob > confidence_threshold:
language_codes.append(language_candidate.lang)
except langdetect.lang_detect_exception.LangDetectException:
# This exception can be produced with empty strings or inputs without letters like `10-10-2021`.
# As this could be really common, we ignore them.
pass
return language_codes
18 changes: 18 additions & 0 deletions dateparser/custom_language_detection/language_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from dateparser.data.languages_info import language_map


def map_languages(language_codes):
"""
Returns the candidates from the supported languages codes.
:param language_codes:
A list of language codes, e.g. ['en', 'es'] in ISO 639 Standard.
:type language_codes: list
:return: Returns list[str] representing supported languages
:rtype: list[str]
"""
return [
language_code
for language in language_codes
if language in language_map
for language_code in language_map[language]
]
Loading

0 comments on commit 544ea39

Please sign in to comment.