-
Notifications
You must be signed in to change notification settings - Fork 466
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #932 from gavishpoddar/language
Optional Language Detect
- Loading branch information
Showing
25 changed files
with
1,218 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
|
||
import fasttext | ||
|
||
from dateparser_cli.fasttext_manager import fasttext_downloader | ||
from dateparser_cli.utils import dateparser_model_home, create_data_model_home | ||
from dateparser_cli.exceptions import FastTextModelNotFoundException | ||
|
||
|
||
_supported_models = ["large.bin", "small.bin"] | ||
_DEFAULT_MODEL = "small" | ||
|
||
|
||
class _FastTextCache: | ||
model = None | ||
|
||
|
||
def _load_fasttext_model(): | ||
if _FastTextCache.model: | ||
return _FastTextCache.model | ||
create_data_model_home() | ||
downloaded_models = [ | ||
file for file in os.listdir(dateparser_model_home) | ||
if file in _supported_models | ||
] | ||
if not downloaded_models: | ||
fasttext_downloader(_DEFAULT_MODEL) | ||
return _load_fasttext_model() | ||
model_path = os.path.join(dateparser_model_home, downloaded_models[0]) | ||
if not os.path.isfile(model_path): | ||
raise FastTextModelNotFoundException('Fasttext model file not found') | ||
_FastTextCache.model = fasttext.load_model(model_path) | ||
return _FastTextCache.model | ||
|
||
|
||
def detect_languages(text, confidence_threshold): | ||
_language_parser = _load_fasttext_model() | ||
text = text.replace('\n', ' ').replace('\r', '') | ||
language_codes = [] | ||
parser_data = _language_parser.predict(text) | ||
for idx, language_probability in enumerate(parser_data[1]): | ||
if language_probability > confidence_threshold: | ||
language_code = parser_data[0][idx].replace("__label__", "") | ||
language_codes.append(language_code) | ||
return language_codes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import langdetect | ||
|
||
|
||
# The below _Factory is set to prevent setting global state of the library | ||
# but still get consistent results. | ||
# Refer : https://github.com/Mimino666/langdetect | ||
|
||
class _Factory: | ||
data = None | ||
|
||
|
||
def _init_factory(): | ||
if _Factory.data is None: | ||
_Factory.data = langdetect.detector_factory.DetectorFactory() | ||
_Factory.data.load_profile(langdetect.detector_factory.PROFILES_DIRECTORY) | ||
_Factory.data.seed = 0 | ||
|
||
|
||
def _get_language_probablities(text): | ||
_init_factory() | ||
detector = _Factory.data.create() | ||
detector.append(text) | ||
return detector.get_probabilities() | ||
|
||
|
||
def detect_languages(text, confidence_threshold): | ||
language_codes = [] | ||
try: | ||
parser_data = _get_language_probablities(text) | ||
for language_candidate in parser_data: | ||
if language_candidate.prob > confidence_threshold: | ||
language_codes.append(language_candidate.lang) | ||
except langdetect.lang_detect_exception.LangDetectException: | ||
# This exception can be produced with empty strings or inputs without letters like `10-10-2021`. | ||
# As this could be really common, we ignore them. | ||
pass | ||
return language_codes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from dateparser.data.languages_info import language_map | ||
|
||
|
||
def map_languages(language_codes): | ||
""" | ||
Returns the candidates from the supported languages codes. | ||
:param language_codes: | ||
A list of language codes, e.g. ['en', 'es'] in ISO 639 Standard. | ||
:type language_codes: list | ||
:return: Returns list[str] representing supported languages | ||
:rtype: list[str] | ||
""" | ||
return [ | ||
language_code | ||
for language in language_codes | ||
if language in language_map | ||
for language_code in language_map[language] | ||
] |
Oops, something went wrong.