diff --git a/dist/lexifuzz_ner-0.0.2-py3-none-any.whl b/dist/lexifuzz_ner-0.0.2-py3-none-any.whl deleted file mode 100644 index 706116f..0000000 Binary files a/dist/lexifuzz_ner-0.0.2-py3-none-any.whl and /dev/null differ diff --git a/dist/lexifuzz_ner-0.0.2.tar.gz b/dist/lexifuzz_ner-0.0.2.tar.gz deleted file mode 100644 index 853e380..0000000 Binary files a/dist/lexifuzz_ner-0.0.2.tar.gz and /dev/null differ diff --git a/dist/lexifuzz_ner-0.0.3-py3-none-any.whl b/dist/lexifuzz_ner-0.0.3-py3-none-any.whl new file mode 100644 index 0000000..a9c74a1 Binary files /dev/null and b/dist/lexifuzz_ner-0.0.3-py3-none-any.whl differ diff --git a/dist/lexifuzz_ner-0.0.3.tar.gz b/dist/lexifuzz_ner-0.0.3.tar.gz new file mode 100644 index 0000000..1f91da0 Binary files /dev/null and b/dist/lexifuzz_ner-0.0.3.tar.gz differ diff --git a/pyproject.toml b/pyproject.toml index 4be0515..f315d2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "lexifuzz-ner" -version = "0.0.2" +version = "0.0.3" authors = ["Hanif Yuli Abdillah P "] description = "Python package for detecting entities in text based on a dictionary and fuzzy similarity" readme = "README.md" diff --git a/src/lexifuzz_ner/ner.py b/src/lexifuzz_ner/ner.py index b9337f9..60296c2 100644 --- a/src/lexifuzz_ner/ner.py +++ b/src/lexifuzz_ner/ner.py @@ -19,7 +19,7 @@ def getFuzzySimilarity(token=None, dictionary=None, min_ratio=None): assert isinstance(token, str), "Tokens can be str() type only" assert isinstance(dictionary, dict), "Dictionary format should be provided in the dictionary parameter." assert isinstance(min_ratio, int), "Integer format should be provided in the minimum-ratio parameter." - + for key, values in dictionary.items(): # Using the process option of FuzzyWuzzy, we can search through the entire dictionary for the best match match = process.extractOne(token, values, scorer = fuzz.ratio) @@ -122,12 +122,15 @@ def find_entity(text=None, dictionary=None, min_ratio=None): if not similarity_score == None: # Find the start and end indices correctly using current_index start_index = text.find(compared_text, current_index) - if start_index == -1: - start_index = 0 + # if start_index == -1: + # start_index = 0 end_index = start_index + len(compared_text) - 1 - # Update current_index to start searching for the next occurrence after the current one - current_index = end_index + 1 + # Update current_index to start searching for the next occurrence after the current one. For ngram > 1, current_index will be back to the second index of the ngram value + if n == 1: + current_index = end_index + 1 + else: + current_index = end_index - (len(compared_text.split(' ', 1)[1]) + 1) result_detection['entities'].append( {