bug fix: entity indices when ngram > 1

hanifabd · Sep 14, 2023 · 5c0d892 · 5c0d892
1 parent 76dfa93
commit 5c0d892
Show file tree

Hide file tree

Showing 6 changed files with 9 additions and 6 deletions.
diff --git a/dist/lexifuzz_ner-0.0.2-py3-none-any.whl b/dist/lexifuzz_ner-0.0.2-py3-none-any.whl
diff --git a/dist/lexifuzz_ner-0.0.2.tar.gz b/dist/lexifuzz_ner-0.0.2.tar.gz
diff --git a/dist/lexifuzz_ner-0.0.3-py3-none-any.whl b/dist/lexifuzz_ner-0.0.3-py3-none-any.whl
diff --git a/dist/lexifuzz_ner-0.0.3.tar.gz b/dist/lexifuzz_ner-0.0.3.tar.gz
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "lexifuzz-ner"
-version = "0.0.2"
+version = "0.0.3"
 authors = ["Hanif Yuli Abdillah P <[email protected]>"]
 description = "Python package for detecting entities in text based on a dictionary and fuzzy similarity"
 readme = "README.md"

diff --git a/src/lexifuzz_ner/ner.py b/src/lexifuzz_ner/ner.py
@@ -19,7 +19,7 @@ def getFuzzySimilarity(token=None, dictionary=None, min_ratio=None):
     assert isinstance(token, str), "Tokens can be str() type only"
     assert isinstance(dictionary, dict), "Dictionary format should be provided in the dictionary parameter."
     assert isinstance(min_ratio, int), "Integer format should be provided in the minimum-ratio parameter."
-    
+
     for key, values in dictionary.items():
         # Using the process option of FuzzyWuzzy, we can search through the entire dictionary for the best match
         match = process.extractOne(token, values, scorer = fuzz.ratio)
@@ -122,12 +122,15 @@ def find_entity(text=None, dictionary=None, min_ratio=None):
       if not similarity_score == None:
         # Find the start and end indices correctly using current_index
         start_index = text.find(compared_text, current_index)
-        if start_index == -1:
-          start_index = 0
+        # if start_index == -1:
+        #   start_index = 0
         end_index = start_index + len(compared_text) - 1
 
-        # Update current_index to start searching for the next occurrence after the current one
-        current_index = end_index + 1
+        # Update current_index to start searching for the next occurrence after the current one. For ngram > 1, current_index will be back to the second index of the ngram value
+        if n == 1:
+          current_index = end_index + 1
+        else:
+          current_index = end_index - (len(compared_text.split(' ', 1)[1]) + 1)
 
         result_detection['entities'].append(
             {