Skip to content

Commit

Permalink
bug fix: entity indices when ngram > 1
Browse files Browse the repository at this point in the history
  • Loading branch information
Hanif Yuli Abdillah P committed Sep 14, 2023
1 parent 76dfa93 commit 5c0d892
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 6 deletions.
Binary file removed dist/lexifuzz_ner-0.0.2-py3-none-any.whl
Binary file not shown.
Binary file removed dist/lexifuzz_ner-0.0.2.tar.gz
Binary file not shown.
Binary file added dist/lexifuzz_ner-0.0.3-py3-none-any.whl
Binary file not shown.
Binary file added dist/lexifuzz_ner-0.0.3.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "lexifuzz-ner"
version = "0.0.2"
version = "0.0.3"
authors = ["Hanif Yuli Abdillah P <[email protected]>"]
description = "Python package for detecting entities in text based on a dictionary and fuzzy similarity"
readme = "README.md"
Expand Down
13 changes: 8 additions & 5 deletions src/lexifuzz_ner/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def getFuzzySimilarity(token=None, dictionary=None, min_ratio=None):
assert isinstance(token, str), "Tokens can be str() type only"
assert isinstance(dictionary, dict), "Dictionary format should be provided in the dictionary parameter."
assert isinstance(min_ratio, int), "Integer format should be provided in the minimum-ratio parameter."

for key, values in dictionary.items():
# Using the process option of FuzzyWuzzy, we can search through the entire dictionary for the best match
match = process.extractOne(token, values, scorer = fuzz.ratio)
Expand Down Expand Up @@ -122,12 +122,15 @@ def find_entity(text=None, dictionary=None, min_ratio=None):
if not similarity_score == None:
# Find the start and end indices correctly using current_index
start_index = text.find(compared_text, current_index)
if start_index == -1:
start_index = 0
# if start_index == -1:
# start_index = 0
end_index = start_index + len(compared_text) - 1

# Update current_index to start searching for the next occurrence after the current one
current_index = end_index + 1
# Update current_index to start searching for the next occurrence after the current one. For ngram > 1, current_index will be back to the second index of the ngram value
if n == 1:
current_index = end_index + 1
else:
current_index = end_index - (len(compared_text.split(' ', 1)[1]) + 1)

result_detection['entities'].append(
{
Expand Down

0 comments on commit 5c0d892

Please sign in to comment.