diff --git a/dist/lexifuzz_ner-0.0.3-py3-none-any.whl b/dist/lexifuzz_ner-0.0.3-py3-none-any.whl deleted file mode 100644 index a9c74a1..0000000 Binary files a/dist/lexifuzz_ner-0.0.3-py3-none-any.whl and /dev/null differ diff --git a/dist/lexifuzz_ner-0.0.3.tar.gz b/dist/lexifuzz_ner-0.0.3.tar.gz deleted file mode 100644 index 1f91da0..0000000 Binary files a/dist/lexifuzz_ner-0.0.3.tar.gz and /dev/null differ diff --git a/dist/lexifuzz_ner-0.0.4-py3-none-any.whl b/dist/lexifuzz_ner-0.0.4-py3-none-any.whl new file mode 100644 index 0000000..305961f Binary files /dev/null and b/dist/lexifuzz_ner-0.0.4-py3-none-any.whl differ diff --git a/dist/lexifuzz_ner-0.0.4.tar.gz b/dist/lexifuzz_ner-0.0.4.tar.gz new file mode 100644 index 0000000..58f3699 Binary files /dev/null and b/dist/lexifuzz_ner-0.0.4.tar.gz differ diff --git a/pyproject.toml b/pyproject.toml index f315d2d..7210d95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "lexifuzz-ner" -version = "0.0.3" +version = "0.0.4" authors = ["Hanif Yuli Abdillah P "] description = "Python package for detecting entities in text based on a dictionary and fuzzy similarity" readme = "README.md" diff --git a/src/lexifuzz_ner/ner.py b/src/lexifuzz_ner/ner.py index 60296c2..0bb5405 100644 --- a/src/lexifuzz_ner/ner.py +++ b/src/lexifuzz_ner/ner.py @@ -29,7 +29,6 @@ def getFuzzySimilarity(token=None, dictionary=None, min_ratio=None): return (match + (key, )) def handle_slicing(data=None): - """ This function takes a dictionary data as input and processes its 'entities' by sorting them based on their score in descending order. It then identifies entities with the highest scores, ensuring there is no overlap in their index ranges. @@ -39,7 +38,7 @@ def handle_slicing(data=None): assert isinstance(data, dict), "Dictionary format should be provided in the dictionary parameter." # Sort entities by their score in descending order - sorted_entities = sorted(data['entities'], key=lambda x: -x['score']) + sorted_entities = sorted(data['entities'], key=lambda x: (-x['score'], x['index']['start'], -x['index']['end'])) # Initialize a dictionary to keep track of which indices have been covered indices_covered = set() @@ -53,14 +52,11 @@ def handle_slicing(data=None): # Check if the entity's indices overlap with previously covered indices if all(start > end_covered or end < start_covered for start_covered, end_covered in indices_covered): new_entities.append(entity) - # Update the covered indices indices_covered.add((start, end)) # Update the entities in the data dictionary data['entities'] = new_entities - - # Print the modified data return data def annotate_text(entities = None):