Skip to content

Commit

Permalink
Add methods to generate result table
Browse files Browse the repository at this point in the history
  • Loading branch information
haohangyan committed Aug 5, 2024
1 parent 7fecac3 commit b1ab1b3
Showing 1 changed file with 80 additions and 0 deletions.
80 changes: 80 additions & 0 deletions benchmarks/bioid_ner_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import pathlib
import logging
import pickle
from datetime import datetime
from collections import defaultdict, Counter
import xml.etree.ElementTree as ET
Expand Down Expand Up @@ -51,6 +52,7 @@ def __init__(self):
self.counts_table = None
self.precision_recall = None
self.false_positives_counter = Counter()
self.result = None

def process_xml_files(self):
"""Extract relevant information from XML files."""
Expand Down Expand Up @@ -206,6 +208,7 @@ def evaluate_gilda_performance(self):
metrics['top_match']['fp'] += 1

# False negative calculation using ref dict
# The number entries of annotion in reference with no annotion in grounding
for key, refs in tqdm(ref_dict.items(),
desc="Calculating False Negatives"):
doc_id, figure = key[0], key[1]
Expand Down Expand Up @@ -292,13 +295,86 @@ def get_tables(self):
self.precision_recall,
self.false_positives_counter)

def check_match(self, row):
obj_synonyms = row['obj_synonyms']
groundings = row['groundings']
if obj_synonyms is None or groundings is None:
return False
for elem in obj_synonyms:
for tup in groundings:
if elem == tup[0]:
return True
return False

def generate_result_table(self):

ref_dict = defaultdict()

for _, row in self.annotations_df.iterrows():
key = (str(row['don_article']), row['figure'], row['text'],
row['first left'], row['last right'])
ref_dict[key] = (row['obj'], row['obj_synonyms'])

text_list, obj_synonyms_list, don_articles_list = [], [], []
groundings_list, entity_type_list, obj_list = [], [], []

for (doc_id, figure), annotations in (
tqdm(self.gilda_annotations_map.items(),
desc="Getting result")):
for annotation in annotations:
key = (doc_id, figure, annotation.text, annotation.start,
annotation.end)
matching_refs = ref_dict.get(key, None)

groundings = []
if matching_refs:
obj = matching_refs[0]
obj_synonyms = matching_refs[1]
else:
obj, obj_synonyms = None, None

text = annotation.text
for scored_match in annotation.matches:
curies = []
curie = f"{scored_match.term.db}:{scored_match.term.id}"
score = scored_match.score
groundings.append((curie, score))
curies.append(curie)

obj_list.append(obj)
text_list.append(text)
obj_synonyms_list.append(obj_synonyms)
don_articles_list.append(doc_id)
groundings_list.append(groundings)

for key, refs in ref_dict.items():
doc_id, figure = key[0], key[1]

if not self.gilda_annotations_map.get((doc_id, figure), []):
obj_list.append(refs[0])
text_list.append(key[2])
obj_synonyms_list.append(refs[1])
don_articles_list.append(key[0])
groundings_list.append(None)

data = {
'text': text_list,
'obj': obj_list,
'obj_synonyms': obj_synonyms_list,
'don_article': don_articles_list,
'groundings': groundings_list,
}
self.result = pd.DataFrame(data)
self.result['match'] = self.result.apply(self.check_match, axis=1)


def main(results: str = RESULTS_DIR):
results_path = os.path.expandvars(os.path.expanduser(results))
os.makedirs(results_path, exist_ok=True)

benchmarker = BioIDNERBenchmarker()
benchmarker.annotate_entities_with_gilda()
benchmarker.generate_result_table()
benchmarker.evaluate_gilda_performance()
counts, precision_recall, false_positives_counter = benchmarker.get_tables()

Expand Down Expand Up @@ -361,6 +437,10 @@ def main(results: str = RESULTS_DIR):
counts.to_csv(result_stub.with_suffix(".counts.csv"), index=False)
precision_recall.to_csv(result_stub.with_suffix(".precision_recall.csv"),
index=False)
benchmarker.result.to_csv(
result_stub.with_suffix(".ner_result.tsv"),
sep='\t', index=False)

print(f'Results saved to {results_path}')


Expand Down

0 comments on commit b1ab1b3

Please sign in to comment.