refactor grobid processors

lfoppiano · Mar 4, 2024 · 99b364f · 99b364f
1 parent c08e73a
commit 99b364f
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 37 deletions.
diff --git a/document_qa/document_qa_engine.py b/document_qa/document_qa_engine.py
@@ -269,7 +269,7 @@ def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1,
             print("File", pdf_file_path)
         filename = Path(pdf_file_path).stem
         coordinates = True  # if chunk_size == -1 else False
-        structure = self.grobid_processor.process_structure(pdf_file_path, coordinates=coordinates)
+        structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
 
         biblio = structure['biblio']
         biblio['filename'] = filename.replace(" ", "_")

diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py
@@ -63,7 +63,7 @@ def extract_quantities(client, x_all, column_text_index):
 
     for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
         text = example[column_text_index]
-        spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
+        spans = GrobidQuantitiesProcessor(client).process(text)
 
         data_record = {
             "id": example[0],
@@ -83,7 +83,7 @@ def extract_materials(client, x_all, column_text_index):
 
     for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
         text = example[column_text_index]
-        spans = GrobidMaterialsProcessor(client).extract_materials(text)
+        spans = GrobidMaterialsProcessor(client).process(text)
         data_record = {
             "id": example[0],
             "filename": example[1],
@@ -131,7 +131,7 @@ def __init__(self, grobid_client):
         # super().__init__()
         self.grobid_client = grobid_client
 
-    def process_structure(self, input_path, coordinates=False):
+    def process(self, input_path, coordinates=False):
         pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
                                                                 input_path,
                                                                 consolidate_header=True,
@@ -145,19 +145,19 @@ def process_structure(self, input_path, coordinates=False):
         if status != 200:
             return
 
-        output_data = self.parse_grobid_xml(text, coordinates=coordinates)
-        output_data['filename'] = Path(pdf_file).stem.replace(".tei", "")
+        document_object = self.parse_grobid_xml(text, coordinates=coordinates)
+        document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
 
-        return output_data
-
-    def process_single(self, input_file):
-        doc = self.process_structure(input_file)
+        return document_object
 
-        for paragraph in doc['passages']:
-            entities = self.process_single_text(paragraph['text'])
-            paragraph['spans'] = entities
-
-        return doc
+    # def process_single(self, input_file):
+    #     doc = self.process_structure(input_file)
+    # 
+    #     for paragraph in doc['passages']:
+    #         entities = self.process_single_text(paragraph['text'])
+    #         paragraph['spans'] = entities
+    # 
+    #     return doc
 
     def parse_grobid_xml(self, text, coordinates=False):
         output_data = OrderedDict()
@@ -187,10 +187,10 @@ def parse_grobid_xml(self, text, coordinates=False):
             "text": f"authors: {biblio['authors']}",
             "type": passage_type,
             "section": "<header>",
-            "subSection": "<title>",
-            "passage_id": "htitle",
+            "subSection": "<authors>",
+            "passage_id": "hauthors",
             "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
-                            blocks_header['authors']])
+                                     blocks_header['authors']])
         })
 
         passages.append({
@@ -293,7 +293,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
     def __init__(self, grobid_quantities_client):
         self.grobid_quantities_client = grobid_quantities_client
 
-    def extract_quantities(self, text):
+    def process(self, text):
         status, result = self.grobid_quantities_client.process_text(text.strip())
 
         if status != 200:
@@ -465,7 +465,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
     def __init__(self, grobid_superconductors_client):
         self.grobid_superconductors_client = grobid_superconductors_client
 
-    def extract_materials(self, text):
+    def process(self, text):
         preprocessed_text = text.strip()
         status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
                                                                          "processText_disable_linking")
@@ -568,17 +568,17 @@ def __init__(self, grobid_client, grobid_quantities_client=None, grobid_supercon
         self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
 
     def process_single_text(self, text):
-        extracted_quantities_spans = self.gqp.extract_quantities(text)
-        extracted_materials_spans = self.gmp.extract_materials(text)
+        extracted_quantities_spans = self.process_properties(text)
+        extracted_materials_spans = self.process_materials(text)
         all_entities = extracted_quantities_spans + extracted_materials_spans
         entities = self.prune_overlapping_annotations(all_entities)
         return entities
 
-    def extract_quantities(self, text):
-        return self.gqp.extract_quantities(text)
+    def process_properties(self, text):
+        return self.gqp.process(text)
 
-    def extract_materials(self, text):
-        return self.gmp.extract_materials(text)
+    def process_materials(self, text):
+        return self.gmp.process(text)
 
     @staticmethod
     def box_to_dict(box, color=None, type=None):
@@ -715,8 +715,8 @@ def prune_overlapping_annotations(entities: list) -> list:
 
 
 class XmlProcessor(BaseProcessor):
-    def __init__(self, grobid_superconductors_client, grobid_quantities_client):
-        super().__init__(grobid_superconductors_client, grobid_quantities_client)
+    def __init__(self):
+        super().__init__()
 
     def process_structure(self, input_file):
         text = ""
@@ -728,16 +728,16 @@ def process_structure(self, input_file):
 
         return output_data
 
-    def process_single(self, input_file):
-        doc = self.process_structure(input_file)
-
-        for paragraph in doc['passages']:
-            entities = self.process_single_text(paragraph['text'])
-            paragraph['spans'] = entities
-
-        return doc
+    # def process_single(self, input_file):
+    #     doc = self.process_structure(input_file)
+    # 
+    #     for paragraph in doc['passages']:
+    #         entities = self.process_single_text(paragraph['text'])
+    #         paragraph['spans'] = entities
+    # 
+    #     return doc
 
-    def parse_xml(self, text):
+    def process(self, text):
         output_data = OrderedDict()
         soup = BeautifulSoup(text, 'xml')
         text_blocks_children = get_children_list_supermat(soup, verbose=False)