diff --git a/document_qa/grobid_processors.py b/document_qa/grobid_processors.py index a0fd022..e8cc8e3 100644 --- a/document_qa/grobid_processors.py +++ b/document_qa/grobid_processors.py @@ -183,6 +183,7 @@ def parse_grobid_xml(self, text, coordinates=False): }) text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True) + text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True)) use_paragraphs = True if not use_paragraphs: @@ -800,6 +801,20 @@ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool return nodes +def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list: + nodes = [] + tag_name = "p" if use_paragraphs else "s" + for child in soup.TEI.children: + if child.name == 'text': + nodes.extend( + [subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)]) + + if verbose: + print(str(nodes)) + + return nodes + + def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list: children = [] for child in soup.TEI.children: