Skip to content

Commit

Permalink
get data availability statement as context for QA
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jun 22, 2024
1 parent ffa83ea commit 53c8deb
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions document_qa/grobid_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def parse_grobid_xml(self, text, coordinates=False):
})

text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))

use_paragraphs = True
if not use_paragraphs:
Expand Down Expand Up @@ -800,6 +801,20 @@ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool
return nodes


def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
nodes = []
tag_name = "p" if use_paragraphs else "s"
for child in soup.TEI.children:
if child.name == 'text':
nodes.extend(
[subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])

if verbose:
print(str(nodes))

return nodes


def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
children = []
for child in soup.TEI.children:
Expand Down

0 comments on commit 53c8deb

Please sign in to comment.