From 9d34e2b1bad84d5834bd28e2644cf7e2349eb0d2 Mon Sep 17 00:00:00 2001 From: Manuel Rech <63170478+manuelrech@users.noreply.github.com> Date: Wed, 20 Mar 2024 16:09:19 +0100 Subject: [PATCH 1/3] Catch parsing mistakes Sometimes with scanned pages we get '[NO_BLOCKS] PDF parsing resulted in empty content' and with GROBID parsing errors we get '[GENERAL] An exception occurred while running Grobid.' to catch these errors we need some additional logic --- scipdf/pdf/parse_pdf.py | 44 ++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py index 15152c6..57b54e6 100644 --- a/scipdf/pdf/parse_pdf.py +++ b/scipdf/pdf/parse_pdf.py @@ -350,27 +350,31 @@ def convert_article_soup_to_dict(article, as_list: bool = False): ] } """ - article_dict = {} - if article is not None: - title = article.find("title", attrs={"type": "main"}) - title = title.text.strip() if title is not None else "" - - article_dict["title"] = title - article_dict["authors"] = parse_authors(article) - article_dict["pub_date"] = parse_date(article) - article_dict["abstract"] = parse_abstract(article) - article_dict["sections"] = parse_sections(article, as_list=as_list) - article_dict["references"] = parse_references(article) - article_dict["figures"] = parse_figure_caption(article) - article_dict["formulas"] = parse_formulas(article) - - doi = article.find("idno", attrs={"type": "DOI"}) - doi = doi.text if doi is not None else "" - article_dict["doi"] = doi - - return article_dict - else: + if article is None: return None + if article.string is not None: + if '[NO_BLOCKS] PDF parsing resulted in empty content' in article.string or '[GENERAL] An exception occurred while running Grobid.' in article.string: + return None + + article_dict = {} + + title = article.find("title", attrs={"type": "main"}) + title = title.text.strip() if title is not None else "" + + article_dict["title"] = title + article_dict["authors"] = parse_authors(article) + article_dict["pub_date"] = parse_date(article) + article_dict["abstract"] = parse_abstract(article) + article_dict["sections"] = parse_sections(article, as_list=as_list) + article_dict["references"] = parse_references(article) + article_dict["figures"] = parse_figure_caption(article) + article_dict["formulas"] = parse_formulas(article) + + doi = article.find("idno", attrs={"type": "DOI"}) + doi = doi.text if doi is not None else "" + article_dict["doi"] = doi + + return article_dict def parse_pdf_to_dict( From 5a67ba8778fc6e7270904a4983aa1cdea24716ba Mon Sep 17 00:00:00 2001 From: Manuel Rech <63170478+manuelrech@users.noreply.github.com> Date: Thu, 21 Mar 2024 09:57:54 +0100 Subject: [PATCH 2/3] Remove xml - html warning I have removed the xml waring by setting features = 'xml' and with some small adjustments --- scipdf/pdf/parse_pdf.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py index 57b54e6..944daf1 100644 --- a/scipdf/pdf/parse_pdf.py +++ b/scipdf/pdf/parse_pdf.py @@ -113,7 +113,7 @@ def parse_pdf( parsed_article = None if soup and parsed_article is not None: - parsed_article = BeautifulSoup(parsed_article, "lxml") + parsed_article = BeautifulSoup(parsed_article, features='xml') return parsed_article @@ -122,7 +122,7 @@ def parse_authors(article): """ Parse authors from a given BeautifulSoup of an article """ - author_names = article.find("sourcedesc").findAll("persname") + author_names = article.find("fileDesc").find_all("persName") authors = [] for author in author_names: firstname = author.find("forename", {"type": "first"}) @@ -143,7 +143,7 @@ def parse_date(article): """ Parse date from a given BeautifulSoup of an article """ - pub_date = article.find("publicationstmt") + pub_date = article.find("publicationStmt") year = pub_date.find("date") year = year.attrs.get("when") if year is not None else "" return year @@ -280,7 +280,7 @@ def parse_figure_caption(article): figure_id = figure.attrs.get("xml:id") or "" label = figure.find("label").text if figure_type == "table": - caption = figure.find("figdesc").text + caption = figure.find("figDesc").text data = figure.table.text else: caption = figure.text @@ -289,6 +289,7 @@ def parse_figure_caption(article): { "figure_label": label, "figure_type": figure_type, + "figure_type_label": f"{figure_type.title()}-{label}", "figure_id": figure_id, "figure_caption": caption, "figure_data": data, @@ -384,7 +385,6 @@ def parse_pdf_to_dict( as_list: bool = False, return_coordinates: bool = True, grobid_url: str = GROBID_URL, - parse_figures: bool = True, ): """ Parse the given PDF and return dictionary of the parsed article @@ -459,7 +459,7 @@ def parse_figures( op.join(op.abspath(figure_path), ""), # end path with "/" ] _ = subprocess.run( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20 + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60 ) print("Done parsing figures from PDFs!") else: From 0d8252d4f6e9752d01a6e010a7c334702ce19c37 Mon Sep 17 00:00:00 2001 From: Manuel Rech <63170478+manuelrech@users.noreply.github.com> Date: Thu, 21 Mar 2024 10:30:02 +0100 Subject: [PATCH 3/3] update checks on wrongly parsed articles with new xml parser we need a different checking system --- scipdf/pdf/parse_pdf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py index 944daf1..72dca8b 100644 --- a/scipdf/pdf/parse_pdf.py +++ b/scipdf/pdf/parse_pdf.py @@ -351,11 +351,8 @@ def convert_article_soup_to_dict(article, as_list: bool = False): ] } """ - if article is None: + if article is None or (article.contents == [] and article.text == ""): return None - if article.string is not None: - if '[NO_BLOCKS] PDF parsing resulted in empty content' in article.string or '[GENERAL] An exception occurred while running Grobid.' in article.string: - return None article_dict = {}