From 9d34e2b1bad84d5834bd28e2644cf7e2349eb0d2 Mon Sep 17 00:00:00 2001
From: Manuel Rech <63170478+manuelrech@users.noreply.github.com>
Date: Wed, 20 Mar 2024 16:09:19 +0100
Subject: [PATCH 1/3] Catch parsing mistakes

Sometimes with scanned pages we get '[NO_BLOCKS] PDF parsing resulted in empty content' and with GROBID parsing errors we get '[GENERAL] An exception occurred while running Grobid.'

to catch these errors we need some additional logic
---
 scipdf/pdf/parse_pdf.py | 44 ++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py
index 15152c6..57b54e6 100644
--- a/scipdf/pdf/parse_pdf.py
+++ b/scipdf/pdf/parse_pdf.py
@@ -350,27 +350,31 @@ def convert_article_soup_to_dict(article, as_list: bool = False):
             ]
         }
     """
-    article_dict = {}
-    if article is not None:
-        title = article.find("title", attrs={"type": "main"})
-        title = title.text.strip() if title is not None else ""
-
-        article_dict["title"] = title
-        article_dict["authors"] = parse_authors(article)
-        article_dict["pub_date"] = parse_date(article)
-        article_dict["abstract"] = parse_abstract(article)
-        article_dict["sections"] = parse_sections(article, as_list=as_list)
-        article_dict["references"] = parse_references(article)
-        article_dict["figures"] = parse_figure_caption(article)
-        article_dict["formulas"] = parse_formulas(article)
-
-        doi = article.find("idno", attrs={"type": "DOI"})
-        doi = doi.text if doi is not None else ""
-        article_dict["doi"] = doi
-
-        return article_dict
-    else:
+    if article is None:
         return None
+    if article.string is not None:
+        if '[NO_BLOCKS] PDF parsing resulted in empty content' in article.string  or '[GENERAL] An exception occurred while running Grobid.' in article.string:
+            return None
+        
+    article_dict = {}
+
+    title = article.find("title", attrs={"type": "main"})
+    title = title.text.strip() if title is not None else ""
+
+    article_dict["title"] = title
+    article_dict["authors"] = parse_authors(article)
+    article_dict["pub_date"] = parse_date(article)
+    article_dict["abstract"] = parse_abstract(article)
+    article_dict["sections"] = parse_sections(article, as_list=as_list)
+    article_dict["references"] = parse_references(article)
+    article_dict["figures"] = parse_figure_caption(article)
+    article_dict["formulas"] = parse_formulas(article)
+
+    doi = article.find("idno", attrs={"type": "DOI"})
+    doi = doi.text if doi is not None else ""
+    article_dict["doi"] = doi
+
+    return article_dict
 
 
 def parse_pdf_to_dict(

From 5a67ba8778fc6e7270904a4983aa1cdea24716ba Mon Sep 17 00:00:00 2001
From: Manuel Rech <63170478+manuelrech@users.noreply.github.com>
Date: Thu, 21 Mar 2024 09:57:54 +0100
Subject: [PATCH 2/3] Remove xml - html warning

I have removed the xml waring by setting features = 'xml' and with some small adjustments
---
 scipdf/pdf/parse_pdf.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py
index 57b54e6..944daf1 100644
--- a/scipdf/pdf/parse_pdf.py
+++ b/scipdf/pdf/parse_pdf.py
@@ -113,7 +113,7 @@ def parse_pdf(
         parsed_article = None
 
     if soup and parsed_article is not None:
-        parsed_article = BeautifulSoup(parsed_article, "lxml")
+        parsed_article = BeautifulSoup(parsed_article, features='xml')
 
     return parsed_article
 
@@ -122,7 +122,7 @@ def parse_authors(article):
     """
     Parse authors from a given BeautifulSoup of an article
     """
-    author_names = article.find("sourcedesc").findAll("persname")
+    author_names = article.find("fileDesc").find_all("persName")
     authors = []
     for author in author_names:
         firstname = author.find("forename", {"type": "first"})
@@ -143,7 +143,7 @@ def parse_date(article):
     """
     Parse date from a given BeautifulSoup of an article
     """
-    pub_date = article.find("publicationstmt")
+    pub_date = article.find("publicationStmt")
     year = pub_date.find("date")
     year = year.attrs.get("when") if year is not None else ""
     return year
@@ -280,7 +280,7 @@ def parse_figure_caption(article):
         figure_id = figure.attrs.get("xml:id") or ""
         label = figure.find("label").text
         if figure_type == "table":
-            caption = figure.find("figdesc").text
+            caption = figure.find("figDesc").text
             data = figure.table.text
         else:
             caption = figure.text
@@ -289,6 +289,7 @@ def parse_figure_caption(article):
             {
                 "figure_label": label,
                 "figure_type": figure_type,
+                "figure_type_label": f"{figure_type.title()}-{label}",
                 "figure_id": figure_id,
                 "figure_caption": caption,
                 "figure_data": data,
@@ -384,7 +385,6 @@ def parse_pdf_to_dict(
     as_list: bool = False,
     return_coordinates: bool = True,
     grobid_url: str = GROBID_URL,
-    parse_figures: bool = True,
 ):
     """
     Parse the given PDF and return dictionary of the parsed article
@@ -459,7 +459,7 @@ def parse_figures(
             op.join(op.abspath(figure_path), ""),  # end path with "/"
         ]
         _ = subprocess.run(
-            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=20
+            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60
         )
         print("Done parsing figures from PDFs!")
     else:

From 0d8252d4f6e9752d01a6e010a7c334702ce19c37 Mon Sep 17 00:00:00 2001
From: Manuel Rech <63170478+manuelrech@users.noreply.github.com>
Date: Thu, 21 Mar 2024 10:30:02 +0100
Subject: [PATCH 3/3] update checks on wrongly parsed articles

with new xml parser we need a different checking system
---
 scipdf/pdf/parse_pdf.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scipdf/pdf/parse_pdf.py b/scipdf/pdf/parse_pdf.py
index 944daf1..72dca8b 100644
--- a/scipdf/pdf/parse_pdf.py
+++ b/scipdf/pdf/parse_pdf.py
@@ -351,11 +351,8 @@ def convert_article_soup_to_dict(article, as_list: bool = False):
             ]
         }
     """
-    if article is None:
+    if article is None or (article.contents == [] and article.text == ""):
         return None
-    if article.string is not None:
-        if '[NO_BLOCKS] PDF parsing resulted in empty content' in article.string  or '[GENERAL] An exception occurred while running Grobid.' in article.string:
-            return None
         
     article_dict = {}