From 7628de57ac79f3a0c34551fe1c21be0155a9700e Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Tue, 14 Mar 2023 16:56:35 -0700
Subject: [PATCH 1/2] fix for empty pages

---
 src/mmda/parsers/pdfplumber_parser.py         | 95 +++++++++++++------
 .../dictionary_word_predictor.py              | 10 +-
 src/mmda/recipes/core_recipe.py               |  1 +
 src/mmda/types/document.py                    |  3 +-
 4 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/src/mmda/parsers/pdfplumber_parser.py b/src/mmda/parsers/pdfplumber_parser.py
index ae5a4fca..be72f2e2 100644
--- a/src/mmda/parsers/pdfplumber_parser.py
+++ b/src/mmda/parsers/pdfplumber_parser.py
@@ -195,26 +195,57 @@ def parse(self, input_pdf_path: str) -> Document:
                     fine_tokens=[f["text"] for f in fine_tokens],
                 )
                 assert len(word_ids_of_fine_tokens) == len(fine_tokens)
+
                 # 4) normalize / clean tokens & boxes
-                fine_tokens = [
-                    {
-                        "text": token["text"],
-                        "fontname": token["fontname"],
-                        "size": token["size"],
-                        "bbox": Box.from_pdf_coordinates(
-                            x1=float(token["x0"]),
-                            y1=float(token["top"]),
-                            x2=float(token["x1"]),
-                            y2=float(token["bottom"]),
-                            page_width=float(page.width),
-                            page_height=float(page.height),
-                            page=int(page_id),
-                        ).get_relative(
-                            page_width=float(page.width), page_height=float(page.height)
-                        ),
-                    }
-                    for token in fine_tokens
-                ]
+                if len(fine_tokens) > 0:
+                    fine_tokens = [
+                        {
+                            "text": token["text"],
+                            "fontname": token["fontname"],
+                            "size": token["size"],
+                            "bbox": Box.from_pdf_coordinates(
+                                x1=float(token["x0"]),
+                                y1=float(token["top"]),
+                                x2=float(token["x1"]),
+                                y2=float(token["bottom"]),
+                                page_width=float(page.width),
+                                page_height=float(page.height),
+                                page=int(page_id),
+                            ).get_relative(
+                                page_width=float(page.width),
+                                page_height=float(page.height)
+                            ),
+                        }
+                        for token in fine_tokens
+                    ]
+                else:
+                    # this page does not have tokens, so we add a single
+                    # fictitious token to the page. This is to ensure that
+                    # any element that is on this page gets anchored to
+                    # something
+                    fine_tokens = [
+                        {
+                            # adding one single character: the null character
+                            "text": "\u0000",
+                            "fontname": "",
+                            "size": 0.,
+                            "bbox": Box.from_pdf_coordinates(
+                                x1=0.,
+                                y1=0.,
+                                x2=1.,
+                                y2=1.,
+                                page_width=float(page.width),
+                                page_height=float(page.height),
+                                page=int(page_id),
+                            ).get_relative(
+                                page_width=float(page.width),
+                                page_height=float(page.height)
+                            ),
+                        }
+                    ]
+                    word_ids_of_fine_tokens = [0]
+
+
                 # 5) group tokens into lines
                 # TODO - doesnt belong in parser; should be own predictor
                 line_ids_of_fine_tokens = self._simple_line_detection(
@@ -223,18 +254,25 @@ def parse(self, input_pdf_path: str) -> Document:
                     y_tolerance=self.line_y_tolerance / page.height,
                 )
                 assert len(line_ids_of_fine_tokens) == len(fine_tokens)
+
                 # 6) accumulate
                 all_tokens.extend(fine_tokens)
                 all_row_ids.extend(
                     [i + last_row_id + 1 for i in line_ids_of_fine_tokens]
                 )
+
+                # import ipdb; ipdb.set_trace()
+
+                # 7) Update the last word id and row id for this page
                 last_row_id = all_row_ids[-1]
                 all_word_ids.extend(
                     [i + last_word_id + 1 for i in word_ids_of_fine_tokens]
                 )
                 last_word_id = all_word_ids[-1]
+
                 for _ in fine_tokens:
                     all_page_ids.append(page_id)
+
             # now turn into a beautiful document!
             doc_json = self._convert_nested_text_to_doc_json(
                 token_dicts=all_tokens,
@@ -242,6 +280,8 @@ def parse(self, input_pdf_path: str) -> Document:
                 row_ids=all_row_ids,
                 page_ids=all_page_ids,
             )
+            # import ipdb; ipdb.set_trace()
+
             doc = Document.from_json(doc_json)
             return doc
 
@@ -299,14 +339,15 @@ def _convert_nested_text_to_doc_json(
                 # new row
                 symbols += "\n"
                 start = end + 1
-        # handle last token
-        symbols += token_dicts[-1]["text"]
-        end = start + len(token_dicts[-1]["text"])
-        token = SpanGroup(
-            spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])],
-            id=len(token_dicts) - 1,
-        )
-        token_annos.append(token)
+        # handle last token if we have one
+        if len(token_dicts) > 0:
+            symbols += token_dicts[-1]["text"]
+            end = start + len(token_dicts[-1]["text"])
+            last_token = SpanGroup(
+                spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])],
+                id=len(token_dicts) - 1,
+            )
+            token_annos.append(last_token)
 
         # 2) build rows
         tokens_with_group_ids = [
diff --git a/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
index 12eca518..891ccb10 100644
--- a/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
+++ b/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
@@ -483,8 +483,9 @@ def _predict_tokens(
         #             token_id_to_word_id[i] = first_token_id
         #         word_id_to_text[first_token_id] = candidate_text
 
-        # are there any unclassified tokens?
-        assert None not in token_id_to_word_id.values()
+        if any(v is None for v in token_id_to_word_id.values()):
+            raise ValueError("Some tokens are not part of any word.")
+
         return token_id_to_word_id, word_id_to_text
 
     def _convert_to_words(
@@ -493,6 +494,11 @@ def _convert_to_words(
             token_id_to_word_id,
             word_id_to_text
     ) -> List[SpanGroup]:
+
+        if len(document.tokens) == 0:
+            # document has no tokens
+            return []
+
         words = []
         tokens_in_word = [document.tokens[0]]
         current_word_id = 0
diff --git a/src/mmda/recipes/core_recipe.py b/src/mmda/recipes/core_recipe.py
index ba49f59e..1f256b79 100644
--- a/src/mmda/recipes/core_recipe.py
+++ b/src/mmda/recipes/core_recipe.py
@@ -51,6 +51,7 @@ def from_path(self, pdfpath: str) -> Document:
         logger.info("Predicting blocks...")
         blocks = self.effdet_publaynet_predictor.predict(document=doc)
         equations = self.effdet_mfd_predictor.predict(document=doc)
+
         doc.annotate(blocks=blocks + equations)
 
         logger.info("Predicting vila...")
diff --git a/src/mmda/types/document.py b/src/mmda/types/document.py
index 5793237c..4c199e4f 100644
--- a/src/mmda/types/document.py
+++ b/src/mmda/types/document.py
@@ -105,12 +105,11 @@ def annotate(
             setattr(self, field_name, span_groups)
             self.__fields.append(field_name)
 
-    def remove(self, field_name: str):
+    def remove(self, field_name: str) -> None:
         delattr(self, field_name)
         self.__fields = [f for f in self.__fields if f != field_name]
         del self.__indexers[field_name]
 
-
     def annotate_images(
         self, images: Iterable[PILImage], is_overwrite: bool = False
     ) -> None:

From 05234f774af694c5afb2bc5b8b7873f5c01e9e39 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Tue, 14 Mar 2023 20:23:47 -0700
Subject: [PATCH 2/2] raise PdfParsingError if can't load pdf in

---
 src/mmda/parsers/pdfplumber_parser.py | 63 +++++++++++++++++++++------
 src/mmda/recipes/core_recipe.py       | 27 +++++++-----
 2 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/src/mmda/parsers/pdfplumber_parser.py b/src/mmda/parsers/pdfplumber_parser.py
index be72f2e2..bb11f1a9 100644
--- a/src/mmda/parsers/pdfplumber_parser.py
+++ b/src/mmda/parsers/pdfplumber_parser.py
@@ -1,8 +1,10 @@
+from contextlib import ExitStack
 import itertools
 import string
 from typing import List, Optional, Union
 
 import pdfplumber
+from pdfminer.pdfparser import PDFSyntaxError
 
 try:
     # pdfplumber >= 0.8.0
@@ -22,6 +24,10 @@
 _TOL = Union[int, float]
 
 
+class PdfParsingError(PDFSyntaxError):
+    ...
+
+
 class WordExtractorWithFontInfo(ppu.WordExtractor):
     """Override WordExtractor methods to append additional char-level info."""
 
@@ -76,10 +82,10 @@ class PDFPlumberParser(Parser):
 
     def __init__(
         self,
-        token_x_tolerance: int = 1.5,
-        token_y_tolerance: int = 2,
-        line_x_tolerance: int = 10,
-        line_y_tolerance: int = 10,
+        token_x_tolerance: Union[int, float] = 1.5,
+        token_y_tolerance: Union[int, float] = 2,
+        line_x_tolerance: Union[int, float] = 10,
+        line_y_tolerance: Union[int, float] = 10,
         keep_blank_chars: bool = False,
         use_text_flow: bool = True,
         horizontal_ltr: bool = True,
@@ -157,8 +163,26 @@ def __init__(
             split_at_punctuation = type(self).DEFAULT_PUNCTUATION_CHARS
         self.split_at_punctuation = split_at_punctuation
 
-    def parse(self, input_pdf_path: str) -> Document:
-        with pdfplumber.open(input_pdf_path) as plumber_pdf_object:
+    def parse(self, input_pdf_path: str, **_) -> Document:
+        """Parse the pdf file and return a Document object.
+
+        Args:
+            input_pdf_path (str): The path to the pdf file.
+        """
+
+        # using a stack so we can detect if the open() fails
+        with ExitStack() as stack:
+
+            try:
+                plumber_pdf_object = stack.enter_context(
+                    pdfplumber.open(input_pdf_path)
+                )
+            except PDFSyntaxError as e:
+                raise PdfParsingError(
+                    f"Failed to open '{input_pdf_path}' with pdfplumber. "
+                    f"Please check if the file is a valid pdf file. "
+                ) from e
+
             all_tokens = []
             all_word_ids = []
             last_word_id = -1
@@ -194,7 +218,14 @@ def parse(self, input_pdf_path: str) -> Document:
                     coarse_tokens=[c["text"] for c in coarse_tokens],
                     fine_tokens=[f["text"] for f in fine_tokens],
                 )
-                assert len(word_ids_of_fine_tokens) == len(fine_tokens)
+                if len(word_ids_of_fine_tokens) != len(fine_tokens):
+                    raise ValueError(
+                        "The length of word_ids_of_fine_tokens and fine_tokens "
+                        f"should be the same on page {page_id}, but got "
+                        f"{len(word_ids_of_fine_tokens)} and {len(fine_tokens)}."
+                        "Please report this issue on Github, and include the "
+                        "pdf file that caused this error."
+                    )
 
                 # 4) normalize / clean tokens & boxes
                 if len(fine_tokens) > 0:
@@ -245,7 +276,6 @@ def parse(self, input_pdf_path: str) -> Document:
                     ]
                     word_ids_of_fine_tokens = [0]
 
-
                 # 5) group tokens into lines
                 # TODO - doesnt belong in parser; should be own predictor
                 line_ids_of_fine_tokens = self._simple_line_detection(
@@ -253,7 +283,14 @@ def parse(self, input_pdf_path: str) -> Document:
                     x_tolerance=self.line_x_tolerance / page.width,
                     y_tolerance=self.line_y_tolerance / page.height,
                 )
-                assert len(line_ids_of_fine_tokens) == len(fine_tokens)
+                if len(line_ids_of_fine_tokens) != len(fine_tokens):
+                    raise ValueError(
+                        "The length of line_ids_of_fine_tokens and fine_tokens "
+                        f"should be the same on page {page_id}, but got "
+                        f"{len(line_ids_of_fine_tokens)} and {len(fine_tokens)}."
+                        "Please report this issue on Github, and include the "
+                        "pdf file that caused this error."
+                    )
 
                 # 6) accumulate
                 all_tokens.extend(fine_tokens)
@@ -261,8 +298,6 @@ def parse(self, input_pdf_path: str) -> Document:
                     [i + last_row_id + 1 for i in line_ids_of_fine_tokens]
                 )
 
-                # import ipdb; ipdb.set_trace()
-
                 # 7) Update the last word id and row id for this page
                 last_row_id = all_row_ids[-1]
                 all_word_ids.extend(
@@ -280,7 +315,6 @@ def parse(self, input_pdf_path: str) -> Document:
                 row_ids=all_row_ids,
                 page_ids=all_page_ids,
             )
-            # import ipdb; ipdb.set_trace()
 
             doc = Document.from_json(doc_json)
             return doc
@@ -401,7 +435,10 @@ def _convert_nested_text_to_doc_json(
         }
 
     def _simple_line_detection(
-        self, page_tokens: List[dict], x_tolerance: int = 10, y_tolerance: int = 10
+        self,
+        page_tokens: List[dict],
+        x_tolerance: Union[int, float] = 10,
+        y_tolerance: Union[int, float] = 10
     ) -> List[int]:
         """Get text lines from the page_tokens.
         It will automatically add new lines for 1) line breaks (i.e., the current token
diff --git a/src/mmda/recipes/core_recipe.py b/src/mmda/recipes/core_recipe.py
index 1f256b79..5589240e 100644
--- a/src/mmda/recipes/core_recipe.py
+++ b/src/mmda/recipes/core_recipe.py
@@ -9,10 +9,8 @@
 
 logger = logging.getLogger(__name__)
 
-from mmda.types import *
-
-
-from mmda.parsers.pdfplumber_parser import PDFPlumberParser
+from mmda.types import Document
+from mmda.parsers.pdfplumber_parser import PDFPlumberParser, PdfParsingError
 from mmda.rasterizers.rasterizer import PDF2ImageRasterizer
 from mmda.predictors.heuristic_predictors.dictionary_word_predictor import DictionaryWordPredictor
 from mmda.predictors.lp_predictors import LayoutParserPredictor
@@ -22,18 +20,25 @@
 
 
 class CoreRecipe(Recipe):
-    def __init__(self,
-                 effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet',
-                 effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD',
-                 vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2'):
+    def __init__(
+        self,
+        effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet',
+        effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD',
+        vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2'
+    ):
         logger.info("Instantiating recipe...")
         self.parser = PDFPlumberParser()
         self.rasterizer = PDF2ImageRasterizer()
         self.word_predictor = DictionaryWordPredictor()
         self.effdet_publaynet_predictor = LayoutParserPredictor.from_pretrained(
-            effdet_publaynet_predictor_path)
-        self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained(effdet_mfd_predictor_path)
-        self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained(vila_predictor_path)
+            effdet_publaynet_predictor_path
+        )
+        self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained(
+            effdet_mfd_predictor_path
+        )
+        self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained(
+            vila_predictor_path
+        )
         logger.info("Finished instantiating recipe")
 
     def from_path(self, pdfpath: str) -> Document: