From 7628de57ac79f3a0c34551fe1c21be0155a9700e Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 14 Mar 2023 16:56:35 -0700 Subject: [PATCH 1/2] fix for empty pages --- src/mmda/parsers/pdfplumber_parser.py | 95 +++++++++++++------ .../dictionary_word_predictor.py | 10 +- src/mmda/recipes/core_recipe.py | 1 + src/mmda/types/document.py | 3 +- 4 files changed, 78 insertions(+), 31 deletions(-) diff --git a/src/mmda/parsers/pdfplumber_parser.py b/src/mmda/parsers/pdfplumber_parser.py index ae5a4fca..be72f2e2 100644 --- a/src/mmda/parsers/pdfplumber_parser.py +++ b/src/mmda/parsers/pdfplumber_parser.py @@ -195,26 +195,57 @@ def parse(self, input_pdf_path: str) -> Document: fine_tokens=[f["text"] for f in fine_tokens], ) assert len(word_ids_of_fine_tokens) == len(fine_tokens) + # 4) normalize / clean tokens & boxes - fine_tokens = [ - { - "text": token["text"], - "fontname": token["fontname"], - "size": token["size"], - "bbox": Box.from_pdf_coordinates( - x1=float(token["x0"]), - y1=float(token["top"]), - x2=float(token["x1"]), - y2=float(token["bottom"]), - page_width=float(page.width), - page_height=float(page.height), - page=int(page_id), - ).get_relative( - page_width=float(page.width), page_height=float(page.height) - ), - } - for token in fine_tokens - ] + if len(fine_tokens) > 0: + fine_tokens = [ + { + "text": token["text"], + "fontname": token["fontname"], + "size": token["size"], + "bbox": Box.from_pdf_coordinates( + x1=float(token["x0"]), + y1=float(token["top"]), + x2=float(token["x1"]), + y2=float(token["bottom"]), + page_width=float(page.width), + page_height=float(page.height), + page=int(page_id), + ).get_relative( + page_width=float(page.width), + page_height=float(page.height) + ), + } + for token in fine_tokens + ] + else: + # this page does not have tokens, so we add a single + # fictitious token to the page. This is to ensure that + # any element that is on this page gets anchored to + # something + fine_tokens = [ + { + # adding one single character: the null character + "text": "\u0000", + "fontname": "", + "size": 0., + "bbox": Box.from_pdf_coordinates( + x1=0., + y1=0., + x2=1., + y2=1., + page_width=float(page.width), + page_height=float(page.height), + page=int(page_id), + ).get_relative( + page_width=float(page.width), + page_height=float(page.height) + ), + } + ] + word_ids_of_fine_tokens = [0] + + # 5) group tokens into lines # TODO - doesnt belong in parser; should be own predictor line_ids_of_fine_tokens = self._simple_line_detection( @@ -223,18 +254,25 @@ def parse(self, input_pdf_path: str) -> Document: y_tolerance=self.line_y_tolerance / page.height, ) assert len(line_ids_of_fine_tokens) == len(fine_tokens) + # 6) accumulate all_tokens.extend(fine_tokens) all_row_ids.extend( [i + last_row_id + 1 for i in line_ids_of_fine_tokens] ) + + # import ipdb; ipdb.set_trace() + + # 7) Update the last word id and row id for this page last_row_id = all_row_ids[-1] all_word_ids.extend( [i + last_word_id + 1 for i in word_ids_of_fine_tokens] ) last_word_id = all_word_ids[-1] + for _ in fine_tokens: all_page_ids.append(page_id) + # now turn into a beautiful document! doc_json = self._convert_nested_text_to_doc_json( token_dicts=all_tokens, @@ -242,6 +280,8 @@ def parse(self, input_pdf_path: str) -> Document: row_ids=all_row_ids, page_ids=all_page_ids, ) + # import ipdb; ipdb.set_trace() + doc = Document.from_json(doc_json) return doc @@ -299,14 +339,15 @@ def _convert_nested_text_to_doc_json( # new row symbols += "\n" start = end + 1 - # handle last token - symbols += token_dicts[-1]["text"] - end = start + len(token_dicts[-1]["text"]) - token = SpanGroup( - spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])], - id=len(token_dicts) - 1, - ) - token_annos.append(token) + # handle last token if we have one + if len(token_dicts) > 0: + symbols += token_dicts[-1]["text"] + end = start + len(token_dicts[-1]["text"]) + last_token = SpanGroup( + spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])], + id=len(token_dicts) - 1, + ) + token_annos.append(last_token) # 2) build rows tokens_with_group_ids = [ diff --git a/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py index 12eca518..891ccb10 100644 --- a/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py +++ b/src/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py @@ -483,8 +483,9 @@ def _predict_tokens( # token_id_to_word_id[i] = first_token_id # word_id_to_text[first_token_id] = candidate_text - # are there any unclassified tokens? - assert None not in token_id_to_word_id.values() + if any(v is None for v in token_id_to_word_id.values()): + raise ValueError("Some tokens are not part of any word.") + return token_id_to_word_id, word_id_to_text def _convert_to_words( @@ -493,6 +494,11 @@ def _convert_to_words( token_id_to_word_id, word_id_to_text ) -> List[SpanGroup]: + + if len(document.tokens) == 0: + # document has no tokens + return [] + words = [] tokens_in_word = [document.tokens[0]] current_word_id = 0 diff --git a/src/mmda/recipes/core_recipe.py b/src/mmda/recipes/core_recipe.py index ba49f59e..1f256b79 100644 --- a/src/mmda/recipes/core_recipe.py +++ b/src/mmda/recipes/core_recipe.py @@ -51,6 +51,7 @@ def from_path(self, pdfpath: str) -> Document: logger.info("Predicting blocks...") blocks = self.effdet_publaynet_predictor.predict(document=doc) equations = self.effdet_mfd_predictor.predict(document=doc) + doc.annotate(blocks=blocks + equations) logger.info("Predicting vila...") diff --git a/src/mmda/types/document.py b/src/mmda/types/document.py index 5793237c..4c199e4f 100644 --- a/src/mmda/types/document.py +++ b/src/mmda/types/document.py @@ -105,12 +105,11 @@ def annotate( setattr(self, field_name, span_groups) self.__fields.append(field_name) - def remove(self, field_name: str): + def remove(self, field_name: str) -> None: delattr(self, field_name) self.__fields = [f for f in self.__fields if f != field_name] del self.__indexers[field_name] - def annotate_images( self, images: Iterable[PILImage], is_overwrite: bool = False ) -> None: From 05234f774af694c5afb2bc5b8b7873f5c01e9e39 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 14 Mar 2023 20:23:47 -0700 Subject: [PATCH 2/2] raise PdfParsingError if can't load pdf in --- src/mmda/parsers/pdfplumber_parser.py | 63 +++++++++++++++++++++------ src/mmda/recipes/core_recipe.py | 27 +++++++----- 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/src/mmda/parsers/pdfplumber_parser.py b/src/mmda/parsers/pdfplumber_parser.py index be72f2e2..bb11f1a9 100644 --- a/src/mmda/parsers/pdfplumber_parser.py +++ b/src/mmda/parsers/pdfplumber_parser.py @@ -1,8 +1,10 @@ +from contextlib import ExitStack import itertools import string from typing import List, Optional, Union import pdfplumber +from pdfminer.pdfparser import PDFSyntaxError try: # pdfplumber >= 0.8.0 @@ -22,6 +24,10 @@ _TOL = Union[int, float] +class PdfParsingError(PDFSyntaxError): + ... + + class WordExtractorWithFontInfo(ppu.WordExtractor): """Override WordExtractor methods to append additional char-level info.""" @@ -76,10 +82,10 @@ class PDFPlumberParser(Parser): def __init__( self, - token_x_tolerance: int = 1.5, - token_y_tolerance: int = 2, - line_x_tolerance: int = 10, - line_y_tolerance: int = 10, + token_x_tolerance: Union[int, float] = 1.5, + token_y_tolerance: Union[int, float] = 2, + line_x_tolerance: Union[int, float] = 10, + line_y_tolerance: Union[int, float] = 10, keep_blank_chars: bool = False, use_text_flow: bool = True, horizontal_ltr: bool = True, @@ -157,8 +163,26 @@ def __init__( split_at_punctuation = type(self).DEFAULT_PUNCTUATION_CHARS self.split_at_punctuation = split_at_punctuation - def parse(self, input_pdf_path: str) -> Document: - with pdfplumber.open(input_pdf_path) as plumber_pdf_object: + def parse(self, input_pdf_path: str, **_) -> Document: + """Parse the pdf file and return a Document object. + + Args: + input_pdf_path (str): The path to the pdf file. + """ + + # using a stack so we can detect if the open() fails + with ExitStack() as stack: + + try: + plumber_pdf_object = stack.enter_context( + pdfplumber.open(input_pdf_path) + ) + except PDFSyntaxError as e: + raise PdfParsingError( + f"Failed to open '{input_pdf_path}' with pdfplumber. " + f"Please check if the file is a valid pdf file. " + ) from e + all_tokens = [] all_word_ids = [] last_word_id = -1 @@ -194,7 +218,14 @@ def parse(self, input_pdf_path: str) -> Document: coarse_tokens=[c["text"] for c in coarse_tokens], fine_tokens=[f["text"] for f in fine_tokens], ) - assert len(word_ids_of_fine_tokens) == len(fine_tokens) + if len(word_ids_of_fine_tokens) != len(fine_tokens): + raise ValueError( + "The length of word_ids_of_fine_tokens and fine_tokens " + f"should be the same on page {page_id}, but got " + f"{len(word_ids_of_fine_tokens)} and {len(fine_tokens)}." + "Please report this issue on Github, and include the " + "pdf file that caused this error." + ) # 4) normalize / clean tokens & boxes if len(fine_tokens) > 0: @@ -245,7 +276,6 @@ def parse(self, input_pdf_path: str) -> Document: ] word_ids_of_fine_tokens = [0] - # 5) group tokens into lines # TODO - doesnt belong in parser; should be own predictor line_ids_of_fine_tokens = self._simple_line_detection( @@ -253,7 +283,14 @@ def parse(self, input_pdf_path: str) -> Document: x_tolerance=self.line_x_tolerance / page.width, y_tolerance=self.line_y_tolerance / page.height, ) - assert len(line_ids_of_fine_tokens) == len(fine_tokens) + if len(line_ids_of_fine_tokens) != len(fine_tokens): + raise ValueError( + "The length of line_ids_of_fine_tokens and fine_tokens " + f"should be the same on page {page_id}, but got " + f"{len(line_ids_of_fine_tokens)} and {len(fine_tokens)}." + "Please report this issue on Github, and include the " + "pdf file that caused this error." + ) # 6) accumulate all_tokens.extend(fine_tokens) @@ -261,8 +298,6 @@ def parse(self, input_pdf_path: str) -> Document: [i + last_row_id + 1 for i in line_ids_of_fine_tokens] ) - # import ipdb; ipdb.set_trace() - # 7) Update the last word id and row id for this page last_row_id = all_row_ids[-1] all_word_ids.extend( @@ -280,7 +315,6 @@ def parse(self, input_pdf_path: str) -> Document: row_ids=all_row_ids, page_ids=all_page_ids, ) - # import ipdb; ipdb.set_trace() doc = Document.from_json(doc_json) return doc @@ -401,7 +435,10 @@ def _convert_nested_text_to_doc_json( } def _simple_line_detection( - self, page_tokens: List[dict], x_tolerance: int = 10, y_tolerance: int = 10 + self, + page_tokens: List[dict], + x_tolerance: Union[int, float] = 10, + y_tolerance: Union[int, float] = 10 ) -> List[int]: """Get text lines from the page_tokens. It will automatically add new lines for 1) line breaks (i.e., the current token diff --git a/src/mmda/recipes/core_recipe.py b/src/mmda/recipes/core_recipe.py index 1f256b79..5589240e 100644 --- a/src/mmda/recipes/core_recipe.py +++ b/src/mmda/recipes/core_recipe.py @@ -9,10 +9,8 @@ logger = logging.getLogger(__name__) -from mmda.types import * - - -from mmda.parsers.pdfplumber_parser import PDFPlumberParser +from mmda.types import Document +from mmda.parsers.pdfplumber_parser import PDFPlumberParser, PdfParsingError from mmda.rasterizers.rasterizer import PDF2ImageRasterizer from mmda.predictors.heuristic_predictors.dictionary_word_predictor import DictionaryWordPredictor from mmda.predictors.lp_predictors import LayoutParserPredictor @@ -22,18 +20,25 @@ class CoreRecipe(Recipe): - def __init__(self, - effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet', - effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD', - vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2'): + def __init__( + self, + effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet', + effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD', + vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2' + ): logger.info("Instantiating recipe...") self.parser = PDFPlumberParser() self.rasterizer = PDF2ImageRasterizer() self.word_predictor = DictionaryWordPredictor() self.effdet_publaynet_predictor = LayoutParserPredictor.from_pretrained( - effdet_publaynet_predictor_path) - self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained(effdet_mfd_predictor_path) - self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained(vila_predictor_path) + effdet_publaynet_predictor_path + ) + self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained( + effdet_mfd_predictor_path + ) + self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained( + vila_predictor_path + ) logger.info("Finished instantiating recipe") def from_path(self, pdfpath: str) -> Document: