Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix most common parsing errors. #209

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 114 additions & 36 deletions src/mmda/parsers/pdfplumber_parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from contextlib import ExitStack
import itertools
import string
from typing import List, Optional, Union

import pdfplumber
from pdfminer.pdfparser import PDFSyntaxError

try:
# pdfplumber >= 0.8.0
Expand All @@ -22,6 +24,10 @@
_TOL = Union[int, float]


class PdfParsingError(PDFSyntaxError):
...


class WordExtractorWithFontInfo(ppu.WordExtractor):
"""Override WordExtractor methods to append additional char-level info."""

Expand Down Expand Up @@ -76,10 +82,10 @@ class PDFPlumberParser(Parser):

def __init__(
self,
token_x_tolerance: int = 1.5,
token_y_tolerance: int = 2,
line_x_tolerance: int = 10,
line_y_tolerance: int = 10,
token_x_tolerance: Union[int, float] = 1.5,
token_y_tolerance: Union[int, float] = 2,
line_x_tolerance: Union[int, float] = 10,
line_y_tolerance: Union[int, float] = 10,
keep_blank_chars: bool = False,
use_text_flow: bool = True,
horizontal_ltr: bool = True,
Expand Down Expand Up @@ -157,8 +163,26 @@ def __init__(
split_at_punctuation = type(self).DEFAULT_PUNCTUATION_CHARS
self.split_at_punctuation = split_at_punctuation

def parse(self, input_pdf_path: str) -> Document:
with pdfplumber.open(input_pdf_path) as plumber_pdf_object:
def parse(self, input_pdf_path: str, **_) -> Document:
"""Parse the pdf file and return a Document object.

Args:
input_pdf_path (str): The path to the pdf file.
"""

# using a stack so we can detect if the open() fails
with ExitStack() as stack:

try:
plumber_pdf_object = stack.enter_context(
pdfplumber.open(input_pdf_path)
)
except PDFSyntaxError as e:
raise PdfParsingError(
f"Failed to open '{input_pdf_path}' with pdfplumber. "
f"Please check if the file is a valid pdf file. "
) from e

all_tokens = []
all_word_ids = []
last_word_id = -1
Expand Down Expand Up @@ -194,54 +218,104 @@ def parse(self, input_pdf_path: str) -> Document:
coarse_tokens=[c["text"] for c in coarse_tokens],
fine_tokens=[f["text"] for f in fine_tokens],
)
assert len(word_ids_of_fine_tokens) == len(fine_tokens)
if len(word_ids_of_fine_tokens) != len(fine_tokens):
raise ValueError(
"The length of word_ids_of_fine_tokens and fine_tokens "
f"should be the same on page {page_id}, but got "
f"{len(word_ids_of_fine_tokens)} and {len(fine_tokens)}."
"Please report this issue on Github, and include the "
"pdf file that caused this error."
)

# 4) normalize / clean tokens & boxes
fine_tokens = [
{
"text": token["text"],
"fontname": token["fontname"],
"size": token["size"],
"bbox": Box.from_pdf_coordinates(
x1=float(token["x0"]),
y1=float(token["top"]),
x2=float(token["x1"]),
y2=float(token["bottom"]),
page_width=float(page.width),
page_height=float(page.height),
page=int(page_id),
).get_relative(
page_width=float(page.width), page_height=float(page.height)
),
}
for token in fine_tokens
]
if len(fine_tokens) > 0:
fine_tokens = [
{
"text": token["text"],
"fontname": token["fontname"],
"size": token["size"],
"bbox": Box.from_pdf_coordinates(
x1=float(token["x0"]),
y1=float(token["top"]),
x2=float(token["x1"]),
y2=float(token["bottom"]),
page_width=float(page.width),
page_height=float(page.height),
page=int(page_id),
).get_relative(
page_width=float(page.width),
page_height=float(page.height)
),
}
for token in fine_tokens
]
else:
# this page does not have tokens, so we add a single
# fictitious token to the page. This is to ensure that
# any element that is on this page gets anchored to
# something
fine_tokens = [
{
# adding one single character: the null character
"text": "\u0000",
"fontname": "",
"size": 0.,
"bbox": Box.from_pdf_coordinates(
x1=0.,
y1=0.,
x2=1.,
y2=1.,
page_width=float(page.width),
page_height=float(page.height),
page=int(page_id),
).get_relative(
page_width=float(page.width),
page_height=float(page.height)
),
}
]
word_ids_of_fine_tokens = [0]

# 5) group tokens into lines
# TODO - doesnt belong in parser; should be own predictor
line_ids_of_fine_tokens = self._simple_line_detection(
page_tokens=fine_tokens,
x_tolerance=self.line_x_tolerance / page.width,
y_tolerance=self.line_y_tolerance / page.height,
)
assert len(line_ids_of_fine_tokens) == len(fine_tokens)
if len(line_ids_of_fine_tokens) != len(fine_tokens):
raise ValueError(
"The length of line_ids_of_fine_tokens and fine_tokens "
f"should be the same on page {page_id}, but got "
f"{len(line_ids_of_fine_tokens)} and {len(fine_tokens)}."
"Please report this issue on Github, and include the "
"pdf file that caused this error."
)

# 6) accumulate
all_tokens.extend(fine_tokens)
all_row_ids.extend(
[i + last_row_id + 1 for i in line_ids_of_fine_tokens]
)

# 7) Update the last word id and row id for this page
last_row_id = all_row_ids[-1]
all_word_ids.extend(
[i + last_word_id + 1 for i in word_ids_of_fine_tokens]
)
last_word_id = all_word_ids[-1]

for _ in fine_tokens:
all_page_ids.append(page_id)

# now turn into a beautiful document!
doc_json = self._convert_nested_text_to_doc_json(
token_dicts=all_tokens,
word_ids=all_word_ids,
row_ids=all_row_ids,
page_ids=all_page_ids,
)

doc = Document.from_json(doc_json)
return doc

Expand Down Expand Up @@ -299,14 +373,15 @@ def _convert_nested_text_to_doc_json(
# new row
symbols += "\n"
start = end + 1
# handle last token
symbols += token_dicts[-1]["text"]
end = start + len(token_dicts[-1]["text"])
token = SpanGroup(
spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])],
id=len(token_dicts) - 1,
)
token_annos.append(token)
# handle last token if we have one
if len(token_dicts) > 0:
symbols += token_dicts[-1]["text"]
end = start + len(token_dicts[-1]["text"])
last_token = SpanGroup(
spans=[Span(start=start, end=end, box=token_dicts[-1]["bbox"])],
id=len(token_dicts) - 1,
)
token_annos.append(last_token)

# 2) build rows
tokens_with_group_ids = [
Expand Down Expand Up @@ -360,7 +435,10 @@ def _convert_nested_text_to_doc_json(
}

def _simple_line_detection(
self, page_tokens: List[dict], x_tolerance: int = 10, y_tolerance: int = 10
self,
page_tokens: List[dict],
x_tolerance: Union[int, float] = 10,
y_tolerance: Union[int, float] = 10
) -> List[int]:
"""Get text lines from the page_tokens.
It will automatically add new lines for 1) line breaks (i.e., the current token
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,9 @@ def _predict_tokens(
# token_id_to_word_id[i] = first_token_id
# word_id_to_text[first_token_id] = candidate_text

# are there any unclassified tokens?
assert None not in token_id_to_word_id.values()
if any(v is None for v in token_id_to_word_id.values()):
raise ValueError("Some tokens are not part of any word.")

return token_id_to_word_id, word_id_to_text

def _convert_to_words(
Expand All @@ -493,6 +494,11 @@ def _convert_to_words(
token_id_to_word_id,
word_id_to_text
) -> List[SpanGroup]:

if len(document.tokens) == 0:
# document has no tokens
return []

words = []
tokens_in_word = [document.tokens[0]]
current_word_id = 0
Expand Down
28 changes: 17 additions & 11 deletions src/mmda/recipes/core_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@

logger = logging.getLogger(__name__)

from mmda.types import *


from mmda.parsers.pdfplumber_parser import PDFPlumberParser
from mmda.types import Document
from mmda.parsers.pdfplumber_parser import PDFPlumberParser, PdfParsingError
from mmda.rasterizers.rasterizer import PDF2ImageRasterizer
from mmda.predictors.heuristic_predictors.dictionary_word_predictor import DictionaryWordPredictor
from mmda.predictors.lp_predictors import LayoutParserPredictor
Expand All @@ -22,18 +20,25 @@


class CoreRecipe(Recipe):
def __init__(self,
effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet',
effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD',
vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2'):
def __init__(
self,
effdet_publaynet_predictor_path: str = 'lp://efficientdet/PubLayNet',
effdet_mfd_predictor_path: str = 'lp://efficientdet/MFD',
vila_predictor_path: str = 'allenai/ivila-row-layoutlm-finetuned-s2vl-v2'
):
logger.info("Instantiating recipe...")
self.parser = PDFPlumberParser()
self.rasterizer = PDF2ImageRasterizer()
self.word_predictor = DictionaryWordPredictor()
self.effdet_publaynet_predictor = LayoutParserPredictor.from_pretrained(
effdet_publaynet_predictor_path)
self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained(effdet_mfd_predictor_path)
self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained(vila_predictor_path)
effdet_publaynet_predictor_path
)
self.effdet_mfd_predictor = LayoutParserPredictor.from_pretrained(
effdet_mfd_predictor_path
)
self.vila_predictor = IVILATokenClassificationPredictor.from_pretrained(
vila_predictor_path
)
logger.info("Finished instantiating recipe")

def from_path(self, pdfpath: str) -> Document:
Expand All @@ -51,6 +56,7 @@ def from_path(self, pdfpath: str) -> Document:
logger.info("Predicting blocks...")
blocks = self.effdet_publaynet_predictor.predict(document=doc)
equations = self.effdet_mfd_predictor.predict(document=doc)

doc.annotate(blocks=blocks + equations)

logger.info("Predicting vila...")
Expand Down
3 changes: 1 addition & 2 deletions src/mmda/types/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,11 @@ def annotate(
setattr(self, field_name, span_groups)
self.__fields.append(field_name)

def remove(self, field_name: str):
def remove(self, field_name: str) -> None:
delattr(self, field_name)
self.__fields = [f for f in self.__fields if f != field_name]
del self.__indexers[field_name]


def annotate_images(
self, images: Iterable[PILImage], is_overwrite: bool = False
) -> None:
Expand Down