Skip to content

Commit

Permalink
Merge pull request #1 from swisstopo/textract-optimisations
Browse files Browse the repository at this point in the history
Textract optimisations
  • Loading branch information
stijnvermeeren-swisstopo authored Oct 9, 2024
2 parents 6a11177 + db32c6b commit ffbe9b0
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 28 deletions.
13 changes: 6 additions & 7 deletions util/applyocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,15 @@ def __init__(
self,
textractor: Textractor,
confidence_threshold: float,
page: fitz.Page,
doc_copy: fitz.Document,
textract_doc: fitz.Document,
ignore_rects: list[fitz.Rect],
tmp_path_prefix: str
):
self.textractor = textractor
self.confidence_threshold = confidence_threshold
self.page = page
self.doc_copy = doc_copy
self.page_copy = doc_copy[0]
# single-page PDF document that will be sent to AWS Textract
self.textract_doc = textract_doc
self.page = textract_doc[0]
self.ignore_rects = ignore_rects
self.tmp_path_prefix = tmp_path_prefix

Expand Down Expand Up @@ -58,7 +57,7 @@ def apply_vertical_check(self, text_lines: list[TextLine], clip_rect: fitz.Rect)
if vertical_detected:
print(" Potential vertical text detected. Running OCR again with horizontal text masked.")
for rect in processed_rects:
self.page_copy.draw_rect(
self.page.draw_rect(
rect * self.page.derotation_matrix,
width=0,
fill=fitz.utils.getColor("white")
Expand All @@ -74,7 +73,7 @@ def _ocr_text_lines(self, clip_rect: fitz.Rect, rotate: float) -> list[TextLine]
text_lines = []
final_clip_rects = clip_rects(clip_rect)
for final_clip_rect in final_clip_rects:
new_lines = textract(self.doc_copy, self.textractor, self.tmp_file_path("pdf"), final_clip_rect, rotate)
new_lines = textract(self.textract_doc, self.textractor, self.tmp_file_path("pdf"), final_clip_rect, rotate)
text_lines = combine_text_lines(text_lines, new_lines)
return text_lines

Expand Down
11 changes: 8 additions & 3 deletions util/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,17 @@ def crop_images(page: fitz.Page, out_doc: fitz.Document):
return

images_info = {dict["xref"]: dict for dict in page.get_image_info(xrefs=True)}

for xref, dict in images_info.items():
try:
img_size = fitz.Matrix(dict["width"], dict["height"])
extracted_img = out_doc.extract_image(xref)
image_bbox = fitz.Rect(*dict["bbox"])

extension = extracted_img['ext']
if extension == 'jb2':
# Example PDF file with a JBIG2 image: A204.pdf
print(" Skipping JBIG2 image.")
continue
if extension == 'jpx':
# Some viewer, most notably the Edge browser, have problems displaying JPX images (slow / bad quality).
# Therefore, we convert them to JPG.
Expand Down Expand Up @@ -75,9 +78,11 @@ def crop_images(page: fitz.Page, out_doc: fitz.Document):
crop.transform(transform_inv)
crop.transform(img_size)

# print(extracted_img["ext"])
try:
img = fitz.Pixmap(extracted_img["image"])
img = fitz.Pixmap(out_doc, xref)
# Force the image into RGB color-space. Otherwise, colors might get distorted, e.g. in A8297.pdf.
# See also https://github.com/pymupdf/PyMuPDF/issues/725#issuecomment-730561405
img = fitz.Pixmap(fitz.csRGB, img)
except FzErrorFormat:
print(" Unsupported image format. Skipping image.")
continue
Expand Down
37 changes: 23 additions & 14 deletions util/textract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import fitz
import os
import backoff
import pymupdf
from botocore.exceptions import ClientError
from textractor import Textractor
from trp.t_pipeline import add_page_orientation
Expand All @@ -12,13 +11,10 @@
import textractcaller.t_call as t_call
import statistics


from util.readingorder import TextLine


GET_PIXMAP_ZOOM_FOR_TEXTRACT = 2
MAX_DIMENSION_PIXELS = 4000
MAX_DIMENSION_POINTS = MAX_DIMENSION_PIXELS // GET_PIXMAP_ZOOM_FOR_TEXTRACT
MAX_DIMENSION_POINTS = 2000


def textract_coordinate_transform(
Expand Down Expand Up @@ -67,16 +63,24 @@ def textract(doc: fitz.Document, extractor: Textractor, tmp_file_path: str, clip

clip_transformed = clip_rect * page.rect.torect(page.cropbox)

page.set_cropbox(clip_transformed.intersect(page.mediabox)) # TODO make more robust, e.g. 267123080-bp.pdf
# Even thought the documentation says that the cropbox is always contained in the mediabox, this is not always the
# case, e.g. 267123080-bp.pdf. The discrepancies are usually very small (floating point accuracy errors?). Even so,
# a trivial call such as page.set_cropbox(page.cropbox) will fail with an "CropBox not in MediaBox" error, if this
# is the case. To avoid such errors, we take an explicit intersection with the mediabox whenever we call
# page.set_cropbox(). Possibly related to: https://github.com/pymupdf/PyMuPDF/issues/1615
page.set_cropbox(clip_transformed.intersect(page.mediabox))
page.set_rotation(page.rotation + rotate)
doc.save(tmp_file_path)
doc.save(tmp_file_path, deflate=True)

page.set_rotation(old_rotation)
page.set_cropbox(old_cropbox.intersect(page.mediabox))

document = call_textract(extractor, tmp_file_path)
os.remove(tmp_file_path)

if document is None:
return []

# Matrix to transform Textract coordinates via Pixmap coordinates back to PyMuPDF coordinates
# TODO cleanup method after removing the Pixmap creation
transform = textract_coordinate_transform(
Expand All @@ -96,13 +100,18 @@ def backoff_hdlr(details):
ClientError,
on_backoff=backoff_hdlr,
base=2)
def call_textract(extractor: Textractor, tmp_file_path: str) -> t1.Document:
j = t_call.call_textract(
input_document=tmp_file_path,
boto3_textract_client=extractor.textract_client,
call_mode=t_call.Textract_Call_Mode.FORCE_SYNC
)
t_document: t2.TDocument = t2.TDocumentSchema().load(j)
def call_textract(extractor: Textractor, tmp_file_path: str) -> t1.Document | None:
try:
j = t_call.call_textract(
input_document=tmp_file_path,
boto3_textract_client=extractor.textract_client,
call_mode=t_call.Textract_Call_Mode.FORCE_SYNC
)
t_document: t2.TDocument = t2.TDocumentSchema().load(j)
except extractor.textract_client.exceptions.InvalidParameterException:
print("Encountered InvalidParameterException from Textract. Page might require more than 10MB memory. Skipping page.")
return None

try:
t_document = add_page_orientation(t_document)
except statistics.StatisticsError:
Expand Down
8 changes: 4 additions & 4 deletions util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ def process_page(

page.clean_contents()

doc_copy = fitz.Document()
doc_copy.insert_pdf(doc, from_page=page.number, to_page=page.number)
# create a single-page PDF document that can be modified if necessary, before being sent to AWS Textract
textract_doc = fitz.Document()
textract_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)

page_ocr = OCR(
textractor=extractor,
confidence_threshold=confidence_threshold,
page=page,
doc_copy=doc_copy,
textract_doc=textract_doc,
ignore_rects=ignore_rects,
tmp_path_prefix=tmp_path_prefix
)
Expand Down

0 comments on commit ffbe9b0

Please sign in to comment.