Merge pull request #1 from swisstopo/textract-optimisations

Textract optimisations
swisstopo · Oct 9, 2024 · ffbe9b0 · ffbe9b0
2 parents 6a11177 + db32c6b
commit ffbe9b0
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 28 deletions.
diff --git a/util/applyocr.py b/util/applyocr.py
@@ -10,16 +10,15 @@ def __init__(
             self,
             textractor: Textractor,
             confidence_threshold: float,
-            page: fitz.Page,
-            doc_copy: fitz.Document,
+            textract_doc: fitz.Document,
             ignore_rects: list[fitz.Rect],
             tmp_path_prefix: str
     ):
         self.textractor = textractor
         self.confidence_threshold = confidence_threshold
-        self.page = page
-        self.doc_copy = doc_copy
-        self.page_copy = doc_copy[0]
+        # single-page PDF document that will be sent to AWS Textract
+        self.textract_doc = textract_doc
+        self.page = textract_doc[0]
         self.ignore_rects = ignore_rects
         self.tmp_path_prefix = tmp_path_prefix
 
@@ -58,7 +57,7 @@ def apply_vertical_check(self, text_lines: list[TextLine], clip_rect: fitz.Rect)
         if vertical_detected:
             print("  Potential vertical text detected. Running OCR again with horizontal text masked.")
             for rect in processed_rects:
-                self.page_copy.draw_rect(
+                self.page.draw_rect(
                     rect * self.page.derotation_matrix,
                     width=0,
                     fill=fitz.utils.getColor("white")
@@ -74,7 +73,7 @@ def _ocr_text_lines(self, clip_rect: fitz.Rect, rotate: float) -> list[TextLine]
         text_lines = []
         final_clip_rects = clip_rects(clip_rect)
         for final_clip_rect in final_clip_rects:
-            new_lines = textract(self.doc_copy, self.textractor, self.tmp_file_path("pdf"), final_clip_rect, rotate)
+            new_lines = textract(self.textract_doc, self.textractor, self.tmp_file_path("pdf"), final_clip_rect, rotate)
             text_lines = combine_text_lines(text_lines, new_lines)
         return text_lines
 

diff --git a/util/crop.py b/util/crop.py
@@ -26,14 +26,17 @@ def crop_images(page: fitz.Page, out_doc: fitz.Document):
         return
 
     images_info = {dict["xref"]: dict for dict in page.get_image_info(xrefs=True)}
-
     for xref, dict in images_info.items():
         try:
             img_size = fitz.Matrix(dict["width"], dict["height"])
             extracted_img = out_doc.extract_image(xref)
             image_bbox = fitz.Rect(*dict["bbox"])
 
             extension = extracted_img['ext']
+            if extension == 'jb2':
+                # Example PDF file with a JBIG2 image: A204.pdf
+                print("  Skipping JBIG2 image.")
+                continue
             if extension == 'jpx':
                 # Some viewer, most notably the Edge browser, have problems displaying JPX images (slow / bad quality).
                 # Therefore, we convert them to JPG.
@@ -75,9 +78,11 @@ def crop_images(page: fitz.Page, out_doc: fitz.Document):
                 crop.transform(transform_inv)
                 crop.transform(img_size)
 
-                # print(extracted_img["ext"])
                 try:
-                    img = fitz.Pixmap(extracted_img["image"])
+                    img = fitz.Pixmap(out_doc, xref)
+                    # Force the image into RGB color-space. Otherwise, colors might get distorted, e.g. in A8297.pdf.
+                    # See also https://github.com/pymupdf/PyMuPDF/issues/725#issuecomment-730561405
+                    img = fitz.Pixmap(fitz.csRGB, img)
                 except FzErrorFormat:
                     print("  Unsupported image format. Skipping image.")
                     continue

diff --git a/util/textract.py b/util/textract.py
@@ -3,7 +3,6 @@
 import fitz
 import os
 import backoff
-import pymupdf
 from botocore.exceptions import ClientError
 from textractor import Textractor
 from trp.t_pipeline import add_page_orientation
@@ -12,13 +11,10 @@
 import textractcaller.t_call as t_call
 import statistics
 
-
 from util.readingorder import TextLine
 
 
-GET_PIXMAP_ZOOM_FOR_TEXTRACT = 2
-MAX_DIMENSION_PIXELS = 4000
-MAX_DIMENSION_POINTS = MAX_DIMENSION_PIXELS // GET_PIXMAP_ZOOM_FOR_TEXTRACT
+MAX_DIMENSION_POINTS = 2000
 
 
 def textract_coordinate_transform(
@@ -67,16 +63,24 @@ def textract(doc: fitz.Document, extractor: Textractor, tmp_file_path: str, clip
 
     clip_transformed = clip_rect * page.rect.torect(page.cropbox)
 
-    page.set_cropbox(clip_transformed.intersect(page.mediabox))  # TODO make more robust, e.g. 267123080-bp.pdf
+    # Even thought the documentation says that the cropbox is always contained in the mediabox, this is not always the
+    # case, e.g. 267123080-bp.pdf. The discrepancies are usually very small (floating point accuracy errors?). Even so,
+    # a trivial call such as page.set_cropbox(page.cropbox) will fail with an "CropBox not in MediaBox" error, if this
+    # is the case. To avoid such errors, we take an explicit intersection with the mediabox whenever we call
+    # page.set_cropbox(). Possibly related to: https://github.com/pymupdf/PyMuPDF/issues/1615
+    page.set_cropbox(clip_transformed.intersect(page.mediabox))
     page.set_rotation(page.rotation + rotate)
-    doc.save(tmp_file_path)
+    doc.save(tmp_file_path, deflate=True)
 
     page.set_rotation(old_rotation)
     page.set_cropbox(old_cropbox.intersect(page.mediabox))
 
     document = call_textract(extractor, tmp_file_path)
     os.remove(tmp_file_path)
 
+    if document is None:
+        return []
+
     # Matrix to transform Textract coordinates via Pixmap coordinates back to PyMuPDF coordinates
     # TODO cleanup method after removing the Pixmap creation
     transform = textract_coordinate_transform(
@@ -96,13 +100,18 @@ def backoff_hdlr(details):
                       ClientError,
                       on_backoff=backoff_hdlr,
                       base=2)
-def call_textract(extractor: Textractor, tmp_file_path: str) -> t1.Document:
-    j = t_call.call_textract(
-        input_document=tmp_file_path,
-        boto3_textract_client=extractor.textract_client,
-        call_mode=t_call.Textract_Call_Mode.FORCE_SYNC
-    )
-    t_document: t2.TDocument = t2.TDocumentSchema().load(j)
+def call_textract(extractor: Textractor, tmp_file_path: str) -> t1.Document | None:
+    try:
+        j = t_call.call_textract(
+            input_document=tmp_file_path,
+            boto3_textract_client=extractor.textract_client,
+            call_mode=t_call.Textract_Call_Mode.FORCE_SYNC
+        )
+        t_document: t2.TDocument = t2.TDocumentSchema().load(j)
+    except extractor.textract_client.exceptions.InvalidParameterException:
+        print("Encountered InvalidParameterException from Textract. Page might require more than 10MB memory. Skipping page.")
+        return None
+
     try:
         t_document = add_page_orientation(t_document)
     except statistics.StatisticsError:

diff --git a/util/util.py b/util/util.py
@@ -24,14 +24,14 @@ def process_page(
 
     page.clean_contents()
 
-    doc_copy = fitz.Document()
-    doc_copy.insert_pdf(doc, from_page=page.number, to_page=page.number)
+    # create a single-page PDF document that can be modified if necessary, before being sent to AWS Textract
+    textract_doc = fitz.Document()
+    textract_doc.insert_pdf(doc, from_page=page.number, to_page=page.number)
 
     page_ocr = OCR(
         textractor=extractor,
         confidence_threshold=confidence_threshold,
-        page=page,
-        doc_copy=doc_copy,
+        textract_doc=textract_doc,
         ignore_rects=ignore_rects,
         tmp_path_prefix=tmp_path_prefix
     )