Skip to content

Commit

Permalink
Merge pull request #2 from swisstopo/textract-optimisations
Browse files Browse the repository at this point in the history
cleanup textract_coordinate_transform method
  • Loading branch information
stijnvermeeren-swisstopo authored Oct 9, 2024
2 parents ffbe9b0 + 93862b3 commit 70316d3
Showing 1 changed file with 7 additions and 19 deletions.
26 changes: 7 additions & 19 deletions util/textract.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,19 @@

def textract_coordinate_transform(
clip_rect: fitz.Rect,
pixmap_rect: fitz.IRect,
pixmap_with_margin_rect: fitz.IRect,
rotate: float
) -> fitz.Matrix:
# The rectangle surrounding the rotated version of the clip (corresponds to the image that was sent to AWS Textract,
# without margins)
# The rectangle surrounding the rotated version of the clip (corresponds to the page that was sent to AWS Textract)
rotated_clip_rect = (clip_rect.quad * fitz.Matrix(1, 1).prerotate(rotate)).rect

# Matrix to transform the Textract coordinates to the pixmap coordinates
transform1 = fitz.Rect(0, 0, 1, 1).torect(pixmap_with_margin_rect)

# Matrix to transform the pixmap coordinates back to the rotated PyMuPDF coordinates
transform2 = pixmap_rect.torect(rotated_clip_rect)
# Matrix to transform the Textract coordinates to the rotated PyMuPDF coordinates
transform1 = fitz.Rect(0, 0, 1, 1).torect(rotated_clip_rect)

# Matrix to change the PyMuPDF coordinates back to the unrotated version
transform3 = fitz.Matrix(1, 1).prerotate(-rotate)
transform2 = fitz.Matrix(1, 1).prerotate(-rotate)

# Matrix to transform the Textract coordinates back to the original unrotated PyMuPDF coordinates
return transform1 * transform2 * transform3
return transform1 * transform2


def text_lines_from_document(
Expand Down Expand Up @@ -81,14 +75,8 @@ def textract(doc: fitz.Document, extractor: Textractor, tmp_file_path: str, clip
if document is None:
return []

# Matrix to transform Textract coordinates via Pixmap coordinates back to PyMuPDF coordinates
# TODO cleanup method after removing the Pixmap creation
transform = textract_coordinate_transform(
clip_rect=clip_rect,
pixmap_rect=fitz.IRect(0,0,1,1),
pixmap_with_margin_rect=fitz.IRect(0,0,1,1),
rotate=rotate
)
# Matrix to transform Textract coordinates back to PyMuPDF coordinates
transform = textract_coordinate_transform(clip_rect=clip_rect, rotate=rotate)
return text_lines_from_document(document, transform, rotate, doc[0].rect.height)


Expand Down

0 comments on commit 70316d3

Please sign in to comment.