Skip to content

Commit

Permalink
updated the extract_text_from_bbox
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Dec 7, 2024
1 parent d1b1cdf commit 3914d2d
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 274 deletions.
46 changes: 37 additions & 9 deletions docling_parse/extract_text_from_bbox.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
import argparse
import logging
import os

from tabulate import tabulate

from docling_parse import pdf_parser_v2 # type: ignore[attr-defined]
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]
from docling_parse.utils import (
create_pil_image_of_page_v2,
draw_bbox_on_page_v2,
filter_columns_v2,
)

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


def parse_args():
Expand Down Expand Up @@ -39,9 +50,9 @@ def parse_args():
parser.add_argument(
"-b",
"--bbox",
type=tuple(float, float, float, float),
type=str, # Tuple[int, int, int, int],
required=True,
help="bounding box as tuple(float, float, float, float)",
help="bounding box as str x0,y0,x1,y1",
)

# Parse the command-line arguments
Expand All @@ -54,7 +65,7 @@ def parse_args():
args.log_level,
args.input_pdf,
int(args.page),
args.bbox,
list(map(float, args.bbox.split(","))),
)


Expand All @@ -67,33 +78,50 @@ def main():
doc = None

try:
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)

doc_key = "key"
success = parser.load_document(doc_key, pdf_file)

if success == False:
logging.error("Not successful in loading document")
return

doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)

parser.unload_document(doc_key)

except Exception as exc:
logging.error(f"Could not parse pdf-document: {exc}")
return

page = doc["pages"][0]
parser.set_loglevel_with_label("info")

sanitized_cells = parser.sanitize_cells_in_bbox(
page=doc["pages"][0],
page=page,
bbox=bbox,
cell_overlap=0.9,
horizontal_cell_tolerance=1.0,
enforce_same_font=False,
space_width_factor_for_merge=1.5,
space_width_factor_for_merge_with_space=0.33,
)
print("#-cells: ", len(sanitized_cells))
print(tabulate(sanitized_cells["data"]))

new_data, new_header = filter_columns_v2(
sanitized_cells["data"],
sanitized_cells["header"],
new_header=["x0", "y0", "x1", "y1", "text"],
)

table = tabulate(new_data, new_header)

logging.info("#-cells: " + str(len(sanitized_cells["data"])))
logging.info(f"selected cells: \n\n{table}\n\n")

img = create_pil_image_of_page_v2(doc["pages"][0])
# img.show()

img = draw_bbox_on_page_v2(img, page, list(map(int, bbox)))
img.show()


if __name__ == "__main__":
Expand Down
82 changes: 63 additions & 19 deletions docling_parse/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging
from typing import Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -158,6 +157,67 @@ def create_pil_image_of_page_v1(
return img


def filter_columns_v2(data: List[Tuple], header: list[str], new_header: list[str]):

new_data = []
for row in data:

new_row = []
for _ in new_header:
new_row.append(row[header.index(_)])
new_data.append(new_row)

return new_data, new_header


def draw_bbox_on_page_v2(
img: Image.Image,
page: Dict,
bbox: Tuple[int, int, int, int],
category: str = "sanitized", # original
page_boundary: str = "crop_box", # media_box
ocolor: str = "red",
fcolor: str = "white",
alpha: float = 1.0,
) -> Image.Image:

if category not in ["original", "sanitized"]:
raise ValueError(
f"category {category} needs to be of `original` or `sanitized`."
)

if page_boundary not in ["crop_box", "media_box"]:
raise ValueError(
f"page_boundary {page_boundary} needs to be of `crop_box` or `media_box`."
)

draw = ImageDraw.Draw(img)

dimension = page[category]["dimension"]
# logging.info(f"dimensions: {json.dumps(dimension, indent=2)}")

dimension["width"]
H = dimension["height"]

# logging.info(f"width: {W}, height: {H}")

bl = (bbox[0], H - bbox[1])
br = (bbox[2], H - bbox[1])
tr = (bbox[2], H - bbox[3])
tl = (bbox[0], H - bbox[3])

_ = int(max(0, min(255, int(alpha * 255))))

# Convert cell color to RGBA with alpha
outl_color = ImageColor.getrgb(ocolor) + (_,)
ImageColor.getrgb(fcolor) + (_,)

# Draw the rectangle as a polygon
draw.polygon([bl, br, tr, tl], outline=outl_color) # , fill=fill_color)

return img


def create_pil_image_of_page_v2(
page: Dict,
category: str = "sanitized", # original
Expand Down Expand Up @@ -286,7 +346,7 @@ def _draw_annotations(
)

dimension = page[category]["dimension"]
logging.info(f"dimensions: {json.dumps(dimension, indent=2)}")
# logging.info(f"dimensions: {json.dumps(dimension, indent=2)}")

cells = page[category]["cells"]["data"]
cells_header = page[category]["cells"]["header"]
Expand All @@ -301,22 +361,11 @@ def _draw_annotations(
W = dimension["width"]
H = dimension["height"]

logging.info(f"width: {W}, height: {H}")
# logging.info(f"width: {W}, height: {H}")

"""
# Create a blank white image
img = Image.new("RGB", (round(W), round(H)), "white")
draw = ImageDraw.Draw(img)
"""
# Create a blank white image with RGBA mode
img = Image.new("RGBA", (round(W), round(H)), (255, 255, 255, 255))
draw = ImageDraw.Draw(img)
"""
overlay = Image.new(
"RGBA", (round(W), round(H)), (255, 255, 255, 0)
) # Transparent overlay
draw = ImageDraw.Draw(overlay)
"""

# Draw each rectangle by connecting its four points
if draw_images:
Expand Down Expand Up @@ -367,14 +416,11 @@ def _draw_annotations(
alpha=cell_alpha,
)

# Fixme: the _draw_text_in_bounding_bbox is not yet working
text = row[cells_header.index(f"text")]
if draw_cells_text and len(text) > 0:
draw = _draw_text_in_bounding_bbox(
# overlay,
img,
draw,
# bbox=(round(rect[0][0]), round(rect[0][1]), round(rect[2][0]), round(rect[2][1])),
bbox=(
round(rect[3][0]),
round(rect[3][1]),
Expand Down Expand Up @@ -483,8 +529,6 @@ def _draw_annotations(
tr = (x1, H - y1)
tl = (x0, H - y1)

logging.info(page_bbox)

outl_color = ImageColor.getrgb(cropbox_outline) + (int(cropbox_alpha * 255),)

# Draw the rectangle as a polygon
Expand Down
Loading

0 comments on commit 3914d2d

Please sign in to comment.