From a7174c501b822d3489f6dd96348d50a848c64715 Mon Sep 17 00:00:00 2001 From: Davi Barreira Date: Tue, 6 Sep 2022 11:21:41 -0300 Subject: [PATCH 1/2] :sparkles: The load_pdf function now can handle bitstreams. --- src/layoutparser/io/pdf.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py index ad35cf4..db14352 100644 --- a/src/layoutparser/io/pdf.py +++ b/src/layoutparser/io/pdf.py @@ -16,6 +16,7 @@ import pdfplumber import pandas as pd +from io import BytesIO from ..elements import Layout from .basic import load_dataframe @@ -82,7 +83,7 @@ def extract_words_for_page( def load_pdf( - filename: str, + filename: Union[str,bytes], load_images: bool = False, x_tolerance: int = 1.5, y_tolerance: int = 2, @@ -97,7 +98,7 @@ def load_pdf( in a list of Layout objects with the original page order. Args: - filename (str): The path to the PDF file. + filename (Union[str,bytes]): The path to the PDF file or the bitstream. load_images (bool, optional): Whether load screenshot for each page of the PDF file. When set to true, the function will return both the layout and @@ -178,6 +179,8 @@ def load_pdf( >>> lp.draw_box(pdf_images[0], pdf_layout[0]) """ + if type(filename) == bytes: + filename = BytesIO(filename) plumber_pdf_object = pdfplumber.open(filename) all_page_layout = [] @@ -207,7 +210,10 @@ def load_pdf( else: import pdf2image - pdf_images = pdf2image.convert_from_path(filename, dpi=dpi) + if type(filename) == bytes: + pdf_images = pdf2image.convert_from_bytes(filename, dpi=dpi) + else: + pdf_images = pdf2image.convert_from_path(filename, dpi=dpi) for page_id, page_image in enumerate(pdf_images): image_width, image_height = page_image.size @@ -222,4 +228,4 @@ def load_pdf( page_layout.page_data["height"] = image_height all_page_layout[page_id] = page_layout - return all_page_layout, pdf_images \ No newline at end of file + return all_page_layout, pdf_images From 99df5ef6d5cd7c23312efa9a73ae91aadc739296 Mon Sep 17 00:00:00 2001 From: Davi Barreira Date: Tue, 6 Sep 2022 11:24:11 -0300 Subject: [PATCH 2/2] :white_check_mark: Added unit test for load_pdf with bitstream. --- tests/test_io.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_io.py b/tests/test_io.py index 5679dc3..2fd0601 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -85,4 +85,15 @@ def test_empty_pdf(): assert len(pdf_layout) == 1 # Only one page page_layout = pdf_layout[0] - assert len(page_layout) == 0 # No selectable tokens on the page \ No newline at end of file + assert len(page_layout) == 0 # No selectable tokens on the page + +def test_pdf_with_bites(): + from io import BytesIO + pdf_layout = load_pdf(BytesIO("tests/fixtures/io/example.pdf").read()) + assert len(pdf_layout) == 1 + + page_layout = pdf_layout[0] + for attr_name in ["width", "height", "index"]: + assert attr_name in page_layout.page_data + + assert len(set(ele.type for ele in page_layout)) == 3