From a7174c501b822d3489f6dd96348d50a848c64715 Mon Sep 17 00:00:00 2001
From: Davi Barreira <davi@resolvvi.com>
Date: Tue, 6 Sep 2022 11:21:41 -0300
Subject: [PATCH 1/2] :sparkles: The load_pdf function now can handle
 bitstreams.

---
 src/layoutparser/io/pdf.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py
index ad35cf4..db14352 100644
--- a/src/layoutparser/io/pdf.py
+++ b/src/layoutparser/io/pdf.py
@@ -16,6 +16,7 @@
 
 import pdfplumber
 import pandas as pd
+from io import BytesIO
 
 from ..elements import Layout
 from .basic import load_dataframe
@@ -82,7 +83,7 @@ def extract_words_for_page(
 
 
 def load_pdf(
-    filename: str,
+    filename: Union[str,bytes],
     load_images: bool = False,
     x_tolerance: int = 1.5,
     y_tolerance: int = 2,
@@ -97,7 +98,7 @@ def load_pdf(
     in a list of Layout objects with the original page order.
 
     Args:
-        filename (str): The path to the PDF file.
+        filename (Union[str,bytes]): The path to the PDF file or the bitstream.
         load_images (bool, optional):
             Whether load screenshot for each page of the PDF file.
             When set to true, the function will return both the layout and
@@ -178,6 +179,8 @@ def load_pdf(
         >>> lp.draw_box(pdf_images[0], pdf_layout[0])
     """
 
+    if type(filename) == bytes:
+        filename = BytesIO(filename)
     plumber_pdf_object = pdfplumber.open(filename)
 
     all_page_layout = []
@@ -207,7 +210,10 @@ def load_pdf(
     else:
         import pdf2image
 
-        pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
+        if type(filename) == bytes:
+            pdf_images = pdf2image.convert_from_bytes(filename, dpi=dpi)
+        else:
+            pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
 
         for page_id, page_image in enumerate(pdf_images):
             image_width, image_height = page_image.size
@@ -222,4 +228,4 @@ def load_pdf(
                 page_layout.page_data["height"] = image_height
                 all_page_layout[page_id] = page_layout
 
-        return all_page_layout, pdf_images
\ No newline at end of file
+        return all_page_layout, pdf_images

From 99df5ef6d5cd7c23312efa9a73ae91aadc739296 Mon Sep 17 00:00:00 2001
From: Davi Barreira <davi@resolvvi.com>
Date: Tue, 6 Sep 2022 11:24:11 -0300
Subject: [PATCH 2/2] :white_check_mark: Added unit test for load_pdf with
 bitstream.

---
 tests/test_io.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/test_io.py b/tests/test_io.py
index 5679dc3..2fd0601 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -85,4 +85,15 @@ def test_empty_pdf():
     assert len(pdf_layout) == 1 # Only one page
     
     page_layout = pdf_layout[0]
-    assert len(page_layout) == 0 # No selectable tokens on the page
\ No newline at end of file
+    assert len(page_layout) == 0 # No selectable tokens on the page
+
+def test_pdf_with_bites():
+    from io import BytesIO
+    pdf_layout = load_pdf(BytesIO("tests/fixtures/io/example.pdf").read())
+    assert len(pdf_layout) == 1
+    
+    page_layout = pdf_layout[0]
+    for attr_name in ["width", "height", "index"]:
+        assert attr_name in page_layout.page_data
+
+    assert len(set(ele.type for ele in page_layout)) == 3