Layout-Parser · davibarreira · Sep 6, 2022 · Sep 6, 2022
diff --git a/src/layoutparser/io/pdf.py b/src/layoutparser/io/pdf.py
@@ -16,6 +16,7 @@
 
 import pdfplumber
 import pandas as pd
+from io import BytesIO
 
 from ..elements import Layout
 from .basic import load_dataframe
@@ -82,7 +83,7 @@ def extract_words_for_page(
 
 
 def load_pdf(
-    filename: str,
+    filename: Union[str,bytes],
     load_images: bool = False,
     x_tolerance: int = 1.5,
     y_tolerance: int = 2,
@@ -97,7 +98,7 @@ def load_pdf(
     in a list of Layout objects with the original page order.
 
     Args:
-        filename (str): The path to the PDF file.
+        filename (Union[str,bytes]): The path to the PDF file or the bitstream.
         load_images (bool, optional):
             Whether load screenshot for each page of the PDF file.
             When set to true, the function will return both the layout and
@@ -178,6 +179,8 @@ def load_pdf(
         >>> lp.draw_box(pdf_images[0], pdf_layout[0])
     """
 
+    if type(filename) == bytes:
+        filename = BytesIO(filename)
     plumber_pdf_object = pdfplumber.open(filename)
 
     all_page_layout = []
@@ -207,7 +210,10 @@ def load_pdf(
     else:
         import pdf2image
 
-        pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
+        if type(filename) == bytes:
+            pdf_images = pdf2image.convert_from_bytes(filename, dpi=dpi)
+        else:
+            pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
 
         for page_id, page_image in enumerate(pdf_images):
             image_width, image_height = page_image.size
@@ -222,4 +228,4 @@ def load_pdf(
                 page_layout.page_data["height"] = image_height
                 all_page_layout[page_id] = page_layout
 
-        return all_page_layout, pdf_images
+        return all_page_layout, pdf_images
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -85,4 +85,15 @@ def test_empty_pdf():
     assert len(pdf_layout) == 1 # Only one page
 
     page_layout = pdf_layout[0]
-    assert len(page_layout) == 0 # No selectable tokens on the page
+    assert len(page_layout) == 0 # No selectable tokens on the page
+
+def test_pdf_with_bites():
+    from io import BytesIO
+    pdf_layout = load_pdf(BytesIO("tests/fixtures/io/example.pdf").read())
+    assert len(pdf_layout) == 1
+
+    page_layout = pdf_layout[0]
+    for attr_name in ["width", "height", "index"]:
+        assert attr_name in page_layout.page_data
+
+    assert len(set(ele.type for ele in page_layout)) == 3