Skip to content

Commit

Permalink
merged with branch
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 17, 2025
2 parents 40cba22 + 132b663 commit 3d30a01
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
6 changes: 6 additions & 0 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Parser for PDF files"""

import hashlib
import warnings
from io import BytesIO
from pathlib import Path
from typing import Dict, Iterator, List, Tuple, Union
Expand Down Expand Up @@ -283,6 +284,11 @@ def __init__(self, loglevel: str = "fatal"):
level (str): Logging level as a string.
One of ['fatal', 'error', 'warning', 'info']
"""
warnings.warn(
"This API is currently experimental and may change in upcoming versions without notice.",
category=UserWarning,
stacklevel=2,
)
self.parser = pdf_parser_v2(level=loglevel)

def set_loglevel(self, loglevel: str):
Expand Down
15 changes: 11 additions & 4 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,11 @@ def test_reference_documents_from_filenames():
) # default: True
assert pdf_doc is not None

# PdfDocument.iterate_pages() will automatically populate pages as they are yielded.
# No need to call PdfDocument.load_all_pages() before.
for page_no, pred_page in pdf_doc.iterate_pages():
print(f" -> Page {page_no} has {len(pred_page.sanitized.cells)} cells.")

# res = page.original.render()
# res.show()

Expand Down Expand Up @@ -238,9 +241,11 @@ def test_load_lazy_or_eager():

pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename, lazy=False)

# The lazy doc has no pages populated, the eager one has them.
# The lazy doc has no pages populated, since they were never iterated so far.
# The eager doc one has the pages pre-populated before first iteration.
assert pdf_doc_case1._pages != pdf_doc_case2._pages

# This method triggers the pre-loading on the lazy document after creation.
pdf_doc_case1.load_all_pages()

# After loading the pages of the lazy doc, the two documents are equal.
Expand All @@ -262,6 +267,9 @@ def test_load_two_distinct_docs():
pdf_doc_case1.load_all_pages()
pdf_doc_case2.load_all_pages()

# The two PdfDocument instances must be non-equal. This confirms
# that no internal state is overwritten by accident when loading more than
# one document with the same DoclingPdfParser instance.
assert pdf_doc_case1._pages != pdf_doc_case2._pages


Expand All @@ -272,9 +280,8 @@ def test_serialize_and_reload():

pdf_doc: PdfDocument = parser.load(path_or_stream=filename, lazy=True)

# TODO a proper serialization model must be still established for a full PdfDocument

page_adapter = TypeAdapter(Dict[int, ParsedPdfPage])
# We can serialize the pages dict the following way.
page_adapter = TypeAdapter(Dict[int, ParsedPage])

json_pages = page_adapter.dump_json(pdf_doc._pages)
reloaded_pages: Dict[int, ParsedPdfPage] = page_adapter.validate_json(json_pages)
Expand Down

0 comments on commit 3d30a01

Please sign in to comment.