merged with branch

Signed-off-by: Peter Staar <[email protected]>
DS4SD · Jan 17, 2025 · 3d30a01 · 3d30a01
2 parents 40cba22 + 132b663
commit 3d30a01
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 4 deletions.
diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py
@@ -1,6 +1,7 @@
 """Parser for PDF files"""
 
 import hashlib
+import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import Dict, Iterator, List, Tuple, Union
@@ -283,6 +284,11 @@ def __init__(self, loglevel: str = "fatal"):
             level (str): Logging level as a string.
                      One of ['fatal', 'error', 'warning', 'info']
         """
+        warnings.warn(
+            "This API is currently experimental and may change in upcoming versions without notice.",
+            category=UserWarning,
+            stacklevel=2,
+        )
         self.parser = pdf_parser_v2(level=loglevel)
 
     def set_loglevel(self, loglevel: str):

diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -193,8 +193,11 @@ def test_reference_documents_from_filenames():
         )  # default: True
         assert pdf_doc is not None
 
+        # PdfDocument.iterate_pages() will automatically populate pages as they are yielded.
+        # No need to call PdfDocument.load_all_pages() before.        
         for page_no, pred_page in pdf_doc.iterate_pages():
             print(f" -> Page {page_no} has {len(pred_page.sanitized.cells)} cells.")
+
             # res = page.original.render()
             # res.show()
 
@@ -238,9 +241,11 @@ def test_load_lazy_or_eager():
 
     pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename, lazy=False)
 
-    # The lazy doc has no pages populated, the eager one has them.
+    # The lazy doc has no pages populated, since they were never iterated so far.
+    # The eager doc one has the pages pre-populated before first iteration.
     assert pdf_doc_case1._pages != pdf_doc_case2._pages
 
+    # This method triggers the pre-loading on the lazy document after creation.
     pdf_doc_case1.load_all_pages()
 
     # After loading the pages of the lazy doc, the two documents are equal.
@@ -262,6 +267,9 @@ def test_load_two_distinct_docs():
     pdf_doc_case1.load_all_pages()
     pdf_doc_case2.load_all_pages()
 
+    # The two PdfDocument instances must be non-equal. This confirms
+    # that no internal state is overwritten by accident when loading more than
+    # one document with the same DoclingPdfParser instance.
     assert pdf_doc_case1._pages != pdf_doc_case2._pages
 
 
@@ -272,9 +280,8 @@ def test_serialize_and_reload():
 
     pdf_doc: PdfDocument = parser.load(path_or_stream=filename, lazy=True)
 
-    # TODO a proper serialization model must be still established for a full PdfDocument
-
-    page_adapter = TypeAdapter(Dict[int, ParsedPdfPage])
+    # We can serialize the pages dict the following way.
+    page_adapter = TypeAdapter(Dict[int, ParsedPage])
 
     json_pages = page_adapter.dump_json(pdf_doc._pages)
     reloaded_pages: Dict[int, ParsedPdfPage] = page_adapter.validate_json(json_pages)