Closed
Description
Docling, WDU Tables/OCR tests fail with the error:
RuntimeError: Cannot visualize document without images
To reproduce, update test_tables_aws.py to use Docling
and run.
poetry run pytest -v tests/test_tables_docling.py
======================================================================================================================== test session starts =========================================================================================================================
platform darwin -- Python 3.11.7, pytest-7.4.4, pluggy-1.5.0 -- /Users/wai25/.pyenv/versions/3.11.7/envs/quality/bin/python
cachedir: .pytest_cache
rootdir: /Users/wai25/git/wai25/docling-eval
plugins: anyio-4.9.0, dependency-0.6.0, xdist-3.6.1
collected 1 item
tests/test_tables_docling.py::test_run_fintabnet_builder FAILED [100%]
============================================================================================================================== FAILURES ==============================================================================================================================
_____________________________________________________________________________________________________________________ test_run_fintabnet_builder _____________________________________________________________________________________________________________________
@pytest.mark.skipif(
IS_CI, reason="Skipping test in CI because the dataset is too heavy."
)
def test_run_fintabnet_builder():
target_path = Path(f"./scratch/{BenchMarkNames.FINTABNET.value}_docling/")
docling_provider = DoclingPredictionProvider(
do_visualization=True, ignore_missing_predictions=False
)
dataset = FintabNetDatasetBuilder(
target=target_path / "gt_dataset",
begin_index=1,
end_index=15,
)
dataset.save_to_disk() # does all the job of iterating the dataset, making GT+prediction records, and saving them in shards as parquet.
> docling_provider.create_prediction_dataset(
name=dataset.name,
gt_dataset_dir=target_path / "gt_dataset",
target_dataset_dir=target_path / "eval_dataset",
)
tests/test_tables_docling.py:42:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
docling_eval/prediction_providers/base_prediction_provider.py:393: in create_prediction_dataset
self.visualize_results(r, target_dataset_dir)
docling_eval/prediction_providers/base_prediction_provider.py:162: in visualize_results
save_comparison_html_with_clusters(
docling_eval/visualisation/visualisations.py:125: in save_comparison_html_with_clusters
pred_page_imgs = pred_doc.get_visualization(show_label=False)
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/types/doc/document.py:4086: in get_visualization
images = visualizer.get_visualization(doc=self)
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/transforms/visualizer/reading_order_visualizer.py:144: in get_visualization
self.base_visualizer.get_visualization(doc=doc, **kwargs)
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/transforms/visualizer/layout_visualizer.py:212: in get_visualization
return self._draw_doc_layout(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = LayoutVisualizer(base_visualizer=None, params=Params(show_label=False))
doc = DoclingDocument(schema_name='DoclingDocument', version='1.3.0', name='file', origin=DocumentOrigin(mimetype='applicati...], key_value_items=[], form_items=[], pages={1: PageItem(size=Size(width=368.0, height=105.0), image=None, page_no=1)}), images = None
def _draw_doc_layout(
self, doc: DoclingDocument, images: Optional[dict[Optional[int], Image]] = None
):
"""Draw the document clusters and optionaly the reading order."""
clusters = []
my_images: dict[Optional[int], Image] = {}
if images is not None:
my_images = images
# Initialise `my_images` beforehand: sometimes, you have the
# page-images but no DocItems!
for page_nr, page in doc.pages.items():
page_image = doc.pages[page_nr].image
if page_image is None or (pil_img := page_image.pil_image) is None:
> raise RuntimeError("Cannot visualize document without images")
E RuntimeError: Cannot visualize document without images
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/transforms/visualizer/layout_visualizer.py:136: RuntimeError
------------------------------------------------------------------------------------------------------------------------ Captured stderr call ------------------------------------------------------------------------------------------------------------------------
Processing FinTabNet dataset: 100%|██████████| 14/14 [00:00<00:00, 75.92it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 974.51ba/s]
Generating test split: 14 examples [00:00, 4290.53 examples/s]
Creating predictions: 100%|█████████████████████████████████████████████████████████████| 14/14 [00:31<00:00, 2.28s/it]
========================================================================================================================== warnings summary ==========================================================================================================================
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/types/doc/document.py:4112: 1 warning
tests/test_tables_docling.py: 84 warnings
/Users/wai25/.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling_core/types/doc/document.py:4112: DeprecationWarning: deprecated
if not d.validate_tree(d.body) or not d.validate_tree(d.furniture):
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/PyPDF2/__init__.py:21
/Users/wai25/.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/PyPDF2/__init__.py:21: DeprecationWarning: PyPDF2 is deprecated. Please move to the pypdf library instead.
warnings.warn(
../../../.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/pydantic/_internal/_config.py:323
/Users/wai25/.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/pydantic/_internal/_config.py:323: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V/
warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning)
tests/test_tables_docling.py::test_run_fintabnet_builder
/Users/wai25/.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling/pipeline/standard_pdf_pipeline.py:60: DeprecationWarning: Field `generate_table_images` is deprecated. To obtain table images, set `PdfPipelineOptions.generate_page_images .
or self.pipeline_options.generate_table_images
tests/test_tables_docling.py: 14 warnings
/Users/wai25/.pyenv/versions/3.11.7/envs/quality/lib/python3.11/site-packages/docling/pipeline/standard_pdf_pipeline.py:214: DeprecationWarning: Field `generate_table_images` is deprecated. To obtain table images, set `PdfPipelineOptions.generate_page_images.
or self.pipeline_options.generate_table_images
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
====================================================================================================================== short test summary info =======================================================================================================================
FAILED tests/test_tables_docling.py::test_run_fintabnet_builder - RuntimeError: Cannot visualize document without images
================================================================================================================== 1 failed, 102 warnings in 38.99s==================================================================================================================