Skip to content

Commit

Permalink
fix: Introduce Image format options in CLI. Silence the tqdm download…
Browse files Browse the repository at this point in the history
…ing messages. (#544)

* fix: main: Introduce format options for Image with the same pdf pipeline_options.
Add RapidOcrOptions to the Union of ocr_options for PdfPipelineOptions

Signed-off-by: Nikos Livathinos <[email protected]>

* fix: Silence the tqdm messages during the downloading of model files

Signed-off-by: Nikos Livathinos <[email protected]>

* fix: Code styling

Signed-off-by: Nikos Livathinos <[email protected]>

* fix: Use the HF API to disable the tqdm progress bars

Signed-off-by: Nikos Livathinos <[email protected]>

---------

Signed-off-by: Nikos Livathinos <[email protected]>
  • Loading branch information
nikos-livathinos authored Dec 9, 2024
1 parent aca57f0 commit 78f61a8
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
10 changes: 6 additions & 4 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,13 @@ def convert(
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,
Expand Down
6 changes: 5 additions & 1 deletion docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):

table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
EasyOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
OcrMacOptions,
RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind")

images_scale: float = 1.0
Expand Down
2 changes: 2 additions & 0 deletions docling/pipeline/standard_pdf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars

disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,
Expand Down

0 comments on commit 78f61a8

Please sign in to comment.