Skip to content

Commit

Permalink
fix: Enable HTML export in CLI and add options for image mode (#513)
Browse files Browse the repository at this point in the history
* updated README

Signed-off-by: Peter Staar <[email protected]>

* removed duck in title

Signed-off-by: Peter Staar <[email protected]>

* updated the index.md

Signed-off-by: Peter Staar <[email protected]>

* updated the cli to export html

Signed-off-by: Peter Staar <[email protected]>

* added html to cli

Signed-off-by: Peter Staar <[email protected]>

* reformatted the code

Signed-off-by: Peter Staar <[email protected]>

* removed the duck emoji, added the  in the cli. Currently, the referenced seems broken

Signed-off-by: Peter Staar <[email protected]>

* cleaning up the comments

Signed-off-by: Peter Staar <[email protected]>

* reference is now working

Signed-off-by: Peter Staar <[email protected]>

* Clean up styling and docs

Signed-off-by: Christoph Auer <[email protected]>

* Pin docling-core>=2.7.1

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
PeterStaar-IBM and cau-git authored Dec 6, 2024
1 parent b730b2d commit 0d11e30
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 351 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
</a>
</p>

# πŸ¦† Docling
# Docling

<p align="center">
<a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
Expand All @@ -26,7 +26,7 @@ Docling parses documents and exports them to the desired format with ease and sp

## Features

* πŸ—‚οΈ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
* πŸ—‚οΈ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
* πŸ“‘ Advanced PDF document understanding including page layout, reading order & table structures
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
* πŸ€– Easy integration with πŸ¦™ LlamaIndex & πŸ¦œπŸ”— LangChain for powerful RAG / QA applications
Expand Down
58 changes: 45 additions & 13 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import Annotated, Dict, Iterable, List, Optional, Type

import typer
from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter, ValidationError

Expand Down Expand Up @@ -87,9 +88,11 @@ def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
export_json: bool,
export_html: bool,
export_md: bool,
export_txt: bool,
export_doctags: bool,
image_export_mode: ImageRefMode,
):

success_count = 0
Expand All @@ -100,33 +103,45 @@ def export_documents(
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Deep Search document JSON format:
# Export JSON format:
if export_json:
fname = output_dir / f"{doc_filename}.json"
with fname.open("w", encoding="utf8") as fp:
_log.info(f"writing JSON output to {fname}")
fp.write(json.dumps(conv_res.document.export_to_dict()))
_log.info(f"writing JSON output to {fname}")
conv_res.document.save_as_json(
filename=fname, image_mode=image_export_mode
)

# Export HTML format:
if export_html:
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(
filename=fname, image_mode=image_export_mode
)

# Export Text format:
if export_txt:
fname = output_dir / f"{doc_filename}.txt"
with fname.open("w", encoding="utf8") as fp:
_log.info(f"writing Text output to {fname}")
fp.write(conv_res.document.export_to_markdown(strict_text=True))
_log.info(f"writing TXT output to {fname}")
conv_res.document.save_as_markdown(
filename=fname,
strict_text=True,
image_mode=ImageRefMode.PLACEHOLDER,
)

# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
with fname.open("w", encoding="utf8") as fp:
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.document.export_to_markdown())
_log.info(f"writing Markdown output to {fname}")
conv_res.document.save_as_markdown(
filename=fname, image_mode=image_export_mode
)

# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w", encoding="utf8") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.document.export_to_document_tokens())
_log.info(f"writing Doc Tags output to {fname}")
conv_res.document.save_as_document_tokens(filename=fname)

else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
Expand Down Expand Up @@ -161,6 +176,13 @@ def convert(
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown."
),
image_export_mode: Annotated[
ImageRefMode,
typer.Option(
...,
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
),
] = ImageRefMode.EMBEDDED,
ocr: Annotated[
bool,
typer.Option(
Expand Down Expand Up @@ -299,6 +321,7 @@ def convert(
to_formats = [OutputFormat.MARKDOWN]

export_json = OutputFormat.JSON in to_formats
export_html = OutputFormat.HTML in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
Expand Down Expand Up @@ -330,6 +353,13 @@ def convert(
)
pipeline_options.table_structure_options.mode = table_mode

if image_export_mode != ImageRefMode.PLACEHOLDER:
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = (
True # FIXME: to be deprecated in verson 3
)
pipeline_options.images_scale = 2

if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

Expand Down Expand Up @@ -364,9 +394,11 @@ def convert(
conv_results,
output_dir=output,
export_json=export_json,
export_html=export_html,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
image_export_mode=image_export_mode,
)

end_time = time.time() - start_time
Expand Down
1 change: 1 addition & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class InputFormat(str, Enum):
class OutputFormat(str, Enum):
MARKDOWN = "md"
JSON = "json"
HTML = "html"
TEXT = "text"
DOCTAGS = "doctags"

Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Docling parses documents and exports them to the desired format with ease and sp

## Features

* πŸ—‚οΈ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
* πŸ—‚οΈ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
* πŸ“‘ Advanced PDF document understanding incl. page layout, reading order & table structures
* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format
* πŸ€– Easy integration with πŸ¦™ LlamaIndex & πŸ¦œπŸ”— LangChain for powerful RAG / QA applications
Expand Down
2 changes: 1 addition & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ theme:
- toc.follow
nav:
- Home:
- "πŸ¦† Docling": index.md
- "Docling": index.md
- Installation: installation.md
- Usage: usage.md
- CLI: cli.md
Expand Down
Loading

0 comments on commit 0d11e30

Please sign in to comment.