Skip to content

Commit

Permalink
feat: docling-parse v2 as default PDF backend (#549)
Browse files Browse the repository at this point in the history
* Move to_docling_document from ds-glm to this repo

Signed-off-by: Christoph Auer <[email protected]>

* Upgrade to ds-glm 1.0 and docling-parse 3.0

Signed-off-by: Christoph Auer <[email protected]>

* Update lock

Signed-off-by: Christoph Auer <[email protected]>

* Fix DP2 backend code, change CLI default backend

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git authored Dec 9, 2024
1 parent 9fd2cf8 commit aca57f0
Show file tree
Hide file tree
Showing 8 changed files with 506 additions and 183 deletions.
2 changes: 1 addition & 1 deletion docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser_v1
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

Expand Down
12 changes: 7 additions & 5 deletions docling/backend/docling_parse_v2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_parse.docling_parse import pdf_parser_v2
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

Expand Down Expand Up @@ -210,12 +210,14 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.parser = pdf_parser_v2("fatal")

success = False
if isinstance(path_or_stream, BytesIO):
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
self.document_hash, path_or_stream
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
elif isinstance(path_or_stream, Path):
success = self.parser.load_document(self.document_hash, str(path_or_stream))

if not success:
raise RuntimeError(
Expand Down
2 changes: 1 addition & 1 deletion docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def convert(
] = None,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V1,
] = PdfBackend.DLPARSE_V2,
table_mode: Annotated[
TableFormerMode,
typer.Option(..., help="The mode to use in the table structure model."),
Expand Down
10 changes: 5 additions & 5 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
Expand Down Expand Up @@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):

class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


def _get_default_option(format: InputFormat) -> FormatOption:
Expand All @@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
Expand Down
4 changes: 2 additions & 2 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import List, Union

from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_docling_document
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
Expand All @@ -29,6 +28,7 @@
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.glm_utils import to_docling_document
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash

Expand Down Expand Up @@ -232,7 +232,7 @@ def make_spans(cell):
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)

glm_doc = self.model.apply_on_doc(ds_doc_dict)

Expand Down
Loading

0 comments on commit aca57f0

Please sign in to comment.