Skip to content

Commit

Permalink
feat: Create a backend to transform PubMed XML files to DoclingDocume…
Browse files Browse the repository at this point in the history
…nt (#557)

Signed-off-by: lucas-morin <[email protected]>
  • Loading branch information
lucas-morin authored Dec 17, 2024
1 parent e31f09f commit fd03480
Show file tree
Hide file tree
Showing 24 changed files with 31,040 additions and 4 deletions.
592 changes: 592 additions & 0 deletions docling/backend/xml/pubmed_backend.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class InputFormat(str, Enum):
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
XML_PUBMED = "xml_pubmed"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
Expand All @@ -55,6 +56,7 @@ class OutputFormat(str, Enum):
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
Expand All @@ -72,6 +74,7 @@ class OutputFormat(str, Enum):
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_PUBMED: ["application/xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",
Expand Down
16 changes: 13 additions & 3 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,7 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
# TODO: remove application/xml case after adding another XML parse
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
if len(formats) == 1 and mime not in ("text/plain"):
return formats[0]
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
Expand Down Expand Up @@ -325,6 +324,12 @@ def _guess_from_content(
):
input_format = InputFormat.XML_USPTO

if (
InputFormat.XML_PUBMED in formats
and "/NLM//DTD JATS" in xml_doctype
):
input_format = InputFormat.XML_PUBMED

elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO
Expand All @@ -340,7 +345,6 @@ def _mime_from_extension(ext):
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]

return mime

@staticmethod
Expand Down Expand Up @@ -370,4 +374,10 @@ def _detect_html_xhtml(
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"

p = re.compile(
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
)
if p.search(content_str):
return "application/xml"

return None
10 changes: 9 additions & 1 deletion docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
Expand Down Expand Up @@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


class XMLPubMedFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend


class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
Expand Down Expand Up @@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
Expand Down Expand Up @@ -171,7 +180,6 @@ def convert(
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:

all_res = self.convert_all(
source=[source],
raises_on_error=raises_on_error,
Expand Down
165 changes: 165 additions & 0 deletions tests/data/groundtruth/docling_v2/elife-56337.xml.itxt

Large diffs are not rendered by default.

Loading

0 comments on commit fd03480

Please sign in to comment.