feat: Create a backend to transform PubMed XML files to DoclingDocume…

…nt (#557) Signed-off-by: lucas-morin <[email protected]>
DS4SD · Dec 17, 2024 · fd03480 · fd03480
1 parent e31f09f
commit fd03480
Show file tree

Hide file tree

Showing 24 changed files with 31,040 additions and 4 deletions.
diff --git a/docling/backend/xml/pubmed_backend.py b/docling/backend/xml/pubmed_backend.py
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
+    XML_PUBMED = "xml_pubmed"
     IMAGE = "image"
     PDF = "pdf"
     ASCIIDOC = "asciidoc"
@@ -55,6 +56,7 @@ class OutputFormat(str, Enum):
     InputFormat.PDF: ["pdf"],
     InputFormat.MD: ["md"],
     InputFormat.HTML: ["html", "htm", "xhtml"],
+    InputFormat.XML_PUBMED: ["xml", "nxml"],
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
@@ -72,6 +74,7 @@ class OutputFormat(str, Enum):
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
     ],
     InputFormat.HTML: ["text/html", "application/xhtml+xml"],
+    InputFormat.XML_PUBMED: ["application/xml"],
     InputFormat.IMAGE: [
         "image/png",
         "image/jpeg",

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -292,8 +292,7 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
         if formats:
-            # TODO: remove application/xml case after adding another XML parse
-            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
+            if len(formats) == 1 and mime not in ("text/plain"):
                 return formats[0]
             else:  # ambiguity in formats
                 return _DocumentConversionInput._guess_from_content(
@@ -325,6 +324,12 @@ def _guess_from_content(
                 ):
                     input_format = InputFormat.XML_USPTO
 
+                if (
+                    InputFormat.XML_PUBMED in formats
+                    and "/NLM//DTD JATS" in xml_doctype
+                ):
+                    input_format = InputFormat.XML_PUBMED
+
         elif mime == "text/plain":
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO
@@ -340,7 +345,6 @@ def _mime_from_extension(ext):
             mime = FormatToMimeType[InputFormat.HTML][0]
         elif ext in FormatToExtensions[InputFormat.MD]:
             mime = FormatToMimeType[InputFormat.MD][0]
-
         return mime
 
     @staticmethod
@@ -370,4 +374,10 @@ def _detect_html_xhtml(
         if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
             return "text/html"
 
+        p = re.compile(
+            r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
+        )
+        if p.search(content_str):
+            return "application/xml"
+
         return None
diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -15,6 +15,7 @@
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
     backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
 
 
+class XMLPubMedFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
+
+
 class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.XML_USPTO: FormatOption(
             pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
         ),
+        InputFormat.XML_PUBMED: FormatOption(
+            pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),
@@ -171,7 +180,6 @@ def convert(
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
     ) -> ConversionResult:
-
         all_res = self.convert_all(
             source=[source],
             raises_on_error=raises_on_error,

diff --git a/tests/data/groundtruth/docling_v2/elife-56337.xml.itxt b/tests/data/groundtruth/docling_v2/elife-56337.xml.itxt