refactor: simplify handling multiple InputFormats per mime type

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
DS4SD · Dec 17, 2024 · 3d249eb · 3d249eb
1 parent 8b66e61
commit 3d249eb
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 29 deletions.
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from docling_core.types.doc import (
@@ -13,7 +13,6 @@
 )
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict
-from typing_extensions import Self, override
 
 if TYPE_CHECKING:
     from docling.backend.pdf_backend import PdfPageBackend
@@ -29,31 +28,17 @@ class ConversionStatus(str, Enum):
 
 
 class InputFormat(str, Enum):
-    """A document format supported by document backend parsers.
-
-    The field `is_custom` indicates whether the document format is more specific than
-    the standard and content formats, typically defined by MIME types.
-    """
-
-    DOCX = "docx", False
-    PPTX = "pptx", False
-    HTML = "html", False
-    IMAGE = "image", False
-    PDF = "pdf", False
-    ASCIIDOC = "asciidoc", False
-    MD = "md", False
-    XLSX = "xlsx", False
-    XML_USPTO = "uspto", True
-
-    @override
-    def __new__(cls, value: str, _) -> Self:
-        obj = str.__new__(cls, [value])
-        obj._value_ = value
-        return obj
-
-    @override
-    def __init__(self, _, is_custom: bool) -> None:
-        self.is_custom: bool = is_custom
+    """A document format supported by document backend parsers."""
+
+    DOCX = "docx"
+    PPTX = "pptx"
+    HTML = "html"
+    IMAGE = "image"
+    PDF = "pdf"
+    ASCIIDOC = "asciidoc"
+    MD = "md"
+    XLSX = "xlsx"
+    XML_USPTO = "uspto"
 
 
 class OutputFormat(str, Enum):

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -290,9 +290,10 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
         mime = mime or "text/plain"
         formats = MimeTypeToFormat.get(mime, [])
         if formats:
-            if len(formats) == 1 and not formats[0].is_custom:
+            # TODO: remove application/xml case after adding another XML parse
+            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
                 return formats[0]
-            else:  # ambiguity or custom cases
+            else:  # ambiguity in formats
                 return _DocumentConversionInput._guess_from_content(
                     content, mime, formats
                 )