From e7af164f21e68ec058cc72c03f8ceace2cbff745 Mon Sep 17 00:00:00 2001
From: Matteo-Omenetti <omenetti.matteo@gmail.com>
Date: Fri, 17 Jan 2025 14:40:25 +0100
Subject: [PATCH] added enum for programming languages

---
 docling_core/types/doc/document.py | 64 +++++++++++++++++++++++++++--
 docling_core/types/doc/labels.py   | 66 ++++++++++++++++++++++++++++++
 docs/DoclingDocument.json          | 59 ++++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 3 deletions(-)

diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
index 9b99d38f..8e7c178e 100644
--- a/docling_core/types/doc/document.py
+++ b/docling_core/types/doc/document.py
@@ -36,7 +36,7 @@
 from docling_core.types.base import _JSON_POINTER_REGEX
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.base import ImageRefMode
-from docling_core.types.doc.labels import DocItemLabel, GroupLabel
+from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
 from docling_core.types.doc.tokens import DocumentToken, TableToken
 from docling_core.types.doc.utils import relative_path
 
@@ -661,7 +661,65 @@ class CodeItem(TextItem):
     label: typing.Literal[DocItemLabel.CODE] = (
         DocItemLabel.CODE  # type: ignore[assignment]
     )
-    code_language: str = "unknown"
+    code_language: typing.Literal[
+        CodeLanguageLabel.ADA,
+        CodeLanguageLabel.AWK,
+        CodeLanguageLabel.BASH,
+        CodeLanguageLabel.C,
+        CodeLanguageLabel.C_SHARP,
+        CodeLanguageLabel.C_PLUS_PLUS,
+        CodeLanguageLabel.CMAKE,
+        CodeLanguageLabel.COBOL,
+        CodeLanguageLabel.CSS,
+        CodeLanguageLabel.CEYLON,
+        CodeLanguageLabel.CLOJURE,
+        CodeLanguageLabel.CRYSTAL,
+        CodeLanguageLabel.CUDA,
+        CodeLanguageLabel.CYTHON,
+        CodeLanguageLabel.D,
+        CodeLanguageLabel.DART,
+        CodeLanguageLabel.DOCKERFILE,
+        CodeLanguageLabel.ELIXIR,
+        CodeLanguageLabel.ERLANG,
+        CodeLanguageLabel.FORTRAN,
+        CodeLanguageLabel.FORTH,
+        CodeLanguageLabel.GO,
+        CodeLanguageLabel.HTML,
+        CodeLanguageLabel.HASKELL,
+        CodeLanguageLabel.HAXE,
+        CodeLanguageLabel.JAVA,
+        CodeLanguageLabel.JAVASCRIPT,
+        CodeLanguageLabel.JULIA,
+        CodeLanguageLabel.KOTLIN,
+        CodeLanguageLabel.LISP,
+        CodeLanguageLabel.LUA,
+        CodeLanguageLabel.MATLAB,
+        CodeLanguageLabel.MOONSCRIPT,
+        CodeLanguageLabel.NIM,
+        CodeLanguageLabel.OCAML,
+        CodeLanguageLabel.OBJECTIVEC,
+        CodeLanguageLabel.OCTAVE,
+        CodeLanguageLabel.PHP,
+        CodeLanguageLabel.PASCAL,
+        CodeLanguageLabel.PERL,
+        CodeLanguageLabel.PROLOG,
+        CodeLanguageLabel.PYTHON,
+        CodeLanguageLabel.RACKET,
+        CodeLanguageLabel.RUBY,
+        CodeLanguageLabel.RUST,
+        CodeLanguageLabel.SML,
+        CodeLanguageLabel.SQL,
+        CodeLanguageLabel.SCALA,
+        CodeLanguageLabel.SCHEME,
+        CodeLanguageLabel.SWIFT,
+        CodeLanguageLabel.TYPESCRIPT,
+        CodeLanguageLabel.VISUALBASIC,
+        CodeLanguageLabel.XML,
+        CodeLanguageLabel.YAML,
+        CodeLanguageLabel.BC,
+        CodeLanguageLabel.DC,
+        CodeLanguageLabel.UNKNOWN,
+    ] = CodeLanguageLabel.UNKNOWN
 
 
 class SectionHeaderItem(TextItem):
@@ -1655,7 +1713,7 @@ def add_title(
     def add_code(
         self,
         text: str,
-        code_language: Optional[str] = None,
+        code_language: Optional[CodeLanguageLabel] = None,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[NodeItem] = None,
diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py
index 63101638..6573f2d7 100644
--- a/docling_core/types/doc/labels.py
+++ b/docling_core/types/doc/labels.py
@@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
     def __str__(self):
         """Get string value."""
         return str(self.value)
+
+
+class CodeLanguageLabel(str, Enum):
+    """CodeLanguageLabel."""
+
+    ADA = "Ada"
+    AWK = "Awk"
+    BASH = "Bash"
+    C = "C"
+    C_SHARP = "C#"
+    C_PLUS_PLUS = "C++"
+    CMAKE = "CMake"
+    COBOL = "COBOL"
+    CSS = "CSS"
+    CEYLON = "Ceylon"
+    CLOJURE = "Clojure"
+    CRYSTAL = "Crystal"
+    CUDA = "Cuda"
+    CYTHON = "Cython"
+    D = "D"
+    DART = "Dart"
+    DOCKERFILE = "Dockerfile"
+    ELIXIR = "Elixir"
+    ERLANG = "Erlang"
+    FORTRAN = "FORTRAN"
+    FORTH = "Forth"
+    GO = "Go"
+    HTML = "HTML"
+    HASKELL = "Haskell"
+    HAXE = "Haxe"
+    JAVA = "Java"
+    JAVASCRIPT = "JavaScript"
+    JULIA = "Julia"
+    KOTLIN = "Kotlin"
+    LISP = "Lisp"
+    LUA = "Lua"
+    MATLAB = "Matlab"
+    MOONSCRIPT = "MoonScript"
+    NIM = "Nim"
+    OCAML = "OCaml"
+    OBJECTIVEC = "ObjectiveC"
+    OCTAVE = "Octave"
+    PHP = "PHP"
+    PASCAL = "Pascal"
+    PERL = "Perl"
+    PROLOG = "Prolog"
+    PYTHON = "Python"
+    RACKET = "Racket"
+    RUBY = "Ruby"
+    RUST = "Rust"
+    SML = "SML"
+    SQL = "SQL"
+    SCALA = "Scala"
+    SCHEME = "Scheme"
+    SWIFT = "Swift"
+    TYPESCRIPT = "TypeScript"
+    VISUALBASIC = "VisualBasic"
+    XML = "XML"
+    YAML = "YAML"
+    BC = "bc"
+    DC = "dc"
+    UNKNOWN = "unknown"
+
+    def __str__(self):
+        """Get string value."""
+        return str(self.value)
diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
index cc92a423..c51acce0 100644
--- a/docs/DoclingDocument.json
+++ b/docs/DoclingDocument.json
@@ -214,6 +214,65 @@
         },
         "code_language": {
           "default": "unknown",
+          "enum": [
+            "Ada",
+            "Awk",
+            "Bash",
+            "C",
+            "C#",
+            "C++",
+            "CMake",
+            "COBOL",
+            "CSS",
+            "Ceylon",
+            "Clojure",
+            "Crystal",
+            "Cuda",
+            "Cython",
+            "D",
+            "Dart",
+            "Dockerfile",
+            "Elixir",
+            "Erlang",
+            "FORTRAN",
+            "Forth",
+            "Go",
+            "HTML",
+            "Haskell",
+            "Haxe",
+            "Java",
+            "JavaScript",
+            "Julia",
+            "Kotlin",
+            "Lisp",
+            "Lua",
+            "Matlab",
+            "MoonScript",
+            "Nim",
+            "OCaml",
+            "ObjectiveC",
+            "Octave",
+            "PHP",
+            "Pascal",
+            "Perl",
+            "Prolog",
+            "Python",
+            "Racket",
+            "Ruby",
+            "Rust",
+            "SML",
+            "SQL",
+            "Scala",
+            "Scheme",
+            "Swift",
+            "TypeScript",
+            "VisualBasic",
+            "XML",
+            "YAML",
+            "bc",
+            "dc",
+            "unknown"
+          ],
           "title": "Code Language",
           "type": "string"
         }