From fa63048b6468f4d806a1c16f22499252f9328c7c Mon Sep 17 00:00:00 2001 From: Matteo-Omenetti Date: Fri, 17 Jan 2025 14:40:25 +0100 Subject: [PATCH] added enum for programming languages Signed-off-by: Matteo-Omenetti --- docling_core/types/doc/document.py | 64 +++++++++++++++++++++++++++-- docling_core/types/doc/labels.py | 66 ++++++++++++++++++++++++++++++ docs/DoclingDocument.json | 59 ++++++++++++++++++++++++++ 3 files changed, 186 insertions(+), 3 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 9b99d38..8e7c178 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -36,7 +36,7 @@ from docling_core.types.base import _JSON_POINTER_REGEX from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc.base import ImageRefMode -from docling_core.types.doc.labels import DocItemLabel, GroupLabel +from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel from docling_core.types.doc.tokens import DocumentToken, TableToken from docling_core.types.doc.utils import relative_path @@ -661,7 +661,65 @@ class CodeItem(TextItem): label: typing.Literal[DocItemLabel.CODE] = ( DocItemLabel.CODE # type: ignore[assignment] ) - code_language: str = "unknown" + code_language: typing.Literal[ + CodeLanguageLabel.ADA, + CodeLanguageLabel.AWK, + CodeLanguageLabel.BASH, + CodeLanguageLabel.C, + CodeLanguageLabel.C_SHARP, + CodeLanguageLabel.C_PLUS_PLUS, + CodeLanguageLabel.CMAKE, + CodeLanguageLabel.COBOL, + CodeLanguageLabel.CSS, + CodeLanguageLabel.CEYLON, + CodeLanguageLabel.CLOJURE, + CodeLanguageLabel.CRYSTAL, + CodeLanguageLabel.CUDA, + CodeLanguageLabel.CYTHON, + CodeLanguageLabel.D, + CodeLanguageLabel.DART, + CodeLanguageLabel.DOCKERFILE, + CodeLanguageLabel.ELIXIR, + CodeLanguageLabel.ERLANG, + CodeLanguageLabel.FORTRAN, + CodeLanguageLabel.FORTH, + CodeLanguageLabel.GO, + CodeLanguageLabel.HTML, + CodeLanguageLabel.HASKELL, + CodeLanguageLabel.HAXE, + CodeLanguageLabel.JAVA, + CodeLanguageLabel.JAVASCRIPT, + CodeLanguageLabel.JULIA, + CodeLanguageLabel.KOTLIN, + CodeLanguageLabel.LISP, + CodeLanguageLabel.LUA, + CodeLanguageLabel.MATLAB, + CodeLanguageLabel.MOONSCRIPT, + CodeLanguageLabel.NIM, + CodeLanguageLabel.OCAML, + CodeLanguageLabel.OBJECTIVEC, + CodeLanguageLabel.OCTAVE, + CodeLanguageLabel.PHP, + CodeLanguageLabel.PASCAL, + CodeLanguageLabel.PERL, + CodeLanguageLabel.PROLOG, + CodeLanguageLabel.PYTHON, + CodeLanguageLabel.RACKET, + CodeLanguageLabel.RUBY, + CodeLanguageLabel.RUST, + CodeLanguageLabel.SML, + CodeLanguageLabel.SQL, + CodeLanguageLabel.SCALA, + CodeLanguageLabel.SCHEME, + CodeLanguageLabel.SWIFT, + CodeLanguageLabel.TYPESCRIPT, + CodeLanguageLabel.VISUALBASIC, + CodeLanguageLabel.XML, + CodeLanguageLabel.YAML, + CodeLanguageLabel.BC, + CodeLanguageLabel.DC, + CodeLanguageLabel.UNKNOWN, + ] = CodeLanguageLabel.UNKNOWN class SectionHeaderItem(TextItem): @@ -1655,7 +1713,7 @@ def add_title( def add_code( self, text: str, - code_language: Optional[str] = None, + code_language: Optional[CodeLanguageLabel] = None, orig: Optional[str] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index 6310163..6573f2d 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -138,3 +138,69 @@ class TableCellLabel(str, Enum): def __str__(self): """Get string value.""" return str(self.value) + + +class CodeLanguageLabel(str, Enum): + """CodeLanguageLabel.""" + + ADA = "Ada" + AWK = "Awk" + BASH = "Bash" + C = "C" + C_SHARP = "C#" + C_PLUS_PLUS = "C++" + CMAKE = "CMake" + COBOL = "COBOL" + CSS = "CSS" + CEYLON = "Ceylon" + CLOJURE = "Clojure" + CRYSTAL = "Crystal" + CUDA = "Cuda" + CYTHON = "Cython" + D = "D" + DART = "Dart" + DOCKERFILE = "Dockerfile" + ELIXIR = "Elixir" + ERLANG = "Erlang" + FORTRAN = "FORTRAN" + FORTH = "Forth" + GO = "Go" + HTML = "HTML" + HASKELL = "Haskell" + HAXE = "Haxe" + JAVA = "Java" + JAVASCRIPT = "JavaScript" + JULIA = "Julia" + KOTLIN = "Kotlin" + LISP = "Lisp" + LUA = "Lua" + MATLAB = "Matlab" + MOONSCRIPT = "MoonScript" + NIM = "Nim" + OCAML = "OCaml" + OBJECTIVEC = "ObjectiveC" + OCTAVE = "Octave" + PHP = "PHP" + PASCAL = "Pascal" + PERL = "Perl" + PROLOG = "Prolog" + PYTHON = "Python" + RACKET = "Racket" + RUBY = "Ruby" + RUST = "Rust" + SML = "SML" + SQL = "SQL" + SCALA = "Scala" + SCHEME = "Scheme" + SWIFT = "Swift" + TYPESCRIPT = "TypeScript" + VISUALBASIC = "VisualBasic" + XML = "XML" + YAML = "YAML" + BC = "bc" + DC = "dc" + UNKNOWN = "unknown" + + def __str__(self): + """Get string value.""" + return str(self.value) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index cc92a42..c51acce 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -214,6 +214,65 @@ }, "code_language": { "default": "unknown", + "enum": [ + "Ada", + "Awk", + "Bash", + "C", + "C#", + "C++", + "CMake", + "COBOL", + "CSS", + "Ceylon", + "Clojure", + "Crystal", + "Cuda", + "Cython", + "D", + "Dart", + "Dockerfile", + "Elixir", + "Erlang", + "FORTRAN", + "Forth", + "Go", + "HTML", + "Haskell", + "Haxe", + "Java", + "JavaScript", + "Julia", + "Kotlin", + "Lisp", + "Lua", + "Matlab", + "MoonScript", + "Nim", + "OCaml", + "ObjectiveC", + "Octave", + "PHP", + "Pascal", + "Perl", + "Prolog", + "Python", + "Racket", + "Ruby", + "Rust", + "SML", + "SQL", + "Scala", + "Scheme", + "Swift", + "TypeScript", + "VisualBasic", + "XML", + "YAML", + "bc", + "dc", + "unknown" + ], "title": "Code Language", "type": "string" }