Skip to content

Commit

Permalink
added enum for programming languages
Browse files Browse the repository at this point in the history
  • Loading branch information
Matteo-Omenetti committed Jan 17, 2025
1 parent 7dee149 commit e7af164
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 3 deletions.
64 changes: 61 additions & 3 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from docling_core.types.base import _JSON_POINTER_REGEX
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path

Expand Down Expand Up @@ -661,7 +661,65 @@ class CodeItem(TextItem):
label: typing.Literal[DocItemLabel.CODE] = (
DocItemLabel.CODE # type: ignore[assignment]
)
code_language: str = "unknown"
code_language: typing.Literal[
CodeLanguageLabel.ADA,
CodeLanguageLabel.AWK,
CodeLanguageLabel.BASH,
CodeLanguageLabel.C,
CodeLanguageLabel.C_SHARP,
CodeLanguageLabel.C_PLUS_PLUS,
CodeLanguageLabel.CMAKE,
CodeLanguageLabel.COBOL,
CodeLanguageLabel.CSS,
CodeLanguageLabel.CEYLON,
CodeLanguageLabel.CLOJURE,
CodeLanguageLabel.CRYSTAL,
CodeLanguageLabel.CUDA,
CodeLanguageLabel.CYTHON,
CodeLanguageLabel.D,
CodeLanguageLabel.DART,
CodeLanguageLabel.DOCKERFILE,
CodeLanguageLabel.ELIXIR,
CodeLanguageLabel.ERLANG,
CodeLanguageLabel.FORTRAN,
CodeLanguageLabel.FORTH,
CodeLanguageLabel.GO,
CodeLanguageLabel.HTML,
CodeLanguageLabel.HASKELL,
CodeLanguageLabel.HAXE,
CodeLanguageLabel.JAVA,
CodeLanguageLabel.JAVASCRIPT,
CodeLanguageLabel.JULIA,
CodeLanguageLabel.KOTLIN,
CodeLanguageLabel.LISP,
CodeLanguageLabel.LUA,
CodeLanguageLabel.MATLAB,
CodeLanguageLabel.MOONSCRIPT,
CodeLanguageLabel.NIM,
CodeLanguageLabel.OCAML,
CodeLanguageLabel.OBJECTIVEC,
CodeLanguageLabel.OCTAVE,
CodeLanguageLabel.PHP,
CodeLanguageLabel.PASCAL,
CodeLanguageLabel.PERL,
CodeLanguageLabel.PROLOG,
CodeLanguageLabel.PYTHON,
CodeLanguageLabel.RACKET,
CodeLanguageLabel.RUBY,
CodeLanguageLabel.RUST,
CodeLanguageLabel.SML,
CodeLanguageLabel.SQL,
CodeLanguageLabel.SCALA,
CodeLanguageLabel.SCHEME,
CodeLanguageLabel.SWIFT,
CodeLanguageLabel.TYPESCRIPT,
CodeLanguageLabel.VISUALBASIC,
CodeLanguageLabel.XML,
CodeLanguageLabel.YAML,
CodeLanguageLabel.BC,
CodeLanguageLabel.DC,
CodeLanguageLabel.UNKNOWN,
] = CodeLanguageLabel.UNKNOWN


class SectionHeaderItem(TextItem):
Expand Down Expand Up @@ -1655,7 +1713,7 @@ def add_title(
def add_code(
self,
text: str,
code_language: Optional[str] = None,
code_language: Optional[CodeLanguageLabel] = None,
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
Expand Down
66 changes: 66 additions & 0 deletions docling_core/types/doc/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
def __str__(self):
"""Get string value."""
return str(self.value)


class CodeLanguageLabel(str, Enum):
"""CodeLanguageLabel."""

ADA = "Ada"
AWK = "Awk"
BASH = "Bash"
C = "C"
C_SHARP = "C#"
C_PLUS_PLUS = "C++"
CMAKE = "CMake"
COBOL = "COBOL"
CSS = "CSS"
CEYLON = "Ceylon"
CLOJURE = "Clojure"
CRYSTAL = "Crystal"
CUDA = "Cuda"
CYTHON = "Cython"
D = "D"
DART = "Dart"
DOCKERFILE = "Dockerfile"
ELIXIR = "Elixir"
ERLANG = "Erlang"
FORTRAN = "FORTRAN"
FORTH = "Forth"
GO = "Go"
HTML = "HTML"
HASKELL = "Haskell"
HAXE = "Haxe"
JAVA = "Java"
JAVASCRIPT = "JavaScript"
JULIA = "Julia"
KOTLIN = "Kotlin"
LISP = "Lisp"
LUA = "Lua"
MATLAB = "Matlab"
MOONSCRIPT = "MoonScript"
NIM = "Nim"
OCAML = "OCaml"
OBJECTIVEC = "ObjectiveC"
OCTAVE = "Octave"
PHP = "PHP"
PASCAL = "Pascal"
PERL = "Perl"
PROLOG = "Prolog"
PYTHON = "Python"
RACKET = "Racket"
RUBY = "Ruby"
RUST = "Rust"
SML = "SML"
SQL = "SQL"
SCALA = "Scala"
SCHEME = "Scheme"
SWIFT = "Swift"
TYPESCRIPT = "TypeScript"
VISUALBASIC = "VisualBasic"
XML = "XML"
YAML = "YAML"
BC = "bc"
DC = "dc"
UNKNOWN = "unknown"

def __str__(self):
"""Get string value."""
return str(self.value)
59 changes: 59 additions & 0 deletions docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,65 @@
},
"code_language": {
"default": "unknown",
"enum": [
"Ada",
"Awk",
"Bash",
"C",
"C#",
"C++",
"CMake",
"COBOL",
"CSS",
"Ceylon",
"Clojure",
"Crystal",
"Cuda",
"Cython",
"D",
"Dart",
"Dockerfile",
"Elixir",
"Erlang",
"FORTRAN",
"Forth",
"Go",
"HTML",
"Haskell",
"Haxe",
"Java",
"JavaScript",
"Julia",
"Kotlin",
"Lisp",
"Lua",
"Matlab",
"MoonScript",
"Nim",
"OCaml",
"ObjectiveC",
"Octave",
"PHP",
"Pascal",
"Perl",
"Prolog",
"Python",
"Racket",
"Ruby",
"Rust",
"SML",
"SQL",
"Scala",
"Scheme",
"Swift",
"TypeScript",
"VisualBasic",
"XML",
"YAML",
"bc",
"dc",
"unknown"
],
"title": "Code Language",
"type": "string"
}
Expand Down

0 comments on commit e7af164

Please sign in to comment.