Skip to content

Commit

Permalink
feat: Add CodeItem as pydantic type, update export methods and APIs (#…
Browse files Browse the repository at this point in the history
…129)

* added code item

* added code item

* added code item

Signed-off-by: Matteo-Omenetti <[email protected]>

* added code item

Signed-off-by: Matteo-Omenetti <[email protected]>

* added code item

Signed-off-by: Matteo-Omenetti <[email protected]>

* added code item

Signed-off-by: Matteo-Omenetti <[email protected]>

* added code item

Signed-off-by: Matteo-Omenetti <[email protected]>

* add constraints to allow numpy > 2.1.0 on python3.13 and others

Signed-off-by: Michele Dolfi <[email protected]>

* Add CodeItem to ContentItem

Signed-off-by: Christoph Auer <[email protected]>

* added CodeItem in ContentItem tagged union.

* added enum for programming languages

* removed double CodeItem in ContentItem Union

* fixed type of code_language in CodeItem class

* fixed sorting of programming languages, not sorted anymore by value of string but variable name

---------

Signed-off-by: Matteo-Omenetti <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Matteo-Omenetti <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
4 people authored Jan 17, 2025
1 parent 618df13 commit c940aa5
Show file tree
Hide file tree
Showing 8 changed files with 734 additions and 585 deletions.
1 change: 1 addition & 0 deletions docling_core/types/doc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
from .document import (
CodeItem,
DocItem,
DoclingDocument,
DocumentOrigin,
Expand Down
73 changes: 68 additions & 5 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from docling_core.types.base import _JSON_POINTER_REGEX
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path

Expand Down Expand Up @@ -597,7 +597,6 @@ class TextItem(DocItem):
DocItemLabel.CAPTION,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CODE,
DocItemLabel.FOOTNOTE,
DocItemLabel.FORMULA,
DocItemLabel.PAGE_FOOTER,
Expand Down Expand Up @@ -656,6 +655,15 @@ def export_to_document_tokens(
return body


class CodeItem(TextItem):
"""CodeItem."""

label: typing.Literal[DocItemLabel.CODE] = (
DocItemLabel.CODE # type: ignore[assignment]
)
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN


class SectionHeaderItem(TextItem):
"""SectionItem."""

Expand Down Expand Up @@ -1302,6 +1310,7 @@ class KeyValueItem(DocItem):
TextItem,
SectionHeaderItem,
ListItem,
CodeItem,
PictureItem,
TableItem,
KeyValueItem,
Expand Down Expand Up @@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []

groups: List[GroupItem] = []
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
pictures: List[PictureItem] = []
tables: List[TableItem] = []
key_value_items: List[KeyValueItem] = []
Expand Down Expand Up @@ -1643,6 +1652,46 @@ def add_title(

return text_item

def add_code(
self,
text: str,
code_language: Optional[CodeLanguageLabel] = None,
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
):
"""add_code.
:param text: str:
:param code_language: Optional[str]: (Default value = None)
:param orig: Optional[str]: (Default value = None)
:param prov: Optional[ProvenanceItem]: (Default value = None)
:param parent: Optional[NodeItem]: (Default value = None)
"""
if not parent:
parent = self.body

if not orig:
orig = text

text_index = len(self.texts)
cref = f"#/texts/{text_index}"
code_item = CodeItem(
text=text,
orig=orig,
self_ref=cref,
parent=parent.get_ref(),
)
if code_language:
code_item.code_language = code_language
if prov:
code_item.prov.append(prov)

self.texts.append(code_item)
parent.children.append(RefItem(cref=cref))

return code_item

def add_heading(
self,
text: str,
Expand Down Expand Up @@ -2086,7 +2135,7 @@ def export_to_markdown( # noqa: C901
text = f"{marker} {item.text}\n"
mdtexts.append(text.strip() + "\n")

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
elif isinstance(item, CodeItem) and item.label in labels:
in_list = False
text = f"```\n{item.text}\n```\n"
mdtexts.append(text)
Expand Down Expand Up @@ -2392,11 +2441,14 @@ def close_lists(
text = f"<li>{item.text}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = f"<pre><code>{item.text}</code></pre>"
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:

text = f"<p>{item.text}</p>"
html_texts.append(text.strip())

elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand Down Expand Up @@ -2594,6 +2646,17 @@ def close_lists(
add_content=add_content,
add_page_index=add_page_index,
)
elif isinstance(item, CodeItem) and (item.label in labels):

result += item.export_to_document_tokens(
doc=self,
new_line=delim,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
)

elif isinstance(item, TextItem) and (item.label in labels):

Expand Down
66 changes: 66 additions & 0 deletions docling_core/types/doc/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
def __str__(self):
"""Get string value."""
return str(self.value)


class CodeLanguageLabel(str, Enum):
"""CodeLanguageLabel."""

ADA = "Ada"
AWK = "Awk"
BASH = "Bash"
BC = "bc"
C = "C"
C_SHARP = "C#"
C_PLUS_PLUS = "C++"
CMAKE = "CMake"
COBOL = "COBOL"
CSS = "CSS"
CEYLON = "Ceylon"
CLOJURE = "Clojure"
CRYSTAL = "Crystal"
CUDA = "Cuda"
CYTHON = "Cython"
D = "D"
DART = "Dart"
DC = "dc"
DOCKERFILE = "Dockerfile"
ELIXIR = "Elixir"
ERLANG = "Erlang"
FORTRAN = "FORTRAN"
FORTH = "Forth"
GO = "Go"
HTML = "HTML"
HASKELL = "Haskell"
HAXE = "Haxe"
JAVA = "Java"
JAVASCRIPT = "JavaScript"
JULIA = "Julia"
KOTLIN = "Kotlin"
LISP = "Lisp"
LUA = "Lua"
MATLAB = "Matlab"
MOONSCRIPT = "MoonScript"
NIM = "Nim"
OCAML = "OCaml"
OBJECTIVEC = "ObjectiveC"
OCTAVE = "Octave"
PHP = "PHP"
PASCAL = "Pascal"
PERL = "Perl"
PROLOG = "Prolog"
PYTHON = "Python"
RACKET = "Racket"
RUBY = "Ruby"
RUST = "Rust"
SML = "SML"
SQL = "SQL"
SCALA = "Scala"
SCHEME = "Scheme"
SWIFT = "Swift"
TYPESCRIPT = "TypeScript"
UNKNOWN = "unknown"
VISUALBASIC = "VisualBasic"
XML = "XML"
YAML = "YAML"

def __str__(self):
"""Get string value."""
return str(self.value)
131 changes: 130 additions & 1 deletion docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,133 @@
"title": "ChartStackedBar",
"type": "object"
},
"CodeItem": {
"additionalProperties": false,
"description": "CodeItem.",
"properties": {
"self_ref": {
"pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$",
"title": "Self Ref",
"type": "string"
},
"parent": {
"anyOf": [
{
"$ref": "#/$defs/RefItem"
},
{
"type": "null"
}
],
"default": null
},
"children": {
"default": [],
"items": {
"$ref": "#/$defs/RefItem"
},
"title": "Children",
"type": "array"
},
"label": {
"const": "code",
"default": "code",
"title": "Label",
"type": "string"
},
"prov": {
"default": [],
"items": {
"$ref": "#/$defs/ProvenanceItem"
},
"title": "Prov",
"type": "array"
},
"orig": {
"title": "Orig",
"type": "string"
},
"text": {
"title": "Text",
"type": "string"
},
"code_language": {
"$ref": "#/$defs/CodeLanguageLabel",
"default": "unknown"
}
},
"required": [
"self_ref",
"orig",
"text"
],
"title": "CodeItem",
"type": "object"
},
"CodeLanguageLabel": {
"description": "CodeLanguageLabel.",
"enum": [
"Ada",
"Awk",
"Bash",
"bc",
"C",
"C#",
"C++",
"CMake",
"COBOL",
"CSS",
"Ceylon",
"Clojure",
"Crystal",
"Cuda",
"Cython",
"D",
"Dart",
"dc",
"Dockerfile",
"Elixir",
"Erlang",
"FORTRAN",
"Forth",
"Go",
"HTML",
"Haskell",
"Haxe",
"Java",
"JavaScript",
"Julia",
"Kotlin",
"Lisp",
"Lua",
"Matlab",
"MoonScript",
"Nim",
"OCaml",
"ObjectiveC",
"Octave",
"PHP",
"Pascal",
"Perl",
"Prolog",
"Python",
"Racket",
"Ruby",
"Rust",
"SML",
"SQL",
"Scala",
"Scheme",
"Swift",
"TypeScript",
"unknown",
"VisualBasic",
"XML",
"YAML"
],
"title": "CodeLanguageLabel",
"type": "string"
},
"CoordOrigin": {
"description": "CoordOrigin.",
"enum": [
Expand Down Expand Up @@ -1266,7 +1393,6 @@
"caption",
"checkbox_selected",
"checkbox_unselected",
"code",
"footnote",
"formula",
"page_footer",
Expand Down Expand Up @@ -1375,6 +1501,9 @@
},
{
"$ref": "#/$defs/TextItem"
},
{
"$ref": "#/$defs/CodeItem"
}
]
},
Expand Down
Loading

0 comments on commit c940aa5

Please sign in to comment.