Skip to content

Commit 5df614b

Browse files
committed
Draft new docling document format, pydantic model and tests
Signed-off-by: Christoph Auer <[email protected]>
1 parent 1ed846c commit 5df614b

File tree

5 files changed

+420
-0
lines changed

5 files changed

+420
-0
lines changed

docling_core/types/newdoc/__init__.py

Whitespace-only changes.

docling_core/types/newdoc/base.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import copy
2+
from enum import Enum
3+
from typing import Tuple
4+
5+
from pydantic import BaseModel
6+
7+
8+
## All copied from docling
9+
class CoordOrigin(str, Enum):
10+
TOPLEFT = "TOPLEFT"
11+
BOTTOMLEFT = "BOTTOMLEFT"
12+
13+
14+
class Size(BaseModel):
15+
width: float = 0.0
16+
height: float = 0.0
17+
18+
19+
class BoundingBox(BaseModel):
20+
l: float # left
21+
t: float # top
22+
r: float # right
23+
b: float # bottom
24+
25+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
26+
27+
@property
28+
def width(self):
29+
return self.r - self.l
30+
31+
@property
32+
def height(self):
33+
return abs(self.t - self.b)
34+
35+
def scaled(self, scale: float) -> "BoundingBox":
36+
out_bbox = copy.deepcopy(self)
37+
out_bbox.l *= scale
38+
out_bbox.r *= scale
39+
out_bbox.t *= scale
40+
out_bbox.b *= scale
41+
42+
return out_bbox
43+
44+
def normalized(self, page_size: Size) -> "BoundingBox":
45+
out_bbox = copy.deepcopy(self)
46+
out_bbox.l /= page_size.width
47+
out_bbox.r /= page_size.width
48+
out_bbox.t /= page_size.height
49+
out_bbox.b /= page_size.height
50+
51+
return out_bbox
52+
53+
def as_tuple(self):
54+
if self.coord_origin == CoordOrigin.TOPLEFT:
55+
return (self.l, self.t, self.r, self.b)
56+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
57+
return (self.l, self.b, self.r, self.t)
58+
59+
@classmethod
60+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
61+
if origin == CoordOrigin.TOPLEFT:
62+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
63+
if r < l:
64+
l, r = r, l
65+
if b < t:
66+
b, t = t, b
67+
68+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
69+
elif origin == CoordOrigin.BOTTOMLEFT:
70+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
71+
if r < l:
72+
l, r = r, l
73+
if b > t:
74+
b, t = t, b
75+
76+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
77+
78+
def area(self) -> float:
79+
return (self.r - self.l) * (self.b - self.t)
80+
81+
def intersection_area_with(self, other: "BoundingBox") -> float:
82+
# Calculate intersection coordinates
83+
left = max(self.l, other.l)
84+
top = max(self.t, other.t)
85+
right = min(self.r, other.r)
86+
bottom = min(self.b, other.b)
87+
88+
# Calculate intersection dimensions
89+
width = right - left
90+
height = bottom - top
91+
92+
# If the bounding boxes do not overlap, width or height will be negative
93+
if width <= 0 or height <= 0:
94+
return 0.0
95+
96+
return width * height
97+
98+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
99+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
100+
return self
101+
elif self.coord_origin == CoordOrigin.TOPLEFT:
102+
return BoundingBox(
103+
l=self.l,
104+
r=self.r,
105+
t=page_height - self.t,
106+
b=page_height - self.b,
107+
coord_origin=CoordOrigin.BOTTOMLEFT,
108+
)
109+
110+
def to_top_left_origin(self, page_height):
111+
if self.coord_origin == CoordOrigin.TOPLEFT:
112+
return self
113+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
114+
return BoundingBox(
115+
l=self.l,
116+
r=self.r,
117+
t=page_height - self.t, # self.b
118+
b=page_height - self.b, # self.t
119+
coord_origin=CoordOrigin.TOPLEFT,
120+
)

docling_core/types/newdoc/document.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from typing import Any, Dict, List, Optional, Tuple, Union
2+
3+
from pydantic import AnyUrl, BaseModel, Field
4+
5+
from docling_core.types.newdoc.base import BoundingBox, Size
6+
7+
8+
class FigureData(BaseModel): # TBD
9+
pass
10+
11+
12+
class TableData(BaseModel): # TBD
13+
pass
14+
15+
16+
class RefItem(BaseModel):
17+
cref: str = Field(alias="$ref")
18+
19+
def resolve(self, doc: "DoclingDocument"):
20+
_, path, index = self.cref.split("/")
21+
index = int(index)
22+
obj = doc.__getattribute__(path)[index]
23+
return obj
24+
25+
26+
class ImageRef(BaseModel):
27+
format: str # png, etc.
28+
dpi: int # ...
29+
size: Size
30+
uri: AnyUrl
31+
32+
33+
class ProvenanceItem(BaseModel):
34+
page_no: int
35+
bbox: BoundingBox
36+
charspan: Tuple[int, int]
37+
38+
39+
class DocItem(BaseModel):
40+
dloc: str # format spec ({document_hash}{json-path})
41+
hash: int
42+
label: str
43+
parent: Optional[RefItem]
44+
children: List[RefItem]
45+
prov: List[ProvenanceItem]
46+
47+
48+
class TextItem(DocItem):
49+
orig: str # untreated representation
50+
text: str # sanitized representation
51+
52+
53+
class FloatingItem(DocItem):
54+
caption: Optional[Union[RefItem, TextItem]]
55+
references: List[Union[RefItem, TextItem]]
56+
footnotes: List[Union[RefItem, TextItem]]
57+
data: Any
58+
image: Optional[ImageRef]
59+
60+
61+
class FigureItem(DocItem):
62+
data: FigureData
63+
64+
65+
class TableItem(DocItem):
66+
data: TableData
67+
68+
69+
class KeyValueItem(DocItem):
70+
pass
71+
72+
73+
ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
74+
75+
76+
class DocumentContent(BaseModel):
77+
furniture: List[RefItem] = []
78+
body: List[RefItem] = []
79+
texts: List[TextItem] = []
80+
figures: List[FigureItem] = []
81+
tables: List[TableItem] = []
82+
key_value_items: List[KeyValueItem] = []
83+
84+
85+
class PageItem(DocumentContent):
86+
hash: str # page hash
87+
size: Size
88+
image: Optional[ImageRef]
89+
num_elements: int
90+
91+
92+
class DoclingDocument(DocumentContent):
93+
description: Any
94+
file_info: Any
95+
pages: Dict[int, PageItem] = {} # empty as default

test/data/newdoc/dummy_doc.yaml

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
---
2+
## Document with content + layout info
3+
description: { } # DescriptionType - TBD
4+
file_info: # FileInfoType - TBD
5+
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
6+
furniture: # Headers, footers, framing, navigation elements, all other non-body text
7+
- $ref: "/texts/0"
8+
9+
body: # All elements in other arrays, by-reference only
10+
- $ref: "/texts/1"
11+
- $ref: "/figure/0"
12+
- $ref: "/texts/2"
13+
- $ref: "/texts/3"
14+
- $ref: "/tables/0"
15+
16+
texts: # All elements that have a text-string representation, with actual data
17+
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
18+
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
19+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
20+
hash: 132103230
21+
label: "page_header"
22+
parent: null
23+
children: [ ]
24+
prov:
25+
- page_no: 1
26+
bbox:
27+
l: 21.3
28+
t: 52.3
29+
b: 476.2
30+
r: 35.2
31+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
32+
- orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
33+
text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
34+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
35+
hash: 2349732 # uint64 hash of dloc
36+
label: "title"
37+
parent: null
38+
children: [ ]
39+
prov: # must exist, can be empty
40+
- page_no: 1
41+
bbox:
42+
l: 65.0
43+
t: 30.1
44+
b: 53.4
45+
r: 623.2
46+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
47+
- orig: "OPERATION (cont.)" # nested inside the figure
48+
text: "OPERATION (cont.)"
49+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
50+
hash: 6978483
51+
label: "section_header"
52+
parent:
53+
$ref: "/figures/0"
54+
children: [ ]
55+
prov:
56+
- page_no: 1
57+
bbox:
58+
l: 323.0
59+
t: 354.3
60+
b: 334.4
61+
r: 376.0
62+
charspan: [ 0,734 ]
63+
- orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
64+
text: "Figure 1: Four examples of complex page layouts across different document categories"
65+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
66+
hash: 6978483
67+
label: "caption"
68+
parent:
69+
$ref: "/figures/0"
70+
children: [ ]
71+
prov:
72+
- page_no: 1
73+
bbox:
74+
l: 323.0
75+
t: 354.3
76+
b: 334.4
77+
r: 376.0
78+
coord_origin: "BOTTOMLEFT"
79+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
80+
81+
82+
tables: # All tables...
83+
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
84+
hash: 98574
85+
label: "table"
86+
parent: null
87+
children: [ ]
88+
caption:
89+
$ref: "/texts/3"
90+
references:
91+
- $ref: "/text/??"
92+
footnotes:
93+
- $ref: "/text/??"
94+
image:
95+
format: png
96+
dpi: 72
97+
size:
98+
width: 231
99+
height: 351
100+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
101+
#alternatives: base64 encoded striong
102+
data: # TableData Type
103+
grid: [ [ ] ] # list-of-list of TableCell type
104+
otsl: "<fcel><ecel>..." # OTSL token string
105+
html: "" # ??
106+
prov:
107+
- page_no: 1
108+
bbox:
109+
l: 323.0
110+
t: 354.3
111+
b: 334.4
112+
r: 376.0
113+
coord_origin: "BOTTOMLEFT"
114+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
115+
116+
figures: # All figures...
117+
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
118+
hash: 7782482
119+
label: "figure"
120+
parent: null
121+
caption:
122+
$ref: "/texts/2"
123+
references:
124+
- $ref: "/text/??"
125+
footnotes:
126+
- $ref: "/text/??"
127+
128+
data: # FigureData Type
129+
classification: "illustration"
130+
confidence: 0.78
131+
description: "...."
132+
# content structure?
133+
image:
134+
format: png
135+
dpi: 72
136+
size:
137+
width: 231
138+
height: 351
139+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
140+
#alternatives: base64 encoded striong
141+
children:
142+
- $ref: "/texts/2"
143+
prov:
144+
- page_no: 1
145+
bbox:
146+
l: 456.3
147+
t: 145.8
148+
b: 623.4
149+
r: 702.5
150+
charspan: [ 0,288 ]
151+
152+
key_value_items: [ ] # All KV-items
153+
154+
# We should consider this for pages
155+
pages: # Optional, for layout documents
156+
1:
157+
hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
158+
size:
159+
width: 768.23
160+
height: 583.15
161+
image:
162+
format: png
163+
dpi: 144
164+
size:
165+
width: 1536
166+
height: 1166
167+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
168+
#alternatives: base64 encoded string
169+
num_elements: 23

0 commit comments

Comments
 (0)