Skip to content

Commit bfeb2db

Browse files
authored
fix: include titles to chunk heading metadata (#62)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 1126410 commit bfeb2db

File tree

4 files changed

+289
-8
lines changed

4 files changed

+289
-8
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,14 +183,15 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
183183
)
184184
list_items = [] # reset
185185

186-
if isinstance(
187-
item, SectionHeaderItem
188-
) or ( # TODO remove when all captured as SectionHeaderItem:
186+
if isinstance(item, SectionHeaderItem) or (
189187
isinstance(item, TextItem)
190-
and item.label == DocItemLabel.SECTION_HEADER
188+
and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
191189
):
192-
# TODO second branch not needed once cleanup above complete:
193-
level = item.level if isinstance(item, SectionHeaderItem) else 1
190+
level = (
191+
item.level
192+
if isinstance(item, SectionHeaderItem)
193+
else (0 if item.label == DocItemLabel.TITLE else 1)
194+
)
194195
heading_by_level[level] = item.text
195196

196197
# remove headings of higher level as they just went out of scope

test/data/chunker/0_inp_dl_doc.json

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@
547547
"$ref": "#/body"
548548
},
549549
"children": [],
550-
"label": "section_header",
550+
"label": "title",
551551
"prov": [
552552
{
553553
"page_no": 1,
@@ -652,7 +652,7 @@
652652
},
653653
"children": [],
654654
"label": "section_header",
655-
"level": 2,
655+
"level": 1,
656656
"prov": [
657657
{
658658
"page_no": 1,
@@ -705,6 +705,7 @@
705705
},
706706
"children": [],
707707
"label": "section_header",
708+
"level": 1,
708709
"prov": [
709710
{
710711
"page_no": 1,
@@ -1017,6 +1018,7 @@
10171018
},
10181019
"children": [],
10191020
"label": "section_header",
1021+
"level": 1,
10201022
"prov": [
10211023
{
10221024
"page_no": 2,
@@ -1147,6 +1149,7 @@
11471149
},
11481150
"children": [],
11491151
"label": "section_header",
1152+
"level": 1,
11501153
"prov": [
11511154
{
11521155
"page_no": 2,
@@ -1199,6 +1202,7 @@
11991202
},
12001203
"children": [],
12011204
"label": "section_header",
1205+
"level": 2,
12021206
"prov": [
12031207
{
12041208
"page_no": 2,
@@ -1381,6 +1385,7 @@
13811385
},
13821386
"children": [],
13831387
"label": "section_header",
1388+
"level": 2,
13841389
"prov": [
13851390
{
13861391
"page_no": 3,
@@ -1433,6 +1438,7 @@
14331438
},
14341439
"children": [],
14351440
"label": "section_header",
1441+
"level": 3,
14361442
"prov": [
14371443
{
14381444
"page_no": 3,
@@ -1511,6 +1517,7 @@
15111517
},
15121518
"children": [],
15131519
"label": "section_header",
1520+
"level": 3,
15141521
"prov": [
15151522
{
15161523
"page_no": 3,
@@ -1615,6 +1622,7 @@
16151622
},
16161623
"children": [],
16171624
"label": "section_header",
1625+
"level": 3,
16181626
"prov": [
16191627
{
16201628
"page_no": 4,
@@ -1693,6 +1701,7 @@
16931701
},
16941702
"children": [],
16951703
"label": "section_header",
1704+
"level": 2,
16961705
"prov": [
16971706
{
16981707
"page_no": 4,
@@ -1745,6 +1754,7 @@
17451754
},
17461755
"children": [],
17471756
"label": "section_header",
1757+
"level": 2,
17481758
"prov": [
17491759
{
17501760
"page_no": 4,
@@ -1823,6 +1833,7 @@
18231833
},
18241834
"children": [],
18251835
"label": "section_header",
1836+
"level": 1,
18261837
"prov": [
18271838
{
18281839
"page_no": 4,
@@ -2005,6 +2016,7 @@
20052016
},
20062017
"children": [],
20072018
"label": "section_header",
2019+
"level": 1,
20082020
"prov": [
20092021
{
20102022
"page_no": 5,
@@ -2057,6 +2069,7 @@
20572069
},
20582070
"children": [],
20592071
"label": "section_header",
2072+
"level": 1,
20602073
"prov": [
20612074
{
20622075
"page_no": 5,
@@ -2135,6 +2148,7 @@
21352148
},
21362149
"children": [],
21372150
"label": "section_header",
2151+
"level": 1,
21382152
"prov": [
21392153
{
21402154
"page_no": 5,
@@ -2655,6 +2669,7 @@
26552669
},
26562670
"children": [],
26572671
"label": "section_header",
2672+
"level": 1,
26582673
"prov": [
26592674
{
26602675
"page_no": 7,

0 commit comments

Comments
 (0)