Skip to content

Commit

Permalink
fix: include titles to chunk heading metadata (#62)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Nov 1, 2024
1 parent 1126410 commit bfeb2db
Show file tree
Hide file tree
Showing 4 changed files with 289 additions and 8 deletions.
13 changes: 7 additions & 6 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,15 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
)
list_items = [] # reset

if isinstance(
item, SectionHeaderItem
) or ( # TODO remove when all captured as SectionHeaderItem:
if isinstance(item, SectionHeaderItem) or (
isinstance(item, TextItem)
and item.label == DocItemLabel.SECTION_HEADER
and item.label in [DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE]
):
# TODO second branch not needed once cleanup above complete:
level = item.level if isinstance(item, SectionHeaderItem) else 1
level = (
item.level
if isinstance(item, SectionHeaderItem)
else (0 if item.label == DocItemLabel.TITLE else 1)
)
heading_by_level[level] = item.text

# remove headings of higher level as they just went out of scope
Expand Down
19 changes: 17 additions & 2 deletions test/data/chunker/0_inp_dl_doc.json
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,7 @@
"$ref": "#/body"
},
"children": [],
"label": "section_header",
"label": "title",
"prov": [
{
"page_no": 1,
Expand Down Expand Up @@ -652,7 +652,7 @@
},
"children": [],
"label": "section_header",
"level": 2,
"level": 1,
"prov": [
{
"page_no": 1,
Expand Down Expand Up @@ -705,6 +705,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 1,
Expand Down Expand Up @@ -1017,6 +1018,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 2,
Expand Down Expand Up @@ -1147,6 +1149,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 2,
Expand Down Expand Up @@ -1199,6 +1202,7 @@
},
"children": [],
"label": "section_header",
"level": 2,
"prov": [
{
"page_no": 2,
Expand Down Expand Up @@ -1381,6 +1385,7 @@
},
"children": [],
"label": "section_header",
"level": 2,
"prov": [
{
"page_no": 3,
Expand Down Expand Up @@ -1433,6 +1438,7 @@
},
"children": [],
"label": "section_header",
"level": 3,
"prov": [
{
"page_no": 3,
Expand Down Expand Up @@ -1511,6 +1517,7 @@
},
"children": [],
"label": "section_header",
"level": 3,
"prov": [
{
"page_no": 3,
Expand Down Expand Up @@ -1615,6 +1622,7 @@
},
"children": [],
"label": "section_header",
"level": 3,
"prov": [
{
"page_no": 4,
Expand Down Expand Up @@ -1693,6 +1701,7 @@
},
"children": [],
"label": "section_header",
"level": 2,
"prov": [
{
"page_no": 4,
Expand Down Expand Up @@ -1745,6 +1754,7 @@
},
"children": [],
"label": "section_header",
"level": 2,
"prov": [
{
"page_no": 4,
Expand Down Expand Up @@ -1823,6 +1833,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 4,
Expand Down Expand Up @@ -2005,6 +2016,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 5,
Expand Down Expand Up @@ -2057,6 +2069,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 5,
Expand Down Expand Up @@ -2135,6 +2148,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 5,
Expand Down Expand Up @@ -2655,6 +2669,7 @@
},
"children": [],
"label": "section_header",
"level": 1,
"prov": [
{
"page_no": 7,
Expand Down
Loading

0 comments on commit bfeb2db

Please sign in to comment.