Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: add the contentlayer to html-backend #1040

Merged
merged 6 commits into from
Mar 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 62 additions & 14 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
Expand Down Expand Up @@ -66,7 +67,8 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
"Could not initialize HTML backend for file with "
f"hash {self.document_hash}."
) from e

@override
Expand Down Expand Up @@ -109,14 +111,21 @@ def convert(self) -> DoclingDocument:
# TODO: remove style to avoid losing text from tags like i, b, span, ...
for br in content("br"):
br.replace_with(NavigableString("\n"))

headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
f"Cannot convert doc with {self.document_hash} because the backend "
"failed to init."
)
return doc

def walk(self, tag: Tag, doc: DoclingDocument) -> None:

# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
Expand All @@ -143,8 +152,9 @@ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
if text and tag.name in ["div"]:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.PARAGRAPH,
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
text = ""

Expand All @@ -166,7 +176,7 @@ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
elif tag.name == "figure":
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(doc)
self.handle_image(tag, doc)
else:
self.walk(tag, doc)

Expand Down Expand Up @@ -197,12 +207,17 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
text = element.text.strip()

if hlevel == 1:
self.content_layer = ContentLayer.BODY

for key in self.parents.keys():
self.parents[key] = None

self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
)
else:
if hlevel > self.level:
Expand All @@ -213,6 +228,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
content_layer=self.content_layer,
)
self.level = hlevel

Expand All @@ -228,6 +244,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
content_layer=self.content_layer,
)

def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
Expand All @@ -236,24 +253,35 @@ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
return
text = element.text.strip()
if text:
doc.add_code(parent=self.parents[self.level], text=text)
doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)

def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)

def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""

if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
)
elif element.name == "ol":
start_attr = element.get("start")
Expand All @@ -267,6 +295,7 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
parent=self.parents[self.level],
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer,
)
self.level += 1

Expand Down Expand Up @@ -315,6 +344,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
self.level += 1

Expand All @@ -336,6 +366,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
else:
_log.debug(f"list-item has no text: {element}")
Expand Down Expand Up @@ -439,7 +470,11 @@ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
table_data = HTMLDocumentBackend.parse_table_data(element)

if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level])
doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)

def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation."""
Expand Down Expand Up @@ -479,20 +514,33 @@ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:

contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None)
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)

fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
content_layer=self.content_layer,
)

def handle_image(self, doc: DoclingDocument) -> None:
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)
_log.debug(f"ignoring <img> tags at the moment: {element}")

doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_01.html.itxt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
item-2 at level 2: text: This is the first paragraph of the introduction.
item-3 at level 2: section_header: Background
item-4 at level 3: paragraph: Some background information here.
item-4 at level 3: text: Some background information here.
item-5 at level 3: picture
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_01.html.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
Expand Down Expand Up @@ -126,7 +126,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_02.html.itxt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
item-2 at level 2: text: This is the first paragraph of the introduction.
item-3 at level 2: section_header: Background
item-4 at level 3: paragraph: Some background information here.
item-4 at level 3: text: Some background information here.
item-5 at level 3: list: group list
item-6 at level 4: list_item: First item in unordered list
item-7 at level 4: list_item: Second item in unordered list
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_02.html.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
Expand Down Expand Up @@ -123,7 +123,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_03.html.itxt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Example Document
item-2 at level 2: section_header: Introduction
item-3 at level 3: paragraph: This is the first paragraph of the introduction.
item-3 at level 3: text: This is the first paragraph of the introduction.
item-4 at level 2: section_header: Background
item-5 at level 3: paragraph: Some background information here.
item-5 at level 3: text: Some background information here.
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list
item-8 at level 5: list: group list
Expand Down
4 changes: 2 additions & 2 deletions tests/data/groundtruth/docling_v2/example_03.html.json
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
Expand Down Expand Up @@ -177,7 +177,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."
Expand Down
10 changes: 5 additions & 5 deletions tests/data/groundtruth/docling_v2/example_06.html.itxt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: This is a div with text.
item-2 at level 1: paragraph: This is another div with text.
item-3 at level 1: paragraph: This is a regular paragraph.
item-4 at level 1: paragraph: This is a third div
item-1 at level 1: text: This is a div with text.
item-2 at level 1: text: This is another div with text.
item-3 at level 1: text: This is a regular paragraph.
item-4 at level 1: text: This is a third div
with a new line.
item-5 at level 1: paragraph: This is a fourth div with a bold paragraph.
item-5 at level 1: text: This is a fourth div with a bold paragraph.
10 changes: 5 additions & 5 deletions tests/data/groundtruth/docling_v2/example_06.html.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a div with text.",
"text": "This is a div with text."
Expand All @@ -58,7 +58,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is another div with text.",
"text": "This is another div with text."
Expand All @@ -70,7 +70,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a regular paragraph.",
"text": "This is a regular paragraph."
Expand All @@ -82,7 +82,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a third div\nwith a new line.",
"text": "This is a third div\nwith a new line."
Expand All @@ -94,7 +94,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is a fourth div with a bold paragraph.",
"text": "This is a fourth div with a bold paragraph."
Expand Down
Loading