DS4SD · PeterStaar-IBM · Mar 2, 2025 · Feb 23, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -15,6 +15,7 @@
     TableCell,
     TableData,
 )
+from docling_core.types.doc.document import ContentLayer
 from typing_extensions import override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -66,7 +67,8 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:
             raise RuntimeError(
-                f"Could not initialize HTML backend for file with hash {self.document_hash}."
+                "Could not initialize HTML backend for file with "
+                f"hash {self.document_hash}."
             ) from e
 
     @override
@@ -109,14 +111,21 @@ def convert(self) -> DoclingDocument:
             # TODO: remove style to avoid losing text from tags like i, b, span, ...
             for br in content("br"):
                 br.replace_with(NavigableString("\n"))
+
+            headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
+            self.content_layer = (
+                ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
+            )
             self.walk(content, doc)
         else:
             raise RuntimeError(
-                f"Cannot convert doc with {self.document_hash} because the backend failed to init."
+                f"Cannot convert doc with {self.document_hash} because the backend "
+                "failed to init."
             )
         return doc
 
     def walk(self, tag: Tag, doc: DoclingDocument) -> None:
+
         # Iterate over elements in the body of the document
         text: str = ""
         for element in tag.children:
@@ -143,8 +152,9 @@ def walk(self, tag: Tag, doc: DoclingDocument) -> None:
                     if text and tag.name in ["div"]:
                         doc.add_text(
                             parent=self.parents[self.level],
-                            label=DocItemLabel.PARAGRAPH,
+                            label=DocItemLabel.TEXT,
                             text=text,
+                            content_layer=self.content_layer,
                         )
                     text = ""
 
@@ -166,7 +176,7 @@ def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
         elif tag.name == "figure":
             self.handle_figure(tag, doc)
         elif tag.name == "img":
-            self.handle_image(doc)
+            self.handle_image(tag, doc)
         else:
             self.walk(tag, doc)
 
@@ -197,12 +207,17 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
         text = element.text.strip()
 
         if hlevel == 1:
+            self.content_layer = ContentLayer.BODY
+
             for key in self.parents.keys():
                 self.parents[key] = None
 
             self.level = 1
             self.parents[self.level] = doc.add_text(
-                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
+                parent=self.parents[0],
+                label=DocItemLabel.TITLE,
+                text=text,
+                content_layer=self.content_layer,
             )
         else:
             if hlevel > self.level:
@@ -213,6 +228,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
                         name=f"header-{i}",
                         label=GroupLabel.SECTION,
                         parent=self.parents[i - 1],
+                        content_layer=self.content_layer,
                     )
                 self.level = hlevel
 
@@ -228,6 +244,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
                 parent=self.parents[hlevel - 1],
                 text=text,
                 level=hlevel,
+                content_layer=self.content_layer,
             )
 
     def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@@ -236,24 +253,35 @@ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
             return
         text = element.text.strip()
         if text:
-            doc.add_code(parent=self.parents[self.level], text=text)
+            doc.add_code(
+                parent=self.parents[self.level],
+                text=text,
+                content_layer=self.content_layer,
+            )
 
     def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles paragraph tags (p)."""
         if element.text is None:
             return
         text = element.text.strip()
-        label = DocItemLabel.PARAGRAPH
         if text:
-            doc.add_text(parent=self.parents[self.level], label=label, text=text)
+            doc.add_text(
+                parent=self.parents[self.level],
+                label=DocItemLabel.TEXT,
+                text=text,
+                content_layer=self.content_layer,
+            )
 
     def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles list tags (ul, ol) and their list items."""
 
         if element.name == "ul":
             # create a list group
             self.parents[self.level + 1] = doc.add_group(
-                parent=self.parents[self.level], name="list", label=GroupLabel.LIST
+                parent=self.parents[self.level],
+                name="list",
+                label=GroupLabel.LIST,
+                content_layer=self.content_layer,
             )
         elif element.name == "ol":
             start_attr = element.get("start")
@@ -267,6 +295,7 @@ def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
                 parent=self.parents[self.level],
                 name="ordered list" + (f" start {start}" if start != 1 else ""),
                 label=GroupLabel.ORDERED_LIST,
+                content_layer=self.content_layer,
             )
         self.level += 1
 
@@ -315,6 +344,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
                     enumerated=enumerated,
                     marker=marker,
                     parent=parent,
+                    content_layer=self.content_layer,
                 )
                 self.level += 1
 
@@ -336,6 +366,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
                 enumerated=enumerated,
                 marker=marker,
                 parent=parent,
+                content_layer=self.content_layer,
             )
         else:
             _log.debug(f"list-item has no text: {element}")
@@ -439,7 +470,11 @@ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
         table_data = HTMLDocumentBackend.parse_table_data(element)
 
         if table_data is not None:
-            doc.add_table(data=table_data, parent=self.parents[self.level])
+            doc.add_table(
+                data=table_data,
+                parent=self.parents[self.level],
+                content_layer=self.content_layer,
+            )
 
     def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
         """Recursively extract text from <ul> or <ol> with proper indentation."""
@@ -479,20 +514,33 @@ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
 
         contains_captions = element.find(["figcaption"])
         if not isinstance(contains_captions, Tag):
-            doc.add_picture(parent=self.parents[self.level], caption=None)
+            doc.add_picture(
+                parent=self.parents[self.level],
+                caption=None,
+                content_layer=self.content_layer,
+            )
         else:
             texts = []
             for item in contains_captions:
                 texts.append(item.text)
 
             fig_caption = doc.add_text(
-                label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
+                label=DocItemLabel.CAPTION,
+                text=("".join(texts)).strip(),
+                content_layer=self.content_layer,
             )
             doc.add_picture(
                 parent=self.parents[self.level],
                 caption=fig_caption,
+                content_layer=self.content_layer,
             )
 
-    def handle_image(self, doc: DoclingDocument) -> None:
+    def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
         """Handles image tags (img)."""
-        doc.add_picture(parent=self.parents[self.level], caption=None)
+        _log.debug(f"ignoring <img> tags at the moment: {element}")
+
+        doc.add_picture(
+            parent=self.parents[self.level],
+            caption=None,
+            content_layer=self.content_layer,
+        )
diff --git a/tests/data/groundtruth/docling_v2/example_01.html.itxt b/tests/data/groundtruth/docling_v2/example_01.html.itxt
@@ -1,8 +1,8 @@
 item-0 at level 0: unspecified: group _root_
   item-1 at level 1: title: Introduction
-    item-2 at level 2: paragraph: This is the first paragraph of the introduction.
+    item-2 at level 2: text: This is the first paragraph of the introduction.
     item-3 at level 2: section_header: Background
-      item-4 at level 3: paragraph: Some background information here.
+      item-4 at level 3: text: Some background information here.
       item-5 at level 3: picture
       item-6 at level 3: list: group list
         item-7 at level 4: list_item: First item in unordered list

diff --git a/tests/data/groundtruth/docling_v2/example_01.html.json b/tests/data/groundtruth/docling_v2/example_01.html.json
@@ -88,7 +88,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is the first paragraph of the introduction.",
       "text": "This is the first paragraph of the introduction."
@@ -126,7 +126,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "Some background information here.",
       "text": "Some background information here."

diff --git a/tests/data/groundtruth/docling_v2/example_02.html.itxt b/tests/data/groundtruth/docling_v2/example_02.html.itxt
@@ -1,8 +1,8 @@
 item-0 at level 0: unspecified: group _root_
   item-1 at level 1: title: Introduction
-    item-2 at level 2: paragraph: This is the first paragraph of the introduction.
+    item-2 at level 2: text: This is the first paragraph of the introduction.
     item-3 at level 2: section_header: Background
-      item-4 at level 3: paragraph: Some background information here.
+      item-4 at level 3: text: Some background information here.
       item-5 at level 3: list: group list
         item-6 at level 4: list_item: First item in unordered list
         item-7 at level 4: list_item: Second item in unordered list

diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json
@@ -88,7 +88,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is the first paragraph of the introduction.",
       "text": "This is the first paragraph of the introduction."
@@ -123,7 +123,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "Some background information here.",
       "text": "Some background information here."

diff --git a/tests/data/groundtruth/docling_v2/example_03.html.itxt b/tests/data/groundtruth/docling_v2/example_03.html.itxt
@@ -1,9 +1,9 @@
 item-0 at level 0: unspecified: group _root_
   item-1 at level 1: title: Example Document
     item-2 at level 2: section_header: Introduction
-      item-3 at level 3: paragraph: This is the first paragraph of the introduction.
+      item-3 at level 3: text: This is the first paragraph of the introduction.
     item-4 at level 2: section_header: Background
-      item-5 at level 3: paragraph: Some background information here.
+      item-5 at level 3: text: Some background information here.
       item-6 at level 3: list: group list
         item-7 at level 4: list_item: First item in unordered list
           item-8 at level 5: list: group list

diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json
@@ -142,7 +142,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is the first paragraph of the introduction.",
       "text": "This is the first paragraph of the introduction."
@@ -177,7 +177,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "Some background information here.",
       "text": "Some background information here."

diff --git a/tests/data/groundtruth/docling_v2/example_06.html.itxt b/tests/data/groundtruth/docling_v2/example_06.html.itxt
@@ -1,7 +1,7 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: paragraph: This is a div with text.
-  item-2 at level 1: paragraph: This is another div with text.
-  item-3 at level 1: paragraph: This is a regular paragraph.
-  item-4 at level 1: paragraph: This is a third div
+  item-1 at level 1: text: This is a div with text.
+  item-2 at level 1: text: This is another div with text.
+  item-3 at level 1: text: This is a regular paragraph.
+  item-4 at level 1: text: This is a third div
 with a new line.
-  item-5 at level 1: paragraph: This is a fourth div with a bold paragraph.
+  item-5 at level 1: text: This is a fourth div with a bold paragraph.
diff --git a/tests/data/groundtruth/docling_v2/example_06.html.json b/tests/data/groundtruth/docling_v2/example_06.html.json
@@ -46,7 +46,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is a div with text.",
       "text": "This is a div with text."
@@ -58,7 +58,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is another div with text.",
       "text": "This is another div with text."
@@ -70,7 +70,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is a regular paragraph.",
       "text": "This is a regular paragraph."
@@ -82,7 +82,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is a third div\nwith a new line.",
       "text": "This is a third div\nwith a new line."
@@ -94,7 +94,7 @@
       },
       "children": [],
       "content_layer": "body",
-      "label": "paragraph",
+      "label": "text",
       "prov": [],
       "orig": "This is a fourth div with a bold paragraph.",
       "text": "This is a fourth div with a bold paragraph."