Skip to content

Commit

Permalink
reformatted code of html backend
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Feb 24, 2025
1 parent ecf20cb commit e96ed30
Showing 1 changed file with 47 additions and 19 deletions.
66 changes: 47 additions & 19 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
from typing import Optional, Union, cast

from bs4 import BeautifulSoup, NavigableString, PageElement, Tag

from docling_core.types.doc.document import (ContentLayer)
from docling_core.types.doc import (
DocItem,
DocItemLabel,
Expand All @@ -16,6 +14,7 @@
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
Expand Down Expand Up @@ -88,7 +87,7 @@ def convert(self) -> DoclingDocument:

if self.is_valid():
self.content_layer = ContentLayer.FURNITURE

assert self.soup is not None
content = self.soup.body or self.soup
# Replace <br> tags with newline characters
Expand Down Expand Up @@ -164,13 +163,16 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:

if hlevel == 1:
self.content_layer = ContentLayer.BODY

for key, val in self.parents.items():
self.parents[key] = None

self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text, content_layer=self.content_layer
parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
)
else:
if hlevel > self.level:
Expand All @@ -181,7 +183,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
content_layer=self.content_layer
content_layer=self.content_layer,
)
self.level = hlevel

Expand All @@ -197,7 +199,7 @@ def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
content_layer=self.content_layer
content_layer=self.content_layer,
)

def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
Expand All @@ -206,7 +208,11 @@ def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
return
text = element.text.strip()
if text:
doc.add_code(parent=self.parents[self.level], text=text, content_layer=self.content_layer)
doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)

def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
Expand All @@ -215,23 +221,31 @@ def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
text = element.text.strip()
label = DocItemLabel.TEXT
if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text, content_layer=self.content_layer)
doc.add_text(
parent=self.parents[self.level],
label=label,
text=text,
content_layer=self.content_layer,
)

def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""

if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST, content_layer=self.content_layer
parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
)
elif element.name == "ol":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer
content_layer=self.content_layer,
)
self.level += 1

Expand Down Expand Up @@ -272,7 +286,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer
content_layer=self.content_layer,
)
self.level += 1

Expand All @@ -294,7 +308,7 @@ def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer
content_layer=self.content_layer,
)
else:
_log.warning(f"list-item has no text: {element}")
Expand Down Expand Up @@ -398,7 +412,11 @@ def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
table_data = HTMLDocumentBackend.parse_table_data(element)

if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level], content_layer=self.content_layer)
doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)

def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation."""
Expand Down Expand Up @@ -438,23 +456,33 @@ def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:

contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)

fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip(), content_layer=self.content_layer
label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
content_layer=self.content_layer
content_layer=self.content_layer,
)

def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
_log.warning(f"ignoring <img> tags at the moment: {element}")

doc.add_picture(parent=self.parents[self.level], caption=None, content_layer=self.content_layer)

doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)

0 comments on commit e96ed30

Please sign in to comment.