Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/update html parser with h1 #240

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 102 additions & 31 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union

from bs4 import BeautifulSoup
from bs4.element import Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
Expand All @@ -21,7 +23,12 @@


class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
skip_furniture: bool = True,
):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
self.soup = None
Expand All @@ -35,17 +42,21 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.parents[i] = None
self.labels = {} # type: ignore

self.skip_furniture = skip_furniture

try:
if isinstance(self.path_or_stream, BytesIO):
_log.debug("reading from BytesIO")
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
_log.debug("reading from file")
with open(self.path_or_stream, "r", encoding="utf-8") as fr:
html_content = fr.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
f"Could not initialize HTML backend for file with hash '{self.document_hash}'."
) from e

def is_valid(self) -> bool:
Expand Down Expand Up @@ -81,6 +92,10 @@ def convert(self) -> DoclingDocument:
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")

self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture
self.detected_h1 = False

doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
Expand All @@ -90,46 +105,91 @@ def convert(self) -> DoclingDocument:

def walk(self, element, doc):
try:
# Iterate over elements in the body of the document
for idx, element in enumerate(element.children):
try:
self.analyse_element(element, idx, doc)
except Exception as exc_child:
if isinstance(element, Tag) and any(element.children):
# Iterate over elements in the body of the document
for idx, child in enumerate(element.children):
try:
self.analyse_element(child, idx, doc)
except Exception as exc:
_log.info(f" -> error treating child: {exc}")
raise exc

_log.error(" -> error treating child: ", exc_child)
_log.error(" => element: ", element, "\n")
raise exc_child
elif isinstance(element, Tag):
try:
self.analyse_element(element, 0, doc)
except Exception as exc:
_log.info(f" -> error treating elem: {exc}")
raise exc
else:
_log.debug(f"ignoring element of type {type(element)}")

except Exception as exc:
_log.debug(f"error walking element: {type(element)}")
pass

return doc

def is_body(self):
return (not self.contains_h1) or (self.contains_h1 and self.detected_h1)

def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""

if element.name != None:
_log.debug("\t" * self.level, idx, "\t", f"{element.name} ({self.level})")

if element.name in self.labels:
self.labels[element.name] += 1
else:
self.labels[element.name] = 1

if element.name in ["h1"]:
self.detected_h1 = True

if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc)
if self.is_body():
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
if self.is_body():
self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
if self.is_body():
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
self.handle_listitem(element, idx, doc)
if self.is_body():
self.handle_listitem(element, idx, doc)
elif element.name == "table":
self.handle_table(element, idx, doc)
if self.is_body():
self.handle_table(element, idx, doc)
elif element.name == "figure":
self.handle_figure(element, idx, doc)
if self.is_body():
self.handle_figure(element, idx, doc)
elif element.name == "img":
self.handle_image(element, idx, doc)
if self.is_body():
self.handle_image(element, idx, doc)
elif element.name == "svg":
if self.is_body():
self.handle_svg(element, idx, doc)

elif (
isinstance(element, Tag)
and element.name in ["section"]
and element.has_attr("data-content")
):
try:
# Decode the data-content attribute
# data_content = html.unescape(element['data-content'])
data_content = element["data-content"]

# Parse the decoded HTML content
content_soup = BeautifulSoup(data_content, "html.parser")

for jdx, _ in enumerate(content_soup):
self.analyse_element(_, jdx, doc)
except:
_log.debug("could not parse the `data-content` attribute")

self.walk(element, doc)

else:
self.walk(element, doc)

Expand Down Expand Up @@ -209,10 +269,12 @@ def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
return

text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if len(text) == 0:
return

label = DocItemLabel.PARAGRAPH
doc.add_text(parent=self.parents[self.level], label=label, text=text)

def handle_list(self, element, idx, doc):
Expand Down Expand Up @@ -250,8 +312,9 @@ def handle_listitem(self, element, idx, doc):
# we need to extract it recursively
text = self.extract_text_recursively(element)
# Flatten text, remove break lines:
text = text.replace("\n", "").replace("\r", "")
text = text.replace("\n", " ").replace("\r", "")
text = " ".join(text.split()).strip()
text = re.sub(r"\s{2,}", " ", text)

marker = ""
enumerated = False
Expand All @@ -276,18 +339,22 @@ def handle_listitem(self, element, idx, doc):

elif isinstance(element.text, str):
text = element.text.strip()
text = text.replace("\n", " ").replace("\r", "")
text = re.sub(r"\s{2,}", " ", text)

marker = ""
enumerated = False
if parent_list_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)

if len(text) > 0:
doc.add_list_item(
text=text,
enumerated=enumerated,
marker=marker,
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)

Expand Down Expand Up @@ -427,3 +494,7 @@ def handle_figure(self, element, idx, doc):
def handle_image(self, element, idx, doc):
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)

def handle_svg(self, element, idx, doc):
"""Handles svg tags."""
doc.add_picture(parent=self.parents[self.level], caption=None)
Loading