Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add export to xml and html #17

Merged
merged 13 commits into from
Sep 18, 2024
Prev Previous commit
Next Next commit
added the DocumentTokens class
Signed-off-by: Peter Staar <[email protected]>
PeterStaar-IBM committed Sep 9, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 4ca481e80ccf390c17309a06b15189865396f552
67 changes: 67 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
"""Models for the Docling Document data type."""

from datetime import datetime
from enum import Enum
from typing import Generic, Optional, Union

from pydantic import (
@@ -346,6 +347,72 @@ def from_dict(cls, data):
return data


class DocumentToken(Enum):
"""Class to represent an LLM friendly representation of a Document."""

BEG_DOCUMENT = "<document>"
END_DOCUMENT = "</document>"

BEG_TITLE = "<title>"
END_TITLE = "</title>"

BEG_ABSTRACT = "<abstract>"
END_ABSTRACT = "</abstract>"

BEG_DOI = "<doi>"
END_DOI = "</doi>"
BEG_DATE = "<date>"
END_DATE = "</date>"

BEG_AUTHORS = "<authors>"
END_AUTHORS = "</authors>"
BEG_AUTHOR = "<author>"
END_AUTHOR = "</author>"

BEG_AFFILIATIONS = "<affiliations>"
END_AFFILIATIONS = "</affiliations>"
BEG_AFFILIATION = "<affiliation>"
END_AFFILIATION = "</affiliation>"

BEG_HEADER = "<section-header>"
END_HEADER = "</section-header>"
BEG_TEXT = "<text>"
END_TEXT = "</text>"
BEG_PARAGRAPH = "<paragraph>"
END_PARAGRAPH = "</paragraph>"
BEG_TABLE = "<table>"
END_TABLE = "</table>"
BEG_FIGURE = "<figure>"
END_FIGURE = "</figure>"
BEG_CAPTION = "<caption>"
END_CAPTION = "</caption>"
BEG_EQUATION = "<equation>"
END_EQUATION = "</equation>"
BEG_LIST = "<list>"
END_LIST = "</list>"
BEG_LISTITEM = "<list-item>"
END_LISTITEM = "</list-item>"

BEG_LOCATION = "<location>"
END_LOCATION = "</location>"
BEG_GROUP = "<group>"
END_GROUP = "</group>"

@classmethod
def get_special_tokens(cls):
"""Function to get all special document tokens."""
special_tokens = [token.value for token in cls]

# Adding dynamically generated row and col tokens
for i in range(100):
special_tokens += [f"<row_{i}>", f"</row_{i}>", f"<col_{i}>", f"</col_{i}>"]

for i in range(6):
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]

return special_tokens


class ExportedCCSDocument(
MinimalDocument,
Generic[
34 changes: 34 additions & 0 deletions docling_core/types/rec/statement.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
#

"""Define the model Statement."""
from enum import Enum
from typing import Generic

from pydantic import Field
@@ -21,6 +22,39 @@
from docling_core.types.rec.subject import Subject


class StatementToken(Enum):
"""Class to represent an LLM friendly representation of statements."""

BEG_STATEMENTS = "<statements>"
END_STATEMENTS = "</statements>"

BEG_STATEMENT = "<statement>"
END_STATEMENT = "</statement>"

BEG_PROV = "<prov>"
END_PROV = "</prov>"

BEG_SUBJECT = "<subject>"
END_SUBJECT = "</subject>"

BEG_PREDICATE = "<predicate>"
END_PREDICATE = "</predicate>"

BEG_PROPERTY = "<property>"
END_PROPERTY = "</property>"

BEG_VALUE = "<value>"
END_VALUE = "</value>"

BEG_UNIT = "<unit>"
END_UNIT = "</unit>"

@classmethod
def get_special_tokens(cls):
"""Function to get all special statements tokens."""
return [token.value for token in cls]


class Statement(
Attribute,
Generic[