From 1ccaeaef92ae2ad85d9a3d697270c0ff15747b42 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:26:48 +0200 Subject: [PATCH] make typing py3.9-compatible Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/chunker/hierarchical_chunker.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docling_core/chunker/hierarchical_chunker.py b/docling_core/chunker/hierarchical_chunker.py index 73b3881d..76f8f3aa 100644 --- a/docling_core/chunker/hierarchical_chunker.py +++ b/docling_core/chunker/hierarchical_chunker.py @@ -9,7 +9,7 @@ import logging from enum import Enum -from typing import Any, Iterator +from typing import Any, Iterator, Optional, Union import pandas as pd from pydantic import BaseModel, PositiveInt @@ -53,11 +53,11 @@ class _NodeName(str, Enum): } @classmethod - def _norm(cls, text: str | None) -> str | None: + def _norm(cls, text: Optional[str]) -> Optional[str]: return text.lower() if text is not None else None @classmethod - def _convert_table_to_dataframe(cls, table: Table) -> pd.DataFrame | None: + def _convert_table_to_dataframe(cls, table: Table) -> Optional[pd.DataFrame]: if table.data: table_content = [[cell.text for cell in row] for row in table.data] return pd.DataFrame(table_content) @@ -65,8 +65,8 @@ def _convert_table_to_dataframe(cls, table: Table) -> pd.DataFrame | None: return None @classmethod - def _triplet_serialize(cls, table) -> str | None: - output_text: str | None = None + def _triplet_serialize(cls, table) -> Optional[str]: + output_text: Optional[str] = None table_df = cls._convert_table_to_dataframe(table) if table_df is not None and table_df.shape[0] > 1 and table_df.shape[1] > 1: rows = [item.strip() for item in table_df.iloc[:, 0].to_list()] @@ -87,7 +87,7 @@ def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str: return f"$.{path_prefix}[{pos}]" class _MainTextItemNode(BaseModel): - parent: int | None = None + parent: Optional[int] = None children: list[int] = [] class _TitleInfo(BaseModel): @@ -95,7 +95,7 @@ class _TitleInfo(BaseModel): path_in_doc: str class _GlobalContext(BaseModel): - title: _HC._TitleInfo | None = None + title: Optional[_HC._TitleInfo] = None class _DocContext(BaseModel): dmap: dict[int, _HC._MainTextItemNode] # main text element context @@ -276,13 +276,13 @@ def _build_chunk( idx: int, delim: str, rec: bool = False, - ) -> Chunk | None: + ) -> Optional[Chunk]: texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec) concat = delim.join([t.text for t in texts if t.text]) assert doc.main_text is not None if len(concat) >= self.min_chunk_len: orig_item = doc.main_text[idx] - item: BaseText | Table + item: Union[BaseText, Table] if isinstance(orig_item, Ref): if _HC._norm(orig_item.obj_type) == _HC._NodeType.TABLE and doc.tables: pos = int(orig_item.ref.split("/")[2])