From 156a98a18882d53b1cb03c548ce7af80c11f5175 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sat, 26 Oct 2024 07:26:44 +0200 Subject: [PATCH 1/6] added ReadingOrder model Signed-off-by: Peter Staar --- .gitignore | 2 + .../reading_order/ReadingOrder.py | 159 ++++++++++++++++++ docling_ibm_models/reading_order/__init__.py | 0 3 files changed, 161 insertions(+) create mode 100644 docling_ibm_models/reading_order/ReadingOrder.py create mode 100644 docling_ibm_models/reading_order/__init__.py diff --git a/.gitignore b/.gitignore index 34c70f7..9474b33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*~ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docling_ibm_models/reading_order/ReadingOrder.py b/docling_ibm_models/reading_order/ReadingOrder.py new file mode 100644 index 0000000..c62832c --- /dev/null +++ b/docling_ibm_models/reading_order/ReadingOrder.py @@ -0,0 +1,159 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# +import os +from collections.abc import Iterable +from typing import Union + +@dataclass +class PageElement: + + cid: int # conversion id + pid: int # page-id + + x0: float # lower-left x + y0: float # lower-left y + + x1: float # upper-right x + y1: float # upper-right y + + label: str # layout label + + def overlaps_x(self, other: PageElement) -> bool: + return True + + def overlaps_y(self, other: PageElement) -> bool: + return True + + def is_strictly_left_of(self, other: PageElement) -> bool: + return True + + def is_strictly_right_of(self, other: PageElement) -> bool: + return True + + def is_strictly_below(self, other: PageElement) -> bool: + return True + + def is_strictly_above(self, other: PageElement) -> bool: + return True + + def follows_maintext_order(self, other: PageElement) -> bool: + return True + +class ReadingOrder: + r""" + Rule based reading order for DoclingDocument + """ + + def __init__(self): + self.page_elements: Dict[int, List[PageElement]] = {} + + def predict(self, conv_res: ConversionResult) -> DoclingDocument: + r""" + Reorder the output of the + """ + doc_elems = self._to_page_elements(conv_res) + + for pid, page_elems in doc_elems.items(): + + h2i_map, i2h_map = init_h2i_map(page_elems) + + l2r_map, r2l_map = init_l2r_map(page_elems) + + up_map, dn_map = init_ud_maps(page_elems) + + doc = DoclingDocument() + return doc + + def _to_page_elements(self, conv_res:ConversionResult): + + self.page_elements = {} + self.page_elements = {p.page_no: [] for p in conv_res.pages} + + for elem_id, element in enumerate(conv_res.assembled.elements): + # Convert bboxes to lower-left origin. + bbox = DsBoundingBox( + element.cluster.bbox.to_bottom_left_origin( + page_no_to_page[element.page_no].size.height + ).as_tuple() + ) + + elem = PageElement(cid=cid, pid=element.page_no, + x0=bbox[0], y0=bbox[1], x1=bbox[2], y1=bbox[3], + label=element.label) + + self.page_elements[element.page_no].append(elem) + + def _init_h2i_map(self, page_elems): + h2i_map = {} + i2h_map = {} + + for i,pelem in enumerate(page_elems): + h2i_map[pelem.cid] = i + i2h_map[i] = pelem.cid + + return h2i_map, i2h_map + + def _init_l2r_map(self, page_elems): + l2r_map = {} + r2l_map = {} + + for i,pelem_i in enumerate(page_elems): + for j,pelem_j in enumerate(page_elems): + + if(pelem_i.follows_maintext_order(pelem_j) and + pelem_i.is_strictly_left_of(pelem_j) and + pelem_i.overlaps_y(pelem_j, 0.8)): + l2r_map[i] = j; + r2l_map[j] = i; + + return l2r_map, r2l_map + + def _init_ud_maps(self, page_elems): + up_map = {} + dn_map = {} + + for i,pelem_i in enumerate(page_elems): + up_map[i] = [] + dn_map[i] = [] + + for j,pelem_j in enumerate(page_elems): + + if(j in r2l_map): + i = r2l_map[j] + + dn_map[i] = [j] + up_map[j] = [i] + + continue + + for i,pelem_i in enumerate(page_elems): + + if i==j: + continue + + is_horizontally_connected:bool = False; + is_i_just_above_j:bool = (pelem_i.overlaps_x(pelem_j) and pelem_i.is_strictly_above(pelem_j)); + + for w,pelem_w in enumerate(page_elems): + + if(not is_horizontally_connected): + is_horizontally_connected = pelem_w.is_horizontally_connected(pelem_i, pelem_j); + + # ensure there is no other element that is between i and j vertically + if(is_i_just_above_j and (pelem_i.overlaps_x(pelem_w) or pelem_j.overlaps_x(pelem_w))): + i_above_w:bool = pelem_i.is_strictly_above(pelem_w); + w_above_j:bool = pelem_w.is_strictly_above(pelem_j); + + is_i_just_above_j:bool = (not (i_above_w and w_above_j)); + + if(is_i_just_above_j): + + while(i in l2r_map): + i = l2r_map[i]; + + dn_map[i].append(j) + up_map[j].append(i) + + return up_map, dn_map diff --git a/docling_ibm_models/reading_order/__init__.py b/docling_ibm_models/reading_order/__init__.py new file mode 100644 index 0000000..e69de29 From 93aaa57878e7385f2b8c36c494703455ef68d287 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sat, 26 Oct 2024 08:32:56 +0200 Subject: [PATCH 2/6] updated the ReadinOrder Signed-off-by: Peter Staar --- .../reading_order/ReadingOrder.py | 95 +++++++++++++++---- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/docling_ibm_models/reading_order/ReadingOrder.py b/docling_ibm_models/reading_order/ReadingOrder.py index c62832c..07caa66 100644 --- a/docling_ibm_models/reading_order/ReadingOrder.py +++ b/docling_ibm_models/reading_order/ReadingOrder.py @@ -9,6 +9,8 @@ @dataclass class PageElement: + eps: float = 1.e-3 + cid: int # conversion id pid: int # page-id @@ -20,27 +22,67 @@ class PageElement: label: str # layout label - def overlaps_x(self, other: PageElement) -> bool: - return True + def follows_maintext_order(self, rhs: PageElement) -> bool: + return (self.cid+1==rhs.cid) + + def overlaps(self, rhs: PageElement) -> bool: + return (self.overlaps_x(rhs) and self.overlaps_y(rhs)) - def overlaps_y(self, other: PageElement) -> bool: - return True + def overlaps_x(self, rhs: PageElement) -> bool: + return ((self.x0<=rhs.x0 and rhs.x0 bool: + return ((self.y0<=rhs.y0 and rhs.y0 bool: - return True + def overlaps_y_with_iou(self, rhs: PageElement, iou:float) -> bool: + return False + + def is_left_of(self, rhs: PageElement) -> bool: + return (self.x0 bool: + def is_strictly_left_of(self, rhs: PageElement) -> bool: + return (self.x1+self.eps bool: return True - def is_strictly_below(self, other: PageElement) -> bool: + def is_strictly_right_of(self, rhs: PageElement) -> bool: + return True + """ + + """ + def is_below(self, rhs: PageElement) -> bool: return True - def is_strictly_above(self, other: PageElement) -> bool: + def is_strictly_below(self, rhs: PageElement) -> bool: return True + """ + + def is_above(self, rhs: PageElement) -> bool: + return (self.y0>rhs.y0) + + def is_strictly_above(self, rhs: PageElement) -> bool: + (self.y0+self.eps>rhs.y1) - def follows_maintext_order(self, other: PageElement) -> bool: - return True + def is_horizontally_connected(self, elem_i: PageElement, elem_j: PageElement) -> bool: + min_ij:float = min(elem_i.y0, elem_j.y0) + max_ij:float = max(elem_i.y1, elem_j.y1) + + if(self.y0min_ij): # overlap_y + return False + if(self.x0elem_j.x0): + return True + + return False + class ReadingOrder: r""" Rule based reading order for DoclingDocument @@ -57,11 +99,13 @@ def predict(self, conv_res: ConversionResult) -> DoclingDocument: for pid, page_elems in doc_elems.items(): - h2i_map, i2h_map = init_h2i_map(page_elems) + h2i_map, i2h_map = self.init_h2i_map(page_elems) + + l2r_map, r2l_map = self.init_l2r_map(page_elems) - l2r_map, r2l_map = init_l2r_map(page_elems) + up_map, dn_map = self.init_ud_maps(page_elems) - up_map, dn_map = init_ud_maps(page_elems) + heads = self.find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) doc = DoclingDocument() return doc @@ -85,7 +129,7 @@ def _to_page_elements(self, conv_res:ConversionResult): self.page_elements[element.page_no].append(elem) - def _init_h2i_map(self, page_elems): + def _init_h2i_map(self, page_elems: List[PageElement]): h2i_map = {} i2h_map = {} @@ -95,7 +139,7 @@ def _init_h2i_map(self, page_elems): return h2i_map, i2h_map - def _init_l2r_map(self, page_elems): + def _init_l2r_map(self, page_elems: List[PageElement]): l2r_map = {} r2l_map = {} @@ -110,7 +154,7 @@ def _init_l2r_map(self, page_elems): return l2r_map, r2l_map - def _init_ud_maps(self, page_elems): + def _init_ud_maps(self, page_elems: List[PageElement]): up_map = {} dn_map = {} @@ -157,3 +201,20 @@ def _init_ud_maps(self, page_elems): up_map[j].append(i) return up_map, dn_map + + def find_heads(self, page_elems, h2i_map, i2h_map, up_map, dn_map): + heads:list[int] = [] + + head_provs = [] + for key,vals in up_map.items(): + if(len(vals)==0): + head_provs.append(page_elems[key]) + + sorted(head_provs, key=lambda); + + for item in head_provs.items(): + heads.append(h2i_map[item.cid)) + + return heads + + From 20fa95012fff5da70afdae84213d603f22cadc26 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sun, 27 Oct 2024 06:30:31 +0100 Subject: [PATCH 3/6] finished the first porting of the reading-order Signed-off-by: Peter Staar --- .../reading_order/ReadingOrder.py | 184 +++++++++++++++--- 1 file changed, 153 insertions(+), 31 deletions(-) diff --git a/docling_ibm_models/reading_order/ReadingOrder.py b/docling_ibm_models/reading_order/ReadingOrder.py index 07caa66..956d544 100644 --- a/docling_ibm_models/reading_order/ReadingOrder.py +++ b/docling_ibm_models/reading_order/ReadingOrder.py @@ -3,6 +3,9 @@ # SPDX-License-Identifier: MIT # import os + +import copy + from collections.abc import Iterable from typing import Union @@ -22,6 +25,16 @@ class PageElement: label: str # layout label + def __lt__(self, other): + if self.pid==other.pid: + + if self.overlaps_x(other): + return self.y0 > other.y0 + else: + return self.x0 < other.x0 + else: + return self.pid bool: return (self.cid+1==rhs.cid) @@ -49,22 +62,6 @@ def is_left_of(self, rhs: PageElement) -> bool: def is_strictly_left_of(self, rhs: PageElement) -> bool: return (self.x1+self.eps bool: - return True - - def is_strictly_right_of(self, rhs: PageElement) -> bool: - return True - """ - - """ - def is_below(self, rhs: PageElement) -> bool: - return True - - def is_strictly_below(self, rhs: PageElement) -> bool: - return True - """ - def is_above(self, rhs: PageElement) -> bool: return (self.y0>rhs.y0) @@ -99,14 +96,33 @@ def predict(self, conv_res: ConversionResult) -> DoclingDocument: for pid, page_elems in doc_elems.items(): - h2i_map, i2h_map = self.init_h2i_map(page_elems) + h2i_map, i2h_map = self._init_h2i_map(page_elems) - l2r_map, r2l_map = self.init_l2r_map(page_elems) + l2r_map, r2l_map = self._init_l2r_map(page_elems) - up_map, dn_map = self.init_ud_maps(page_elems) + up_map, dn_map = self._init_ud_maps(page_elems, l2r_map) - heads = self.find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) + if True: + dilated_page_elems = copy.deepcopy(page_elems) # deep-copy + self._do_horizontal_dilation(page_elems, dilated_page_elems, up_map, dn_map); + + # redo with dilated provs + up_map={} + dn_map={} + all_up_map={} + self._init_ud_maps_v2(dilated_page_elems, l2r_map, r2l_map, up_map, dn_map) + heads = self._find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) + + self._sort_ud_maps(provs, h2i_map, i2h_map, up_map, dn_map); + order = self._find_order(provs, heads, up_map, dn_map); + + sorted_page_elems=[]; + for ind in order: + sorted_page_elems.append(self.page_elems[ind]); + + doc_elems[pid] = result + doc = DoclingDocument() return doc @@ -153,10 +169,10 @@ def _init_l2r_map(self, page_elems: List[PageElement]): r2l_map[j] = i; return l2r_map, r2l_map - - def _init_ud_maps(self, page_elems: List[PageElement]): - up_map = {} - dn_map = {} + + def _init_ud_maps(self, page_elems: List[PageElement], l2r_map: Dict[int, int]): + up_map: dict[int, list[int]] = {} + dn_map: dict[int, list[int]] = {} for i,pelem_i in enumerate(page_elems): up_map[i] = [] @@ -192,9 +208,9 @@ def _init_ud_maps(self, page_elems: List[PageElement]): is_i_just_above_j:bool = (not (i_above_w and w_above_j)); - if(is_i_just_above_j): + if is_i_just_above_j: - while(i in l2r_map): + while i in l2r_map: i = l2r_map[i]; dn_map[i].append(j) @@ -202,19 +218,125 @@ def _init_ud_maps(self, page_elems: List[PageElement]): return up_map, dn_map - def find_heads(self, page_elems, h2i_map, i2h_map, up_map, dn_map): + def _do_horizontal_dilation(self, page_elems, dilated_page_elems, up_map, dn_map): + dilated_page_elems = page_elems # // deep-copy + + for i,pelem_i in enumerate(dilated_page_elems): + + x0 = pelem_i.x0; + y0 = pelem_i.y0; + + x1 = pelem_i.x1; + y1 = pelem_i.y1; + + if i in up_map: + pelem_up = page_elems[up_map[i][0]]] + + x0 = min(x0, pelem_up.x0); + x1 = max(x1, pelem_up.x1); + + if i in dn_map: + pelem_dn = page_elems[dn_map[i][0]]] + + x0 = min(x0, pelem_dn.x0); + x1 = max(x1, pelem_dn.x1); + + pelem_i.x0 = x0 + pelem_i.x1 = x1 + + overlaps_with_rest:bool = False; + for j,pelem_j in enumerate(page_elems): + + if i==j: + continue; + + if not overlaps_with_rest: + overlaps_with_rest = pelem_j.overlaps(pelem_i); + + # update + if(not overlaps_with_rest): + dilated_page_elems[i].x0 = x0 + dilated_page_elems[i].y0 = y0 + dilated_page_elems[i].x1 = x1 + dilated_page_elems[i].y1 = y1 + + def _find_heads(self, page_elems, h2i_map, i2h_map, up_map, dn_map): heads:list[int] = [] - head_provs = [] + head_page_elems = [] for key,vals in up_map.items(): if(len(vals)==0): - head_provs.append(page_elems[key]) + head_page_elems.append(page_elems[key]) - sorted(head_provs, key=lambda); + sorted(head_page_elems, key=lambda); - for item in head_provs.items(): + for item in head_page_elems: heads.append(h2i_map[item.cid)) return heads + def _sort_ud_maps(self, provs, h2i_map, i2h_map, up_map, dn_map): + for ind_i,vals in dn_map.items(): + + child_provs={}; + for ind_j in vals: + child_provs.push_back(provs[ind_j]); + + sorted(child_provs) + + dn_map[ind_i] = [] + for child in child_provs: + dn_map[ind_i].append(h2i_map[child.cid]); + + def _find_order(self, provs, heads, up_map, dn_map): + order: list[int] = []; + + visited: list[bool] = [False for _ in provs]; + + for j in heads: + + if not visited[j]: + + order.append(j); + visited[j] = true; + + self.depth_first_search_downwards(j, order, visited, dn_map, up_map); + + if len(order)!=len(provs): + _log.error("something went wrong") + + return order; + + def _depth_first_search_upwards(j: int, + order: list[int], + visited: list[bool], + dn_map: dict[int, list[int]], + up_map: dict[int, list[int]]): + """depth_first_search_upwards""" + k = j + + auto& inds = up_map.at(j); + for ind in inds: + if not visited[ind]: + return self.depth_first_search_upwards(ind, order, visited, dn_map, up_map) + + return k; + + def _depth_first_search_downwards(j: int, + order: list[int], + visited: list[bool], + dn_map: dict[int, list[int]], + up_map: dict[int, list[int]]): + """depth_first_search_downwards""" + + inds: list[int] = dn_map[j] + + for i in inds: + k:int = self._depth_first_search_upwards(i, order, visited, dn_map, up_map) + + if not visited[k]: + order.append(k) + visited[k] = True + + self._depth_first_search_downwards(k, order, visited, dn_map, up_map); From ee95a52cc21f0e9f2d1122e252424e0facac57a1 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Oct 2024 06:04:22 +0100 Subject: [PATCH 4/6] added a test and refactored the reading-order-model Signed-off-by: Peter Staar --- .../{ReadingOrder.py => reading_order_rb.py} | 208 +++++++++--------- tests/test_reading_order.py | 95 ++++++++ 2 files changed, 198 insertions(+), 105 deletions(-) rename docling_ibm_models/reading_order/{ReadingOrder.py => reading_order_rb.py} (62%) create mode 100644 tests/test_reading_order.py diff --git a/docling_ibm_models/reading_order/ReadingOrder.py b/docling_ibm_models/reading_order/reading_order_rb.py similarity index 62% rename from docling_ibm_models/reading_order/ReadingOrder.py rename to docling_ibm_models/reading_order/reading_order_rb.py index 956d544..e35a5e6 100644 --- a/docling_ibm_models/reading_order/ReadingOrder.py +++ b/docling_ibm_models/reading_order/reading_order_rb.py @@ -9,21 +9,23 @@ from collections.abc import Iterable from typing import Union +from dataclasses import dataclass + @dataclass class PageElement: eps: float = 1.e-3 - cid: int # conversion id - pid: int # page-id + cid: int = -1# conversion id + pid: int = -1# page-id - x0: float # lower-left x - y0: float # lower-left y + x0: float = -1.0# lower-left x + y0: float = -1.0# lower-left y - x1: float # upper-right x - y1: float # upper-right y + x1: float = -1.0 # upper-right x + y1: float = -1.0 # upper-right y - label: str # layout label + label: str = "" # layout label def __lt__(self, other): if self.pid==other.pid: @@ -35,97 +37,93 @@ def __lt__(self, other): else: return self.pid bool: + def follows_maintext_order(self, rhs) -> bool: return (self.cid+1==rhs.cid) - def overlaps(self, rhs: PageElement) -> bool: + def overlaps(self, rhs) -> bool: return (self.overlaps_x(rhs) and self.overlaps_y(rhs)) - def overlaps_x(self, rhs: PageElement) -> bool: + def overlaps_x(self, rhs) -> bool: return ((self.x0<=rhs.x0 and rhs.x0 bool: + def overlaps_y(self, rhs) -> bool: return ((self.y0<=rhs.y0 and rhs.y0 bool: + def overlaps_y_with_iou(self, rhs, iou:float) -> bool: return False - def is_left_of(self, rhs: PageElement) -> bool: + def is_left_of(self, rhs) -> bool: return (self.x0 bool: - return (self.x1+self.eps bool: + return ((self.x1+self.eps) bool: + def is_above(self, rhs) -> bool: return (self.y0>rhs.y0) - def is_strictly_above(self, rhs: PageElement) -> bool: - (self.y0+self.eps>rhs.y1) + def is_strictly_above(self, rhs) -> bool: + return ((self.y0+self.eps)>rhs.y1) - def is_horizontally_connected(self, elem_i: PageElement, elem_j: PageElement) -> bool: + def is_horizontally_connected(self, elem_i, elem_j) -> bool: min_ij:float = min(elem_i.y0, elem_j.y0) max_ij:float = max(elem_i.y1, elem_j.y1) - - if(self.y0min_ij): # overlap_y - return False - - if(self.x0elem_j.x0): - return True - - return False -class ReadingOrder: + if self.y0min_ij: # overlap_y + return False + + if self.x0elem_j.x0: + return True + + return False + +class ReadingOrderPredictor: r""" Rule based reading order for DoclingDocument """ def __init__(self): - self.page_elements: Dict[int, List[PageElement]] = {} + return - def predict(self, conv_res: ConversionResult) -> DoclingDocument: + def predict_page(self, page_elems: list[PageElement]) -> list[PageElement]: r""" Reorder the output of the """ - doc_elems = self._to_page_elements(conv_res) + #doc_elems = self._to_page_elements(conv_res) - for pid, page_elems in doc_elems.items(): + h2i_map, i2h_map = self._init_h2i_map(page_elems) - h2i_map, i2h_map = self._init_h2i_map(page_elems) + l2r_map, r2l_map = self._init_l2r_map(page_elems) - l2r_map, r2l_map = self._init_l2r_map(page_elems) + up_map, dn_map = self._init_ud_maps(page_elems, l2r_map) - up_map, dn_map = self._init_ud_maps(page_elems, l2r_map) - - if True: - dilated_page_elems = copy.deepcopy(page_elems) # deep-copy - self._do_horizontal_dilation(page_elems, dilated_page_elems, up_map, dn_map); + if True: + dilated_page_elems = copy.deepcopy(page_elems) # deep-copy + self._do_horizontal_dilation(page_elems, dilated_page_elems, up_map, dn_map); - # redo with dilated provs - up_map={} - dn_map={} - all_up_map={} - self._init_ud_maps_v2(dilated_page_elems, l2r_map, r2l_map, up_map, dn_map) - - heads = self._find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) - - self._sort_ud_maps(provs, h2i_map, i2h_map, up_map, dn_map); - order = self._find_order(provs, heads, up_map, dn_map); - - sorted_page_elems=[]; - for ind in order: - sorted_page_elems.append(self.page_elems[ind]); + # redo with dilated provs + up_map={} + dn_map={} + all_up_map={} + self._init_ud_maps_v2(dilated_page_elems, l2r_map, r2l_map, up_map, dn_map) - doc_elems[pid] = result + heads = self._find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) - doc = DoclingDocument() - return doc + self._sort_ud_maps(provs, h2i_map, i2h_map, up_map, dn_map); + order = self._find_order(provs, heads, up_map, dn_map); + + sorted_page_elems: list[PageElement] = []; + for ind in order: + sorted_page_elems.append(self.page_elems[ind]); + return sorted_page_elems + + """ def _to_page_elements(self, conv_res:ConversionResult): self.page_elements = {} @@ -144,8 +142,9 @@ def _to_page_elements(self, conv_res:ConversionResult): label=element.label) self.page_elements[element.page_no].append(elem) - - def _init_h2i_map(self, page_elems: List[PageElement]): + """ + + def _init_h2i_map(self, page_elems: list[PageElement]): h2i_map = {} i2h_map = {} @@ -155,7 +154,7 @@ def _init_h2i_map(self, page_elems: List[PageElement]): return h2i_map, i2h_map - def _init_l2r_map(self, page_elems: List[PageElement]): + def _init_l2r_map(self, page_elems: list[PageElement]): l2r_map = {} r2l_map = {} @@ -170,7 +169,7 @@ def _init_l2r_map(self, page_elems: List[PageElement]): return l2r_map, r2l_map - def _init_ud_maps(self, page_elems: List[PageElement], l2r_map: Dict[int, int]): + def _init_ud_maps(self, page_elems: list[PageElement], l2r_map: dict[int, int]): up_map: dict[int, list[int]] = {} dn_map: dict[int, list[int]] = {} @@ -220,42 +219,42 @@ def _init_ud_maps(self, page_elems: List[PageElement], l2r_map: Dict[int, int]): def _do_horizontal_dilation(self, page_elems, dilated_page_elems, up_map, dn_map): dilated_page_elems = page_elems # // deep-copy - + for i,pelem_i in enumerate(dilated_page_elems): - x0 = pelem_i.x0; - y0 = pelem_i.y0; - - x1 = pelem_i.x1; - y1 = pelem_i.y1; - - if i in up_map: - pelem_up = page_elems[up_map[i][0]]] - - x0 = min(x0, pelem_up.x0); - x1 = max(x1, pelem_up.x1); + x0 = pelem_i.x0; + y0 = pelem_i.y0; - if i in dn_map: - pelem_dn = page_elems[dn_map[i][0]]] - - x0 = min(x0, pelem_dn.x0); - x1 = max(x1, pelem_dn.x1); + x1 = pelem_i.x1; + y1 = pelem_i.y1; + + if i in up_map: + pelem_up = page_elems[up_map[i][0]] + + x0 = min(x0, pelem_up.x0) + x1 = max(x1, pelem_up.x1) - pelem_i.x0 = x0 + if i in dn_map: + pelem_dn = page_elems[dn_map[i][0]] + + x0 = min(x0, pelem_dn.x0) + x1 = max(x1, pelem_dn.x1) + + pelem_i.x0 = x0 pelem_i.x1 = x1 - - overlaps_with_rest:bool = False; + + overlaps_with_rest:bool = False for j,pelem_j in enumerate(page_elems): - - if i==j: - continue; + + if i==j: + continue - if not overlaps_with_rest: - overlaps_with_rest = pelem_j.overlaps(pelem_i); - + if not overlaps_with_rest: + overlaps_with_rest = pelem_j.overlaps(pelem_i) + # update if(not overlaps_with_rest): - dilated_page_elems[i].x0 = x0 + dilated_page_elems[i].x0 = x0 dilated_page_elems[i].y0 = y0 dilated_page_elems[i].x1 = x1 dilated_page_elems[i].y1 = y1 @@ -268,44 +267,43 @@ def _find_heads(self, page_elems, h2i_map, i2h_map, up_map, dn_map): if(len(vals)==0): head_page_elems.append(page_elems[key]) - sorted(head_page_elems, key=lambda); + sorted(head_page_elems) # this will invokde __lt__ from PageElements for item in head_page_elems: - heads.append(h2i_map[item.cid)) + heads.append(h2i_map[item.cid]) return heads def _sort_ud_maps(self, provs, h2i_map, i2h_map, up_map, dn_map): for ind_i,vals in dn_map.items(): - child_provs={}; + child_provs={} for ind_j in vals: - child_provs.push_back(provs[ind_j]); + child_provs.push_back(provs[ind_j]) sorted(child_provs) dn_map[ind_i] = [] for child in child_provs: - dn_map[ind_i].append(h2i_map[child.cid]); + dn_map[ind_i].append(h2i_map[child.cid]) def _find_order(self, provs, heads, up_map, dn_map): - order: list[int] = []; + order: list[int] = [] - visited: list[bool] = [False for _ in provs]; + visited: list[bool] = [False for _ in provs] for j in heads: if not visited[j]: - order.append(j); - visited[j] = true; - + order.append(j) + visited[j] = true self.depth_first_search_downwards(j, order, visited, dn_map, up_map); if len(order)!=len(provs): _log.error("something went wrong") - return order; + return order def _depth_first_search_upwards(j: int, order: list[int], @@ -315,13 +313,13 @@ def _depth_first_search_upwards(j: int, """depth_first_search_upwards""" k = j - - auto& inds = up_map.at(j); + + inds = up_map.at(j) for ind in inds: - if not visited[ind]: - return self.depth_first_search_upwards(ind, order, visited, dn_map, up_map) + if not visited[ind]: + return self.depth_first_search_upwards(ind, order, visited, dn_map, up_map) - return k; + return k def _depth_first_search_downwards(j: int, order: list[int], @@ -333,10 +331,10 @@ def _depth_first_search_downwards(j: int, inds: list[int] = dn_map[j] for i in inds: - k:int = self._depth_first_search_upwards(i, order, visited, dn_map, up_map) + k:int = self._depth_first_search_upwards(i, order, visited, dn_map, up_map) if not visited[k]: order.append(k) visited[k] = True - self._depth_first_search_downwards(k, order, visited, dn_map, up_map); + self._depth_first_search_downwards(k, order, visited, dn_map, up_map) diff --git a/tests/test_reading_order.py b/tests/test_reading_order.py new file mode 100644 index 0000000..ee29168 --- /dev/null +++ b/tests/test_reading_order.py @@ -0,0 +1,95 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# +import os +import json + +import numpy as np +import pytest +from PIL import Image + +from huggingface_hub import snapshot_download + +import docling_ibm_models.layoutmodel.layout_predictor as lp +from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor + +from docling_ibm_models.reading_order.reading_order_rb import PageElement, ReadingOrderPredictor + +@pytest.fixture(scope="module") +def init() -> dict: + r""" + Initialize the testing environment + """ + # This config is missing the keys: "artifact_path", "info1.torch_file", "info2.torch_file" + init = { + "num_threads": 1, + "test_imgs": [ + "tests/test_data/samples/ADS.2007.page_123.png", + ], + "info1": { + "use_cpu_only": True, + "image_size": 640, + "threshold": 0.6, + }, + "info2": { + "use_cpu_only": True, + "image_size": 640, + "threshold": 0.6, + }, + "pred_bboxes": 9, + } + + # Download models from HF + download_path = snapshot_download(repo_id="ds4sd/docling-models") + artifact_path = os.path.join(download_path, "model_artifacts/layout/beehive_v0.0.5_pt") + + # Add the missing config keys + init["artifact_path"] = artifact_path + init["info1"]["torch_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN) + init["info2"]["torch_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN) + + return init + + +def run_layoutpredictor(init: dict): + r""" + Unit test for the LayoutPredictor + """ + # Initialize LayoutPredictor with envvars + os.environ["USE_CPU_ONLY"] = "" + os.environ["OMP_NUM_THREADS"] = "2" + lpredictor = LayoutPredictor(init["artifact_path"]) + assert init["info1"] == lpredictor.info() + + # Initialize LayoutPredictor with optional parameters + lpredictor = LayoutPredictor( + init["artifact_path"], use_cpu_only=True + ) + assert init["info2"] == lpredictor.info() + + # Unsupported input image + is_exception = False + try: + for pred in lpredictor.predict("wrong"): + pass + except TypeError: + is_exception = True + assert is_exception + + # Predict on the test image + for img_fn in init["test_imgs"]: + with Image.open(img_fn) as img: + # Load images as PIL objects + for i, pred in enumerate(lpredictor.predict(img)): + print("PIL pred: {}".format(pred)) + yield pred + +def test_readingorder(): + + romodel = ReadingOrderPredictor() + + for pred in run_layoutpredictor(init): + print(pred.keys()) + + assert True From 860ab954e1825b1943e496e2167817f00df84385 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 28 Oct 2024 08:46:59 +0100 Subject: [PATCH 5/6] tests scripts are WIP Signed-off-by: Peter Staar --- tests/test_layout_predictor.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_layout_predictor.py b/tests/test_layout_predictor.py index 0a65ee1..e89f16f 100644 --- a/tests/test_layout_predictor.py +++ b/tests/test_layout_predictor.py @@ -77,12 +77,27 @@ def test_layoutpredictor(init: dict): # Predict on the test image for img_fn in init["test_imgs"]: + + true_layout_fn = img_fn+".json" with Image.open(img_fn) as img: + pred_layout=[] + # Load images as PIL objects for i, pred in enumerate(lpredictor.predict(img)): print("PIL pred: {}".format(pred)) + pred_layout.append(pred) assert i + 1 == init["pred_bboxes"] + if os.path.exists(true_layout_fn): + with open(true_layout_fn, "r") as fr: + true_layout = json.load(fr) + + # FIXME: write a simple test to check all objects are found + else: + with open(true_layout_fn, "w") as fw: + fw.write(json.dumps(pred_layout, indent=4)) + + # Load images as numpy arrays np_arr = np.asarray(img) for i, pred in enumerate(lpredictor.predict(np_arr)): From 7e3a2026504b62d27c9675b8ebb2958a8860b8fc Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 12 Nov 2024 05:36:11 +0100 Subject: [PATCH 6/6] first running reading order model Signed-off-by: Peter Staar --- .../reading_order/reading_order_rb.py | 64 +++++++++++-------- tests/test_layout_predictor.py | 13 +++- tests/test_reading_order.py | 50 ++++++++++----- 3 files changed, 80 insertions(+), 47 deletions(-) diff --git a/docling_ibm_models/reading_order/reading_order_rb.py b/docling_ibm_models/reading_order/reading_order_rb.py index e35a5e6..d44cac5 100644 --- a/docling_ibm_models/reading_order/reading_order_rb.py +++ b/docling_ibm_models/reading_order/reading_order_rb.py @@ -56,7 +56,18 @@ def overlaps_y(self, rhs) -> bool: (rhs.y0<=self.y1 and self.y1 bool: - return False + if self.overlaps_y(rhs): + + u0 = min(self.y0, rhs.y0); + u1 = max(self.y1, rhs.y1); + + i0 = max(self.y0, rhs.y0); + i1 = min(self.y1, rhs.y1); + + iou_ = float(i1-i0)/float(u1-u0); + return (iou_)>iou; + + return False; def is_left_of(self, rhs) -> bool: return (self.x0 list[PageElement]: l2r_map, r2l_map = self._init_l2r_map(page_elems) - up_map, dn_map = self._init_ud_maps(page_elems, l2r_map) + up_map, dn_map = self._init_ud_maps(page_elems, l2r_map, r2l_map) if True: dilated_page_elems = copy.deepcopy(page_elems) # deep-copy self._do_horizontal_dilation(page_elems, dilated_page_elems, up_map, dn_map); # redo with dilated provs - up_map={} - dn_map={} - all_up_map={} - self._init_ud_maps_v2(dilated_page_elems, l2r_map, r2l_map, up_map, dn_map) + up_map, dn_map = self._init_ud_maps(dilated_page_elems, l2r_map, r2l_map) heads = self._find_heads(page_elems, h2i_map, i2h_map, up_map, dn_map) - self._sort_ud_maps(provs, h2i_map, i2h_map, up_map, dn_map); - order = self._find_order(provs, heads, up_map, dn_map); + self._sort_ud_maps(page_elems, h2i_map, i2h_map, up_map, dn_map); + order = self._find_order(page_elems, heads, up_map, dn_map); sorted_page_elems: list[PageElement] = []; for ind in order: - sorted_page_elems.append(self.page_elems[ind]); + sorted_page_elems.append(page_elems[ind]); return sorted_page_elems @@ -163,13 +171,15 @@ def _init_l2r_map(self, page_elems: list[PageElement]): if(pelem_i.follows_maintext_order(pelem_j) and pelem_i.is_strictly_left_of(pelem_j) and - pelem_i.overlaps_y(pelem_j, 0.8)): + pelem_i.overlaps_y_with_iou(pelem_j, 0.8)): l2r_map[i] = j; r2l_map[j] = i; return l2r_map, r2l_map - def _init_ud_maps(self, page_elems: list[PageElement], l2r_map: dict[int, int]): + def _init_ud_maps(self, page_elems: list[PageElement], + l2r_map: dict[int, int], + r2l_map: dict[int, int]): up_map: dict[int, list[int]] = {} dn_map: dict[int, list[int]] = {} @@ -179,7 +189,7 @@ def _init_ud_maps(self, page_elems: list[PageElement], l2r_map: dict[int, int]): for j,pelem_j in enumerate(page_elems): - if(j in r2l_map): + if j in r2l_map: i = r2l_map[j] dn_map[i] = [j] @@ -228,13 +238,13 @@ def _do_horizontal_dilation(self, page_elems, dilated_page_elems, up_map, dn_map x1 = pelem_i.x1; y1 = pelem_i.y1; - if i in up_map: + if i in up_map and len(up_map[i])>0: pelem_up = page_elems[up_map[i][0]] x0 = min(x0, pelem_up.x0) x1 = max(x1, pelem_up.x1) - if i in dn_map: + if i in dn_map and len(dn_map[i])>0: pelem_dn = page_elems[dn_map[i][0]] x0 = min(x0, pelem_dn.x0) @@ -297,19 +307,19 @@ def _find_order(self, provs, heads, up_map, dn_map): if not visited[j]: order.append(j) - visited[j] = true - self.depth_first_search_downwards(j, order, visited, dn_map, up_map); + visited[j] = True + self._depth_first_search_downwards(j, order, visited, dn_map, up_map); if len(order)!=len(provs): _log.error("something went wrong") return order - def _depth_first_search_upwards(j: int, - order: list[int], - visited: list[bool], - dn_map: dict[int, list[int]], - up_map: dict[int, list[int]]): + def _depth_first_search_upwards(self, j: int, + order: list[int], + visited: list[bool], + dn_map: dict[int, list[int]], + up_map: dict[int, list[int]]): """depth_first_search_upwards""" k = j @@ -317,15 +327,15 @@ def _depth_first_search_upwards(j: int, inds = up_map.at(j) for ind in inds: if not visited[ind]: - return self.depth_first_search_upwards(ind, order, visited, dn_map, up_map) + return self._depth_first_search_upwards(ind, order, visited, dn_map, up_map) return k - def _depth_first_search_downwards(j: int, - order: list[int], - visited: list[bool], - dn_map: dict[int, list[int]], - up_map: dict[int, list[int]]): + def _depth_first_search_downwards(self, j: int, + order: list[int], + visited: list[bool], + dn_map: dict[int, list[int]], + up_map: dict[int, list[int]]): """depth_first_search_downwards""" inds: list[int] = dn_map[j] diff --git a/tests/test_layout_predictor.py b/tests/test_layout_predictor.py index e89f16f..d18420d 100644 --- a/tests/test_layout_predictor.py +++ b/tests/test_layout_predictor.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT # import os +import json import numpy as np import pytest @@ -77,7 +78,7 @@ def test_layoutpredictor(init: dict): # Predict on the test image for img_fn in init["test_imgs"]: - + true_layout_fn = img_fn+".json" with Image.open(img_fn) as img: pred_layout=[] @@ -85,7 +86,14 @@ def test_layoutpredictor(init: dict): # Load images as PIL objects for i, pred in enumerate(lpredictor.predict(img)): print("PIL pred: {}".format(pred)) - pred_layout.append(pred) + pred_layout.append({ + "label": pred["label"], + "t": pred["t"].item(), + "b": pred["b"].item(), + "l": pred["l"].item(), + "r": pred["r"].item(), + }) + print(pred_layout) assert i + 1 == init["pred_bboxes"] if os.path.exists(true_layout_fn): @@ -96,7 +104,6 @@ def test_layoutpredictor(init: dict): else: with open(true_layout_fn, "w") as fw: fw.write(json.dumps(pred_layout, indent=4)) - # Load images as numpy arrays np_arr = np.asarray(img) diff --git a/tests/test_reading_order.py b/tests/test_reading_order.py index ee29168..b2265c8 100644 --- a/tests/test_reading_order.py +++ b/tests/test_reading_order.py @@ -4,6 +4,7 @@ # import os import json +import glob import numpy as np import pytest @@ -24,9 +25,7 @@ def init() -> dict: # This config is missing the keys: "artifact_path", "info1.torch_file", "info2.torch_file" init = { "num_threads": 1, - "test_imgs": [ - "tests/test_data/samples/ADS.2007.page_123.png", - ], + "test_imgs": sorted(glob.glob("tests/test_data/samples/*.png")), "info1": { "use_cpu_only": True, "image_size": 640, @@ -39,7 +38,7 @@ def init() -> dict: }, "pred_bboxes": 9, } - + # Download models from HF download_path = snapshot_download(repo_id="ds4sd/docling-models") artifact_path = os.path.join(download_path, "model_artifacts/layout/beehive_v0.0.5_pt") @@ -48,11 +47,11 @@ def init() -> dict: init["artifact_path"] = artifact_path init["info1"]["torch_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN) init["info2"]["torch_file"] = os.path.join(artifact_path, lp.MODEL_CHECKPOINT_FN) - + return init -def run_layoutpredictor(init: dict): +def test_readingorder(init: dict): r""" Unit test for the LayoutPredictor """ @@ -77,19 +76,36 @@ def run_layoutpredictor(init: dict): is_exception = True assert is_exception + # Init the reading-order model + romodel = ReadingOrderPredictor() + # Predict on the test image for img_fn in init["test_imgs"]: + print(img_fn) + with Image.open(img_fn) as img: + pred_layout=[] + # Load images as PIL objects for i, pred in enumerate(lpredictor.predict(img)): - print("PIL pred: {}".format(pred)) - yield pred - -def test_readingorder(): - - romodel = ReadingOrderPredictor() - - for pred in run_layoutpredictor(init): - print(pred.keys()) - - assert True + pred_layout.append({ + "label": pred["label"], + "t": pred["t"].item(), + "b": pred["b"].item(), + "l": pred["l"].item(), + "r": pred["r"].item(), + }) + print(json.dumps(pred_layout, indent=2)) + + page_elements = [] + for cid, item in enumerate(pred_layout): + page_elements.append(PageElement(cid=cid, pid=0, + x0=item["l"], y0=item["r"], + x1=item["b"], y1=item["t"], + label=item["label"])) + + print(page_elements) + + ordered_elements = romodel.predict_page(page_elements) + + print(ordered_elements)