allenai · kyleclo · Apr 22, 2023 · Apr 22, 2023 · Apr 22, 2023 · Apr 22, 2023
diff --git a/analysis/box_indexer/benchmark.py b/analysis/box_indexer/benchmark.py
@@ -0,0 +1,147 @@
+"""
+
+Benchmarking different ways of indexing and searching over Boxes
+
+RTree
+    INFO:root:RTreeIndexer took 0.23895001411437988 seconds to index 10000 boxes)
+    INFO:root:RTreeIndexer took 188.41757702827454 seconds for 1000 queries
+
+    INFO:root:RTreeIndexer took 0.24275708198547363 seconds to index 10000 boxes)
+    INFO:root:RTreeIndexer took 1.1544108390808105 seconds for 1000 queries
+
+KDTree
+    Stopped Implementation was kind of tricky.
+
+Numpy array
+    INFO:root:NumpyIndexer took 0.0027608871459960938 seconds to index 10000 boxes)
+    INFO:root:NumpyIndexer took 187.8270788192749 seconds for 1000 queries
+
+    INFO:root:NumpyIndexer took 0.0034339427947998047 seconds to index 10000 boxes)
+    INFO:root:NumpyIndexer took 0.07767200469970703 seconds for 1000 queries
+
+"""
+
+import logging
+import time
+from typing import List, Set, Tuple
+
+import numpy as np
+from rtree import index
+from sklearn.neighbors import KDTree
+
+from mmda.types.box import Box
+
+logging.basicConfig(level=logging.INFO)
+
+# create random boxes
+boxes = [
+    Box(
+        l=np.random.rand(),
+        t=np.random.rand(),
+        w=np.random.rand(),
+        h=np.random.rand(),
+        page=0,
+    )
+    for _ in range(10000)
+]
+
+# create random boxes for querying
+queries = [
+    Box(
+        l=np.random.rand(),
+        t=np.random.rand(),
+        w=np.random.rand(),
+        h=np.random.rand(),
+        page=0,
+    )
+    for _ in range(1000)
+]
+
+
+class RTreeIndexer:
+    def __init__(self, boxes: List[Box]):
+        self.boxes = boxes
+        self.rtree = index.Index(interleaved=True)
+        for i, b in enumerate(boxes):
+            x1, y1, x2, y2 = b.coordinates
+            self.rtree.insert(i, (x1, y1, x2, y2))
+
+    def find(self, query: Box) -> List[int]:
+        x1, y1, x2, y2 = query.coordinates
+        box_ids = self.rtree.intersection((x1, y1, x2, y2))
+        return list(box_ids)
+
+
+class NumpyIndexer:
+    def __init__(self, boxes: List[Box]):
+        self.boxes = boxes
+        self.np_boxes_x1 = np.array([b.l for b in boxes])
+        self.np_boxes_y1 = np.array([b.t for b in boxes])
+        self.np_boxes_x2 = np.array([b.l + b.w for b in boxes])
+        self.np_boxes_y2 = np.array([b.t + b.h for b in boxes])
+
+    def find(self, query: Box) -> List[int]:
+        x1, y1, x2, y2 = query.coordinates
+        mask = (
+            (self.np_boxes_x1 <= x2)
+            & (self.np_boxes_x2 >= x1)
+            & (self.np_boxes_y1 <= y2)
+            & (self.np_boxes_y2 >= y1)
+        )
+        return np.where(mask)[0].tolist()
+
+
+def bulk_query(indexer, boxes, queries, is_validate: bool = True):
+    for q in queries:
+        found = indexer.find(q)
+        if is_validate:
+            for i in range(len(boxes)):
+                if i in found:
+                    assert boxes[i].is_overlap(q)
+                else:
+                    assert not boxes[i].is_overlap(q)
+
+
+def benchmark_rtree(boxes, queries, is_validate: bool = True):
+    # indexing time
+    start = time.time()
+    logging.info("Starting benchmarking")
+    rtree_indexer = RTreeIndexer(boxes)
+    end = time.time()
+    logging.info(
+        f"RTreeIndexer took {end - start} seconds to index {len(boxes)} boxes)"
+    )
+
+    # searching time
+    start = time.time()
+    logging.info("Starting benchmarking")
+    bulk_query(rtree_indexer, boxes, queries, is_validate=is_validate)
+
+    end = time.time()
+    logging.info(f"RTreeIndexer took {end - start} seconds for {len(queries)} queries")
+
+
+def benchmark_numpy(boxes, queries, is_validate: bool = True):
+    # indexing time
+    start = time.time()
+    logging.info("Starting benchmarking")
+    numpy_indexer = NumpyIndexer(boxes)
+    end = time.time()
+    logging.info(
+        f"NumpyIndexer took {end - start} seconds to index {len(boxes)} boxes)"
+    )
+
+    # searching time
+    start = time.time()
+    logging.info("Starting benchmarking")
+    bulk_query(numpy_indexer, boxes, queries, is_validate=is_validate)
+
+    end = time.time()
+    logging.info(f"NumpyIndexer took {end - start} seconds for {len(queries)} queries")
+
+
+benchmark_rtree(boxes=boxes, queries=queries)
+benchmark_numpy(boxes=boxes, queries=queries)
+
+benchmark_rtree(boxes=boxes, queries=queries, is_validate=False)
+benchmark_numpy(boxes=boxes, queries=queries, is_validate=False)
diff --git a/analysis/box_indexer/requirements.txt b/analysis/box_indexer/requirements.txt
@@ -0,0 +1,2 @@
+scikit-learn
+rtree
diff --git a/src/mmda/eval/metrics.py b/src/mmda/eval/metrics.py
@@ -75,11 +75,11 @@ def levenshtein(
 
 def box_overlap(box: Box, container: Box) -> float:
     """Returns the percentage of area of a box inside of a container."""
-    bl, bt, bw, bh = box.xywh
+    bl, bt, bw, bh = box.l, box.t, box.w, box.h
     br = bl + bw
     bb = bt + bh
 
-    cl, ct, cw, ch = container.xywh
+    cl, ct, cw, ch = container.l, container.t, container.w, container.h
     cr = cl + cw
     cb = ct + ch
 

diff --git a/src/mmda/parsers/NOTES.md b/src/mmda/parsers/NOTES.md
@@ -0,0 +1,122 @@
+# notes
+
+
+## bounding boxes
+
+
+**1. Problem**
+
+Token boxes from PDFPlumber aren't necessarily disjoint, which is annoying.
+
+Here's an example (converted to xy coordinates for ease):
+
+```
+Token=0
+(0.8155551504075421,
+ 0.9343934350623241,
+ 0.8317358800186805,
+ 0.9445089287110585,
+ 'Vol')
+
+Token=1
+(0.8315880494325322,
+ 0.9343934350623241,
+ 0.8343161957041776,
+ 0.9445089287110585,
+ '.')
+```
+
+We see that Token0 and Token1 are on the same line given their ymin and ymax hovering at 0.93-0.94. They should be disjoint but Token0's xmax (0.8317) extends into Token1's xmin (0.8316). This should never be the case.
+
+The weird thing is though, while boxes are kind of messed up like this, the text isn't. The two tokens together spell out `Vol.` correctly.
+
+Why is this bad? Because it messes up our ability to lookup overlapping entities based on bounding box overlap. This is *especially* bad when we're trying to go from a bounding box (e.g. from LayoutParser) to token spans.
+
+**2. Solutions**
+
+There are a few solutions:
+
+1. Fix this in PDFPlumberParser with posthoc operator over bounding boxes to enforce disjointness.
+
+    ➔ Kind of problematic because requires so much reprocessing.
+
+    ➔ May also make it harder to keep in-sync w/ PDFPlumber over time. Posthoc corrections maybe should still be decoupled from the parser itself?
+
+2. Apply fixes at `Document.from_json()` time to adjust all token bounding boxes "inward". 
+
+    ➔ Not as risky. Somewhat hacky but fairly straightforward. Problem is it doesn't quite fix the data, which is still being generated with weird boxes. 
+
+3. Overlapping bounding boxes primarily affects vision-based lookup methods (e.g. give me all entities within a visual region). A different way to fix this is to base all bounding box-based lookup methods not on the bounding boxes, but instead on the box centroid or something else.
+
+    ➔ Not sure whether it would work. Seems dependent on box quality. For example, if the box is extremely off-center, then it's not like this would solve anything. And it's actually quite a bit of code to refactor. Let's rule it out.
+
+
+Thinking about it, #2 is pretty reasonable for now given that we're aiming for overall stability. We can worry about #1 later.
+
+**2. Implementation details of Approach 2**
+
+There are a few ways to do #2 actually. 
+
+If the Boxes are actually pretty good, but only overlap with each other slightly (e.g. boundaries kinda fuzzy), then the easiest way to fix things is to shrink all boxes to avoid all overlapping regions. 
+
+But if the Boxes are pretty poor quality (imagine something that's really big/off-center), then there's no real way to resolve that box without actually pulling up the original image and trying to do some pixel-based localization.
+
+We *really* don't want to do the latter, so let's investigate whether the former is the more common case.
+
+See this example where we cluster all the token boxes on a page and only shade the ones that have some overlap with another box: ![image](fixtures/token-box-overlap.png).
+
+The overlapping regions are really small. Let's go with the easier implementation.
+
+
+**3. Global epsilon-based adjustment to all boxes**
+
+Before we do that, there are actually boxes out of PDFPlumber that are perfectly shared borders:
+
+```
+{'x1': 0.38126804444444445, 'y1': 0.12202539797979782, 'x2': 0.3861468209150327, 'y2': 0.13334661010100995}
+{'x1': 0.3861468209150327, 'y1': 0.12202539797979782, 'x2': 0.42277427189542477, 'y2': 0.13334661010100995}
+```
+
+As you can see, `x2` of the first box is exactly `x1` of the second box.
+
+One way to make this way less annoying is to apply a very small shrinkage. How big is this? Well, looking at the size of typical images rendered at `dpi=72`, we're probably looking at pages that have dimension `(800, 620)` at most. So conservatively, we can set an `epsilon=1e-4` without worrying about it being perceptible.
+
+For example, just doing something like this:
+```
+BUFFER = 1e-4
+for token in doc.tokens:
+    token.box_group.boxes[0].l += BUFFER
+    token.box_group.boxes[0].t += BUFFER
+    token.box_group.boxes[0].w -= 2 * BUFFER
+    token.box_group.boxes[0].h -= 2 * BUFFER
+```
+
+will fix a lot of the overlapping boxes from the first figure:
+
+![image](fixtures/token-box-overlap-fixed-with-epsilon.png)
+
+
+**5. Clustering-based solution***
+
+Now how do we fix the remaining boxes? 
+
+One thought is -- Can we do a fast thing just writing rules to compare `xmin, xmax, ymin, ymax` and directly make adjustments to those boundaries? Probably, but it gets confusing really quickly. Consider this case:
+
+![image](fixtures/overlap-tokens-edge-case.png)
+
+The problem is that we don't really want to work off clusters of boxes, but actually *pairs* of boxes that overlap. But the O(N^2) over all boxes on a page is kind of costly to check. So let's pre-cluster and then perform the O(N^2) within each cluster.
+
+
+The technique looks something like:
+
+![image](fixtures/overlap-token-strategy.png)
+
+The figure above is what would happen if you just swapped the correct coordinates of the overlapping boxes. It does create more whitespace between, which may not be desirable, but it's also easier to follow. You can instead replace the relevant box coordinates with something newly calculated (e.g. split the difference).
+
+**6. Boxes that aren't even on the page**
+
+If we look at PDF in test fixtures `4be952924cd565488b4a239dc6549095029ee578.pdf`, we'll actually find weird boxes that come out of PDFPlumber that are off the page:
+
+```
+
+```
diff --git a/src/mmda/parsers/fixtures/overlap-token-strategy.png b/src/mmda/parsers/fixtures/overlap-token-strategy.png
diff --git a/src/mmda/parsers/fixtures/overlap-tokens-edge-case.png b/src/mmda/parsers/fixtures/overlap-tokens-edge-case.png
diff --git a/src/mmda/parsers/fixtures/token-box-overlap-fixed-with-epsilon.png b/src/mmda/parsers/fixtures/token-box-overlap-fixed-with-epsilon.png
diff --git a/src/mmda/parsers/fixtures/token-box-overlap.png b/src/mmda/parsers/fixtures/token-box-overlap.png
diff --git a/src/mmda/parsers/pdfplumber_parser.py b/src/mmda/parsers/pdfplumber_parser.py
@@ -1,6 +1,6 @@
 import itertools
 import string
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Tuple, Union
 
 import pdfplumber
 
@@ -201,24 +201,28 @@ def parse(self, input_pdf_path: str) -> Document:
                 )
                 assert len(word_ids_of_fine_tokens) == len(fine_tokens)
                 # 4) normalize / clean tokens & boxes
+                boxes = [
+                    Box.from_coordinates(
+                        x1=float(token["x0"]),
+                        y1=float(token["top"]),
+                        x2=float(token["x1"]),
+                        y2=float(token["bottom"]),
+                        page=int(page_id),
+                    ).get_relative(
+                        page_width=float(page.width), page_height=float(page.height)
+                    )
+                    for token in fine_tokens
+                ]
+                # 4.5) reformat
+                # TODO: remove tokens that are 'off page'
                 fine_tokens = [
                     {
                         "text": token["text"],
                         "fontname": token["fontname"],
                         "size": token["size"],
-                        "bbox": Box.from_pdf_coordinates(
-                            x1=float(token["x0"]),
-                            y1=float(token["top"]),
-                            x2=float(token["x1"]),
-                            y2=float(token["bottom"]),
-                            page_width=float(page.width),
-                            page_height=float(page.height),
-                            page=int(page_id),
-                        ).get_relative(
-                            page_width=float(page.width), page_height=float(page.height)
-                        ),
+                        "bbox": box,
                     }
-                    for token in fine_tokens
+                    for box, token in zip(boxes, fine_tokens)
                 ]
                 # 5) group tokens into lines
                 # TODO - doesnt belong in parser; should be own predictor