Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] fixing boxes that are breaking spp runs #229

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
147 changes: 147 additions & 0 deletions analysis/box_indexer/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""

Benchmarking different ways of indexing and searching over Boxes

RTree
INFO:root:RTreeIndexer took 0.23895001411437988 seconds to index 10000 boxes)
INFO:root:RTreeIndexer took 188.41757702827454 seconds for 1000 queries

INFO:root:RTreeIndexer took 0.24275708198547363 seconds to index 10000 boxes)
INFO:root:RTreeIndexer took 1.1544108390808105 seconds for 1000 queries

KDTree
Stopped Implementation was kind of tricky.

Numpy array
INFO:root:NumpyIndexer took 0.0027608871459960938 seconds to index 10000 boxes)
INFO:root:NumpyIndexer took 187.8270788192749 seconds for 1000 queries

INFO:root:NumpyIndexer took 0.0034339427947998047 seconds to index 10000 boxes)
INFO:root:NumpyIndexer took 0.07767200469970703 seconds for 1000 queries

"""

import logging
import time
from typing import List, Set, Tuple

import numpy as np
from rtree import index
from sklearn.neighbors import KDTree

from mmda.types.box import Box

logging.basicConfig(level=logging.INFO)

# create random boxes
boxes = [
Box(
l=np.random.rand(),
t=np.random.rand(),
w=np.random.rand(),
h=np.random.rand(),
page=0,
)
for _ in range(10000)
]

# create random boxes for querying
queries = [
Box(
l=np.random.rand(),
t=np.random.rand(),
w=np.random.rand(),
h=np.random.rand(),
page=0,
)
for _ in range(1000)
]


class RTreeIndexer:
def __init__(self, boxes: List[Box]):
self.boxes = boxes
self.rtree = index.Index(interleaved=True)
for i, b in enumerate(boxes):
x1, y1, x2, y2 = b.coordinates
self.rtree.insert(i, (x1, y1, x2, y2))

def find(self, query: Box) -> List[int]:
x1, y1, x2, y2 = query.coordinates
box_ids = self.rtree.intersection((x1, y1, x2, y2))
return list(box_ids)


class NumpyIndexer:
def __init__(self, boxes: List[Box]):
self.boxes = boxes
self.np_boxes_x1 = np.array([b.l for b in boxes])
self.np_boxes_y1 = np.array([b.t for b in boxes])
self.np_boxes_x2 = np.array([b.l + b.w for b in boxes])
self.np_boxes_y2 = np.array([b.t + b.h for b in boxes])

def find(self, query: Box) -> List[int]:
x1, y1, x2, y2 = query.coordinates
mask = (
(self.np_boxes_x1 <= x2)
& (self.np_boxes_x2 >= x1)
& (self.np_boxes_y1 <= y2)
& (self.np_boxes_y2 >= y1)
)
return np.where(mask)[0].tolist()


def bulk_query(indexer, boxes, queries, is_validate: bool = True):
for q in queries:
found = indexer.find(q)
if is_validate:
for i in range(len(boxes)):
if i in found:
assert boxes[i].is_overlap(q)
else:
assert not boxes[i].is_overlap(q)


def benchmark_rtree(boxes, queries, is_validate: bool = True):
# indexing time
start = time.time()
logging.info("Starting benchmarking")
rtree_indexer = RTreeIndexer(boxes)
end = time.time()
logging.info(
f"RTreeIndexer took {end - start} seconds to index {len(boxes)} boxes)"
)

# searching time
start = time.time()
logging.info("Starting benchmarking")
bulk_query(rtree_indexer, boxes, queries, is_validate=is_validate)

end = time.time()
logging.info(f"RTreeIndexer took {end - start} seconds for {len(queries)} queries")


def benchmark_numpy(boxes, queries, is_validate: bool = True):
# indexing time
start = time.time()
logging.info("Starting benchmarking")
numpy_indexer = NumpyIndexer(boxes)
end = time.time()
logging.info(
f"NumpyIndexer took {end - start} seconds to index {len(boxes)} boxes)"
)

# searching time
start = time.time()
logging.info("Starting benchmarking")
bulk_query(numpy_indexer, boxes, queries, is_validate=is_validate)

end = time.time()
logging.info(f"NumpyIndexer took {end - start} seconds for {len(queries)} queries")


benchmark_rtree(boxes=boxes, queries=queries)
benchmark_numpy(boxes=boxes, queries=queries)

benchmark_rtree(boxes=boxes, queries=queries, is_validate=False)
benchmark_numpy(boxes=boxes, queries=queries, is_validate=False)
2 changes: 2 additions & 0 deletions analysis/box_indexer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
scikit-learn
rtree
4 changes: 2 additions & 2 deletions src/mmda/eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ def levenshtein(

def box_overlap(box: Box, container: Box) -> float:
"""Returns the percentage of area of a box inside of a container."""
bl, bt, bw, bh = box.xywh
bl, bt, bw, bh = box.l, box.t, box.w, box.h
br = bl + bw
bb = bt + bh

cl, ct, cw, ch = container.xywh
cl, ct, cw, ch = container.l, container.t, container.w, container.h
cr = cl + cw
cb = ct + ch

Expand Down
122 changes: 122 additions & 0 deletions src/mmda/parsers/NOTES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# notes


## bounding boxes


**1. Problem**

Token boxes from PDFPlumber aren't necessarily disjoint, which is annoying.

Here's an example (converted to xy coordinates for ease):

```
Token=0
(0.8155551504075421,
0.9343934350623241,
0.8317358800186805,
0.9445089287110585,
'Vol')

Token=1
(0.8315880494325322,
0.9343934350623241,
0.8343161957041776,
0.9445089287110585,
'.')
```

We see that Token0 and Token1 are on the same line given their ymin and ymax hovering at 0.93-0.94. They should be disjoint but Token0's xmax (0.8317) extends into Token1's xmin (0.8316). This should never be the case.

The weird thing is though, while boxes are kind of messed up like this, the text isn't. The two tokens together spell out `Vol.` correctly.

Why is this bad? Because it messes up our ability to lookup overlapping entities based on bounding box overlap. This is *especially* bad when we're trying to go from a bounding box (e.g. from LayoutParser) to token spans.

**2. Solutions**

There are a few solutions:

1. Fix this in PDFPlumberParser with posthoc operator over bounding boxes to enforce disjointness.

➔ Kind of problematic because requires so much reprocessing.

➔ May also make it harder to keep in-sync w/ PDFPlumber over time. Posthoc corrections maybe should still be decoupled from the parser itself?

2. Apply fixes at `Document.from_json()` time to adjust all token bounding boxes "inward".

➔ Not as risky. Somewhat hacky but fairly straightforward. Problem is it doesn't quite fix the data, which is still being generated with weird boxes.

3. Overlapping bounding boxes primarily affects vision-based lookup methods (e.g. give me all entities within a visual region). A different way to fix this is to base all bounding box-based lookup methods not on the bounding boxes, but instead on the box centroid or something else.

➔ Not sure whether it would work. Seems dependent on box quality. For example, if the box is extremely off-center, then it's not like this would solve anything. And it's actually quite a bit of code to refactor. Let's rule it out.


Thinking about it, #2 is pretty reasonable for now given that we're aiming for overall stability. We can worry about #1 later.

**2. Implementation details of Approach 2**

There are a few ways to do #2 actually.

If the Boxes are actually pretty good, but only overlap with each other slightly (e.g. boundaries kinda fuzzy), then the easiest way to fix things is to shrink all boxes to avoid all overlapping regions.

But if the Boxes are pretty poor quality (imagine something that's really big/off-center), then there's no real way to resolve that box without actually pulling up the original image and trying to do some pixel-based localization.

We *really* don't want to do the latter, so let's investigate whether the former is the more common case.

See this example where we cluster all the token boxes on a page and only shade the ones that have some overlap with another box: ![image](fixtures/token-box-overlap.png).

The overlapping regions are really small. Let's go with the easier implementation.


**3. Global epsilon-based adjustment to all boxes**

Before we do that, there are actually boxes out of PDFPlumber that are perfectly shared borders:

```
{'x1': 0.38126804444444445, 'y1': 0.12202539797979782, 'x2': 0.3861468209150327, 'y2': 0.13334661010100995}
{'x1': 0.3861468209150327, 'y1': 0.12202539797979782, 'x2': 0.42277427189542477, 'y2': 0.13334661010100995}
```

As you can see, `x2` of the first box is exactly `x1` of the second box.

One way to make this way less annoying is to apply a very small shrinkage. How big is this? Well, looking at the size of typical images rendered at `dpi=72`, we're probably looking at pages that have dimension `(800, 620)` at most. So conservatively, we can set an `epsilon=1e-4` without worrying about it being perceptible.

For example, just doing something like this:
```
BUFFER = 1e-4
for token in doc.tokens:
token.box_group.boxes[0].l += BUFFER
token.box_group.boxes[0].t += BUFFER
token.box_group.boxes[0].w -= 2 * BUFFER
token.box_group.boxes[0].h -= 2 * BUFFER
```

will fix a lot of the overlapping boxes from the first figure:

![image](fixtures/token-box-overlap-fixed-with-epsilon.png)


**5. Clustering-based solution***

Now how do we fix the remaining boxes?

One thought is -- Can we do a fast thing just writing rules to compare `xmin, xmax, ymin, ymax` and directly make adjustments to those boundaries? Probably, but it gets confusing really quickly. Consider this case:

![image](fixtures/overlap-tokens-edge-case.png)

The problem is that we don't really want to work off clusters of boxes, but actually *pairs* of boxes that overlap. But the O(N^2) over all boxes on a page is kind of costly to check. So let's pre-cluster and then perform the O(N^2) within each cluster.


The technique looks something like:

![image](fixtures/overlap-token-strategy.png)

The figure above is what would happen if you just swapped the correct coordinates of the overlapping boxes. It does create more whitespace between, which may not be desirable, but it's also easier to follow. You can instead replace the relevant box coordinates with something newly calculated (e.g. split the difference).

**6. Boxes that aren't even on the page**

If we look at PDF in test fixtures `4be952924cd565488b4a239dc6549095029ee578.pdf`, we'll actually find weird boxes that come out of PDFPlumber that are off the page:

```

```
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/mmda/parsers/fixtures/token-box-overlap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 17 additions & 13 deletions src/mmda/parsers/pdfplumber_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import itertools
import string
from typing import List, Optional, Union, Tuple
from typing import List, Optional, Tuple, Union

import pdfplumber

Expand Down Expand Up @@ -201,24 +201,28 @@ def parse(self, input_pdf_path: str) -> Document:
)
assert len(word_ids_of_fine_tokens) == len(fine_tokens)
# 4) normalize / clean tokens & boxes
boxes = [
Box.from_coordinates(
x1=float(token["x0"]),
y1=float(token["top"]),
x2=float(token["x1"]),
y2=float(token["bottom"]),
page=int(page_id),
).get_relative(
page_width=float(page.width), page_height=float(page.height)
)
for token in fine_tokens
]
# 4.5) reformat
# TODO: remove tokens that are 'off page'
fine_tokens = [
{
"text": token["text"],
"fontname": token["fontname"],
"size": token["size"],
"bbox": Box.from_pdf_coordinates(
x1=float(token["x0"]),
y1=float(token["top"]),
x2=float(token["x1"]),
y2=float(token["bottom"]),
page_width=float(page.width),
page_height=float(page.height),
page=int(page_id),
).get_relative(
page_width=float(page.width), page_height=float(page.height)
),
"bbox": box,
}
for token in fine_tokens
for box, token in zip(boxes, fine_tokens)
]
# 5) group tokens into lines
# TODO - doesnt belong in parser; should be own predictor
Expand Down
Loading