Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kylel/2023/hardening codebase #259

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f391c74
formatting
kyleclo Jul 5, 2023
bd0c3ab
add functions to Box; reformatting
kyleclo Jul 5, 2023
326bf43
add tests for box
kyleclo Jul 5, 2023
a62faf4
tests for span
kyleclo Jul 5, 2023
c0f283e
fix tests for span; add flag for merging Box behavior
kyleclo Jul 5, 2023
66ec632
fix tests for Box
kyleclo Jul 5, 2023
128ed4f
bugfix annotation
kyleclo Jul 5, 2023
966dc13
comment out unfinished test
kyleclo Jul 5, 2023
78b7c3b
fix bug; span equality
kyleclo Jul 5, 2023
013b512
add boxgroup indexer; add tests for indexers
kyleclo Jul 5, 2023
caf81c3
bugfix unit test
kyleclo Jul 5, 2023
d7afec3
bugfix; metrics
kyleclo Jul 5, 2023
37b223e
bugfix test
kyleclo Jul 5, 2023
439307b
CHANGE! word predictor doesnt return merged Box anymore as part of it…
kyleclo Jul 6, 2023
db07e68
reformat; remove unused imports from Doc
kyleclo Jul 6, 2023
039f0dc
reformatting
kyleclo Jul 6, 2023
d163044
reformat
kyleclo Jul 6, 2023
524e260
move MergeSpan into figure table heuristic predictor
kyleclo Jul 6, 2023
d26a379
revert commit
kyleclo Jul 6, 2023
bb4b740
bugfix; remove repeat method in Span
kyleclo Jul 6, 2023
070ce04
add test for unsorted span merging
kyleclo Jul 6, 2023
e8953fc
update tools tests so Spans have Boxes
kyleclo Jul 6, 2023
dc70d9a
fix bug in tools; should just use the librarys span merging function
kyleclo Jul 6, 2023
1069e52
reformat test_tools; bugfix need to check Box equality via .coordinates
kyleclo Jul 6, 2023
1d546cf
minor fix in test; equivalence of Span and Box
kyleclo Jul 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/mmda/eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ def levenshtein(

def box_overlap(box: Box, container: Box) -> float:
"""Returns the percentage of area of a box inside of a container."""
bl, bt, bw, bh = box.xywh
bl, bt, bw, bh = box.l, box.t, box.w, box.h
br = bl + bw
bb = bt + bh

cl, ct, cw, ch = container.xywh
cl, ct, cw, ch = container.l, container.t, container.w, container.h
cr = cl + cw
cb = ct + ch

Expand Down
8 changes: 6 additions & 2 deletions src/mmda/predictors/sklearn_predictors/svm_word_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,10 @@ def _create_words(
else:
spans = [
Span.small_spans_to_big_span(
spans=[span for token in tokens_in_word for span in token.spans]
spans=[
span for token in tokens_in_word for span in token.spans
],
merge_boxes=False,
)
]
metadata = (
Expand All @@ -471,7 +474,8 @@ def _create_words(
# last bit
spans = [
Span.small_spans_to_big_span(
spans=[span for token in tokens_in_word for span in token.spans]
spans=[span for token in tokens_in_word for span in token.spans],
merge_boxes=False,
)
]
metadata = (
Expand Down
76 changes: 40 additions & 36 deletions src/mmda/types/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
__all__ = ["Annotation", "BoxGroup", "SpanGroup", "Relation"]



def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
"""Warns when a deepcopy is performed on an Annotation."""

Expand All @@ -34,15 +33,14 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
warnings.warn(msg, UserWarning, stacklevel=2)



class Annotation:
"""Annotation is intended for storing model predictions for a document."""

def __init__(
self,
id: Optional[int] = None,
doc: Optional['Document'] = None,
metadata: Optional[Metadata] = None
self,
id: Optional[int] = None,
doc: Optional["Document"] = None,
metadata: Optional[Metadata] = None,
):
self.id = id
self.doc = doc
Expand Down Expand Up @@ -77,40 +75,44 @@ def __getattr__(self, field: str) -> List["Annotation"]:
return self.__getattribute__(field)



class BoxGroup(Annotation):
def __init__(
self,
boxes: List[Box],
id: Optional[int] = None,
doc: Optional['Document'] = None,
metadata: Optional[Metadata] = None,
self,
boxes: List[Box],
id: Optional[int] = None,
doc: Optional["Document"] = None,
metadata: Optional[Metadata] = None,
allow_overlap: Optional[bool] = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allow_overlap=False as the default is a behavioral change and will almost certainly break stuff in practice. Can we switch the default to True?

):
self.boxes = boxes
if not allow_overlap:
clusters = Box.cluster_boxes(boxes=boxes)
if any([len(cluster) > 1 for cluster in clusters]):
raise ValueError(
"BoxGroup does not allow overlapping boxes. "
"Consider setting allow_overlap=True."
)
super().__init__(id=id, doc=doc, metadata=metadata)

def to_json(self) -> Dict:
box_group_dict = dict(
boxes=[box.to_json() for box in self.boxes],
id=self.id,
metadata=self.metadata.to_json()
metadata=self.metadata.to_json(),
)
return {
key: value for key, value in box_group_dict.items() if value
} # only serialize non-null values

@classmethod
def from_json(cls, box_group_dict: Dict) -> "BoxGroup":

if "metadata" in box_group_dict:
metadata_dict = box_group_dict["metadata"]
else:
# this fallback is necessary to ensure compatibility with box
# groups that were create before the metadata migration and
# therefore have "type" in the root of the json dict instead.
metadata_dict = {
"type": box_group_dict.get("type", None)
}
metadata_dict = {"type": box_group_dict.get("type", None)}

return cls(
boxes=[
Expand All @@ -132,7 +134,7 @@ def __deepcopy__(self, memo):
box_group = BoxGroup(
boxes=deepcopy(self.boxes, memo),
id=self.id,
metadata=deepcopy(self.metadata, memo)
metadata=deepcopy(self.metadata, memo),
)

# Don't copy an attached document
Expand All @@ -150,25 +152,30 @@ def type(self, type: Union[str, None]) -> None:


class SpanGroup(Annotation):

def __init__(
self,
spans: List[Span],
box_group: Optional[BoxGroup] = None,
id: Optional[int] = None,
doc: Optional['Document'] = None,
metadata: Optional[Metadata] = None,
self,
spans: List[Span],
box_group: Optional[BoxGroup] = None,
id: Optional[int] = None,
doc: Optional["Document"] = None,
metadata: Optional[Metadata] = None,
allow_overlap: Optional[bool] = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same story here as BoxGroup -- False as the default is a breaking change in behavior, and there's no telling how much data we already have that violates it.

):
self.spans = spans
if not allow_overlap:
clusters = Span.cluster_spans(spans=spans)
if any([len(cluster) > 1 for cluster in clusters]):
raise ValueError(
"SpanGroup does not allow overlapping spans. "
"Consider setting allow_overlap=True."
)
self.box_group = box_group
super().__init__(id=id, doc=doc, metadata=metadata)

@property
def symbols(self) -> List[str]:
if self.doc is not None:
return [
self.doc.symbols[span.start: span.end] for span in self.spans
]
return [self.doc.symbols[span.start : span.end] for span in self.spans]
else:
return []

Expand All @@ -187,12 +194,10 @@ def to_json(self) -> Dict:
spans=[span.to_json() for span in self.spans],
id=self.id,
metadata=self.metadata.to_json(),
box_group=self.box_group.to_json() if self.box_group else None
box_group=self.box_group.to_json() if self.box_group else None,
)
return {
key: value
for key, value in span_group_dict.items()
if value is not None
key: value for key, value in span_group_dict.items() if value is not None
} # only serialize non-null values

@classmethod
Expand All @@ -211,7 +216,7 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup":
# therefore have "id", "type" in the root of the json dict instead.
metadata_dict = {
"type": span_group_dict.get("type", None),
"text": span_group_dict.get("text", None)
"text": span_group_dict.get("text", None),
}

return cls(
Expand Down Expand Up @@ -256,7 +261,7 @@ def __deepcopy__(self, memo):
spans=deepcopy(self.spans, memo),
id=self.id,
metadata=deepcopy(self.metadata, memo),
box_group=deepcopy(self.box_group, memo)
box_group=deepcopy(self.box_group, memo),
)

# Don't copy an attached document
Expand Down Expand Up @@ -284,6 +289,5 @@ def text(self, text: Union[str, None]) -> None:
self.metadata.text = text



class Relation(Annotation):
pass
pass
Loading