From d9a64ace3a2d1daa5ea4636b67dd7983943025da Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 21:53:25 -0700 Subject: [PATCH 01/21] simplify annotation --- mmda/types/annotation.py | 103 ++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index d8280cc3..a0325079 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -21,12 +21,9 @@ from mmda.types.document import Document -__all__ = ["Annotation", "BoxGroup", "SpanGroup"] +__all__ = ["Annotation", "BoxGroup", "SpanGroup", "Relation"] -def default_factory(): - return str(uuid4()) - def warn_deepcopy_of_annotation(obj: "Annotation") -> None: """Warns when a deepcopy is performed on an Annotation.""" @@ -39,49 +36,42 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None: warnings.warn(msg, UserWarning, stacklevel=2) -@dataclass + class Annotation: """Annotation is intended for storing model predictions for a document.""" - # TODO[kylel] - remove UUID from this class, as you explained to me (luca) - # it is about 10% of the wall time in processing a document - uuid: str = field(default_factory=default_factory) - doc: Optional["Document"] = field(default=None, init=False) - metadata: Metadata = field(default_factory=Metadata) + def __init__( + self, + id: Optional[int] = None, + doc: Optional['Document'] = None, + metadata: Optional[Metadata] = None + ): + self.id = id + self.doc = doc + self.metadata = metadata if metadata else Metadata() @abstractmethod def to_json(self) -> Dict: pass - # TODO[shannon] make this as an abstract method after implementing - # get_symbols for BoxGroup - def get_symbols(self) -> str: # type: ignore - pass - @classmethod @abstractmethod def from_json(cls, annotation_dict: Dict) -> "Annotation": pass - @property - def key_prefix(self) -> str: - return f"{self.__class__.__name__}|{self.uuid}|" - def attach_doc(self, doc: "Document") -> None: if not self.doc: self.doc = doc else: - raise AttributeError( - "This annotation already has an attached document" - ) + raise AttributeError("This annotation already has an attached document") # TODO[kylel] - comment explaining def __getattr__(self, field: str) -> List["Annotation"]: if self.doc is None: raise ValueError("This annotation is not attached to a document") - if self.key_prefix + field in self.doc.fields: - return self.doc.find_overlapping(self, self.key_prefix + field) + if field in self.doc.fields: + return self.doc.find_overlapping(self, field) if field in self.doc.fields: return self.doc.find_overlapping(self, field) @@ -95,18 +85,24 @@ def __getattr__(self, field: str) -> List["Annotation"]: # useful because it keeps backward compatibility with the old API, while # migrating id and type to metadata. @store_field_in_metadata("type") -@store_field_in_metadata("id") -@dataclass class BoxGroup(Annotation): - boxes: List[Box] = field(default_factory=list) - id: Optional[int] = None - type: Optional[str] = None + def __init__( + self, + boxes: List[Box], + type: Optional[str] = None, + id: Optional[int] = None, + doc: Optional['Document'] = None, + metadata: Optional[Metadata] = None, + ): + self.boxes = boxes + self.type = type + super().__init__(id=id, doc=doc, metadata=metadata) def to_json(self) -> Dict: box_group_dict = dict( boxes=[box.to_json() for box in self.boxes], - metadata=self.metadata.to_json(), - uuid=self.uuid, + id=self.id, + metadata=self.metadata.to_json() ) return { key: value for key, value in box_group_dict.items() if value @@ -122,7 +118,6 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup": # groups that were create before the metadata migration and # therefore have "id", "type" in the root of the json dict instead. metadata_dict = { - "id": box_group_dict.get("id", None), "type": box_group_dict.get("type", None), "text": box_group_dict.get("text", None) } @@ -134,8 +129,8 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup": # minimally serialize when running to_json() for box_dict in box_group_dict.get("boxes", []) ], + id=box_group_dict.get("id", None), metadata=Metadata.from_json(metadata_dict), - uuid=box_group_dict.get("uuid", str(uuid4())), ) def __getitem__(self, key: int): @@ -146,8 +141,8 @@ def __deepcopy__(self, memo): box_group = BoxGroup( boxes=deepcopy(self.boxes, memo), - metadata=deepcopy(self.metadata, memo), - uuid=self.uuid, + id=self.id, + metadata=deepcopy(self.metadata, memo) ) # Don't copy an attached document @@ -177,18 +172,22 @@ def _text_span_group_getter(span_group: "SpanGroup") -> str: # and use a custom getter to obtain the text from symbols if the text # is not explicitly set. @store_field_in_metadata("type") -@store_field_in_metadata("id") @store_field_in_metadata("text", getter_fn=_text_span_group_getter) -@dataclass class SpanGroup(Annotation): - spans: List[Span] = field(default_factory=list) - - # TODO[kylel] - implement default behavior for box_group - box_group: Optional[BoxGroup] = None - id: Optional[int] = None - type: Optional[str] = None - text: Optional[str] = None + def __init__( + self, + spans: List[Span], + type: Optional[str] = None, + text: Optional[str] = None, + id: Optional[int] = None, + doc: Optional['Document'] = None, + metadata: Optional[Metadata] = None, + ): + self.spans = spans + self.type = type + self.text = text + super().__init__(id=id, doc=doc, metadata=metadata) @property def symbols(self) -> List[str]: @@ -212,9 +211,9 @@ def annotate( def to_json(self) -> Dict: span_group_dict = dict( spans=[span.to_json() for span in self.spans], + id=self.id, metadata=self.metadata.to_json(), - box_group=self.box_group.to_json() if self.box_group else None, - uuid=self.uuid, + box_group=self.box_group.to_json() if self.box_group else None ) return { key: value @@ -237,7 +236,6 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup": # groups that were create before the metadata migration and # therefore have "id", "type" in the root of the json dict instead. metadata_dict = { - "id": span_group_dict.get("id", None), "type": span_group_dict.get("type", None), "text": span_group_dict.get("text", None) } @@ -247,9 +245,9 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup": Span.from_json(span_dict=span_dict) for span_dict in span_group_dict["spans"] ], + id=span_group_dict.get("id", None), metadata=Metadata.from_json(metadata_dict), box_group=box_group, - uuid=span_group_dict.get("uuid", str(uuid4())), ) def __getitem__(self, key: int): @@ -282,12 +280,17 @@ def __deepcopy__(self, memo): span_group = SpanGroup( spans=deepcopy(self.spans, memo), + id=self.id, metadata=deepcopy(self.metadata, memo), - box_group=deepcopy(self.box_group, memo), - uuid=self.uuid, + box_group=deepcopy(self.box_group, memo) ) # Don't copy an attached document span_group.doc = self.doc return span_group + + + +class Relation(Annotation): + pass \ No newline at end of file From b17bbb736250574fa9135f5652967306d9616d92 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 21:55:30 -0700 Subject: [PATCH 02/21] remove unused imports --- mmda/types/annotation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index a0325079..54e4d8bc 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -9,9 +9,7 @@ import warnings from abc import abstractmethod from copy import deepcopy -from dataclasses import dataclass, field from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union -from uuid import uuid4 from mmda.types.box import Box from mmda.types.metadata import Metadata, store_field_in_metadata From 3132290f77de17cae3a3f740b568651321622050 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:31:12 -0700 Subject: [PATCH 03/21] remove metadata anno in favor of getter/setters --- mmda/types/annotation.py | 72 +++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index 54e4d8bc..c288c157 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -12,7 +12,7 @@ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union from mmda.types.box import Box -from mmda.types.metadata import Metadata, store_field_in_metadata +from mmda.types.metadata import Metadata from mmda.types.span import Span if TYPE_CHECKING: @@ -77,17 +77,11 @@ def __getattr__(self, field: str) -> List["Annotation"]: return self.__getattribute__(field) -# NOTE[LucaS]: by using the store_field_in_metadata decorator, we are -# able to store id and type in the metadata of BoxGroup, while keeping it -# accessible via SpanGroup.id and SpanGroup.type respectively. This is -# useful because it keeps backward compatibility with the old API, while -# migrating id and type to metadata. -@store_field_in_metadata("type") + class BoxGroup(Annotation): def __init__( self, boxes: List[Box], - type: Optional[str] = None, id: Optional[int] = None, doc: Optional['Document'] = None, metadata: Optional[Metadata] = None, @@ -114,10 +108,9 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup": else: # this fallback is necessary to ensure compatibility with box # groups that were create before the metadata migration and - # therefore have "id", "type" in the root of the json dict instead. + # therefore have "type" in the root of the json dict instead. metadata_dict = { - "type": box_group_dict.get("type", None), - "text": box_group_dict.get("text", None) + "type": box_group_dict.get("type", None) } return cls( @@ -148,43 +141,27 @@ def __deepcopy__(self, memo): return box_group + @property + def type(self) -> str: + return self.metadata.get("type", None) + + @type.setter + def type(self, type: Union[str, None]) -> None: + self.metadata.type = type + -def _text_span_group_getter(span_group: "SpanGroup") -> str: - """Getter used to obtain a textual representation of a SpanGroup. - - When SpanGroup.text is not set, this function uses the SpanGroup's - symbols to generate approximate a text. However, if text is set, - this function returns it instead. - """ - maybe_text = span_group.metadata.get("text", None) - return maybe_text if maybe_text else " ".join(span_group.symbols) - - -# NOTE[@soldni]: by using the store_field_in_metadata decorator, we are -# able to store id and type in the metadata of BoxGroup, while keeping it -# accessible via SpanGroup.id and SpanGroup.type respectively. This is -# useful because it keeps backward compatibility with the old API, while -# migrating id and type to metadata. -# -# Futhermore, we also store the text of the SpanGroup in the metadata, -# and use a custom getter to obtain the text from symbols if the text -# is not explicitly set. -@store_field_in_metadata("type") -@store_field_in_metadata("text", getter_fn=_text_span_group_getter) class SpanGroup(Annotation): def __init__( self, spans: List[Span], - type: Optional[str] = None, - text: Optional[str] = None, + box_group: Optional[BoxGroup] = None, id: Optional[int] = None, doc: Optional['Document'] = None, metadata: Optional[Metadata] = None, ): self.spans = spans - self.type = type - self.text = text + self.box_group = box_group super().__init__(id=id, doc=doc, metadata=metadata) @property @@ -202,7 +179,7 @@ def annotate( if self.doc is None: raise ValueError("SpanGroup has no attached document!") - key_remaps = {self.key_prefix + k: v for k, v in kwargs.items()} + key_remaps = {k: v for k, v in kwargs.items()} self.doc.annotate(is_overwrite=is_overwrite, **key_remaps) @@ -288,6 +265,25 @@ def __deepcopy__(self, memo): return span_group + @property + def type(self) -> str: + return self.metadata.get("type", None) + + @type.setter + def type(self, type: Union[str, None]) -> None: + self.metadata.type = type + + @property + def text(self) -> str: + maybe_text = self.metadata.get("text", None) + if maybe_text is None: + return " ".join(self.symbols) + return maybe_text + + @text.setter + def text(self, text: Union[str, None]) -> None: + self.metadata.text = text + class Relation(Annotation): From 055dfea02494da8315f9bcaf5d92e679abf6f674 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:31:30 -0700 Subject: [PATCH 04/21] remove spangroup nesting --- tests/test_types/test_span_group.py | 87 ----------------------------- 1 file changed, 87 deletions(-) diff --git a/tests/test_types/test_span_group.py b/tests/test_types/test_span_group.py index 9c63db5d..6068b5ee 100644 --- a/tests/test_types/test_span_group.py +++ b/tests/test_types/test_span_group.py @@ -23,92 +23,5 @@ def test_annotation_attaches_document(self): span_group = self.doc.tokens[0] self.assertEqual(["This", "is"], span_group.symbols) - def test_annotation_allows_nesting(self): - span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)]) - nested_span_group = SpanGroup(id=2, spans=[Span(0, 4)], text="This") - - self.doc.annotate(tokens=[span_group]) - - span_group = self.doc.tokens[0] - span_group.annotate(capitalized=[nested_span_group]) - - nested_span_group = span_group.capitalized[0] - self.assertEqual("This", nested_span_group.text) - self.assertEqual(["This"], nested_span_group.symbols) - - def test_serialization_with_nesting(self): - span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)]) - nested_span_group = SpanGroup(id=2, spans=[Span(0, 4)], text="This") - - self.doc.annotate(tokens=[span_group]) - - span_group = self.doc.tokens[0] - span_group.annotate(capitalized=[nested_span_group]) - - json_repr = self.doc.to_json() - new_doc = Document.from_json(json_repr) - - span_group = new_doc.tokens[0] - self.assertEqual(["This", "is"], span_group.symbols) - - nested_span_group = span_group.capitalized[0] - self.assertEqual("This", nested_span_group.text) - self.assertEqual(["This"], nested_span_group.symbols) - - def test_deep_nesting_without_id_conflicts(self): - span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)]) - nested_span_group = SpanGroup( - id=2, spans=[Span(0, 4), Span(5, 7)], text="This is" - ) - deep_span_group = SpanGroup(id=3, spans=[Span(0, 4)], text="This") - - self.doc.annotate(tokens=[span_group]) - - span_group = self.doc.tokens[0] - span_group.annotate(capitalized=[nested_span_group]) - - nested_span_group = span_group.capitalized[0] - nested_span_group.annotate(deep=[deep_span_group]) - - json_repr = self.doc.to_json() - new_doc = Document.from_json(json_repr) - - span_group = new_doc.tokens[0] - self.assertEqual(["This", "is"], span_group.symbols) - - nested_span_group = span_group.capitalized[0] - self.assertEqual("This is", nested_span_group.text) - self.assertEqual(["This", "is"], nested_span_group.symbols) - - deep_span_group = nested_span_group.deep[0] - self.assertEqual("This", deep_span_group.text) - self.assertEqual(["This"], deep_span_group.symbols) - - def test_deep_nesting_with_id_conflicts(self): - span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)]) - nested_span_group = SpanGroup( - id=1, spans=[Span(0, 4), Span(5, 7)], text="This is" - ) - deep_span_group = SpanGroup(id=1, spans=[Span(0, 4)], text="This") - - self.doc.annotate(tokens=[span_group]) - - span_group = self.doc.tokens[0] - span_group.annotate(capitalized=[nested_span_group]) - - nested_span_group = span_group.capitalized[0] - nested_span_group.annotate(deep=[deep_span_group]) - - json_repr = self.doc.to_json() - new_doc = Document.from_json(json_repr) - - span_group = new_doc.tokens[0] - self.assertEqual(["This", "is"], span_group.symbols) - nested_span_group = span_group.capitalized[0] - self.assertEqual("This is", nested_span_group.text) - self.assertEqual(["This", "is"], nested_span_group.symbols) - deep_span_group = nested_span_group.deep[0] - self.assertEqual("This", deep_span_group.text) - self.assertEqual(["This"], deep_span_group.symbols) From 813e13475cd87997b63b474a4d5400edd1ab266e Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:38:57 -0700 Subject: [PATCH 05/21] fix grobid parser tests --- mmda/parsers/grobid_parser.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py index b2a5fe43..a358c34e 100644 --- a/mmda/parsers/grobid_parser.py +++ b/mmda/parsers/grobid_parser.py @@ -109,7 +109,9 @@ def _get_title(self, root: et.Element) -> SpanGroup: tokens = text.split() spans = _get_token_spans(text, tokens) - return SpanGroup(spans=spans, text=text) + sg = SpanGroup(spans=spans) + sg.text = text + return sg def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup: matches = root.findall(".//tei:profileDesc//tei:abstract//", NS) @@ -122,4 +124,6 @@ def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup: tokens = text.split() spans = _get_token_spans(text, tokens, offset=offset) - return SpanGroup(spans=spans, text=text) + sg = SpanGroup(spans=spans) + sg.text = text + return sg From affc3e84711c4a7a4a932cbc4ccaaddf81f96c31 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:43:23 -0700 Subject: [PATCH 06/21] fix tests for dict word predictor --- .../dictionary_word_predictor.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py index 6d8bd4f6..8ee3d617 100644 --- a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py +++ b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py @@ -145,23 +145,16 @@ def predict(self, document: Document) -> List[SpanGroup]: or combined_no_hyphen.lower() in self.dictionary or combined_no_hyphen.lower() in local_dictionary ): - combined_text = curr_row_last_token_text[:-1] + self._token_text( - next_row_first_token - ) - span_group = SpanGroup( - spans=curr_row_last_token.spans + next_row_first_token.spans, - text=combined_text, - ) + combined_text = curr_row_last_token_text[:-1] + \ + self._token_text(next_row_first_token) else: # Use the combined, hyphenated word instead (e.g., few-shot) - combined_text = curr_row_last_token_text + self._token_text( - next_row_first_token - ) - span_group = SpanGroup( - spans=curr_row_last_token.spans + next_row_first_token.spans, - text=combined_text, - ) - + combined_text = curr_row_last_token_text + \ + self._token_text(next_row_first_token) + span_group = SpanGroup( + spans=curr_row_last_token.spans + next_row_first_token.spans + ) + span_group.text = combined_text words.append(span_group) # add IDs to each word @@ -174,7 +167,9 @@ def _token_text(self, token: SpanGroup) -> str: return "".join(token.symbols) def _copy_token_with_text(self, token: SpanGroup) -> SpanGroup: - return SpanGroup(spans=token.spans, text=self._token_text(token)) + sg = SpanGroup(spans=token.spans) + sg.text = self._token_text(token) + return sg def _row_pairs(self, document): for i in range(0, len(document.rows) - 1): From 87de5c900af862ee0b0c7d2c1cfae912e1efdea4 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:57:22 -0700 Subject: [PATCH 07/21] fix bug in grobidparser --- mmda/parsers/grobid_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py index a358c34e..8dad2734 100644 --- a/mmda/parsers/grobid_parser.py +++ b/mmda/parsers/grobid_parser.py @@ -23,7 +23,8 @@ def _null_span_group() -> SpanGroup: - return SpanGroup(spans=[], text="") + sg = SpanGroup(spans=[]) + return sg def _get_token_spans(text: str, tokens: List[str], offset: int = 0) -> List[int]: From 92b79d432342e845c48b8ec168d8738ca05de2a5 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:57:39 -0700 Subject: [PATCH 08/21] fix bug; forgot to remove type from init in boxgroup --- mmda/types/annotation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index c288c157..7949c7e2 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -87,7 +87,6 @@ def __init__( metadata: Optional[Metadata] = None, ): self.boxes = boxes - self.type = type super().__init__(id=id, doc=doc, metadata=metadata) def to_json(self) -> Dict: From 70bc6bf094714701f1f01109dbb3c2e5d9d59378 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 22:58:08 -0700 Subject: [PATCH 09/21] make tests in json conversion more lenient; doesnt need to be exactly the same object, just the values need to be the same after to/from json --- tests/test_types/test_json_conversion.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index 416358b4..e7a5f27d 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -16,13 +16,15 @@ def test_span_group_conversion(): - sg = SpanGroup(id=3, metadata=Metadata.from_json({"text": "test"})) + sg = SpanGroup(spans=[], id=3, metadata=Metadata.from_json({"text": "test"})) sg2 = SpanGroup.from_json(sg.to_json()) - assert sg2 == sg + assert sg2.to_json() == sg.to_json() + assert sg2.__dict__ == sg.__dict__ - bg = BoxGroup(metadata=Metadata.from_json({"text": "test", "id": 1})) + bg = BoxGroup(boxes=[], metadata=Metadata.from_json({"text": "test", "id": 1})) bg2 = BoxGroup.from_json(bg.to_json()) - assert bg2 == bg + assert bg2.to_json() == bg.to_json() + assert bg2.__dict__ == bg.__dict__ def test_doc_conversion(): @@ -54,7 +56,6 @@ def test_doc_conversion(): for orig_sg, new_sg in field_it: # for each pair, they should have same metadata (type, id, - # and optionally, text), same spans, and same uuid. + # and optionally, text) and same spans. assert orig_sg.metadata == new_sg.metadata - assert orig_sg.uuid == new_sg.uuid assert orig_sg.spans == new_sg.spans From 34d6fc619143a9340a4987c42a8e5d3b1de85341 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Wed, 5 Oct 2022 23:23:01 -0700 Subject: [PATCH 10/21] oops forgot to commit --- tests/test_internal_ai2/test_api.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_internal_ai2/test_api.py b/tests/test_internal_ai2/test_api.py index b3873573..75c72a77 100644 --- a/tests/test_internal_ai2/test_api.py +++ b/tests/test_internal_ai2/test_api.py @@ -65,8 +65,5 @@ def test_equivalence(self): }) sg_ann_2 = ClassificationSpanGroup.from_mmda(sg_ann).to_mmda() - # we need to manually set the uuids to be equal - # because by default they are randomly generated - sg_ann.uuid = sg_ann_2.uuid = 'manually-fix-to-avoid-randomness' - - self.assertEqual(sg_ann, sg_ann_2) + self.assertDictEqual(sg_ann.to_json(), sg_ann_2.to_json()) + self.assertDictEqual(sg_ann.__dict__, sg_ann_2.__dict__) From 32a2e119e62f1dfee9166d01c548a1ff02a363f8 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 6 Oct 2022 11:39:51 -0700 Subject: [PATCH 11/21] change internal ai2 test api --- tests/test_internal_ai2/test_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_internal_ai2/test_api.py b/tests/test_internal_ai2/test_api.py index 75c72a77..6cd2f4f8 100644 --- a/tests/test_internal_ai2/test_api.py +++ b/tests/test_internal_ai2/test_api.py @@ -19,7 +19,8 @@ class TestApi(unittest.TestCase): def test_vanilla_span_group(self) -> None: sg_ann = mmda_ann.SpanGroup.from_json({ 'spans': [{'start': 0, 'end': 1}], - 'metadata': {'text': 'hello', 'id': 1} + 'id': 1, + 'metadata': {'text': 'hello', 'id': 999} # note id not used; it's just in metadata }) sg_api = mmda_api.SpanGroup.from_mmda(sg_ann) From d763d097f296ce588d8619a9c1c94f9ba2aab8b1 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 6 Oct 2022 11:46:41 -0700 Subject: [PATCH 12/21] bugfix; vila model SpanGroup creation had type --- .../hf_predictors/token_classification_predictor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mmda/predictors/hf_predictors/token_classification_predictor.py b/mmda/predictors/hf_predictors/token_classification_predictor.py index 801dfbad..1570d815 100644 --- a/mmda/predictors/hf_predictors/token_classification_predictor.py +++ b/mmda/predictors/hf_predictors/token_classification_predictor.py @@ -106,8 +106,9 @@ def postprocess(self, document: Document, model_predictions) -> List[SpanGroup]: start = min([ele.start for ele in cur_spans]) end = max([ele.end for ele in cur_spans]) - prediction_spans.append(SpanGroup(spans=[Span(start, end)], type=label)) - + sg = SpanGroup(spans=[Span(start, end)]) + sg.type = label + prediction_spans.append(sg) return prediction_spans From 98da4ead49c04db0c989e64a61cf28efbba64eea Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 6 Oct 2022 13:26:07 -0700 Subject: [PATCH 13/21] modify metadata constructor to take args; adjust all tests/models that instantiate spangroups --- README.md | 27 ++++++++++++++++++- mmda/parsers/grobid_parser.py | 8 +++--- .../dictionary_word_predictor.py | 4 +-- .../token_classification_predictor.py | 4 +-- .../hf_predictors/vila_predictor.py | 4 ++- mmda/predictors/lp_predictors.py | 6 ++--- mmda/types/metadata.py | 4 +++ tests/test_predictors/test_vila_predictors.py | 13 ++++++--- 8 files changed, 51 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index cb8dc804..3567ccc3 100644 --- a/README.md +++ b/README.md @@ -111,8 +111,31 @@ A key aspect of using this library is understanding how these different fields a +#### 4. What's in a `SpanGroup`? -#### 4. Adding a new SpanGroup field +Each `SpanGroup` object stores information about its contents and position: + +* `.spans: List[Span]`, A `Span` is a pointer into `Document.symbols` (that is, `Span(start=0, end=5)` corresponds to `symbols[0:5]`) and a single `Box` representing its position & rectangular region on the page. + +* `.box_group: BoxGroup`, A `BoxGroup` object stores `.boxes: List[Box]`. + +* `.metadata: Metadata`, A free + + * **Span-Box Coupling:** Every `Span` is associated with a single `Box`, and not a `BoxGroup`. In this library, we restrict all of our `Span` to be units that can be represented by a single rectangular box. This is instead of allowing *any* (start, end) which would result in spans that can't necessarily be cleanly represented by a single box. + * + +**FAQS** + +Q. Why do we need `BoxGroup` if we already have `Box` in each `Span`? + +A: Let's consider a `SpanGroup` object representing a single sentence in a paper. We know a single `Box` can't properly cover a sentence, because sentences can wrap rows & even cross columns/page: + +* One way to represent the visual area of that sentence is to take the Union of all `Box` in every involved `Span` -- This leaves us with many rectangles. +* But another way to synthesize all those `Box` into one giant `Box` (which might even overlap other text outside of this sentence). +* Finally, a third way is to synthesize all the `Box` of tokens on the same row into one `Box`, but keep `Box` on different rows separate. None of these ways + + +#### 5. Adding a new SpanGroup field Not all Documents will have all segmentations available at creation time. You may need to load new fields to an existing `Document`. This is where `Predictor` comes in: @@ -127,6 +150,8 @@ output = predictor.predict(document=doc) + + ## Parsers * [PDFPlumber](https://github.com/jsvine/pdfplumber) - MIT License diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py index 8dad2734..8b028c34 100644 --- a/mmda/parsers/grobid_parser.py +++ b/mmda/parsers/grobid_parser.py @@ -15,7 +15,7 @@ from mmda.parsers.parser import Parser from mmda.types.annotation import SpanGroup from mmda.types.document import Document -from mmda.types.names import Symbols +from mmda.types.metadata import Metadata from mmda.types.span import Span DEFAULT_API = "http://localhost:8070/api/processHeaderDocument" @@ -110,8 +110,7 @@ def _get_title(self, root: et.Element) -> SpanGroup: tokens = text.split() spans = _get_token_spans(text, tokens) - sg = SpanGroup(spans=spans) - sg.text = text + sg = SpanGroup(spans=spans, metadata=Metadata(text=text)) return sg def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup: @@ -125,6 +124,5 @@ def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup: tokens = text.split() spans = _get_token_spans(text, tokens, offset=offset) - sg = SpanGroup(spans=spans) - sg.text = text + sg = SpanGroup(spans=spans, metadata=Metadata(text=text)) return sg diff --git a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py index 8ee3d617..f8d565ad 100644 --- a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py +++ b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py @@ -8,6 +8,7 @@ from typing import Optional, Set, List from mmda.predictors.base_predictors.base_predictor import BasePredictor +from mmda.types.metadata import Metadata from mmda.types.annotation import Annotation, Span, SpanGroup from mmda.types.document import Document from mmda.types.names import Rows, Tokens @@ -167,8 +168,7 @@ def _token_text(self, token: SpanGroup) -> str: return "".join(token.symbols) def _copy_token_with_text(self, token: SpanGroup) -> SpanGroup: - sg = SpanGroup(spans=token.spans) - sg.text = self._token_text(token) + sg = SpanGroup(spans=token.spans, metadata=Metadata(text=self._token_text(token))) return sg def _row_pairs(self, document): diff --git a/mmda/predictors/hf_predictors/token_classification_predictor.py b/mmda/predictors/hf_predictors/token_classification_predictor.py index 1570d815..4e37ca91 100644 --- a/mmda/predictors/hf_predictors/token_classification_predictor.py +++ b/mmda/predictors/hf_predictors/token_classification_predictor.py @@ -12,6 +12,7 @@ from mmda.types.names import * from mmda.types.annotation import Annotation, Span, SpanGroup from mmda.types.document import Document +from mmda.types.metadata import Metadata from mmda.predictors.hf_predictors.utils import ( convert_document_page_to_pdf_dict, convert_sequence_tagging_to_spans, @@ -106,8 +107,7 @@ def postprocess(self, document: Document, model_predictions) -> List[SpanGroup]: start = min([ele.start for ele in cur_spans]) end = max([ele.end for ele in cur_spans]) - sg = SpanGroup(spans=[Span(start, end)]) - sg.type = label + sg = SpanGroup(spans=[Span(start, end)], metadata=Metadata(type=label)) prediction_spans.append(sg) return prediction_spans diff --git a/mmda/predictors/hf_predictors/vila_predictor.py b/mmda/predictors/hf_predictors/vila_predictor.py index 5cc1a411..a8b78fa1 100644 --- a/mmda/predictors/hf_predictors/vila_predictor.py +++ b/mmda/predictors/hf_predictors/vila_predictor.py @@ -19,6 +19,7 @@ from mmda.types.names import * from mmda.types.annotation import Annotation, Span, SpanGroup +from mmda.types.metadata import Metadata from mmda.types.document import Document from mmda.predictors.hf_predictors.utils import ( convert_document_page_to_pdf_dict, @@ -167,7 +168,8 @@ def postprocess( start = min([ele.start for ele in cur_spans]) end = max([ele.end for ele in cur_spans]) - prediction_spans.append(SpanGroup(spans=[Span(start, end)], type=label)) + sg = SpanGroup(spans=[Span(start, end)], metadata=Metadata(type=label)) + prediction_spans.append(sg) return prediction_spans diff --git a/mmda/predictors/lp_predictors.py b/mmda/predictors/lp_predictors.py index d41ab62a..878fb7be 100644 --- a/mmda/predictors/lp_predictors.py +++ b/mmda/predictors/lp_predictors.py @@ -3,10 +3,8 @@ from tqdm import tqdm import layoutparser as lp +from mmda.types import Document, Box, BoxGroup, Metadata from mmda.types.names import * -from mmda.types.document import Document -from mmda.types.box import Box -from mmda.types.annotation import BoxGroup, Annotation from mmda.predictors.base_predictors.base_predictor import BasePredictor @@ -83,7 +81,7 @@ def postprocess(self, page_height=page_height, ) ], - type=block.type, + metadata=Metadata(type=block.type) ) for block in model_outputs ] diff --git a/mmda/types/metadata.py b/mmda/types/metadata.py index c33c6aae..26e77103 100644 --- a/mmda/types/metadata.py +++ b/mmda/types/metadata.py @@ -29,6 +29,10 @@ class Metadata: """An object that contains metadata for an annotation. It supports dot access and dict-like access.""" + def __init__(self, **kwargs): + for k, v in kwargs.items(): + self.set(k, v) + @overload def get(self, key: str) -> Any: """Get value with name `key` in metadata; diff --git a/tests/test_predictors/test_vila_predictors.py b/tests/test_predictors/test_vila_predictors.py index 2698ac2d..eeffd98e 100644 --- a/tests/test_predictors/test_vila_predictors.py +++ b/tests/test_predictors/test_vila_predictors.py @@ -1,4 +1,6 @@ -import json +import json +import os +import pathlib from PIL import Image @@ -13,6 +15,9 @@ ) +os.chdir(pathlib.Path(__file__).parent) + + DOCBANK_LABEL_MAP = { "0": "paragraph", "1": "title", @@ -61,8 +66,8 @@ def test_vila_predictors(): pdfplumber_parser = PDFPlumberParser() rasterizer = PDF2ImageRasterizer() - doc = pdfplumber_parser.parse(input_pdf_path="tests/fixtures/1903.10676.pdf") - images = rasterizer.rasterize(input_pdf_path="tests/fixtures/1903.10676.pdf", dpi=72) + doc = pdfplumber_parser.parse(input_pdf_path="../fixtures/1903.10676.pdf") + images = rasterizer.rasterize(input_pdf_path="../fixtures/1903.10676.pdf", dpi=72) doc.annotate_images(images) layout_regions = layout_predictor.predict(doc) @@ -124,7 +129,7 @@ def test_vila_predictors(): def test_vila_predictors_with_special_unicode_inputs(): - test_doc_path = "tests/fixtures/unicode-test.json" + test_doc_path = "../fixtures/unicode-test.json" with open(test_doc_path, 'r') as fp: res = json.load(fp) From 9d09af7858a3c23b021f238e17f413d4e00a09b1 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 6 Oct 2022 16:28:55 -0700 Subject: [PATCH 14/21] add basic relation json conversion; WIP --- mmda/types/__init__.py | 5 +- mmda/types/annotation.py | 74 ++++++++++++++++++------ mmda/types/names.py | 22 ++++++- tests/test_types/test_json_conversion.py | 16 ++++- 4 files changed, 93 insertions(+), 24 deletions(-) diff --git a/mmda/types/__init__.py b/mmda/types/__init__.py index d0f3929c..24dcf0aa 100644 --- a/mmda/types/__init__.py +++ b/mmda/types/__init__.py @@ -1,5 +1,5 @@ from mmda.types.document import Document -from mmda.types.annotation import SpanGroup, BoxGroup +from mmda.types.annotation import SpanGroup, BoxGroup, Relation from mmda.types.span import Span from mmda.types.box import Box from mmda.types.image import PILImage @@ -12,5 +12,6 @@ 'Span', 'Box', 'PILImage', - 'Metadata' + 'Metadata', + "Relation" ] \ No newline at end of file diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index 7949c7e2..28feec7c 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -5,12 +5,15 @@ Collections of Annotations are how one constructs a new Iterable of Group-type objects within the Document +@kylel, @lucas + """ import warnings from abc import abstractmethod from copy import deepcopy from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union +from mmda.types.names import from mmda.types.box import Box from mmda.types.metadata import Metadata from mmda.types.span import Span @@ -18,11 +21,9 @@ if TYPE_CHECKING: from mmda.types.document import Document - __all__ = ["Annotation", "BoxGroup", "SpanGroup", "Relation"] - def warn_deepcopy_of_annotation(obj: "Annotation") -> None: """Warns when a deepcopy is performed on an Annotation.""" @@ -34,7 +35,6 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None: warnings.warn(msg, UserWarning, stacklevel=2) - class Annotation: """Annotation is intended for storing model predictions for a document.""" @@ -77,7 +77,6 @@ def __getattr__(self, field: str) -> List["Annotation"]: return self.__getattribute__(field) - class BoxGroup(Annotation): def __init__( self, @@ -150,7 +149,6 @@ def type(self, type: Union[str, None]) -> None: class SpanGroup(Annotation): - def __init__( self, spans: List[Span], @@ -172,16 +170,6 @@ def symbols(self) -> List[str]: else: return [] - def annotate( - self, is_overwrite: bool = False, **kwargs: Iterable["Annotation"] - ) -> None: - if self.doc is None: - raise ValueError("SpanGroup has no attached document!") - - key_remaps = {k: v for k, v in kwargs.items()} - - self.doc.annotate(is_overwrite=is_overwrite, **key_remaps) - def to_json(self) -> Dict: span_group_dict = dict( spans=[span.to_json() for span in self.spans], @@ -284,6 +272,58 @@ def text(self, text: Union[str, None]) -> None: self.metadata.text = text - class Relation(Annotation): - pass \ No newline at end of file + def __init__( + self, + query: SpanGroup, + value: SpanGroup, + id: Optional[int] = None, + doc: Optional['Document'] = None, + metadata: Optional[Metadata] = None + ): + if query.id is None: + raise ValueError(f'Relation requires the query {query} to have an ID') + if value.id is None: + raise ValueError(f'Relation requires the value {value} to have an ID') + self.query = query + self.value = value + super().__init__(id=id, doc=doc, metadata=metadata) + + def to_json(self, is_minimal: Optional[bool] = True) -> Dict: + if is_minimal: + relation_dict = dict( + query=self.query.id, + value=self.value.id, + id=self.id, + metadata=self.metadata.to_json() + ) + else: + relation_dict = dict( + query=self.query.to_json(), + value=self.value.to_json(), + id=self.id, + metadata=self.metadata.to_json() + ) + return { + key: value + for key, value in relation_dict.items() + if value is not None + } # only serialize non-null values + + @classmethod + def from_json(cls, relation_dict: Dict, is_minimal: Optional[bool] = True) -> "Relation": + metadata_dict = relation_dict.get('metadata', {}) + if is_minimal: + return cls( + query=SpanGroup.from_json(span_group_dict=relation_dict['query']), + value=SpanGroup.from_json(span_group_dict=relation_dict['value']), + id=relation_dict.get("id", None), + metadata=Metadata.from_json(metadata_dict) + ) + else: + return cls( + query=SpanGroup.from_json(span_group_dict=relation_dict['query']), + value=SpanGroup.from_json(span_group_dict=relation_dict['value']), + id=relation_dict.get("id", None), + metadata=Metadata.from_json(metadata_dict) + ) diff --git a/mmda/types/names.py b/mmda/types/names.py index 49460dbe..fbfd9327 100644 --- a/mmda/types/names.py +++ b/mmda/types/names.py @@ -11,8 +11,26 @@ Images = 'images' Pages = 'pages' -Tokens = 'tokens' Rows = 'rows' -Sentences = 'sents' Blocks = 'blocks' + +Tokens = 'tokens' Words = 'words' +Sentences = 'sents' +Paragraphs = 'paras' +SectionHeadings = 'secs' + +Figures = 'figures' +Tables = 'tables' +Captions = 'captions' + +BibEntries = 'bibs' +CiteMentions = 'cites' +ReferenceMentions = 'refs' + +# singletons +Title = 'title' +Abstract = 'abstract' + +# relations +SectionParagraphs \ No newline at end of file diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index e7a5f27d..ee98e03d 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -8,7 +8,7 @@ import json from pathlib import Path -from mmda.types import BoxGroup, SpanGroup, Document, Metadata +from mmda.types import BoxGroup, SpanGroup, Document, Metadata, Relation from mmda.parsers import PDFPlumberParser @@ -16,17 +16,27 @@ def test_span_group_conversion(): - sg = SpanGroup(spans=[], id=3, metadata=Metadata.from_json({"text": "test"})) + sg = SpanGroup(spans=[], id=3, metadata=Metadata(text='test')) sg2 = SpanGroup.from_json(sg.to_json()) assert sg2.to_json() == sg.to_json() assert sg2.__dict__ == sg.__dict__ - bg = BoxGroup(boxes=[], metadata=Metadata.from_json({"text": "test", "id": 1})) + bg = BoxGroup(boxes=[], metadata=Metadata(text='test')) bg2 = BoxGroup.from_json(bg.to_json()) assert bg2.to_json() == bg.to_json() assert bg2.__dict__ == bg.__dict__ +def test_relation_conversion(): + r = Relation( + query=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), + value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), + id=999, + metadata=Metadata(type='something') + ) + r.to_json() + + def test_doc_conversion(): pdfparser = PDFPlumberParser() From e827f0211ce74179b8e8f4513a0d01c2e04954c4 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Mon, 10 Oct 2022 12:10:25 -0700 Subject: [PATCH 15/21] WIP; for relations, handle field-aware id --- mmda/types/annotation.py | 25 +++++++++++++++++-------- mmda/types/names.py | 6 +++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index 28feec7c..8bcd33d1 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -42,11 +42,13 @@ def __init__( self, id: Optional[int] = None, doc: Optional['Document'] = None, + field: Optional[str] = None, metadata: Optional[Metadata] = None ): self.id = id self.doc = doc self.metadata = metadata if metadata else Metadata() + self.field = field @abstractmethod def to_json(self) -> Dict: @@ -83,10 +85,11 @@ def __init__( boxes: List[Box], id: Optional[int] = None, doc: Optional['Document'] = None, + field: Optional[str] = None, metadata: Optional[Metadata] = None, ): self.boxes = boxes - super().__init__(id=id, doc=doc, metadata=metadata) + super().__init__(id=id, doc=doc, field=field, metadata=metadata) def to_json(self) -> Dict: box_group_dict = dict( @@ -131,6 +134,7 @@ def __deepcopy__(self, memo): box_group = BoxGroup( boxes=deepcopy(self.boxes, memo), id=self.id, + field=self.field, metadata=deepcopy(self.metadata, memo) ) @@ -155,18 +159,17 @@ def __init__( box_group: Optional[BoxGroup] = None, id: Optional[int] = None, doc: Optional['Document'] = None, + field: Optional[str] = None, metadata: Optional[Metadata] = None, ): self.spans = spans self.box_group = box_group - super().__init__(id=id, doc=doc, metadata=metadata) + super().__init__(id=id, doc=doc, field=field, metadata=metadata) @property def symbols(self) -> List[str]: if self.doc is not None: - return [ - self.doc.symbols[span.start: span.end] for span in self.spans - ] + return [self.doc.symbols[span.start: span.end] for span in self.spans] else: return [] @@ -243,6 +246,7 @@ def __deepcopy__(self, memo): span_group = SpanGroup( spans=deepcopy(self.spans, memo), id=self.id, + field=self.field, metadata=deepcopy(self.metadata, memo), box_group=deepcopy(self.box_group, memo) ) @@ -279,6 +283,7 @@ def __init__( value: SpanGroup, id: Optional[int] = None, doc: Optional['Document'] = None, + field: Optional[str] = None, metadata: Optional[Metadata] = None ): if query.id is None: @@ -287,13 +292,17 @@ def __init__( raise ValueError(f'Relation requires the value {value} to have an ID') self.query = query self.value = value - super().__init__(id=id, doc=doc, metadata=metadata) + super().__init__(id=id, doc=doc, field=field, metadata=metadata) + + @classmethod + def entity_id(cls, entity: SpanGroup) -> str: + return f'{entity.field}-{entity.id}' def to_json(self, is_minimal: Optional[bool] = True) -> Dict: if is_minimal: relation_dict = dict( - query=self.query.id, - value=self.value.id, + query=Relation.entity_id(self.query), + value=Relation.entity_id(self.value), id=self.id, metadata=self.metadata.to_json() ) diff --git a/mmda/types/names.py b/mmda/types/names.py index fbfd9327..020debb2 100644 --- a/mmda/types/names.py +++ b/mmda/types/names.py @@ -1,6 +1,6 @@ """ -Names of fields, as strings +Names of Annotations, as strings @kylel @@ -17,8 +17,8 @@ Tokens = 'tokens' Words = 'words' Sentences = 'sents' +Sections = 'secs' Paragraphs = 'paras' -SectionHeadings = 'secs' Figures = 'figures' Tables = 'tables' @@ -33,4 +33,4 @@ Abstract = 'abstract' # relations -SectionParagraphs \ No newline at end of file +RefersTo = 'refers_to' \ No newline at end of file From 2577c7d199cf7e89b7aeb6d74b7c3f1d0d16bcfa Mon Sep 17 00:00:00 2001 From: kyleclo Date: Tue, 11 Oct 2022 15:42:56 -0700 Subject: [PATCH 16/21] wip; minor cleanup --- mmda/parsers/pdfplumber_parser.py | 10 +++------- mmda/types/annotation.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/mmda/parsers/pdfplumber_parser.py b/mmda/parsers/pdfplumber_parser.py index 7e8d44c6..44645a8f 100644 --- a/mmda/parsers/pdfplumber_parser.py +++ b/mmda/parsers/pdfplumber_parser.py @@ -99,7 +99,9 @@ def __init__( self.split_at_punctuation = split_at_punctuation def parse(self, input_pdf_path: str) -> Document: - doc = self._load_pdf_as_doc(input_pdf_path) + page_to_line_to_tokens = self._load_pdf_tokens(input_pdf_path) + doc_json = self._convert_nested_text_to_doc_json(page_to_line_to_tokens) + doc = Document.from_json(doc_json) return doc def _load_page_tokens( @@ -238,12 +240,6 @@ def _convert_nested_text_to_doc_json(self, page_to_row_to_tokens: Dict) -> Dict: Rows: [row.to_json() for row in row_annos], } - def _load_pdf_as_doc(self, input_pdf_path: str) -> Document: - page_to_line_to_tokens = self._load_pdf_tokens(input_pdf_path) - doc_json = self._convert_nested_text_to_doc_json(page_to_line_to_tokens) - doc = Document.from_json(doc_json) - return doc - def _simple_line_detection( self, page_tokens: List[Dict], diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index c7eea81d..5edeec47 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -13,7 +13,7 @@ from copy import deepcopy from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union -from mmda.types.names import + from mmda.types.box import Box from mmda.types.metadata import Metadata from mmda.types.span import Span From e5d1d2811ad551b580eef84f2c24952428dc31b9 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 13 Oct 2022 13:57:15 -0700 Subject: [PATCH 17/21] add AnnotationName class; add lookup method to Document based on name; base Relation class on storage of these names; define to and from_json --- mmda/types/annotation.py | 120 +++++++++++++++-------- mmda/types/document.py | 74 +++++++------- tests/test_types/test_json_conversion.py | 2 +- 3 files changed, 118 insertions(+), 78 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index 5edeec47..ad2f3113 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -8,12 +8,12 @@ @kylel, @lucas """ +import logging import warnings from abc import abstractmethod from copy import deepcopy from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union - from mmda.types.box import Box from mmda.types.metadata import Metadata from mmda.types.span import Span @@ -35,6 +35,23 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None: warnings.warn(msg, UserWarning, stacklevel=2) +class AnnotationName: + """Stores a name that uniquely identifies this Annotation within a Document""" + + def __init__(self, field: str, id: int): + self.field = field + self.id = id + + def __str__(self) -> str: + return f"{self.field}-{self.id}" + + @classmethod + def from_str(cls, s: str) -> 'AnnotationName': + field, id = s.split('-') + id = int(id) + return AnnotationName(field=field, id=id) + + class Annotation: """Annotation is intended for storing model predictions for a document.""" @@ -47,35 +64,50 @@ def __init__( ): self.id = id self.doc = doc - self.metadata = metadata if metadata else Metadata() self.field = field + self.metadata = metadata if metadata else Metadata() @abstractmethod def to_json(self) -> Dict: - pass + raise NotImplementedError @classmethod @abstractmethod def from_json(cls, annotation_dict: Dict) -> "Annotation": - pass + raise NotImplementedError - def attach_doc(self, doc: "Document") -> None: + @property + def name(self) -> Optional[AnnotationName]: + if self.field and self.id: + return AnnotationName(field=self.field, id=self.id) + else: + return None + + def _attach_doc(self, doc: "Document", field: str) -> None: if not self.doc: self.doc = doc + self.field = field else: raise AttributeError("This annotation already has an attached document") - # TODO[kylel] - comment explaining - def __getattr__(self, field: str) -> List["Annotation"]: - if self.doc is None: - raise ValueError("This annotation is not attached to a document") + def _get_siblings(self) -> List['Annotation']: + """This method gets all other objects sharing the same field as the current object. + Only works after a Document has been attached, which is how objects learn their `field`.""" + if not self.doc: + raise AttributeError("This annotation does not have an attached document") + return self.doc.__getattribute__(self.field) - if field in self.doc.fields: - return self.doc.find_overlapping(self, field) + def __getattr__(self, field: str) -> List["Annotation"]: + """This method allows jumping from an object of one field to all overlapping + objects of another field. For example `page.tokens` jumps from a particular page + to all its intersecting tokens.""" + if not self.doc: + raise AttributeError("This annotation does not have an attached document") if field in self.doc.fields: return self.doc.find_overlapping(self, field) + # TODO[kylel] - when does this ever get called? infinite loop? return self.__getattribute__(field) @@ -92,6 +124,7 @@ def __init__( super().__init__(id=id, doc=doc, field=field, metadata=metadata) def to_json(self) -> Dict: + """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat""" box_group_dict = dict( boxes=[box.to_json() for box in self.boxes], id=self.id, @@ -145,15 +178,16 @@ def __deepcopy__(self, memo): @property def type(self) -> str: + logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') return self.metadata.get("type", None) @type.setter def type(self, type: Union[str, None]) -> None: + logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') self.metadata.type = type class SpanGroup(Annotation): - def __init__( self, spans: List[Span], @@ -174,17 +208,8 @@ def symbols(self) -> List[str]: else: return [] - def annotate( - self, is_overwrite: bool = False, **kwargs: Iterable["Annotation"] - ) -> None: - if self.doc is None: - raise ValueError("SpanGroup has no attached document!") - - key_remaps = {k: v for k, v in kwargs.items()} - - self.doc.annotate(is_overwrite=is_overwrite, **key_remaps) - def to_json(self) -> Dict: + """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat""" span_group_dict = dict( spans=[span.to_json() for span in self.spans], id=self.id, @@ -210,7 +235,7 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup": else: # this fallback is necessary to ensure compatibility with span # groups that were create before the metadata migration and - # therefore have "id", "type" in the root of the json dict instead. + # therefore have "type" in the root of the json dict instead. metadata_dict = { "type": span_group_dict.get("type", None), "text": span_group_dict.get("text", None) @@ -269,10 +294,12 @@ def __deepcopy__(self, memo): @property def type(self) -> str: + logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') return self.metadata.get("type", None) @type.setter def type(self, type: Union[str, None]) -> None: + logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') self.metadata.type = type @property @@ -290,36 +317,33 @@ def text(self, text: Union[str, None]) -> None: class Relation(Annotation): def __init__( self, - query: SpanGroup, + key: SpanGroup, value: SpanGroup, id: Optional[int] = None, doc: Optional['Document'] = None, field: Optional[str] = None, metadata: Optional[Metadata] = None ): - if query.id is None: - raise ValueError(f'Relation requires the query {query} to have an ID') - if value.id is None: - raise ValueError(f'Relation requires the value {value} to have an ID') - self.query = query + if key.name is None: + raise ValueError(f'Relation requires the key {key} to have a `.name`') + if value.name is None: + raise ValueError(f'Relation requires the value {value} to have a `.name`') + self.key = key self.value = value super().__init__(id=id, doc=doc, field=field, metadata=metadata) - @classmethod - def entity_id(cls, entity: SpanGroup) -> str: - return f'{entity.field}-{entity.id}' - def to_json(self, is_minimal: Optional[bool] = True) -> Dict: + """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat""" if is_minimal: relation_dict = dict( - query=Relation.entity_id(self.query), - value=Relation.entity_id(self.value), + key=self.key.name, + value=self.value.name, id=self.id, metadata=self.metadata.to_json() ) else: relation_dict = dict( - query=self.query.to_json(), + key=self.key.to_json(), value=self.value.to_json(), id=self.id, metadata=self.metadata.to_json() @@ -331,19 +355,29 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict: } # only serialize non-null values @classmethod - def from_json(cls, relation_dict: Dict, is_minimal: Optional[bool] = True) -> "Relation": - metadata_dict = relation_dict.get('metadata', {}) + def from_json( + cls, + relation_dict: Dict, + is_minimal: Optional[bool] = True, + doc: Optional['Document'] = None, + ) -> "Relation": if is_minimal: + if not doc: + raise ValueError( + f"Creating a Relation from a minimal JSON requires Document `doc` " + f"otherwise, no way to know what the key {relation_dict['key']} " + f"or value {relation_dict['value']}" + ) return cls( - query=SpanGroup.from_json(span_group_dict=relation_dict['query']), - value=SpanGroup.from_json(span_group_dict=relation_dict['value']), + key=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['key'])), + value=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['value'])), id=relation_dict.get("id", None), - metadata=Metadata.from_json(metadata_dict) + metadata=Metadata.from_json(relation_dict.get('metadata', {})) ) else: return cls( - query=SpanGroup.from_json(span_group_dict=relation_dict['query']), + key=SpanGroup.from_json(span_group_dict=relation_dict['key']), value=SpanGroup.from_json(span_group_dict=relation_dict['value']), id=relation_dict.get("id", None), - metadata=Metadata.from_json(metadata_dict) + metadata=Metadata.from_json(relation_dict.get('metadata', {})) ) diff --git a/mmda/types/document.py b/mmda/types/document.py index cbd00655..b0d6f3da 100644 --- a/mmda/types/document.py +++ b/mmda/types/document.py @@ -9,7 +9,7 @@ from copy import deepcopy from typing import Dict, Iterable, List, Optional -from mmda.types.annotation import Annotation, BoxGroup, SpanGroup +from mmda.types.annotation import Annotation, BoxGroup, SpanGroup, AnnotationName from mmda.types.image import PILImage from mmda.types.indexers import Indexer, SpanGroupIndexer from mmda.types.names import Images, Symbols @@ -17,7 +17,6 @@ class Document: - SPECIAL_FIELDS = [Symbols, Images] UNALLOWED_FIELD_NAMES = ["fields"] @@ -32,35 +31,35 @@ def fields(self) -> List[str]: return self.__fields # TODO: extend implementation to support DocBoxGroup - def find_overlapping(self, query: Annotation, field_name: str) -> List[Annotation]: + def find_overlapping(self, query: Annotation, field: str) -> List[Annotation]: if not isinstance(query, SpanGroup): raise NotImplementedError( f"Currently only supports query of type SpanGroup" ) - return self.__indexers[field_name].find(query=query) + return self.__indexers[field].find(query=query) def annotate( - self, is_overwrite: bool = False, **kwargs: Iterable[Annotation] + self, is_overwrite: bool = False, **kwargs: Iterable[Annotation] ) -> None: """Annotate the fields for document symbols (correlating the annotations with the symbols) and store them into the papers. """ # 1) check validity of field names - for field_name in kwargs.keys(): + for field in kwargs.keys(): assert ( - field_name not in self.SPECIAL_FIELDS - ), f"The field_name {field_name} should not be in {self.SPECIAL_FIELDS}." + field not in self.SPECIAL_FIELDS + ), f"The field {field} should not be in {self.SPECIAL_FIELDS}." - if field_name in self.fields: + if field in self.fields: # already existing field, check if ok overriding if not is_overwrite: raise AssertionError( - f"This field name {field_name} already exists. To override, set `is_overwrite=True`" + f"This field name {field} already exists. To override, set `is_overwrite=True`" ) - elif field_name in dir(self): + elif field in dir(self): # not an existing field, but a reserved class method name raise AssertionError( - f"The field_name {field_name} should not conflict with existing class properties" + f"The field {field} should not conflict with existing class properties" ) # Kyle's preserved comment: @@ -68,39 +67,39 @@ def annotate( # overhead on large documents. # 2) register fields into Document - for field_name, annotations in kwargs.items(): + for field, annotations in kwargs.items(): if len(annotations) == 0: - warnings.warn(f"The annotations is empty for the field {field_name}") - setattr(self, field_name, []) - self.__fields.append(field_name) + warnings.warn(f"The annotations is empty for the field {field}") + setattr(self, field, []) + self.__fields.append(field) continue annotation_types = {type(a) for a in annotations} assert ( len(annotation_types) == 1 - ), f"Annotations in field_name {field_name} more than 1 type: {annotation_types}" + ), f"Annotations in field {field} more than 1 type: {annotation_types}" annotation_type = annotation_types.pop() if annotation_type == SpanGroup: span_groups = self._annotate_span_group( - span_groups=annotations, field_name=field_name + span_groups=annotations, field=field ) elif annotation_type == BoxGroup: # TODO: not good. BoxGroups should be stored on their own, not auto-generating SpanGroups. span_groups = self._annotate_box_group( - box_groups=annotations, field_name=field_name + box_groups=annotations, field=field ) else: raise NotImplementedError( - f"Unsupported annotation type {annotation_type} for {field_name}" + f"Unsupported annotation type {annotation_type} for {field}" ) # register fields - setattr(self, field_name, span_groups) - self.__fields.append(field_name) + setattr(self, field, span_groups) + self.__fields.append(field) def annotate_images( - self, images: Iterable[PILImage], is_overwrite: bool = False + self, images: Iterable[PILImage], is_overwrite: bool = False ) -> None: if not is_overwrite and len(self.images) > 0: raise AssertionError( @@ -122,7 +121,7 @@ def annotate_images( self.images = images def _annotate_span_group( - self, span_groups: List[SpanGroup], field_name: str + self, span_groups: List[SpanGroup], field: str ) -> List[SpanGroup]: """Annotate the Document using a bunch of span groups. It will associate the annotations with the document symbols. @@ -131,15 +130,15 @@ def _annotate_span_group( # 1) add Document to each SpanGroup for span_group in span_groups: - span_group.attach_doc(doc=self) + span_group._attach_doc(doc=self, field=field) # 2) Build fast overlap lookup index - self.__indexers[field_name] = SpanGroupIndexer(span_groups) + self.__indexers[field] = SpanGroupIndexer(span_groups) return span_groups def _annotate_box_group( - self, box_groups: List[BoxGroup], field_name: str + self, box_groups: List[BoxGroup], field: str ) -> List[SpanGroup]: """Annotate the Document using a bunch of box groups. It will associate the annotations with the document symbols. @@ -177,7 +176,7 @@ def _annotate_box_group( derived_span_groups.append( SpanGroup( spans=MergeSpans(list_of_spans=all_token_spans_with_box_group, index_distance=1) - .merge_neighbor_spans_by_symbol_distance(), box_group=box_group, + .merge_neighbor_spans_by_symbol_distance(), box_group=box_group, # id = box_id, ) # TODO Right now we cannot assign the box id, or otherwise running doc.blocks will @@ -195,7 +194,7 @@ def _annotate_box_group( span_group.id = box_id return self._annotate_span_group( - span_groups=derived_span_groups, field_name=field_name + span_groups=derived_span_groups, field=field ) # @@ -245,16 +244,23 @@ def from_json(cls, doc_dict: Dict) -> "Document": ) # 2) convert span group dicts to span gropus - field_name_to_span_groups = {} - for field_name, span_group_dicts in doc_dict.items(): - if field_name not in doc.SPECIAL_FIELDS: + field_to_span_groups = {} + for field, span_group_dicts in doc_dict.items(): + if field not in doc.SPECIAL_FIELDS: span_groups = [ SpanGroup.from_json(span_group_dict=span_group_dict) for span_group_dict in span_group_dicts ] - field_name_to_span_groups[field_name] = span_groups + field_to_span_groups[field] = span_groups # 3) load annotations for each field - doc.annotate(**field_name_to_span_groups) + doc.annotate(**field_to_span_groups) return doc + + def locate_annotation(self, name: AnnotationName) -> Annotation: + candidates = self.__getattribute__(name.field) + matched_annotations = [c for c in candidates if c.id == name.id] + assert len(matched_annotations) <= 1, \ + f"Multiple annotations in field {name.field} with same ID {name.id}" + return matched_annotations[0] diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index ee98e03d..63b6a3fc 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -29,7 +29,7 @@ def test_span_group_conversion(): def test_relation_conversion(): r = Relation( - query=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), + key=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), id=999, metadata=Metadata(type='something') From f83eb00b4f5b5da30f8f99672703ee5babf3e847 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 13 Oct 2022 17:44:09 -0700 Subject: [PATCH 18/21] add relation test; remove ability to create relation from json --- mmda/types/annotation.py | 18 ++++++------- tests/test_types/test_json_conversion.py | 34 ++++++++++++++++++++---- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index ad2f3113..c505ced2 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -336,8 +336,8 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict: """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat""" if is_minimal: relation_dict = dict( - key=self.key.name, - value=self.value.name, + key=str(self.key.name), + value=str(self.value.name), id=self.id, metadata=self.metadata.to_json() ) @@ -368,16 +368,14 @@ def from_json( f"otherwise, no way to know what the key {relation_dict['key']} " f"or value {relation_dict['value']}" ) + key_name = AnnotationName.from_str(s=relation_dict['key']) + value_name = AnnotationName.from_str(s=relation_dict['value']) return cls( - key=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['key'])), - value=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['value'])), + key=doc.locate_annotation(name=key_name), + value=doc.locate_annotation(name=value_name), id=relation_dict.get("id", None), metadata=Metadata.from_json(relation_dict.get('metadata', {})) ) else: - return cls( - key=SpanGroup.from_json(span_group_dict=relation_dict['key']), - value=SpanGroup.from_json(span_group_dict=relation_dict['value']), - id=relation_dict.get("id", None), - metadata=Metadata.from_json(relation_dict.get('metadata', {})) - ) + raise NotImplementedError(f'Not currently supported. Awkward to build relations' + f'without an existing Document object that stores fields.') \ No newline at end of file diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index 63b6a3fc..c14a36e7 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -11,7 +11,6 @@ from mmda.types import BoxGroup, SpanGroup, Document, Metadata, Relation from mmda.parsers import PDFPlumberParser - PDFFILEPATH = Path(__file__).parent / "../fixtures/1903.10676.pdf" @@ -29,12 +28,37 @@ def test_span_group_conversion(): def test_relation_conversion(): r = Relation( - key=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), - value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')), + key=SpanGroup(spans=[], id=3, metadata=Metadata(foobar='test'), field='abc'), + value=SpanGroup(spans=[], id=5, metadata=Metadata(foobar='test'), field='xyz'), id=999, - metadata=Metadata(type='something') + metadata=Metadata(blabla='something') ) - r.to_json() + + # minimal to & from JSON (default behavior) + r_dict_minimal = { + 'key': 'abc-3', + 'value': 'xyz-5', + 'id': 999, + 'metadata': {'blabla': 'something'} + } + assert r.to_json() == r.to_json(is_minimal=True) == r_dict_minimal + + doc = Document.from_json(doc_dict={ + 'symbols': 'asdfasdf', + 'abc': [{'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}], + 'xyz': [{'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}] + }) + assert r_dict_minimal == r.from_json(r_dict_minimal, is_minimal=True, doc=doc).to_json() == \ + r.from_json(r_dict_minimal, doc=doc).to_json() + + # full to JSON + r_dict_full = { + 'key': {'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}, + 'value': {'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}, + 'id': 999, + 'metadata': {'blabla': 'something'} + } + assert r.to_json(is_minimal=False) == r_dict_full def test_doc_conversion(): From 84682ee20f8ae666058442ebeb535049603728e4 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 13 Oct 2022 18:08:43 -0700 Subject: [PATCH 19/21] remove unused ways to from JSON for relations --- mmda/types/annotation.py | 51 ++++++++---------------- tests/test_types/test_json_conversion.py | 15 ++----- 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index c505ced2..ee67f964 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -332,22 +332,14 @@ def __init__( self.value = value super().__init__(id=id, doc=doc, field=field, metadata=metadata) - def to_json(self, is_minimal: Optional[bool] = True) -> Dict: + def to_json(self) -> Dict: """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat""" - if is_minimal: - relation_dict = dict( - key=str(self.key.name), - value=str(self.value.name), - id=self.id, - metadata=self.metadata.to_json() - ) - else: - relation_dict = dict( - key=self.key.to_json(), - value=self.value.to_json(), - id=self.id, - metadata=self.metadata.to_json() - ) + relation_dict = dict( + key=str(self.key.name), + value=str(self.value.name), + id=self.id, + metadata=self.metadata.to_json() + ) return { key: value for key, value in relation_dict.items() @@ -358,24 +350,13 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict: def from_json( cls, relation_dict: Dict, - is_minimal: Optional[bool] = True, - doc: Optional['Document'] = None, + doc: 'Document', ) -> "Relation": - if is_minimal: - if not doc: - raise ValueError( - f"Creating a Relation from a minimal JSON requires Document `doc` " - f"otherwise, no way to know what the key {relation_dict['key']} " - f"or value {relation_dict['value']}" - ) - key_name = AnnotationName.from_str(s=relation_dict['key']) - value_name = AnnotationName.from_str(s=relation_dict['value']) - return cls( - key=doc.locate_annotation(name=key_name), - value=doc.locate_annotation(name=value_name), - id=relation_dict.get("id", None), - metadata=Metadata.from_json(relation_dict.get('metadata', {})) - ) - else: - raise NotImplementedError(f'Not currently supported. Awkward to build relations' - f'without an existing Document object that stores fields.') \ No newline at end of file + key_name = AnnotationName.from_str(s=relation_dict['key']) + value_name = AnnotationName.from_str(s=relation_dict['value']) + return cls( + key=doc.locate_annotation(name=key_name), + value=doc.locate_annotation(name=value_name), + id=relation_dict.get("id", None), + metadata=Metadata.from_json(relation_dict.get('metadata', {})) + ) diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index c14a36e7..6cd3e608 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -34,31 +34,22 @@ def test_relation_conversion(): metadata=Metadata(blabla='something') ) - # minimal to & from JSON (default behavior) + # to & from JSON r_dict_minimal = { 'key': 'abc-3', 'value': 'xyz-5', 'id': 999, 'metadata': {'blabla': 'something'} } - assert r.to_json() == r.to_json(is_minimal=True) == r_dict_minimal + assert r.to_json() == r_dict_minimal doc = Document.from_json(doc_dict={ 'symbols': 'asdfasdf', 'abc': [{'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}], 'xyz': [{'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}] }) - assert r_dict_minimal == r.from_json(r_dict_minimal, is_minimal=True, doc=doc).to_json() == \ - r.from_json(r_dict_minimal, doc=doc).to_json() + assert r_dict_minimal == r.from_json(r_dict_minimal, doc=doc).to_json() - # full to JSON - r_dict_full = { - 'key': {'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}, - 'value': {'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}, - 'id': 999, - 'metadata': {'blabla': 'something'} - } - assert r.to_json(is_minimal=False) == r_dict_full def test_doc_conversion(): From 818349bf9f90536b1f98d1df689e9c386a738341 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Thu, 20 Oct 2022 23:39:45 -0700 Subject: [PATCH 20/21] replace getattribute with getattr --- mmda/types/annotation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index ee67f964..7db9a591 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -95,7 +95,7 @@ def _get_siblings(self) -> List['Annotation']: Only works after a Document has been attached, which is how objects learn their `field`.""" if not self.doc: raise AttributeError("This annotation does not have an attached document") - return self.doc.__getattribute__(self.field) + return self.doc.__getattr__(self.field) def __getattr__(self, field: str) -> List["Annotation"]: """This method allows jumping from an object of one field to all overlapping @@ -108,7 +108,7 @@ def __getattr__(self, field: str) -> List["Annotation"]: return self.doc.find_overlapping(self, field) # TODO[kylel] - when does this ever get called? infinite loop? - return self.__getattribute__(field) + return self.__getattr__(field) class BoxGroup(Annotation): From 5820934a72622e5101043f55c5c454ee77dacb16 Mon Sep 17 00:00:00 2001 From: kyleclo Date: Fri, 21 Oct 2022 11:06:46 -0700 Subject: [PATCH 21/21] return empty list if no getattr match --- mmda/types/annotation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index 7db9a591..5d925d83 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -106,9 +106,8 @@ def __getattr__(self, field: str) -> List["Annotation"]: if field in self.doc.fields: return self.doc.find_overlapping(self, field) - - # TODO[kylel] - when does this ever get called? infinite loop? - return self.__getattr__(field) + else: + return [] class BoxGroup(Annotation):