From d9a64ace3a2d1daa5ea4636b67dd7983943025da Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 21:53:25 -0700
Subject: [PATCH 01/21] simplify annotation

---
 mmda/types/annotation.py | 103 ++++++++++++++++++++-------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index d8280cc3..a0325079 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -21,12 +21,9 @@
     from mmda.types.document import Document
 
 
-__all__ = ["Annotation", "BoxGroup", "SpanGroup"]
+__all__ = ["Annotation", "BoxGroup", "SpanGroup", "Relation"]
 
 
-def default_factory():
-    return str(uuid4())
-
 
 def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
     """Warns when a deepcopy is performed on an Annotation."""
@@ -39,49 +36,42 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
     warnings.warn(msg, UserWarning, stacklevel=2)
 
 
-@dataclass
+
 class Annotation:
     """Annotation is intended for storing model predictions for a document."""
 
-    # TODO[kylel] - remove UUID from this class, as you explained to me (luca)
-    # it is about 10% of the wall time in processing a document
-    uuid: str = field(default_factory=default_factory)
-    doc: Optional["Document"] = field(default=None, init=False)
-    metadata: Metadata = field(default_factory=Metadata)
+    def __init__(
+            self,
+            id: Optional[int] = None,
+            doc: Optional['Document'] = None,
+            metadata: Optional[Metadata] = None
+    ):
+        self.id = id
+        self.doc = doc
+        self.metadata = metadata if metadata else Metadata()
 
     @abstractmethod
     def to_json(self) -> Dict:
         pass
 
-    # TODO[shannon] make this as an abstract method after implementing
-    # get_symbols for BoxGroup
-    def get_symbols(self) -> str:  # type: ignore
-        pass
-
     @classmethod
     @abstractmethod
     def from_json(cls, annotation_dict: Dict) -> "Annotation":
         pass
 
-    @property
-    def key_prefix(self) -> str:
-        return f"{self.__class__.__name__}|{self.uuid}|"
-
     def attach_doc(self, doc: "Document") -> None:
         if not self.doc:
             self.doc = doc
         else:
-            raise AttributeError(
-                "This annotation already has an attached document"
-            )
+            raise AttributeError("This annotation already has an attached document")
 
     # TODO[kylel] - comment explaining
     def __getattr__(self, field: str) -> List["Annotation"]:
         if self.doc is None:
             raise ValueError("This annotation is not attached to a document")
 
-        if self.key_prefix + field in self.doc.fields:
-            return self.doc.find_overlapping(self, self.key_prefix + field)
+        if field in self.doc.fields:
+            return self.doc.find_overlapping(self, field)
 
         if field in self.doc.fields:
             return self.doc.find_overlapping(self, field)
@@ -95,18 +85,24 @@ def __getattr__(self, field: str) -> List["Annotation"]:
 # useful because it keeps backward compatibility with the old API, while
 # migrating id and type to metadata.
 @store_field_in_metadata("type")
-@store_field_in_metadata("id")
-@dataclass
 class BoxGroup(Annotation):
-    boxes: List[Box] = field(default_factory=list)
-    id: Optional[int] = None
-    type: Optional[str] = None
+    def __init__(
+            self,
+            boxes: List[Box],
+            type: Optional[str] = None,
+            id: Optional[int] = None,
+            doc: Optional['Document'] = None,
+            metadata: Optional[Metadata] = None,
+    ):
+        self.boxes = boxes
+        self.type = type
+        super().__init__(id=id, doc=doc, metadata=metadata)
 
     def to_json(self) -> Dict:
         box_group_dict = dict(
             boxes=[box.to_json() for box in self.boxes],
-            metadata=self.metadata.to_json(),
-            uuid=self.uuid,
+            id=self.id,
+            metadata=self.metadata.to_json()
         )
         return {
             key: value for key, value in box_group_dict.items() if value
@@ -122,7 +118,6 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup":
             # groups that were create before the metadata migration and
             # therefore have "id", "type" in the root of the json dict instead.
             metadata_dict = {
-                "id": box_group_dict.get("id", None),
                 "type": box_group_dict.get("type", None),
                 "text": box_group_dict.get("text", None)
             }
@@ -134,8 +129,8 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup":
                 # minimally serialize when running to_json()
                 for box_dict in box_group_dict.get("boxes", [])
             ],
+            id=box_group_dict.get("id", None),
             metadata=Metadata.from_json(metadata_dict),
-            uuid=box_group_dict.get("uuid", str(uuid4())),
         )
 
     def __getitem__(self, key: int):
@@ -146,8 +141,8 @@ def __deepcopy__(self, memo):
 
         box_group = BoxGroup(
             boxes=deepcopy(self.boxes, memo),
-            metadata=deepcopy(self.metadata, memo),
-            uuid=self.uuid,
+            id=self.id,
+            metadata=deepcopy(self.metadata, memo)
         )
 
         # Don't copy an attached document
@@ -177,18 +172,22 @@ def _text_span_group_getter(span_group: "SpanGroup") -> str:
 # and use a custom getter to obtain the text from symbols if the text
 # is not explicitly set.
 @store_field_in_metadata("type")
-@store_field_in_metadata("id")
 @store_field_in_metadata("text", getter_fn=_text_span_group_getter)
-@dataclass
 class SpanGroup(Annotation):
-    spans: List[Span] = field(default_factory=list)
-
-    # TODO[kylel] - implement default behavior for box_group
-    box_group: Optional[BoxGroup] = None
 
-    id: Optional[int] = None
-    type: Optional[str] = None
-    text: Optional[str] = None
+    def __init__(
+            self,
+            spans: List[Span],
+            type: Optional[str] = None,
+            text: Optional[str] = None,
+            id: Optional[int] = None,
+            doc: Optional['Document'] = None,
+            metadata: Optional[Metadata] = None,
+    ):
+        self.spans = spans
+        self.type = type
+        self.text = text
+        super().__init__(id=id, doc=doc, metadata=metadata)
 
     @property
     def symbols(self) -> List[str]:
@@ -212,9 +211,9 @@ def annotate(
     def to_json(self) -> Dict:
         span_group_dict = dict(
             spans=[span.to_json() for span in self.spans],
+            id=self.id,
             metadata=self.metadata.to_json(),
-            box_group=self.box_group.to_json() if self.box_group else None,
-            uuid=self.uuid,
+            box_group=self.box_group.to_json() if self.box_group else None
         )
         return {
             key: value
@@ -237,7 +236,6 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup":
             # groups that were create before the metadata migration and
             # therefore have "id", "type" in the root of the json dict instead.
             metadata_dict = {
-                "id": span_group_dict.get("id", None),
                 "type": span_group_dict.get("type", None),
                 "text": span_group_dict.get("text", None)
             }
@@ -247,9 +245,9 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup":
                 Span.from_json(span_dict=span_dict)
                 for span_dict in span_group_dict["spans"]
             ],
+            id=span_group_dict.get("id", None),
             metadata=Metadata.from_json(metadata_dict),
             box_group=box_group,
-            uuid=span_group_dict.get("uuid", str(uuid4())),
         )
 
     def __getitem__(self, key: int):
@@ -282,12 +280,17 @@ def __deepcopy__(self, memo):
 
         span_group = SpanGroup(
             spans=deepcopy(self.spans, memo),
+            id=self.id,
             metadata=deepcopy(self.metadata, memo),
-            box_group=deepcopy(self.box_group, memo),
-            uuid=self.uuid,
+            box_group=deepcopy(self.box_group, memo)
         )
 
         # Don't copy an attached document
         span_group.doc = self.doc
 
         return span_group
+
+
+
+class Relation(Annotation):
+    pass
\ No newline at end of file

From b17bbb736250574fa9135f5652967306d9616d92 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 21:55:30 -0700
Subject: [PATCH 02/21] remove unused imports

---
 mmda/types/annotation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index a0325079..54e4d8bc 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -9,9 +9,7 @@
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
-from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
-from uuid import uuid4
 
 from mmda.types.box import Box
 from mmda.types.metadata import Metadata, store_field_in_metadata

From 3132290f77de17cae3a3f740b568651321622050 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:31:12 -0700
Subject: [PATCH 03/21] remove metadata anno in favor of getter/setters

---
 mmda/types/annotation.py | 72 +++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index 54e4d8bc..c288c157 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -12,7 +12,7 @@
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
 
 from mmda.types.box import Box
-from mmda.types.metadata import Metadata, store_field_in_metadata
+from mmda.types.metadata import Metadata
 from mmda.types.span import Span
 
 if TYPE_CHECKING:
@@ -77,17 +77,11 @@ def __getattr__(self, field: str) -> List["Annotation"]:
         return self.__getattribute__(field)
 
 
-# NOTE[LucaS]: by using the store_field_in_metadata decorator, we are
-# able to store id and type in the metadata of BoxGroup, while keeping it
-# accessible via SpanGroup.id and SpanGroup.type respectively. This is
-# useful because it keeps backward compatibility with the old API, while
-# migrating id and type to metadata.
-@store_field_in_metadata("type")
+
 class BoxGroup(Annotation):
     def __init__(
             self,
             boxes: List[Box],
-            type: Optional[str] = None,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
             metadata: Optional[Metadata] = None,
@@ -114,10 +108,9 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup":
         else:
             # this fallback is necessary to ensure compatibility with box
             # groups that were create before the metadata migration and
-            # therefore have "id", "type" in the root of the json dict instead.
+            # therefore have "type" in the root of the json dict instead.
             metadata_dict = {
-                "type": box_group_dict.get("type", None),
-                "text": box_group_dict.get("text", None)
+                "type": box_group_dict.get("type", None)
             }
 
         return cls(
@@ -148,43 +141,27 @@ def __deepcopy__(self, memo):
 
         return box_group
 
+    @property
+    def type(self) -> str:
+        return self.metadata.get("type", None)
+
+    @type.setter
+    def type(self, type: Union[str, None]) -> None:
+        self.metadata.type = type
+
 
-def _text_span_group_getter(span_group: "SpanGroup") -> str:
-    """Getter used to obtain a textual representation of a SpanGroup.
-
-    When SpanGroup.text is not set, this function uses the SpanGroup's
-    symbols to generate approximate a text. However, if text is set,
-    this function returns it instead.
-    """
-    maybe_text = span_group.metadata.get("text", None)
-    return maybe_text if maybe_text else " ".join(span_group.symbols)
-
-
-# NOTE[@soldni]: by using the store_field_in_metadata decorator, we are
-# able to store id and type in the metadata of BoxGroup, while keeping it
-# accessible via SpanGroup.id and SpanGroup.type respectively. This is
-# useful because it keeps backward compatibility with the old API, while
-# migrating id and type to metadata.
-#
-# Futhermore, we also store the text of the SpanGroup in the metadata,
-# and use a custom getter to obtain the text from symbols if the text
-# is not explicitly set.
-@store_field_in_metadata("type")
-@store_field_in_metadata("text", getter_fn=_text_span_group_getter)
 class SpanGroup(Annotation):
 
     def __init__(
             self,
             spans: List[Span],
-            type: Optional[str] = None,
-            text: Optional[str] = None,
+            box_group: Optional[BoxGroup] = None,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
             metadata: Optional[Metadata] = None,
     ):
         self.spans = spans
-        self.type = type
-        self.text = text
+        self.box_group = box_group
         super().__init__(id=id, doc=doc, metadata=metadata)
 
     @property
@@ -202,7 +179,7 @@ def annotate(
         if self.doc is None:
             raise ValueError("SpanGroup has no attached document!")
 
-        key_remaps = {self.key_prefix + k: v for k, v in kwargs.items()}
+        key_remaps = {k: v for k, v in kwargs.items()}
 
         self.doc.annotate(is_overwrite=is_overwrite, **key_remaps)
 
@@ -288,6 +265,25 @@ def __deepcopy__(self, memo):
 
         return span_group
 
+    @property
+    def type(self) -> str:
+        return self.metadata.get("type", None)
+
+    @type.setter
+    def type(self, type: Union[str, None]) -> None:
+        self.metadata.type = type
+
+    @property
+    def text(self) -> str:
+        maybe_text = self.metadata.get("text", None)
+        if maybe_text is None:
+            return " ".join(self.symbols)
+        return maybe_text
+
+    @text.setter
+    def text(self, text: Union[str, None]) -> None:
+        self.metadata.text = text
+
 
 
 class Relation(Annotation):

From 055dfea02494da8315f9bcaf5d92e679abf6f674 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:31:30 -0700
Subject: [PATCH 04/21] remove spangroup nesting

---
 tests/test_types/test_span_group.py | 87 -----------------------------
 1 file changed, 87 deletions(-)

diff --git a/tests/test_types/test_span_group.py b/tests/test_types/test_span_group.py
index 9c63db5d..6068b5ee 100644
--- a/tests/test_types/test_span_group.py
+++ b/tests/test_types/test_span_group.py
@@ -23,92 +23,5 @@ def test_annotation_attaches_document(self):
         span_group = self.doc.tokens[0]
         self.assertEqual(["This", "is"], span_group.symbols)
 
-    def test_annotation_allows_nesting(self):
-        span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)])
-        nested_span_group = SpanGroup(id=2, spans=[Span(0, 4)], text="This")
-
-        self.doc.annotate(tokens=[span_group])
-
-        span_group = self.doc.tokens[0]
-        span_group.annotate(capitalized=[nested_span_group])
-
-        nested_span_group = span_group.capitalized[0]
-        self.assertEqual("This", nested_span_group.text)
-        self.assertEqual(["This"], nested_span_group.symbols)
-
-    def test_serialization_with_nesting(self):
-        span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)])
-        nested_span_group = SpanGroup(id=2, spans=[Span(0, 4)], text="This")
-
-        self.doc.annotate(tokens=[span_group])
-
-        span_group = self.doc.tokens[0]
-        span_group.annotate(capitalized=[nested_span_group])
-
-        json_repr = self.doc.to_json()
-        new_doc = Document.from_json(json_repr)
-
-        span_group = new_doc.tokens[0]
-        self.assertEqual(["This", "is"], span_group.symbols)
-
-        nested_span_group = span_group.capitalized[0]
-        self.assertEqual("This", nested_span_group.text)
-        self.assertEqual(["This"], nested_span_group.symbols)
-
-    def test_deep_nesting_without_id_conflicts(self):
-        span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)])
-        nested_span_group = SpanGroup(
-            id=2, spans=[Span(0, 4), Span(5, 7)], text="This is"
-        )
-        deep_span_group = SpanGroup(id=3, spans=[Span(0, 4)], text="This")
-
-        self.doc.annotate(tokens=[span_group])
-
-        span_group = self.doc.tokens[0]
-        span_group.annotate(capitalized=[nested_span_group])
-
-        nested_span_group = span_group.capitalized[0]
-        nested_span_group.annotate(deep=[deep_span_group])
-
-        json_repr = self.doc.to_json()
-        new_doc = Document.from_json(json_repr)
-
-        span_group = new_doc.tokens[0]
-        self.assertEqual(["This", "is"], span_group.symbols)
-
-        nested_span_group = span_group.capitalized[0]
-        self.assertEqual("This is", nested_span_group.text)
-        self.assertEqual(["This", "is"], nested_span_group.symbols)
-
-        deep_span_group = nested_span_group.deep[0]
-        self.assertEqual("This", deep_span_group.text)
-        self.assertEqual(["This"], deep_span_group.symbols)
-
-    def test_deep_nesting_with_id_conflicts(self):
-        span_group = SpanGroup(id=1, spans=[Span(0, 4), Span(5, 7)])
-        nested_span_group = SpanGroup(
-            id=1, spans=[Span(0, 4), Span(5, 7)], text="This is"
-        )
-        deep_span_group = SpanGroup(id=1, spans=[Span(0, 4)], text="This")
-
-        self.doc.annotate(tokens=[span_group])
-
-        span_group = self.doc.tokens[0]
-        span_group.annotate(capitalized=[nested_span_group])
-
-        nested_span_group = span_group.capitalized[0]
-        nested_span_group.annotate(deep=[deep_span_group])
-
-        json_repr = self.doc.to_json()
-        new_doc = Document.from_json(json_repr)
-
-        span_group = new_doc.tokens[0]
-        self.assertEqual(["This", "is"], span_group.symbols)
 
-        nested_span_group = span_group.capitalized[0]
-        self.assertEqual("This is", nested_span_group.text)
-        self.assertEqual(["This", "is"], nested_span_group.symbols)
 
-        deep_span_group = nested_span_group.deep[0]
-        self.assertEqual("This", deep_span_group.text)
-        self.assertEqual(["This"], deep_span_group.symbols)

From 813e13475cd87997b63b474a4d5400edd1ab266e Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:38:57 -0700
Subject: [PATCH 05/21] fix grobid parser tests

---
 mmda/parsers/grobid_parser.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py
index b2a5fe43..a358c34e 100644
--- a/mmda/parsers/grobid_parser.py
+++ b/mmda/parsers/grobid_parser.py
@@ -109,7 +109,9 @@ def _get_title(self, root: et.Element) -> SpanGroup:
         tokens = text.split()
         spans = _get_token_spans(text, tokens)
 
-        return SpanGroup(spans=spans, text=text)
+        sg = SpanGroup(spans=spans)
+        sg.text = text
+        return sg
 
     def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup:
         matches = root.findall(".//tei:profileDesc//tei:abstract//", NS)
@@ -122,4 +124,6 @@ def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup:
         tokens = text.split()
         spans = _get_token_spans(text, tokens, offset=offset)
 
-        return SpanGroup(spans=spans, text=text)
+        sg = SpanGroup(spans=spans)
+        sg.text = text
+        return sg

From affc3e84711c4a7a4a932cbc4ccaaddf81f96c31 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:43:23 -0700
Subject: [PATCH 06/21] fix tests for dict word predictor

---
 .../dictionary_word_predictor.py              | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
index 6d8bd4f6..8ee3d617 100644
--- a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
+++ b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
@@ -145,23 +145,16 @@ def predict(self, document: Document) -> List[SpanGroup]:
                 or combined_no_hyphen.lower() in self.dictionary
                 or combined_no_hyphen.lower() in local_dictionary
             ):
-                combined_text = curr_row_last_token_text[:-1] + self._token_text(
-                    next_row_first_token
-                )
-                span_group = SpanGroup(
-                    spans=curr_row_last_token.spans + next_row_first_token.spans,
-                    text=combined_text,
-                )
+                combined_text = curr_row_last_token_text[:-1] + \
+                                self._token_text(next_row_first_token)
             else:
                 # Use the combined, hyphenated word instead (e.g., few-shot)
-                combined_text = curr_row_last_token_text + self._token_text(
-                    next_row_first_token
-                )
-                span_group = SpanGroup(
-                    spans=curr_row_last_token.spans + next_row_first_token.spans,
-                    text=combined_text,
-                )
-
+                combined_text = curr_row_last_token_text + \
+                                self._token_text(next_row_first_token)
+            span_group = SpanGroup(
+                spans=curr_row_last_token.spans + next_row_first_token.spans
+            )
+            span_group.text = combined_text
             words.append(span_group)
 
         # add IDs to each word
@@ -174,7 +167,9 @@ def _token_text(self, token: SpanGroup) -> str:
         return "".join(token.symbols)
 
     def _copy_token_with_text(self, token: SpanGroup) -> SpanGroup:
-        return SpanGroup(spans=token.spans, text=self._token_text(token))
+        sg = SpanGroup(spans=token.spans)
+        sg.text = self._token_text(token)
+        return sg
 
     def _row_pairs(self, document):
         for i in range(0, len(document.rows) - 1):

From 87de5c900af862ee0b0c7d2c1cfae912e1efdea4 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:57:22 -0700
Subject: [PATCH 07/21] fix bug in grobidparser

---
 mmda/parsers/grobid_parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py
index a358c34e..8dad2734 100644
--- a/mmda/parsers/grobid_parser.py
+++ b/mmda/parsers/grobid_parser.py
@@ -23,7 +23,8 @@
 
 
 def _null_span_group() -> SpanGroup:
-    return SpanGroup(spans=[], text="")
+    sg = SpanGroup(spans=[])
+    return sg
 
 
 def _get_token_spans(text: str, tokens: List[str], offset: int = 0) -> List[int]:

From 92b79d432342e845c48b8ec168d8738ca05de2a5 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:57:39 -0700
Subject: [PATCH 08/21] fix bug; forgot to remove type from init in boxgroup

---
 mmda/types/annotation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index c288c157..7949c7e2 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -87,7 +87,6 @@ def __init__(
             metadata: Optional[Metadata] = None,
     ):
         self.boxes = boxes
-        self.type = type
         super().__init__(id=id, doc=doc, metadata=metadata)
 
     def to_json(self) -> Dict:

From 70bc6bf094714701f1f01109dbb3c2e5d9d59378 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 22:58:08 -0700
Subject: [PATCH 09/21] make tests in json conversion more lenient; doesnt need
 to be exactly the same object, just the values need to be the same after
 to/from json

---
 tests/test_types/test_json_conversion.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py
index 416358b4..e7a5f27d 100644
--- a/tests/test_types/test_json_conversion.py
+++ b/tests/test_types/test_json_conversion.py
@@ -16,13 +16,15 @@
 
 
 def test_span_group_conversion():
-    sg = SpanGroup(id=3, metadata=Metadata.from_json({"text": "test"}))
+    sg = SpanGroup(spans=[], id=3, metadata=Metadata.from_json({"text": "test"}))
     sg2 = SpanGroup.from_json(sg.to_json())
-    assert sg2 == sg
+    assert sg2.to_json() == sg.to_json()
+    assert sg2.__dict__ == sg.__dict__
 
-    bg = BoxGroup(metadata=Metadata.from_json({"text": "test", "id": 1}))
+    bg = BoxGroup(boxes=[], metadata=Metadata.from_json({"text": "test", "id": 1}))
     bg2 = BoxGroup.from_json(bg.to_json())
-    assert bg2 == bg
+    assert bg2.to_json() == bg.to_json()
+    assert bg2.__dict__ == bg.__dict__
 
 
 def test_doc_conversion():
@@ -54,7 +56,6 @@ def test_doc_conversion():
 
         for orig_sg, new_sg in field_it:
             # for each pair, they should have same metadata (type, id,
-            # and optionally, text), same spans, and same uuid.
+            # and optionally, text) and same spans.
             assert orig_sg.metadata == new_sg.metadata
-            assert orig_sg.uuid == new_sg.uuid
             assert orig_sg.spans == new_sg.spans

From 34d6fc619143a9340a4987c42a8e5d3b1de85341 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Wed, 5 Oct 2022 23:23:01 -0700
Subject: [PATCH 10/21] oops forgot to commit

---
 tests/test_internal_ai2/test_api.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/test_internal_ai2/test_api.py b/tests/test_internal_ai2/test_api.py
index b3873573..75c72a77 100644
--- a/tests/test_internal_ai2/test_api.py
+++ b/tests/test_internal_ai2/test_api.py
@@ -65,8 +65,5 @@ def test_equivalence(self):
         })
         sg_ann_2 = ClassificationSpanGroup.from_mmda(sg_ann).to_mmda()
 
-        # we need to manually set the uuids to be equal
-        # because by default they are randomly generated
-        sg_ann.uuid = sg_ann_2.uuid = 'manually-fix-to-avoid-randomness'
-
-        self.assertEqual(sg_ann, sg_ann_2)
+        self.assertDictEqual(sg_ann.to_json(), sg_ann_2.to_json())
+        self.assertDictEqual(sg_ann.__dict__, sg_ann_2.__dict__)

From 32a2e119e62f1dfee9166d01c548a1ff02a363f8 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 6 Oct 2022 11:39:51 -0700
Subject: [PATCH 11/21] change internal ai2 test api

---
 tests/test_internal_ai2/test_api.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_internal_ai2/test_api.py b/tests/test_internal_ai2/test_api.py
index 75c72a77..6cd2f4f8 100644
--- a/tests/test_internal_ai2/test_api.py
+++ b/tests/test_internal_ai2/test_api.py
@@ -19,7 +19,8 @@ class TestApi(unittest.TestCase):
     def test_vanilla_span_group(self) -> None:
         sg_ann = mmda_ann.SpanGroup.from_json({
             'spans': [{'start': 0, 'end': 1}],
-            'metadata': {'text': 'hello', 'id': 1}
+            'id': 1,
+            'metadata': {'text': 'hello', 'id': 999}    # note id not used; it's just in metadata
         })
 
         sg_api = mmda_api.SpanGroup.from_mmda(sg_ann)

From d763d097f296ce588d8619a9c1c94f9ba2aab8b1 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 6 Oct 2022 11:46:41 -0700
Subject: [PATCH 12/21] bugfix; vila model SpanGroup creation had type

---
 .../hf_predictors/token_classification_predictor.py          | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mmda/predictors/hf_predictors/token_classification_predictor.py b/mmda/predictors/hf_predictors/token_classification_predictor.py
index 801dfbad..1570d815 100644
--- a/mmda/predictors/hf_predictors/token_classification_predictor.py
+++ b/mmda/predictors/hf_predictors/token_classification_predictor.py
@@ -106,8 +106,9 @@ def postprocess(self, document: Document, model_predictions) -> List[SpanGroup]:
 
             start = min([ele.start for ele in cur_spans])
             end = max([ele.end for ele in cur_spans])
-            prediction_spans.append(SpanGroup(spans=[Span(start, end)], type=label))
-
+            sg = SpanGroup(spans=[Span(start, end)])
+            sg.type = label
+            prediction_spans.append(sg)
         return prediction_spans
 
 

From 98da4ead49c04db0c989e64a61cf28efbba64eea Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 6 Oct 2022 13:26:07 -0700
Subject: [PATCH 13/21] modify metadata constructor to take args; adjust all
 tests/models that instantiate spangroups

---
 README.md                                     | 27 ++++++++++++++++++-
 mmda/parsers/grobid_parser.py                 |  8 +++---
 .../dictionary_word_predictor.py              |  4 +--
 .../token_classification_predictor.py         |  4 +--
 .../hf_predictors/vila_predictor.py           |  4 ++-
 mmda/predictors/lp_predictors.py              |  6 ++---
 mmda/types/metadata.py                        |  4 +++
 tests/test_predictors/test_vila_predictors.py | 13 ++++++---
 8 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index cb8dc804..3567ccc3 100644
--- a/README.md
+++ b/README.md
@@ -111,8 +111,31 @@ A key aspect of using this library is understanding how these different fields a
 
 
 
+#### 4. What's in a `SpanGroup`?
 
-#### 4. Adding a new SpanGroup field
+Each `SpanGroup` object stores information about its contents and position:
+
+* `.spans: List[Span]`, A `Span` is a pointer into `Document.symbols` (that is, `Span(start=0, end=5)` corresponds to `symbols[0:5]`) and a single `Box` representing its position & rectangular region on the page.
+
+* `.box_group: BoxGroup`, A `BoxGroup` object stores `.boxes: List[Box]`.  
+
+* `.metadata: Metadata`, A free 
+
+    * **Span-Box Coupling:** Every `Span` is associated with a single `Box`, and not a `BoxGroup`. In this library, we restrict all of our `Span` to be units that can be represented by a single rectangular box. This is instead of allowing *any* (start, end) which would result in spans that can't necessarily be cleanly represented by a single box.
+    * 
+
+**FAQS**
+
+Q. Why do we need `BoxGroup` if we already have `Box` in each `Span`?
+
+A: Let's consider a `SpanGroup` object representing a single sentence in a paper. We know a single `Box` can't properly cover a sentence, because sentences can wrap rows & even cross columns/page:
+
+* One way to represent the visual area of that sentence is to take the Union of all `Box` in every involved `Span` -- This leaves us with many rectangles. 
+* But another way to synthesize all those `Box` into one giant `Box` (which might even overlap other text outside of this sentence). 
+* Finally, a third way is to synthesize all the `Box` of tokens on the same row into one `Box`, but keep `Box` on different rows separate. None of these ways 
+    
+
+#### 5. Adding a new SpanGroup field
 
 Not all Documents will have all segmentations available at creation time. You may need to load new fields to an existing `Document`. This is where `Predictor` comes in:
 
@@ -127,6 +150,8 @@ output = predictor.predict(document=doc)
  
 
 
+
+
 ## Parsers
 
 * [PDFPlumber](https://github.com/jsvine/pdfplumber) - MIT License    
diff --git a/mmda/parsers/grobid_parser.py b/mmda/parsers/grobid_parser.py
index 8dad2734..8b028c34 100644
--- a/mmda/parsers/grobid_parser.py
+++ b/mmda/parsers/grobid_parser.py
@@ -15,7 +15,7 @@
 from mmda.parsers.parser import Parser
 from mmda.types.annotation import SpanGroup
 from mmda.types.document import Document
-from mmda.types.names import Symbols
+from mmda.types.metadata import Metadata
 from mmda.types.span import Span
 
 DEFAULT_API = "http://localhost:8070/api/processHeaderDocument"
@@ -110,8 +110,7 @@ def _get_title(self, root: et.Element) -> SpanGroup:
         tokens = text.split()
         spans = _get_token_spans(text, tokens)
 
-        sg = SpanGroup(spans=spans)
-        sg.text = text
+        sg = SpanGroup(spans=spans, metadata=Metadata(text=text))
         return sg
 
     def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup:
@@ -125,6 +124,5 @@ def _get_abstract(self, root: et.Element, offset: int) -> SpanGroup:
         tokens = text.split()
         spans = _get_token_spans(text, tokens, offset=offset)
 
-        sg = SpanGroup(spans=spans)
-        sg.text = text
+        sg = SpanGroup(spans=spans, metadata=Metadata(text=text))
         return sg
diff --git a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
index 8ee3d617..f8d565ad 100644
--- a/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
+++ b/mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
@@ -8,6 +8,7 @@
 from typing import Optional, Set, List
 
 from mmda.predictors.base_predictors.base_predictor import BasePredictor
+from mmda.types.metadata import Metadata
 from mmda.types.annotation import Annotation, Span, SpanGroup
 from mmda.types.document import Document
 from mmda.types.names import Rows, Tokens
@@ -167,8 +168,7 @@ def _token_text(self, token: SpanGroup) -> str:
         return "".join(token.symbols)
 
     def _copy_token_with_text(self, token: SpanGroup) -> SpanGroup:
-        sg = SpanGroup(spans=token.spans)
-        sg.text = self._token_text(token)
+        sg = SpanGroup(spans=token.spans, metadata=Metadata(text=self._token_text(token)))
         return sg
 
     def _row_pairs(self, document):
diff --git a/mmda/predictors/hf_predictors/token_classification_predictor.py b/mmda/predictors/hf_predictors/token_classification_predictor.py
index 1570d815..4e37ca91 100644
--- a/mmda/predictors/hf_predictors/token_classification_predictor.py
+++ b/mmda/predictors/hf_predictors/token_classification_predictor.py
@@ -12,6 +12,7 @@
 from mmda.types.names import *
 from mmda.types.annotation import Annotation, Span, SpanGroup
 from mmda.types.document import Document
+from mmda.types.metadata import Metadata
 from mmda.predictors.hf_predictors.utils import (
     convert_document_page_to_pdf_dict,
     convert_sequence_tagging_to_spans,
@@ -106,8 +107,7 @@ def postprocess(self, document: Document, model_predictions) -> List[SpanGroup]:
 
             start = min([ele.start for ele in cur_spans])
             end = max([ele.end for ele in cur_spans])
-            sg = SpanGroup(spans=[Span(start, end)])
-            sg.type = label
+            sg = SpanGroup(spans=[Span(start, end)], metadata=Metadata(type=label))
             prediction_spans.append(sg)
         return prediction_spans
 
diff --git a/mmda/predictors/hf_predictors/vila_predictor.py b/mmda/predictors/hf_predictors/vila_predictor.py
index 5cc1a411..a8b78fa1 100644
--- a/mmda/predictors/hf_predictors/vila_predictor.py
+++ b/mmda/predictors/hf_predictors/vila_predictor.py
@@ -19,6 +19,7 @@
 
 from mmda.types.names import *
 from mmda.types.annotation import Annotation, Span, SpanGroup
+from mmda.types.metadata import Metadata
 from mmda.types.document import Document
 from mmda.predictors.hf_predictors.utils import (
     convert_document_page_to_pdf_dict,
@@ -167,7 +168,8 @@ def postprocess(
 
             start = min([ele.start for ele in cur_spans])
             end = max([ele.end for ele in cur_spans])
-            prediction_spans.append(SpanGroup(spans=[Span(start, end)], type=label))
+            sg = SpanGroup(spans=[Span(start, end)], metadata=Metadata(type=label))
+            prediction_spans.append(sg)
 
         return prediction_spans
 
diff --git a/mmda/predictors/lp_predictors.py b/mmda/predictors/lp_predictors.py
index d41ab62a..878fb7be 100644
--- a/mmda/predictors/lp_predictors.py
+++ b/mmda/predictors/lp_predictors.py
@@ -3,10 +3,8 @@
 from tqdm import tqdm
 import layoutparser as lp
 
+from mmda.types import Document, Box, BoxGroup, Metadata
 from mmda.types.names import *
-from mmda.types.document import Document
-from mmda.types.box import Box
-from mmda.types.annotation import BoxGroup, Annotation
 from mmda.predictors.base_predictors.base_predictor import BasePredictor
 
 
@@ -83,7 +81,7 @@ def postprocess(self,
                         page_height=page_height,
                     )
                 ],
-                type=block.type,
+                metadata=Metadata(type=block.type)
             )
             for block in model_outputs
         ]
diff --git a/mmda/types/metadata.py b/mmda/types/metadata.py
index c33c6aae..26e77103 100644
--- a/mmda/types/metadata.py
+++ b/mmda/types/metadata.py
@@ -29,6 +29,10 @@ class Metadata:
     """An object that contains metadata for an annotation.
     It supports dot access and dict-like access."""
 
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            self.set(k, v)
+
     @overload
     def get(self, key: str) -> Any:
         """Get value with name `key` in metadata;
diff --git a/tests/test_predictors/test_vila_predictors.py b/tests/test_predictors/test_vila_predictors.py
index 2698ac2d..eeffd98e 100644
--- a/tests/test_predictors/test_vila_predictors.py
+++ b/tests/test_predictors/test_vila_predictors.py
@@ -1,4 +1,6 @@
-import json 
+import json
+import os
+import pathlib
 
 from PIL import Image
 
@@ -13,6 +15,9 @@
 )
 
 
+os.chdir(pathlib.Path(__file__).parent)
+
+
 DOCBANK_LABEL_MAP = {
     "0": "paragraph",
     "1": "title",
@@ -61,8 +66,8 @@ def test_vila_predictors():
     pdfplumber_parser = PDFPlumberParser()
     rasterizer = PDF2ImageRasterizer()
 
-    doc = pdfplumber_parser.parse(input_pdf_path="tests/fixtures/1903.10676.pdf")
-    images = rasterizer.rasterize(input_pdf_path="tests/fixtures/1903.10676.pdf", dpi=72)
+    doc = pdfplumber_parser.parse(input_pdf_path="../fixtures/1903.10676.pdf")
+    images = rasterizer.rasterize(input_pdf_path="../fixtures/1903.10676.pdf", dpi=72)
     doc.annotate_images(images)
 
     layout_regions = layout_predictor.predict(doc)
@@ -124,7 +129,7 @@ def test_vila_predictors():
 
 def test_vila_predictors_with_special_unicode_inputs():
     
-    test_doc_path = "tests/fixtures/unicode-test.json"
+    test_doc_path = "../fixtures/unicode-test.json"
     
     with open(test_doc_path, 'r') as fp:
         res = json.load(fp)

From 9d09af7858a3c23b021f238e17f413d4e00a09b1 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 6 Oct 2022 16:28:55 -0700
Subject: [PATCH 14/21] add basic relation json conversion; WIP

---
 mmda/types/__init__.py                   |  5 +-
 mmda/types/annotation.py                 | 74 ++++++++++++++++++------
 mmda/types/names.py                      | 22 ++++++-
 tests/test_types/test_json_conversion.py | 16 ++++-
 4 files changed, 93 insertions(+), 24 deletions(-)

diff --git a/mmda/types/__init__.py b/mmda/types/__init__.py
index d0f3929c..24dcf0aa 100644
--- a/mmda/types/__init__.py
+++ b/mmda/types/__init__.py
@@ -1,5 +1,5 @@
 from mmda.types.document import Document
-from mmda.types.annotation import SpanGroup, BoxGroup
+from mmda.types.annotation import SpanGroup, BoxGroup, Relation
 from mmda.types.span import Span
 from mmda.types.box import Box
 from mmda.types.image import PILImage
@@ -12,5 +12,6 @@
     'Span',
     'Box',
     'PILImage',
-    'Metadata'
+    'Metadata',
+    "Relation"
 ]
\ No newline at end of file
diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index 7949c7e2..28feec7c 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -5,12 +5,15 @@
 Collections of Annotations are how one constructs a new
 Iterable of Group-type objects within the Document
 
+@kylel, @lucas
+
 """
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
 
+from mmda.types.names import
 from mmda.types.box import Box
 from mmda.types.metadata import Metadata
 from mmda.types.span import Span
@@ -18,11 +21,9 @@
 if TYPE_CHECKING:
     from mmda.types.document import Document
 
-
 __all__ = ["Annotation", "BoxGroup", "SpanGroup", "Relation"]
 
 
-
 def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
     """Warns when a deepcopy is performed on an Annotation."""
 
@@ -34,7 +35,6 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
     warnings.warn(msg, UserWarning, stacklevel=2)
 
 
-
 class Annotation:
     """Annotation is intended for storing model predictions for a document."""
 
@@ -77,7 +77,6 @@ def __getattr__(self, field: str) -> List["Annotation"]:
         return self.__getattribute__(field)
 
 
-
 class BoxGroup(Annotation):
     def __init__(
             self,
@@ -150,7 +149,6 @@ def type(self, type: Union[str, None]) -> None:
 
 
 class SpanGroup(Annotation):
-
     def __init__(
             self,
             spans: List[Span],
@@ -172,16 +170,6 @@ def symbols(self) -> List[str]:
         else:
             return []
 
-    def annotate(
-        self, is_overwrite: bool = False, **kwargs: Iterable["Annotation"]
-    ) -> None:
-        if self.doc is None:
-            raise ValueError("SpanGroup has no attached document!")
-
-        key_remaps = {k: v for k, v in kwargs.items()}
-
-        self.doc.annotate(is_overwrite=is_overwrite, **key_remaps)
-
     def to_json(self) -> Dict:
         span_group_dict = dict(
             spans=[span.to_json() for span in self.spans],
@@ -284,6 +272,58 @@ def text(self, text: Union[str, None]) -> None:
         self.metadata.text = text
 
 
-
 class Relation(Annotation):
-    pass
\ No newline at end of file
+    def __init__(
+            self,
+            query: SpanGroup,
+            value: SpanGroup,
+            id: Optional[int] = None,
+            doc: Optional['Document'] = None,
+            metadata: Optional[Metadata] = None
+    ):
+        if query.id is None:
+            raise ValueError(f'Relation requires the query {query} to have an ID')
+        if value.id is None:
+            raise ValueError(f'Relation requires the value {value} to have an ID')
+        self.query = query
+        self.value = value
+        super().__init__(id=id, doc=doc, metadata=metadata)
+
+    def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
+        if is_minimal:
+            relation_dict = dict(
+                query=self.query.id,
+                value=self.value.id,
+                id=self.id,
+                metadata=self.metadata.to_json()
+            )
+        else:
+            relation_dict = dict(
+                query=self.query.to_json(),
+                value=self.value.to_json(),
+                id=self.id,
+                metadata=self.metadata.to_json()
+            )
+        return {
+            key: value
+            for key, value in relation_dict.items()
+            if value is not None
+        }  # only serialize non-null values
+
+    @classmethod
+    def from_json(cls, relation_dict: Dict, is_minimal: Optional[bool] = True) -> "Relation":
+        metadata_dict = relation_dict.get('metadata', {})
+        if is_minimal:
+            return cls(
+                query=SpanGroup.from_json(span_group_dict=relation_dict['query']),
+                value=SpanGroup.from_json(span_group_dict=relation_dict['value']),
+                id=relation_dict.get("id", None),
+                metadata=Metadata.from_json(metadata_dict)
+            )
+        else:
+            return cls(
+                query=SpanGroup.from_json(span_group_dict=relation_dict['query']),
+                value=SpanGroup.from_json(span_group_dict=relation_dict['value']),
+                id=relation_dict.get("id", None),
+                metadata=Metadata.from_json(metadata_dict)
+            )
diff --git a/mmda/types/names.py b/mmda/types/names.py
index 49460dbe..fbfd9327 100644
--- a/mmda/types/names.py
+++ b/mmda/types/names.py
@@ -11,8 +11,26 @@
 Images = 'images'
 
 Pages = 'pages'
-Tokens = 'tokens'
 Rows = 'rows'
-Sentences = 'sents'
 Blocks = 'blocks'
+
+Tokens = 'tokens'
 Words = 'words'
+Sentences = 'sents'
+Paragraphs = 'paras'
+SectionHeadings = 'secs'
+
+Figures = 'figures'
+Tables = 'tables'
+Captions = 'captions'
+
+BibEntries = 'bibs'
+CiteMentions = 'cites'
+ReferenceMentions = 'refs'
+
+# singletons
+Title = 'title'
+Abstract = 'abstract'
+
+# relations
+SectionParagraphs
\ No newline at end of file
diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py
index e7a5f27d..ee98e03d 100644
--- a/tests/test_types/test_json_conversion.py
+++ b/tests/test_types/test_json_conversion.py
@@ -8,7 +8,7 @@
 import json
 from pathlib import Path
 
-from mmda.types import BoxGroup, SpanGroup, Document, Metadata
+from mmda.types import BoxGroup, SpanGroup, Document, Metadata, Relation
 from mmda.parsers import PDFPlumberParser
 
 
@@ -16,17 +16,27 @@
 
 
 def test_span_group_conversion():
-    sg = SpanGroup(spans=[], id=3, metadata=Metadata.from_json({"text": "test"}))
+    sg = SpanGroup(spans=[], id=3, metadata=Metadata(text='test'))
     sg2 = SpanGroup.from_json(sg.to_json())
     assert sg2.to_json() == sg.to_json()
     assert sg2.__dict__ == sg.__dict__
 
-    bg = BoxGroup(boxes=[], metadata=Metadata.from_json({"text": "test", "id": 1}))
+    bg = BoxGroup(boxes=[], metadata=Metadata(text='test'))
     bg2 = BoxGroup.from_json(bg.to_json())
     assert bg2.to_json() == bg.to_json()
     assert bg2.__dict__ == bg.__dict__
 
 
+def test_relation_conversion():
+    r = Relation(
+        query=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
+        value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
+        id=999,
+        metadata=Metadata(type='something')
+    )
+    r.to_json()
+
+
 def test_doc_conversion():
     pdfparser = PDFPlumberParser()
 

From e827f0211ce74179b8e8f4513a0d01c2e04954c4 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Mon, 10 Oct 2022 12:10:25 -0700
Subject: [PATCH 15/21] WIP; for relations, handle field-aware id

---
 mmda/types/annotation.py | 25 +++++++++++++++++--------
 mmda/types/names.py      |  6 +++---
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index 28feec7c..8bcd33d1 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -42,11 +42,13 @@ def __init__(
             self,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
+            field: Optional[str] = None,
             metadata: Optional[Metadata] = None
     ):
         self.id = id
         self.doc = doc
         self.metadata = metadata if metadata else Metadata()
+        self.field = field
 
     @abstractmethod
     def to_json(self) -> Dict:
@@ -83,10 +85,11 @@ def __init__(
             boxes: List[Box],
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
+            field: Optional[str] = None,
             metadata: Optional[Metadata] = None,
     ):
         self.boxes = boxes
-        super().__init__(id=id, doc=doc, metadata=metadata)
+        super().__init__(id=id, doc=doc, field=field, metadata=metadata)
 
     def to_json(self) -> Dict:
         box_group_dict = dict(
@@ -131,6 +134,7 @@ def __deepcopy__(self, memo):
         box_group = BoxGroup(
             boxes=deepcopy(self.boxes, memo),
             id=self.id,
+            field=self.field,
             metadata=deepcopy(self.metadata, memo)
         )
 
@@ -155,18 +159,17 @@ def __init__(
             box_group: Optional[BoxGroup] = None,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
+            field: Optional[str] = None,
             metadata: Optional[Metadata] = None,
     ):
         self.spans = spans
         self.box_group = box_group
-        super().__init__(id=id, doc=doc, metadata=metadata)
+        super().__init__(id=id, doc=doc, field=field, metadata=metadata)
 
     @property
     def symbols(self) -> List[str]:
         if self.doc is not None:
-            return [
-                self.doc.symbols[span.start: span.end] for span in self.spans
-            ]
+            return [self.doc.symbols[span.start: span.end] for span in self.spans]
         else:
             return []
 
@@ -243,6 +246,7 @@ def __deepcopy__(self, memo):
         span_group = SpanGroup(
             spans=deepcopy(self.spans, memo),
             id=self.id,
+            field=self.field,
             metadata=deepcopy(self.metadata, memo),
             box_group=deepcopy(self.box_group, memo)
         )
@@ -279,6 +283,7 @@ def __init__(
             value: SpanGroup,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
+            field: Optional[str] = None,
             metadata: Optional[Metadata] = None
     ):
         if query.id is None:
@@ -287,13 +292,17 @@ def __init__(
             raise ValueError(f'Relation requires the value {value} to have an ID')
         self.query = query
         self.value = value
-        super().__init__(id=id, doc=doc, metadata=metadata)
+        super().__init__(id=id, doc=doc, field=field, metadata=metadata)
+
+    @classmethod
+    def entity_id(cls, entity: SpanGroup) -> str:
+        return f'{entity.field}-{entity.id}'
 
     def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
         if is_minimal:
             relation_dict = dict(
-                query=self.query.id,
-                value=self.value.id,
+                query=Relation.entity_id(self.query),
+                value=Relation.entity_id(self.value),
                 id=self.id,
                 metadata=self.metadata.to_json()
             )
diff --git a/mmda/types/names.py b/mmda/types/names.py
index fbfd9327..020debb2 100644
--- a/mmda/types/names.py
+++ b/mmda/types/names.py
@@ -1,6 +1,6 @@
 """
 
-Names of fields, as strings
+Names of Annotations, as strings
 
 @kylel
 
@@ -17,8 +17,8 @@
 Tokens = 'tokens'
 Words = 'words'
 Sentences = 'sents'
+Sections = 'secs'
 Paragraphs = 'paras'
-SectionHeadings = 'secs'
 
 Figures = 'figures'
 Tables = 'tables'
@@ -33,4 +33,4 @@
 Abstract = 'abstract'
 
 # relations
-SectionParagraphs
\ No newline at end of file
+RefersTo = 'refers_to'
\ No newline at end of file

From 2577c7d199cf7e89b7aeb6d74b7c3f1d0d16bcfa Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Tue, 11 Oct 2022 15:42:56 -0700
Subject: [PATCH 16/21] wip; minor cleanup

---
 mmda/parsers/pdfplumber_parser.py | 10 +++-------
 mmda/types/annotation.py          |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/mmda/parsers/pdfplumber_parser.py b/mmda/parsers/pdfplumber_parser.py
index 7e8d44c6..44645a8f 100644
--- a/mmda/parsers/pdfplumber_parser.py
+++ b/mmda/parsers/pdfplumber_parser.py
@@ -99,7 +99,9 @@ def __init__(
         self.split_at_punctuation = split_at_punctuation
 
     def parse(self, input_pdf_path: str) -> Document:
-        doc = self._load_pdf_as_doc(input_pdf_path)
+        page_to_line_to_tokens = self._load_pdf_tokens(input_pdf_path)
+        doc_json = self._convert_nested_text_to_doc_json(page_to_line_to_tokens)
+        doc = Document.from_json(doc_json)
         return doc
 
     def _load_page_tokens(
@@ -238,12 +240,6 @@ def _convert_nested_text_to_doc_json(self, page_to_row_to_tokens: Dict) -> Dict:
             Rows: [row.to_json() for row in row_annos],
         }
 
-    def _load_pdf_as_doc(self, input_pdf_path: str) -> Document:
-        page_to_line_to_tokens = self._load_pdf_tokens(input_pdf_path)
-        doc_json = self._convert_nested_text_to_doc_json(page_to_line_to_tokens)
-        doc = Document.from_json(doc_json)
-        return doc
-
     def _simple_line_detection(
             self,
             page_tokens: List[Dict],
diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index c7eea81d..5edeec47 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -13,7 +13,7 @@
 from copy import deepcopy
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
 
-from mmda.types.names import
+
 from mmda.types.box import Box
 from mmda.types.metadata import Metadata
 from mmda.types.span import Span

From e5d1d2811ad551b580eef84f2c24952428dc31b9 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 13 Oct 2022 13:57:15 -0700
Subject: [PATCH 17/21] add AnnotationName class; add lookup method to Document
 based on name; base Relation class on storage of these names; define to and
 from_json

---
 mmda/types/annotation.py                 | 120 +++++++++++++++--------
 mmda/types/document.py                   |  74 +++++++-------
 tests/test_types/test_json_conversion.py |   2 +-
 3 files changed, 118 insertions(+), 78 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index 5edeec47..ad2f3113 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -8,12 +8,12 @@
 @kylel, @lucas
 
 """
+import logging
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
 
-
 from mmda.types.box import Box
 from mmda.types.metadata import Metadata
 from mmda.types.span import Span
@@ -35,6 +35,23 @@ def warn_deepcopy_of_annotation(obj: "Annotation") -> None:
     warnings.warn(msg, UserWarning, stacklevel=2)
 
 
+class AnnotationName:
+    """Stores a name that uniquely identifies this Annotation within a Document"""
+
+    def __init__(self, field: str, id: int):
+        self.field = field
+        self.id = id
+
+    def __str__(self) -> str:
+        return f"{self.field}-{self.id}"
+
+    @classmethod
+    def from_str(cls, s: str) -> 'AnnotationName':
+        field, id = s.split('-')
+        id = int(id)
+        return AnnotationName(field=field, id=id)
+
+
 class Annotation:
     """Annotation is intended for storing model predictions for a document."""
 
@@ -47,35 +64,50 @@ def __init__(
     ):
         self.id = id
         self.doc = doc
-        self.metadata = metadata if metadata else Metadata()
         self.field = field
+        self.metadata = metadata if metadata else Metadata()
 
     @abstractmethod
     def to_json(self) -> Dict:
-        pass
+        raise NotImplementedError
 
     @classmethod
     @abstractmethod
     def from_json(cls, annotation_dict: Dict) -> "Annotation":
-        pass
+        raise NotImplementedError
 
-    def attach_doc(self, doc: "Document") -> None:
+    @property
+    def name(self) -> Optional[AnnotationName]:
+        if self.field and self.id:
+            return AnnotationName(field=self.field, id=self.id)
+        else:
+            return None
+
+    def _attach_doc(self, doc: "Document", field: str) -> None:
         if not self.doc:
             self.doc = doc
+            self.field = field
         else:
             raise AttributeError("This annotation already has an attached document")
 
-    # TODO[kylel] - comment explaining
-    def __getattr__(self, field: str) -> List["Annotation"]:
-        if self.doc is None:
-            raise ValueError("This annotation is not attached to a document")
+    def _get_siblings(self) -> List['Annotation']:
+        """This method gets all other objects sharing the same field as the current object.
+        Only works after a Document has been attached, which is how objects learn their `field`."""
+        if not self.doc:
+            raise AttributeError("This annotation does not have an attached document")
+        return self.doc.__getattribute__(self.field)
 
-        if field in self.doc.fields:
-            return self.doc.find_overlapping(self, field)
+    def __getattr__(self, field: str) -> List["Annotation"]:
+        """This method allows jumping from an object of one field to all overlapping
+        objects of another field. For example `page.tokens` jumps from a particular page
+        to all its intersecting tokens."""
+        if not self.doc:
+            raise AttributeError("This annotation does not have an attached document")
 
         if field in self.doc.fields:
             return self.doc.find_overlapping(self, field)
 
+        # TODO[kylel] - when does this ever get called? infinite loop?
         return self.__getattribute__(field)
 
 
@@ -92,6 +124,7 @@ def __init__(
         super().__init__(id=id, doc=doc, field=field, metadata=metadata)
 
     def to_json(self) -> Dict:
+        """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat"""
         box_group_dict = dict(
             boxes=[box.to_json() for box in self.boxes],
             id=self.id,
@@ -145,15 +178,16 @@ def __deepcopy__(self, memo):
 
     @property
     def type(self) -> str:
+        logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`')
         return self.metadata.get("type", None)
 
     @type.setter
     def type(self, type: Union[str, None]) -> None:
+        logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`')
         self.metadata.type = type
 
 
 class SpanGroup(Annotation):
-
     def __init__(
             self,
             spans: List[Span],
@@ -174,17 +208,8 @@ def symbols(self) -> List[str]:
         else:
             return []
 
-    def annotate(
-        self, is_overwrite: bool = False, **kwargs: Iterable["Annotation"]
-    ) -> None:
-        if self.doc is None:
-            raise ValueError("SpanGroup has no attached document!")
-
-        key_remaps = {k: v for k, v in kwargs.items()}
-
-        self.doc.annotate(is_overwrite=is_overwrite, **key_remaps)
-
     def to_json(self) -> Dict:
+        """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat"""
         span_group_dict = dict(
             spans=[span.to_json() for span in self.spans],
             id=self.id,
@@ -210,7 +235,7 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup":
         else:
             # this fallback is necessary to ensure compatibility with span
             # groups that were create before the metadata migration and
-            # therefore have "id", "type" in the root of the json dict instead.
+            # therefore have "type" in the root of the json dict instead.
             metadata_dict = {
                 "type": span_group_dict.get("type", None),
                 "text": span_group_dict.get("text", None)
@@ -269,10 +294,12 @@ def __deepcopy__(self, memo):
 
     @property
     def type(self) -> str:
+        logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`')
         return self.metadata.get("type", None)
 
     @type.setter
     def type(self, type: Union[str, None]) -> None:
+        logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`')
         self.metadata.type = type
 
     @property
@@ -290,36 +317,33 @@ def text(self, text: Union[str, None]) -> None:
 class Relation(Annotation):
     def __init__(
             self,
-            query: SpanGroup,
+            key: SpanGroup,
             value: SpanGroup,
             id: Optional[int] = None,
             doc: Optional['Document'] = None,
             field: Optional[str] = None,
             metadata: Optional[Metadata] = None
     ):
-        if query.id is None:
-            raise ValueError(f'Relation requires the query {query} to have an ID')
-        if value.id is None:
-            raise ValueError(f'Relation requires the value {value} to have an ID')
-        self.query = query
+        if key.name is None:
+            raise ValueError(f'Relation requires the key {key} to have a `.name`')
+        if value.name is None:
+            raise ValueError(f'Relation requires the value {value} to have a `.name`')
+        self.key = key
         self.value = value
         super().__init__(id=id, doc=doc, field=field, metadata=metadata)
 
-    @classmethod
-    def entity_id(cls, entity: SpanGroup) -> str:
-        return f'{entity.field}-{entity.id}'
-
     def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
+        """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat"""
         if is_minimal:
             relation_dict = dict(
-                query=Relation.entity_id(self.query),
-                value=Relation.entity_id(self.value),
+                key=self.key.name,
+                value=self.value.name,
                 id=self.id,
                 metadata=self.metadata.to_json()
             )
         else:
             relation_dict = dict(
-                query=self.query.to_json(),
+                key=self.key.to_json(),
                 value=self.value.to_json(),
                 id=self.id,
                 metadata=self.metadata.to_json()
@@ -331,19 +355,29 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
         }  # only serialize non-null values
 
     @classmethod
-    def from_json(cls, relation_dict: Dict, is_minimal: Optional[bool] = True) -> "Relation":
-        metadata_dict = relation_dict.get('metadata', {})
+    def from_json(
+            cls,
+            relation_dict: Dict,
+            is_minimal: Optional[bool] = True,
+            doc: Optional['Document'] = None,
+    ) -> "Relation":
         if is_minimal:
+            if not doc:
+                raise ValueError(
+                    f"Creating a Relation from a minimal JSON requires Document `doc` "
+                    f"otherwise, no way to know what the key {relation_dict['key']} "
+                    f"or value {relation_dict['value']}"
+                )
             return cls(
-                query=SpanGroup.from_json(span_group_dict=relation_dict['query']),
-                value=SpanGroup.from_json(span_group_dict=relation_dict['value']),
+                key=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['key'])),
+                value=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['value'])),
                 id=relation_dict.get("id", None),
-                metadata=Metadata.from_json(metadata_dict)
+                metadata=Metadata.from_json(relation_dict.get('metadata', {}))
             )
         else:
             return cls(
-                query=SpanGroup.from_json(span_group_dict=relation_dict['query']),
+                key=SpanGroup.from_json(span_group_dict=relation_dict['key']),
                 value=SpanGroup.from_json(span_group_dict=relation_dict['value']),
                 id=relation_dict.get("id", None),
-                metadata=Metadata.from_json(metadata_dict)
+                metadata=Metadata.from_json(relation_dict.get('metadata', {}))
             )
diff --git a/mmda/types/document.py b/mmda/types/document.py
index cbd00655..b0d6f3da 100644
--- a/mmda/types/document.py
+++ b/mmda/types/document.py
@@ -9,7 +9,7 @@
 from copy import deepcopy
 from typing import Dict, Iterable, List, Optional
 
-from mmda.types.annotation import Annotation, BoxGroup, SpanGroup
+from mmda.types.annotation import Annotation, BoxGroup, SpanGroup, AnnotationName
 from mmda.types.image import PILImage
 from mmda.types.indexers import Indexer, SpanGroupIndexer
 from mmda.types.names import Images, Symbols
@@ -17,7 +17,6 @@
 
 
 class Document:
-
     SPECIAL_FIELDS = [Symbols, Images]
     UNALLOWED_FIELD_NAMES = ["fields"]
 
@@ -32,35 +31,35 @@ def fields(self) -> List[str]:
         return self.__fields
 
     # TODO: extend implementation to support DocBoxGroup
-    def find_overlapping(self, query: Annotation, field_name: str) -> List[Annotation]:
+    def find_overlapping(self, query: Annotation, field: str) -> List[Annotation]:
         if not isinstance(query, SpanGroup):
             raise NotImplementedError(
                 f"Currently only supports query of type SpanGroup"
             )
-        return self.__indexers[field_name].find(query=query)
+        return self.__indexers[field].find(query=query)
 
     def annotate(
-        self, is_overwrite: bool = False, **kwargs: Iterable[Annotation]
+            self, is_overwrite: bool = False, **kwargs: Iterable[Annotation]
     ) -> None:
         """Annotate the fields for document symbols (correlating the annotations with the
         symbols) and store them into the papers.
         """
         # 1) check validity of field names
-        for field_name in kwargs.keys():
+        for field in kwargs.keys():
             assert (
-                field_name not in self.SPECIAL_FIELDS
-            ), f"The field_name {field_name} should not be in {self.SPECIAL_FIELDS}."
+                field not in self.SPECIAL_FIELDS
+            ), f"The field {field} should not be in {self.SPECIAL_FIELDS}."
 
-            if field_name in self.fields:
+            if field in self.fields:
                 # already existing field, check if ok overriding
                 if not is_overwrite:
                     raise AssertionError(
-                        f"This field name {field_name} already exists. To override, set `is_overwrite=True`"
+                        f"This field name {field} already exists. To override, set `is_overwrite=True`"
                     )
-            elif field_name in dir(self):
+            elif field in dir(self):
                 # not an existing field, but a reserved class method name
                 raise AssertionError(
-                    f"The field_name {field_name} should not conflict with existing class properties"
+                    f"The field {field} should not conflict with existing class properties"
                 )
 
         # Kyle's preserved comment:
@@ -68,39 +67,39 @@ def annotate(
         # overhead on large documents.
 
         # 2) register fields into Document
-        for field_name, annotations in kwargs.items():
+        for field, annotations in kwargs.items():
             if len(annotations) == 0:
-                warnings.warn(f"The annotations is empty for the field {field_name}")
-                setattr(self, field_name, [])
-                self.__fields.append(field_name)
+                warnings.warn(f"The annotations is empty for the field {field}")
+                setattr(self, field, [])
+                self.__fields.append(field)
                 continue
 
             annotation_types = {type(a) for a in annotations}
             assert (
                 len(annotation_types) == 1
-            ), f"Annotations in field_name {field_name} more than 1 type: {annotation_types}"
+            ), f"Annotations in field {field} more than 1 type: {annotation_types}"
             annotation_type = annotation_types.pop()
 
             if annotation_type == SpanGroup:
                 span_groups = self._annotate_span_group(
-                    span_groups=annotations, field_name=field_name
+                    span_groups=annotations, field=field
                 )
             elif annotation_type == BoxGroup:
                 # TODO: not good. BoxGroups should be stored on their own, not auto-generating SpanGroups.
                 span_groups = self._annotate_box_group(
-                    box_groups=annotations, field_name=field_name
+                    box_groups=annotations, field=field
                 )
             else:
                 raise NotImplementedError(
-                    f"Unsupported annotation type {annotation_type} for {field_name}"
+                    f"Unsupported annotation type {annotation_type} for {field}"
                 )
 
             # register fields
-            setattr(self, field_name, span_groups)
-            self.__fields.append(field_name)
+            setattr(self, field, span_groups)
+            self.__fields.append(field)
 
     def annotate_images(
-        self, images: Iterable[PILImage], is_overwrite: bool = False
+            self, images: Iterable[PILImage], is_overwrite: bool = False
     ) -> None:
         if not is_overwrite and len(self.images) > 0:
             raise AssertionError(
@@ -122,7 +121,7 @@ def annotate_images(
         self.images = images
 
     def _annotate_span_group(
-        self, span_groups: List[SpanGroup], field_name: str
+            self, span_groups: List[SpanGroup], field: str
     ) -> List[SpanGroup]:
         """Annotate the Document using a bunch of span groups.
         It will associate the annotations with the document symbols.
@@ -131,15 +130,15 @@ def _annotate_span_group(
 
         # 1) add Document to each SpanGroup
         for span_group in span_groups:
-            span_group.attach_doc(doc=self)
+            span_group._attach_doc(doc=self, field=field)
 
         # 2) Build fast overlap lookup index
-        self.__indexers[field_name] = SpanGroupIndexer(span_groups)
+        self.__indexers[field] = SpanGroupIndexer(span_groups)
 
         return span_groups
 
     def _annotate_box_group(
-        self, box_groups: List[BoxGroup], field_name: str
+            self, box_groups: List[BoxGroup], field: str
     ) -> List[SpanGroup]:
         """Annotate the Document using a bunch of box groups.
         It will associate the annotations with the document symbols.
@@ -177,7 +176,7 @@ def _annotate_box_group(
             derived_span_groups.append(
                 SpanGroup(
                     spans=MergeSpans(list_of_spans=all_token_spans_with_box_group, index_distance=1)
-                    .merge_neighbor_spans_by_symbol_distance(), box_group=box_group,
+                        .merge_neighbor_spans_by_symbol_distance(), box_group=box_group,
                     # id = box_id,
                 )
                 # TODO Right now we cannot assign the box id, or otherwise running doc.blocks will
@@ -195,7 +194,7 @@ def _annotate_box_group(
             span_group.id = box_id
 
         return self._annotate_span_group(
-            span_groups=derived_span_groups, field_name=field_name
+            span_groups=derived_span_groups, field=field
         )
 
     #
@@ -245,16 +244,23 @@ def from_json(cls, doc_dict: Dict) -> "Document":
             )
 
         # 2) convert span group dicts to span gropus
-        field_name_to_span_groups = {}
-        for field_name, span_group_dicts in doc_dict.items():
-            if field_name not in doc.SPECIAL_FIELDS:
+        field_to_span_groups = {}
+        for field, span_group_dicts in doc_dict.items():
+            if field not in doc.SPECIAL_FIELDS:
                 span_groups = [
                     SpanGroup.from_json(span_group_dict=span_group_dict)
                     for span_group_dict in span_group_dicts
                 ]
-                field_name_to_span_groups[field_name] = span_groups
+                field_to_span_groups[field] = span_groups
 
         # 3) load annotations for each field
-        doc.annotate(**field_name_to_span_groups)
+        doc.annotate(**field_to_span_groups)
 
         return doc
+
+    def locate_annotation(self, name: AnnotationName) -> Annotation:
+        candidates = self.__getattribute__(name.field)
+        matched_annotations = [c for c in candidates if c.id == name.id]
+        assert len(matched_annotations) <= 1, \
+            f"Multiple annotations in field {name.field} with same ID {name.id}"
+        return matched_annotations[0]
diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py
index ee98e03d..63b6a3fc 100644
--- a/tests/test_types/test_json_conversion.py
+++ b/tests/test_types/test_json_conversion.py
@@ -29,7 +29,7 @@ def test_span_group_conversion():
 
 def test_relation_conversion():
     r = Relation(
-        query=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
+        key=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
         value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
         id=999,
         metadata=Metadata(type='something')

From f83eb00b4f5b5da30f8f99672703ee5babf3e847 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 13 Oct 2022 17:44:09 -0700
Subject: [PATCH 18/21] add relation test; remove ability to create relation
 from json

---
 mmda/types/annotation.py                 | 18 ++++++-------
 tests/test_types/test_json_conversion.py | 34 ++++++++++++++++++++----
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index ad2f3113..c505ced2 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -336,8 +336,8 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
         """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat"""
         if is_minimal:
             relation_dict = dict(
-                key=self.key.name,
-                value=self.value.name,
+                key=str(self.key.name),
+                value=str(self.value.name),
                 id=self.id,
                 metadata=self.metadata.to_json()
             )
@@ -368,16 +368,14 @@ def from_json(
                     f"otherwise, no way to know what the key {relation_dict['key']} "
                     f"or value {relation_dict['value']}"
                 )
+            key_name = AnnotationName.from_str(s=relation_dict['key'])
+            value_name = AnnotationName.from_str(s=relation_dict['value'])
             return cls(
-                key=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['key'])),
-                value=doc.locate_annotation(name=AnnotationName.from_str(s=relation_dict['value'])),
+                key=doc.locate_annotation(name=key_name),
+                value=doc.locate_annotation(name=value_name),
                 id=relation_dict.get("id", None),
                 metadata=Metadata.from_json(relation_dict.get('metadata', {}))
             )
         else:
-            return cls(
-                key=SpanGroup.from_json(span_group_dict=relation_dict['key']),
-                value=SpanGroup.from_json(span_group_dict=relation_dict['value']),
-                id=relation_dict.get("id", None),
-                metadata=Metadata.from_json(relation_dict.get('metadata', {}))
-            )
+            raise NotImplementedError(f'Not currently supported. Awkward to build relations'
+                                      f'without an existing Document object that stores fields.')
\ No newline at end of file
diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py
index 63b6a3fc..c14a36e7 100644
--- a/tests/test_types/test_json_conversion.py
+++ b/tests/test_types/test_json_conversion.py
@@ -11,7 +11,6 @@
 from mmda.types import BoxGroup, SpanGroup, Document, Metadata, Relation
 from mmda.parsers import PDFPlumberParser
 
-
 PDFFILEPATH = Path(__file__).parent / "../fixtures/1903.10676.pdf"
 
 
@@ -29,12 +28,37 @@ def test_span_group_conversion():
 
 def test_relation_conversion():
     r = Relation(
-        key=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
-        value=SpanGroup(spans=[], id=3, metadata=Metadata(text='test')),
+        key=SpanGroup(spans=[], id=3, metadata=Metadata(foobar='test'), field='abc'),
+        value=SpanGroup(spans=[], id=5, metadata=Metadata(foobar='test'), field='xyz'),
         id=999,
-        metadata=Metadata(type='something')
+        metadata=Metadata(blabla='something')
     )
-    r.to_json()
+
+    # minimal to & from JSON (default behavior)
+    r_dict_minimal = {
+        'key': 'abc-3',
+        'value': 'xyz-5',
+        'id': 999,
+        'metadata': {'blabla': 'something'}
+    }
+    assert r.to_json() == r.to_json(is_minimal=True) == r_dict_minimal
+
+    doc = Document.from_json(doc_dict={
+        'symbols': 'asdfasdf',
+        'abc': [{'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}],
+        'xyz': [{'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}]
+    })
+    assert r_dict_minimal == r.from_json(r_dict_minimal, is_minimal=True, doc=doc).to_json() == \
+           r.from_json(r_dict_minimal, doc=doc).to_json()
+
+    # full to JSON
+    r_dict_full = {
+        'key': {'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}},
+        'value': {'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}},
+        'id': 999,
+        'metadata': {'blabla': 'something'}
+    }
+    assert r.to_json(is_minimal=False) == r_dict_full
 
 
 def test_doc_conversion():

From 84682ee20f8ae666058442ebeb535049603728e4 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 13 Oct 2022 18:08:43 -0700
Subject: [PATCH 19/21] remove unused ways to from JSON for relations

---
 mmda/types/annotation.py                 | 51 ++++++++----------------
 tests/test_types/test_json_conversion.py | 15 ++-----
 2 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index c505ced2..ee67f964 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -332,22 +332,14 @@ def __init__(
         self.value = value
         super().__init__(id=id, doc=doc, field=field, metadata=metadata)
 
-    def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
+    def to_json(self) -> Dict:
         """Note: even if `doc` or `field` are attached, don't include in JSON to avoid bloat"""
-        if is_minimal:
-            relation_dict = dict(
-                key=str(self.key.name),
-                value=str(self.value.name),
-                id=self.id,
-                metadata=self.metadata.to_json()
-            )
-        else:
-            relation_dict = dict(
-                key=self.key.to_json(),
-                value=self.value.to_json(),
-                id=self.id,
-                metadata=self.metadata.to_json()
-            )
+        relation_dict = dict(
+            key=str(self.key.name),
+            value=str(self.value.name),
+            id=self.id,
+            metadata=self.metadata.to_json()
+        )
         return {
             key: value
             for key, value in relation_dict.items()
@@ -358,24 +350,13 @@ def to_json(self, is_minimal: Optional[bool] = True) -> Dict:
     def from_json(
             cls,
             relation_dict: Dict,
-            is_minimal: Optional[bool] = True,
-            doc: Optional['Document'] = None,
+            doc: 'Document',
     ) -> "Relation":
-        if is_minimal:
-            if not doc:
-                raise ValueError(
-                    f"Creating a Relation from a minimal JSON requires Document `doc` "
-                    f"otherwise, no way to know what the key {relation_dict['key']} "
-                    f"or value {relation_dict['value']}"
-                )
-            key_name = AnnotationName.from_str(s=relation_dict['key'])
-            value_name = AnnotationName.from_str(s=relation_dict['value'])
-            return cls(
-                key=doc.locate_annotation(name=key_name),
-                value=doc.locate_annotation(name=value_name),
-                id=relation_dict.get("id", None),
-                metadata=Metadata.from_json(relation_dict.get('metadata', {}))
-            )
-        else:
-            raise NotImplementedError(f'Not currently supported. Awkward to build relations'
-                                      f'without an existing Document object that stores fields.')
\ No newline at end of file
+        key_name = AnnotationName.from_str(s=relation_dict['key'])
+        value_name = AnnotationName.from_str(s=relation_dict['value'])
+        return cls(
+            key=doc.locate_annotation(name=key_name),
+            value=doc.locate_annotation(name=value_name),
+            id=relation_dict.get("id", None),
+            metadata=Metadata.from_json(relation_dict.get('metadata', {}))
+        )
diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py
index c14a36e7..6cd3e608 100644
--- a/tests/test_types/test_json_conversion.py
+++ b/tests/test_types/test_json_conversion.py
@@ -34,31 +34,22 @@ def test_relation_conversion():
         metadata=Metadata(blabla='something')
     )
 
-    # minimal to & from JSON (default behavior)
+    # to & from JSON
     r_dict_minimal = {
         'key': 'abc-3',
         'value': 'xyz-5',
         'id': 999,
         'metadata': {'blabla': 'something'}
     }
-    assert r.to_json() == r.to_json(is_minimal=True) == r_dict_minimal
+    assert r.to_json() == r_dict_minimal
 
     doc = Document.from_json(doc_dict={
         'symbols': 'asdfasdf',
         'abc': [{'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}],
         'xyz': [{'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}]
     })
-    assert r_dict_minimal == r.from_json(r_dict_minimal, is_minimal=True, doc=doc).to_json() == \
-           r.from_json(r_dict_minimal, doc=doc).to_json()
+    assert r_dict_minimal == r.from_json(r_dict_minimal, doc=doc).to_json()
 
-    # full to JSON
-    r_dict_full = {
-        'key': {'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}},
-        'value': {'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}},
-        'id': 999,
-        'metadata': {'blabla': 'something'}
-    }
-    assert r.to_json(is_minimal=False) == r_dict_full
 
 
 def test_doc_conversion():

From 818349bf9f90536b1f98d1df689e9c386a738341 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Thu, 20 Oct 2022 23:39:45 -0700
Subject: [PATCH 20/21] replace getattribute with getattr

---
 mmda/types/annotation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index ee67f964..7db9a591 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -95,7 +95,7 @@ def _get_siblings(self) -> List['Annotation']:
         Only works after a Document has been attached, which is how objects learn their `field`."""
         if not self.doc:
             raise AttributeError("This annotation does not have an attached document")
-        return self.doc.__getattribute__(self.field)
+        return self.doc.__getattr__(self.field)
 
     def __getattr__(self, field: str) -> List["Annotation"]:
         """This method allows jumping from an object of one field to all overlapping
@@ -108,7 +108,7 @@ def __getattr__(self, field: str) -> List["Annotation"]:
             return self.doc.find_overlapping(self, field)
 
         # TODO[kylel] - when does this ever get called? infinite loop?
-        return self.__getattribute__(field)
+        return self.__getattr__(field)
 
 
 class BoxGroup(Annotation):

From 5820934a72622e5101043f55c5c454ee77dacb16 Mon Sep 17 00:00:00 2001
From: kyleclo <kyleclo@uw.edu>
Date: Fri, 21 Oct 2022 11:06:46 -0700
Subject: [PATCH 21/21] return empty list if no getattr match

---
 mmda/types/annotation.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py
index 7db9a591..5d925d83 100644
--- a/mmda/types/annotation.py
+++ b/mmda/types/annotation.py
@@ -106,9 +106,8 @@ def __getattr__(self, field: str) -> List["Annotation"]:
 
         if field in self.doc.fields:
             return self.doc.find_overlapping(self, field)
-
-        # TODO[kylel] - when does this ever get called? infinite loop?
-        return self.__getattr__(field)
+        else:
+            return []
 
 
 class BoxGroup(Annotation):