diff --git a/mmda/types/annotation.py b/mmda/types/annotation.py index ee67f964..f5b1fb5d 100644 --- a/mmda/types/annotation.py +++ b/mmda/types/annotation.py @@ -65,7 +65,7 @@ def __init__( self.id = id self.doc = doc self.field = field - self.metadata = metadata if metadata else Metadata() + self.metadata = metadata @abstractmethod def to_json(self) -> Dict: @@ -128,10 +128,11 @@ def to_json(self) -> Dict: box_group_dict = dict( boxes=[box.to_json() for box in self.boxes], id=self.id, - metadata=self.metadata.to_json() + metadata=self.metadata.to_json() if self.metadata else None ) return { - key: value for key, value in box_group_dict.items() if value + key: value for key, value in box_group_dict.items() + if value is not None } # only serialize non-null values @classmethod @@ -146,6 +147,8 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup": metadata_dict = { "type": box_group_dict.get("type", None) } + metadata_dict = {key: value for key, value in metadata_dict.items() if value is not None} + metadata = Metadata.from_json(metadata_dict) if metadata_dict else None return cls( boxes=[ @@ -155,9 +158,10 @@ def from_json(cls, box_group_dict: Dict) -> "BoxGroup": for box_dict in box_group_dict.get("boxes", []) ], id=box_group_dict.get("id", None), - metadata=Metadata.from_json(metadata_dict), + metadata=metadata, ) + def __getitem__(self, key: int): return self.boxes[key] @@ -179,12 +183,15 @@ def __deepcopy__(self, memo): @property def type(self) -> str: logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') - return self.metadata.get("type", None) + return self.metadata.get("type", None) if self.metadata else None @type.setter def type(self, type: Union[str, None]) -> None: logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') - self.metadata.type = type + if self.metadata: + self.metadata.type = type + else: + self.metadata = Metadata(type=type) class SpanGroup(Annotation): @@ -213,7 +220,7 @@ def to_json(self) -> Dict: span_group_dict = dict( spans=[span.to_json() for span in self.spans], id=self.id, - metadata=self.metadata.to_json(), + metadata=self.metadata.to_json() if self.metadata else None, box_group=self.box_group.to_json() if self.box_group else None ) return { @@ -240,6 +247,8 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup": "type": span_group_dict.get("type", None), "text": span_group_dict.get("text", None) } + metadata_dict = {key: value for key, value in metadata_dict.items() if value is not None} + metadata = Metadata.from_json(metadata_dict) if metadata_dict else None return cls( spans=[ @@ -247,7 +256,7 @@ def from_json(cls, span_group_dict: Dict) -> "SpanGroup": for span_dict in span_group_dict["spans"] ], id=span_group_dict.get("id", None), - metadata=Metadata.from_json(metadata_dict), + metadata=metadata, box_group=box_group, ) @@ -295,23 +304,31 @@ def __deepcopy__(self, memo): @property def type(self) -> str: logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') - return self.metadata.get("type", None) + return self.metadata.get("type", None) if self.metadata else None @type.setter def type(self, type: Union[str, None]) -> None: logging.warning(msg='`.type` to be deprecated in future versions. Use `.metadata.type`') - self.metadata.type = type + if self.metadata: + self.metadata.type = type + else: + self.metadata = Metadata(type=type) @property def text(self) -> str: - maybe_text = self.metadata.get("text", None) - if maybe_text is None: - return " ".join(self.symbols) - return maybe_text + if self.metadata: + maybe_text = self.metadata.get("text", None) + if maybe_text: + return maybe_text + # default behavior is convenient + return " ".join(self.symbols) @text.setter def text(self, text: Union[str, None]) -> None: - self.metadata.text = text + if self.metadata: + self.metadata.text = text + else: + self.metadata = Metadata(text=text) class Relation(Annotation): @@ -338,7 +355,7 @@ def to_json(self) -> Dict: key=str(self.key.name), value=str(self.value.name), id=self.id, - metadata=self.metadata.to_json() + metadata=self.metadata.to_json() if self.metadata else None ) return { key: value diff --git a/tests/test_types/test_annotation.py b/tests/test_types/test_annotation.py deleted file mode 100644 index 83e345fb..00000000 --- a/tests/test_types/test_annotation.py +++ /dev/null @@ -1,36 +0,0 @@ -from mmda.types.annotation import BoxGroup -from mmda.types.box import Box -import unittest - - -class TestBoxGroup(unittest.TestCase): - def setUp(cls) -> None: - cls.box_group_json = {'boxes': [{'left': 0.1, - 'top': 0.6, - 'width': 0.36, - 'height': 0.221, - 'page': 0}], - 'id': None, - 'type': 'Text'} - - def test_from_json(self): - self.assertIsInstance(BoxGroup.from_json(self.box_group_json), BoxGroup) - self.assertEqual(BoxGroup.from_json(self.box_group_json).boxes, - [Box(l=0.1, t=0.6, w=0.36, h=0.221, page=0)]) - - self.assertEqual(BoxGroup.from_json(self.box_group_json).id, None) - self.assertEqual(BoxGroup.from_json(self.box_group_json).type, 'Text') - - def test_to_json(self): - boxgroup = BoxGroup.from_json(self.box_group_json) - - self.assertIsInstance(boxgroup.to_json(), dict) - self.assertEqual(boxgroup.to_json()['boxes'], - [{'left': 0.1, - 'top': 0.6, - 'width': 0.36, - 'height': 0.221, - 'page': 0}]) - - assert 'boxes' in boxgroup.to_json() - assert 'metadata' in boxgroup.to_json() diff --git a/tests/test_types/test_box.py b/tests/test_types/test_box.py deleted file mode 100644 index 8528ecc6..00000000 --- a/tests/test_types/test_box.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -from mmda.types import box as mmda_box - - -class TestBox(unittest.TestCase): - def setUp(cls) -> None: - cls.box_dict = {'left': 0.2, - 'top': 0.09, - 'width': 0.095, - 'height': 0.017, - 'page': 0} - cls.box = mmda_box.Box(l=0.2, t=0.09, w=0.095, h=0.017, page=0) - - def test_from_json(self): - self.assertEqual(self.box.from_json(self.box_dict), self.box) - - def test_to_json(self): - self.assertEqual(self.box.to_json(), self.box_dict) diff --git a/tests/test_types/test_document.py b/tests/test_types/test_document.py index ea953ba6..dc3ec0d5 100644 --- a/tests/test_types/test_document.py +++ b/tests/test_types/test_document.py @@ -9,3 +9,4 @@ def test__empty_annotations_work(self): annotations = [] doc.annotate(my_cool_field=annotations) self.assertEqual(doc.my_cool_field, []) + diff --git a/tests/test_types/test_json_conversion.py b/tests/test_types/test_json_conversion.py index 6cd3e608..cc5ad534 100644 --- a/tests/test_types/test_json_conversion.py +++ b/tests/test_types/test_json_conversion.py @@ -1,86 +1,163 @@ ''' Description: Test whether all properties for an mmda doc are preserved when converting to json and back. -Author: @soldni +Author: @soldni, @kylel ''' +import unittest import json -from pathlib import Path -from mmda.types import BoxGroup, SpanGroup, Document, Metadata, Relation -from mmda.parsers import PDFPlumberParser - -PDFFILEPATH = Path(__file__).parent / "../fixtures/1903.10676.pdf" - - -def test_span_group_conversion(): - sg = SpanGroup(spans=[], id=3, metadata=Metadata(text='test')) - sg2 = SpanGroup.from_json(sg.to_json()) - assert sg2.to_json() == sg.to_json() - assert sg2.__dict__ == sg.__dict__ - - bg = BoxGroup(boxes=[], metadata=Metadata(text='test')) - bg2 = BoxGroup.from_json(bg.to_json()) - assert bg2.to_json() == bg.to_json() - assert bg2.__dict__ == bg.__dict__ - - -def test_relation_conversion(): - r = Relation( - key=SpanGroup(spans=[], id=3, metadata=Metadata(foobar='test'), field='abc'), - value=SpanGroup(spans=[], id=5, metadata=Metadata(foobar='test'), field='xyz'), - id=999, - metadata=Metadata(blabla='something') - ) - - # to & from JSON - r_dict_minimal = { - 'key': 'abc-3', - 'value': 'xyz-5', - 'id': 999, - 'metadata': {'blabla': 'something'} - } - assert r.to_json() == r_dict_minimal - - doc = Document.from_json(doc_dict={ - 'symbols': 'asdfasdf', - 'abc': [{'spans': [], 'id': 3, 'metadata': {'foobar': 'test'}}], - 'xyz': [{'spans': [], 'id': 5, 'metadata': {'foobar': 'test'}}] - }) - assert r_dict_minimal == r.from_json(r_dict_minimal, doc=doc).to_json() - - - -def test_doc_conversion(): - pdfparser = PDFPlumberParser() - - orig_doc = pdfparser.parse(input_pdf_path=str(PDFFILEPATH)) - - json_doc = json.dumps(orig_doc.to_json()) - new_doc = Document.from_json(json.loads(json_doc)) - - # We can't just have a `assert new_doc == orig_doc` statement since - # internal references to the doc itself (e.g. `doc.tokens[0].doc`) - # would make it fail. instead, we compare specific elements of the doc. - - # compare just token representation and name of fields - assert orig_doc.symbols == new_doc.symbols - assert orig_doc.fields == new_doc.fields - - for field_name in orig_doc.fields: - # this iterates over all span group for this field in both docs - field_it = zip( - getattr(orig_doc, field_name), - getattr(new_doc, field_name) - ) - - # type annotations to keep mypy quiet - orig_sg: SpanGroup - new_sg: SpanGroup - - for orig_sg, new_sg in field_it: - # for each pair, they should have same metadata (type, id, - # and optionally, text) and same spans. - assert orig_sg.metadata == new_sg.metadata - assert orig_sg.spans == new_sg.spans +from mmda.types import Span, Box, BoxGroup, SpanGroup, Document, Metadata, Relation + + +class TestJSONConversion(unittest.TestCase): + + def test_boxes(self): + # minimal span + b = Box(l=0.0, t=0.1, w=0.2, h=0.3, page=4) + b_dict = {'left': 0.0, 'top': 0.1, 'width': 0.2, 'height': 0.3, 'page': 4} + assert b.to_json() == b_dict + assert Box.from_json(box_dict=b_dict).to_json() == b_dict + + def test_spans(self): + # minimal span + s = Span(start=0, end=2) + s_dict = {'start': 0, 'end': 2} + assert s.to_json() == s_dict + assert Span.from_json(span_dict=s_dict).to_json() == s_dict + + # contains boxes + s = Span(start=0, end=2, box=Box(l=0.0, t=0.1, w=0.2, h=0.3, page=4)) + s_dict = {'start': 0, 'end': 2, + 'box': {'left': 0.0, 'top': 0.1, 'width': 0.2, 'height': 0.3, 'page': 4}} + assert s.to_json() == s_dict + assert Span.from_json(span_dict=s_dict).to_json() == s_dict + + def test_metadata(self): + # empty metadata + m = Metadata() + m_dict = {} + assert m.to_json() == m_dict + assert Metadata.from_json(di=m_dict).to_json() == m_dict + + # null-valued metadata + m = Metadata(foo=None, bar=None) + m_dict = {'foo': None, 'bar': None} + assert m.to_json() == m_dict + assert Metadata.from_json(di=m_dict).to_json() == m_dict + + # meaningful metadata + m = Metadata(foo='xyz', bar='abc') + m_dict = {'foo': 'xyz', 'bar': 'abc'} + assert m.to_json() == m_dict + assert Metadata.from_json(di=m_dict).to_json() == m_dict + + def test_box_groups(self): + # minimal box group + bg = BoxGroup(boxes=[]) + bg_dict = {'boxes': []} + assert bg.to_json() == bg_dict + assert BoxGroup.from_json(box_group_dict=bg_dict).to_json() == bg_dict + + # slightly more stuff in box group + bg = BoxGroup(boxes=[], id=999, doc=Document(symbols='doesnt-matter-what-goes-here'), + field='also-ignored', metadata=Metadata(foo='bar')) + bg_dict = {'boxes': [], 'id': 999, 'metadata': {'foo': 'bar'}} + assert bg.to_json() == bg_dict + assert BoxGroup.from_json(box_group_dict=bg_dict).to_json() == bg_dict + + # add boxes to boxgroup + bg = BoxGroup(boxes=[Box(l=0.0, t=0.1, w=0.2, h=0.3, page=4), + Box(l=0.5, t=0.6, w=0.7, h=0.8, page=9)]) + bg_dict = {'boxes': [{'left': 0.0, 'top': 0.1, 'width': 0.2, 'height': 0.3, 'page': 4}, + {'left': 0.5, 'top': 0.6, 'width': 0.7, 'height': 0.8, 'page': 9}]} + assert bg.to_json() == bg_dict + assert BoxGroup.from_json(box_group_dict=bg_dict).to_json() == bg_dict + + def test_span_groups(self): + # minimal span group + sg = SpanGroup(spans=[]) + sg_dict = {'spans': []} + assert sg.to_json() == sg_dict + assert SpanGroup.from_json(span_group_dict=sg_dict).to_json() == sg_dict + + # slightly more stuff in span group + sg = SpanGroup(spans=[], id=999, doc=Document(symbols='doesnt-matter-what-goes-here'), + field='also-ignored', metadata=Metadata(foo='bar')) + sg_dict = {'spans': [], 'id': 999, 'metadata': {'foo': 'bar'}} + assert sg.to_json() == sg_dict + assert SpanGroup.from_json(span_group_dict=sg_dict).to_json() == sg_dict + + # add spans to spangroup + sg = SpanGroup(spans=[Span(start=0, end=2), Span(start=3, end=4)]) + sg_dict = {'spans': [{'start': 0, 'end': 2}, {'start': 3, 'end': 4}]} + assert sg.to_json() == sg_dict + assert SpanGroup.from_json(span_group_dict=sg_dict).to_json() == sg_dict + + # contains boxgroup + sg = SpanGroup(spans=[], box_group=BoxGroup(boxes=[])) + sg_dict = {'spans': [], 'box_group': {'boxes': []}} + assert sg.to_json() == sg_dict + assert SpanGroup.from_json(span_group_dict=sg_dict).to_json() == sg_dict + + def test_documents(self): + # minimal doc + doc = Document(symbols='a b c d e f g') + doc_dict = {'symbols': 'a b c d e f g'} + assert doc.to_json() == doc_dict + assert Document.from_json(doc_dict=doc_dict).to_json() == doc_dict + + # doc with span group + doc_dict = {'symbols': 'a b c d e f g', 'stuff': [{'spans': []}]} + doc = Document.from_json(doc_dict) + assert doc.fields == ['stuff'] # from_json() should apply .annotation() + assert len(doc.stuff) == 1 + assert doc.stuff[0].to_json() == {'spans': []} + assert doc.to_json() == doc_dict + + def test_relations(self): + # minimal relation still requires SpanGroup to have names + with self.assertRaises(ValueError): + Relation(key=SpanGroup(spans=[]), value=SpanGroup(spans=[])) + + # minimal relation working example + sg1 = SpanGroup(spans=[], id=123, field='abc') + sg2 = SpanGroup(spans=[], id=999, field='xyz') + r = Relation(key=sg1, value=sg2) + r_dict = {'key': 'abc-123', 'value': 'xyz-999'} + assert r.to_json() == r_dict + + # to test `from_json()` we need a Document annotated w the related units + doc = Document(symbols='a b c d e f g') + sg1 = SpanGroup(spans=[], id=123) + sg2 = SpanGroup(spans=[], id=999) + doc.annotate(abc=[sg1]) + doc.annotate(xyz=[sg2]) + r_dict = {'key': 'abc-123', 'value': 'xyz-999'} + assert Relation.from_json(relation_dict=r_dict, doc=doc).to_json() == r_dict + + # `from_json()` should fail if Document isnt coherent with fieldnames + doc = Document(symbols='a b c d e f g') + sg1 = SpanGroup(spans=[], id=123) + sg2 = SpanGroup(spans=[], id=999) + doc.annotate(wrongname=[sg1]) + doc.annotate(alsowrongname=[sg2]) + r_dict = {'key': 'abc-123', 'value': 'xyz-999'} + with self.assertRaises(AttributeError): + Relation.from_json(relation_dict=r_dict, doc=doc) + + # relations can have metadata too + sg1 = SpanGroup(spans=[], id=123, field='abc') + sg2 = SpanGroup(spans=[], id=999, field='xyz') + r = Relation(key=sg1, value=sg2, id=40404, + doc=Document(symbols='doesnt-get-used-when-to-json'), field='same-here', + metadata=Metadata(foo='bar')) + r_dict = {'key': 'abc-123', 'value': 'xyz-999', 'id': 40404, 'metadata': {'foo': 'bar'}} + assert r.to_json() == r_dict + doc = Document(symbols='a b c d e f g') + sg1 = SpanGroup(spans=[], id=123) + sg2 = SpanGroup(spans=[], id=999) + doc.annotate(abc=[sg1]) + doc.annotate(xyz=[sg2]) + assert Relation.from_json(relation_dict=r_dict, doc=doc).to_json() == r_dict diff --git a/tests/test_types/test_span.py b/tests/test_types/test_span.py deleted file mode 100644 index fd7fb145..00000000 --- a/tests/test_types/test_span.py +++ /dev/null @@ -1,24 +0,0 @@ -import unittest -from mmda.types import span as mmda_span -from mmda.types import box as mmda_box - - -class TestBox(unittest.TestCase): - def setUp(cls): - cls.span = mmda_span.Span(start=0, end=0) - cls.span_dict = {'start': 0, - 'end': 8, - 'box': {'left': 0.2, - 'top': 0.09, - 'width': 0.095, - 'height': 0.017, - 'page': 0}} - - def test_from_json(self): - self.assertEqual(self.span.from_json(self.span_dict), - mmda_span.Span(start=0, end=8, box=mmda_box.Box(l=0.2, t=0.09, w=0.095, h=0.017, page=0))) - - def test_to_json(self): - self.assertEqual(self.span.from_json(self.span_dict).to_json(), - self.span_dict) -