From 0cfed4559bb1ccc23b35322adbe336c39842a4e1 Mon Sep 17 00:00:00 2001 From: Twin Karmakharm Date: Thu, 25 May 2023 23:55:49 +0100 Subject: [PATCH 1/6] Resolves #345 fixed username anonymization --- backend/models.py | 5 +++-- backend/tests/test_models.py | 15 ++++++++++----- teamware/settings/base.py | 5 +++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/backend/models.py b/backend/models.py index d8e44ed1..f35726fa 100644 --- a/backend/models.py +++ b/backend/models.py @@ -1019,8 +1019,9 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): annotation_sets = {} for annotation in annotations: a_data = annotation.data + anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" annotation_set = { - "name": annotation.user.id if anonymize else annotation.user.username, + "name": anonymized_name if anonymize else annotation.user.username, "annotations": [ { "type": "Document", @@ -1035,7 +1036,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): ], "next_annid": 1, } - annotation_sets[annotation.user.username] = annotation_set + annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set doc_dict["annotation_sets"] = annotation_sets # Add to the export the lists (possibly empty) of users who rejected, diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index 2002937e..1f56cf4c 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1098,8 +1098,9 @@ def test_get_annotations_for_user_in_project(self): class TestDocumentAnnotationModelExport(TestCase): def setUp(self): + self.unanonymized_prefix = "namedperson" self.test_user = get_user_model().objects.create(username="project_creator") - self.annotator_names = [f"anno{i}" for i in range(3)] + self.annotator_names = [f"{self.unanonymized_prefix}{i}" for i in range(3)] self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names] self.annotator_ids = [a.id for a in self.annotators] self.project = Project.objects.create(owner=self.test_user) @@ -1233,7 +1234,8 @@ def test_export_raw_anonymized(self): doc_dict = document.get_doc_annotation_dict("raw", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) + self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) self.check_teamware_status(doc_dict, self.annotator_ids) @@ -1243,7 +1245,8 @@ def test_export_raw_deanonymized(self): doc_dict = document.get_doc_annotation_dict("raw", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), str)) + self.assertTrue(aset_key.startswith(self.unanonymized_prefix)) + self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix)) # for non-anonymized export the rejected/aborted/timed_out status # uses names rather than ID numbers @@ -1255,7 +1258,8 @@ def test_export_gate_anonymized(self): doc_dict = document.get_doc_annotation_dict("gate", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), int)) + self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) + self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) self.check_teamware_status(doc_dict["features"], self.annotator_ids) @@ -1265,7 +1269,8 @@ def test_export_gate_deanonymized(self): doc_dict = document.get_doc_annotation_dict("gate", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue(isinstance(aset_data.get("name", None), str)) + self.assertTrue(aset_key.startswith(self.unanonymized_prefix)) + self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix)) # for non-anonymized export the rejected/aborted/timed_out status # uses names rather than ID numbers diff --git a/teamware/settings/base.py b/teamware/settings/base.py index fccf1e4f..66b0fe46 100644 --- a/teamware/settings/base.py +++ b/teamware/settings/base.py @@ -265,6 +265,11 @@ DELETED_USER_LASTNAME = "Deleted" DELETED_USER_EMAIL_DOMAIN = "teamware-deleted.com" +""" +Anonymization settings +""" +ANONYMIZATION_PREFIX = "annotator" + """ Frontend dev server configuration """ From d91a5549f799dcd458ca009766c36e7b896d0306 Mon Sep 17 00:00:00 2001 From: Ian Roberts Date: Mon, 26 Feb 2024 16:28:10 +0000 Subject: [PATCH 2/6] Use the same ANONYMIZATION_PREFIX in teamware_status section --- backend/models.py | 4 ++-- backend/tests/test_models.py | 12 ++++++------ .../documents_annotations_management.md | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/models.py b/backend/models.py index f35726fa..fb17e60b 100644 --- a/backend/models.py +++ b/backend/models.py @@ -1008,7 +1008,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): annotation_dict["duration_seconds"] = annotation.time_to_complete if anonymize: - annotation_sets[str(annotation.user.id)] = annotation_dict + annotation_sets[f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"] = annotation_dict else: annotation_sets[annotation.user.username] = annotation_dict @@ -1048,7 +1048,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): ("aborted", Annotation.ABORTED), ]: teamware_status[key] = [ - annotation.user.id if anonymize else annotation.user.username + f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" if anonymize else annotation.user.username for annotation in self.annotations.filter(status=status) ] if json_format == "csv": diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index 1f56cf4c..4a70def3 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1102,7 +1102,7 @@ def setUp(self): self.test_user = get_user_model().objects.create(username="project_creator") self.annotator_names = [f"{self.unanonymized_prefix}{i}" for i in range(3)] self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names] - self.annotator_ids = [a.id for a in self.annotators] + self.anon_annotator_names = [f"{settings.ANONYMIZATION_PREFIX}{a.id}" for a in self.annotators] self.project = Project.objects.create(owner=self.test_user) for i in range(10): document = Document.objects.create( @@ -1157,7 +1157,7 @@ def test_export_raw(self): self.assertTrue("feature3" in doc_dict) self.check_raw_gate_annotation_formatting(doc_dict) - self.check_teamware_status(doc_dict, self.annotator_ids) + self.check_teamware_status(doc_dict, self.anon_annotator_names) def test_export_gate(self): @@ -1174,7 +1174,7 @@ def test_export_gate(self): self.assertTrue("feature3" in doc_features) self.check_raw_gate_annotation_formatting(doc_dict) - self.check_teamware_status(doc_features, self.annotator_ids) + self.check_teamware_status(doc_features, self.anon_annotator_names) def check_raw_gate_annotation_formatting(self, doc_dict): self.assertTrue("annotation_sets" in doc_dict) @@ -1226,7 +1226,7 @@ def test_export_csv(self): self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) - self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids)) + self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.anon_annotator_names)) def test_export_raw_anonymized(self): @@ -1237,7 +1237,7 @@ def test_export_raw_anonymized(self): self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) - self.check_teamware_status(doc_dict, self.annotator_ids) + self.check_teamware_status(doc_dict, self.anon_annotator_names) def test_export_raw_deanonymized(self): @@ -1261,7 +1261,7 @@ def test_export_gate_anonymized(self): self.assertFalse(aset_key.startswith(self.unanonymized_prefix)) self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix)) - self.check_teamware_status(doc_dict["features"], self.annotator_ids) + self.check_teamware_status(doc_dict["features"], self.anon_annotator_names) def test_export_gate_deanonymized(self): diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md index c73fbb5a..5b55ce84 100644 --- a/docs/docs/manageradminguide/documents_annotations_management.md +++ b/docs/docs/manageradminguide/documents_annotations_management.md @@ -234,7 +234,7 @@ You can choose how documents are exported: In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`). - The field `teamware_status` gives the ids or usernames (depending on the "anonymize" setting) of those annotators + The field `teamware_status` gives the usernames or anonymous IDs (depending on the "anonymize" setting) of those annotators who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the project, or "aborted" for some other reason (e.g. they were removed from the project). From f650d872c2c6fa8f2745aa987282caf575b8498e Mon Sep 17 00:00:00 2001 From: Twin Karmakharm Date: Fri, 26 May 2023 00:28:13 +0100 Subject: [PATCH 3/6] #346 Prevent double nesting of features field --- backend/models.py | 7 +++++-- backend/tests/test_models.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/backend/models.py b/backend/models.py index fb17e60b..29a39dbe 100644 --- a/backend/models.py +++ b/backend/models.py @@ -980,9 +980,12 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): if json_format == "raw" or json_format == "csv": doc_dict = self.data.copy() elif json_format == "gate": + # GATE json format are expected to have an existing "features" field + features_dict = self.data["features"] if "features" in self.data and isinstance(self.data["features"], dict) else {} - ignore_keys = {"text", self.project.document_id_field} - features_dict = {key: value for key, value in self.data.items() if key not in ignore_keys} + # Add any non-compliant top-level fields into the "features" field instead + ignore_keys = {"text", "features", self.project.document_id_field} + features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys}) doc_dict = { "text": self.data["text"], diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index 4a70def3..a2d3652c 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1113,6 +1113,11 @@ def setUp(self): "feature1": "Testvalue 1", "feature2": "Testvalue 1", "feature3": "Testvalue 1", + "features": { + "gate_format_feature1": "Gate feature test value", + "gate_format_feature2": "Gate feature test value", + "gate_format_feature3": "Gate feature test value", + } } ) @@ -1148,6 +1153,8 @@ def setUp(self): def test_export_raw(self): for document in self.project.documents.all(): + # Fields should remain exactly the same as what's been uploaded + # aside from annotation_sets doc_dict = document.get_doc_annotation_dict("raw") print(doc_dict) self.assertTrue("id" in doc_dict) @@ -1155,6 +1162,11 @@ def test_export_raw(self): self.assertTrue("feature1" in doc_dict) self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) + self.assertTrue("features" in doc_dict) + doc_features = doc_dict["features"] + self.assertTrue("gate_format_feature1" in doc_features) + self.assertTrue("gate_format_feature2" in doc_features) + self.assertTrue("gate_format_feature3" in doc_features) self.check_raw_gate_annotation_formatting(doc_dict) self.check_teamware_status(doc_dict, self.anon_annotator_names) @@ -1162,6 +1174,8 @@ def test_export_raw(self): def test_export_gate(self): for document in self.project.documents.all(): + # All top-level fields apart from name, text, features and annotation_sets should be + # nested inside the features field doc_dict = document.get_doc_annotation_dict("gate") print(doc_dict) @@ -1172,6 +1186,10 @@ def test_export_gate(self): self.assertTrue("feature1" in doc_features) self.assertTrue("feature2" in doc_features) self.assertTrue("feature3" in doc_features) + self.assertFalse("features" in doc_features, "Double nesting of features field") + self.assertTrue("gate_format_feature1" in doc_features) + self.assertTrue("gate_format_feature2" in doc_features) + self.assertTrue("gate_format_feature3" in doc_features) self.check_raw_gate_annotation_formatting(doc_dict) self.check_teamware_status(doc_features, self.anon_annotator_names) From b94e22d2672e400d60ba2fad9faf72482fc2b8bf Mon Sep 17 00:00:00 2001 From: Twin Karmakharm Date: Fri, 26 May 2023 01:56:49 +0100 Subject: [PATCH 4/6] #348 merge existing and new annotation fields --- backend/models.py | 17 +++--- backend/tests/test_models.py | 103 +++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 25 deletions(-) diff --git a/backend/models.py b/backend/models.py index 29a39dbe..24ffc0b6 100644 --- a/backend/models.py +++ b/backend/models.py @@ -981,24 +981,25 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): doc_dict = self.data.copy() elif json_format == "gate": # GATE json format are expected to have an existing "features" field - features_dict = self.data["features"] if "features" in self.data and isinstance(self.data["features"], dict) else {} + features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {} # Add any non-compliant top-level fields into the "features" field instead - ignore_keys = {"text", "features", self.project.document_id_field} + ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field} features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys}) doc_dict = { "text": self.data["text"], "features": features_dict, - "offset_type": "p", + "offset_type": self.data["offset_type"] if "offset_type" in self.data else "p", # Use original offset type "name": get_value_from_key_path(self.data, self.project.document_id_field) } # Insert annotation sets into the doc dict annotations = self.annotations.filter(status=Annotation.COMPLETED) if json_format == "csv": + # Gets pre-existing annotations + annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {} # Format annotations for CSV export - annotation_sets = {} for annotation in annotations: a_data = annotation.data annotation_dict = {} @@ -1018,8 +1019,9 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): doc_dict["annotations"] = annotation_sets else: + # Gets pre-existing annotations + annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {} # Format for JSON in line with GATE formatting - annotation_sets = {} for annotation in annotations: a_data = annotation.data anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" @@ -1032,14 +1034,13 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True): "end": 0, "id": 0, "duration_seconds": annotation.time_to_complete, - "features": { - "label": a_data - } + "features": a_data } ], "next_annid": 1, } annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set + doc_dict["annotation_sets"] = annotation_sets # Add to the export the lists (possibly empty) of users who rejected, diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index a2d3652c..c5ddccfe 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1117,8 +1117,52 @@ def setUp(self): "gate_format_feature1": "Gate feature test value", "gate_format_feature2": "Gate feature test value", "gate_format_feature3": "Gate feature test value", + }, + "offset_type": "x", + "annotations": { + "existing_annotator1": { + "sentiment": "positive" + }, + f"2": { + "sentiment": "positive" + } + + }, + "annotation_sets": { + "existing_annotator1": { + "name": "existing_annotator1", + "annotations": [ + { + "type": "Document", + "start": 0, + "end": 10, + "id": 0, + "features": { + "sentiment": "positive" + } + } + ], + "next_annid": 1 + }, + f"{settings.ANONYMIZATION_PREFIX}2": { + "name": f"{settings.ANONYMIZATION_PREFIX}1", + "annotations": [ + { + "type": "Document", + "start": 0, + "end": 10, + "id": 0, + "features": { + "sentiment": "positive" + } + } + ], + "next_annid": 1 + } + } + } ) @@ -1163,11 +1207,14 @@ def test_export_raw(self): self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) self.assertTrue("features" in doc_dict) + self.assertTrue("offset_type" in doc_dict) + self.assertTrue("annotations" in doc_dict) doc_features = doc_dict["features"] self.assertTrue("gate_format_feature1" in doc_features) self.assertTrue("gate_format_feature2" in doc_features) self.assertTrue("gate_format_feature3" in doc_features) + self.check_raw_gate_annotation_formatting(doc_dict) self.check_teamware_status(doc_dict, self.anon_annotator_names) @@ -1181,12 +1228,16 @@ def test_export_gate(self): self.assertTrue("text" in doc_dict) self.assertTrue("features" in doc_dict) + self.assertFalse("annotations" in doc_dict) + self.assertEqual(doc_dict["offset_type"], "x") doc_features = doc_dict["features"] self.assertTrue("id" in doc_features) self.assertTrue("feature1" in doc_features) self.assertTrue("feature2" in doc_features) self.assertTrue("feature3" in doc_features) + self.assertTrue("annotations" in doc_features) self.assertFalse("features" in doc_features, "Double nesting of features field") + self.assertFalse("offset_type" in doc_features, "Double nesting of offset_type field") self.assertTrue("gate_format_feature1" in doc_features) self.assertTrue("gate_format_feature2" in doc_features) self.assertTrue("gate_format_feature3" in doc_features) @@ -1194,25 +1245,40 @@ def test_export_gate(self): self.check_raw_gate_annotation_formatting(doc_dict) self.check_teamware_status(doc_features, self.anon_annotator_names) + def test_export_gate_with_no_offset_type(self): + + for document in self.project.documents.all(): + document.data.pop("offset_type") + + doc_dict = document.get_doc_annotation_dict("gate") + self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p") + + def check_raw_gate_annotation_formatting(self, doc_dict): self.assertTrue("annotation_sets" in doc_dict) - self.assertTrue(len(doc_dict["annotation_sets"]) == 3) + self.assertEqual(len(doc_dict["annotation_sets"]), 4) # Test annotation formatting for aset_key, aset_data in doc_dict["annotation_sets"].items(): - self.assertTrue("name" in aset_data) - self.assertTrue("annotations" in aset_data) - self.assertEqual(len(aset_data["annotations"]), 1) - anno_dict = aset_data["annotations"][0] - self.assertTrue("type" in anno_dict) - self.assertTrue("start" in anno_dict) - self.assertTrue("end" in anno_dict) - self.assertTrue("id" in anno_dict) - self.assertTrue("features" in anno_dict) - self.assertTrue("label" in anno_dict["features"]) - label_dict = anno_dict["features"]["label"] - self.assertTrue("text1" in label_dict) - self.assertTrue("checkbox1" in label_dict) + if aset_key != "existing_annotator1": + self.assertTrue("name" in aset_data) + self.assertTrue("annotations" in aset_data) + self.assertEqual(len(aset_data["annotations"]), 1) + anno_dict = aset_data["annotations"][0] + self.assertTrue("type" in anno_dict) + self.assertTrue("start" in anno_dict) + self.assertTrue("end" in anno_dict) + self.assertTrue("id" in anno_dict) + self.assertTrue("features" in anno_dict) + features_dict = anno_dict["features"] + self.assertTrue("text1" in features_dict) + self.assertTrue("checkbox1" in features_dict) + else: + # Check that existing annotation from document upload is carried over + self.assertEqual(aset_data["annotations"][0]["features"]["sentiment"], "positive") + + + def check_teamware_status(self, containing_dict, expected_value): self.assertTrue("teamware_status" in containing_dict) @@ -1238,11 +1304,14 @@ def test_export_csv(self): self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) self.assertTrue("annotations" in doc_dict) - self.assertTrue(len(doc_dict["annotations"]) == 3) + self.assertEqual(len(doc_dict["annotations"]), 4) anno_set_dict = doc_dict["annotations"] for set_key in anno_set_dict: - self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) - self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) + if set_key != "existing_annotator1": + self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str)) + self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str)) + else: + self.assertEqual(anno_set_dict[set_key]["sentiment"], "positive") self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.anon_annotator_names)) From d34021d5708eab7bb143261fbe79a51a8f5fe66b Mon Sep 17 00:00:00 2001 From: Twin Karmakharm Date: Fri, 26 May 2023 16:59:49 +0100 Subject: [PATCH 5/6] Fixed implementation of export tests --- backend/tests/test_models.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/backend/tests/test_models.py b/backend/tests/test_models.py index c5ddccfe..2758df2c 100644 --- a/backend/tests/test_models.py +++ b/backend/tests/test_models.py @@ -1123,7 +1123,7 @@ def setUp(self): "existing_annotator1": { "sentiment": "positive" }, - f"2": { + f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": { "sentiment": "positive" } @@ -1144,8 +1144,8 @@ def setUp(self): ], "next_annid": 1 }, - f"{settings.ANONYMIZATION_PREFIX}2": { - "name": f"{settings.ANONYMIZATION_PREFIX}1", + f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": { + "name": f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}", "annotations": [ { "type": "Document", @@ -1254,9 +1254,9 @@ def test_export_gate_with_no_offset_type(self): self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p") - def check_raw_gate_annotation_formatting(self, doc_dict): + def check_raw_gate_annotation_formatting(self, doc_dict: dict): self.assertTrue("annotation_sets" in doc_dict) - self.assertEqual(len(doc_dict["annotation_sets"]), 4) + self.assertEqual(len(doc_dict["annotation_sets"]), 4, doc_dict) # Test annotation formatting for aset_key, aset_data in doc_dict["annotation_sets"].items(): @@ -1304,7 +1304,7 @@ def test_export_csv(self): self.assertTrue("feature2" in doc_dict) self.assertTrue("feature3" in doc_dict) self.assertTrue("annotations" in doc_dict) - self.assertEqual(len(doc_dict["annotations"]), 4) + self.assertEqual(len(doc_dict["annotations"]), 4, doc_dict) anno_set_dict = doc_dict["annotations"] for set_key in anno_set_dict: if set_key != "existing_annotator1": @@ -1318,6 +1318,10 @@ def test_export_csv(self): def test_export_raw_anonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("raw", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): @@ -1329,6 +1333,10 @@ def test_export_raw_anonymized(self): def test_export_raw_deanonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("raw", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): @@ -1342,6 +1350,10 @@ def test_export_raw_deanonymized(self): def test_export_gate_anonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("gate", anonymize=True) for aset_key, aset_data in doc_dict["annotation_sets"].items(): @@ -1353,6 +1365,10 @@ def test_export_gate_anonymized(self): def test_export_gate_deanonymized(self): for document in self.project.documents.all(): + # Mask any existing annotations that came with the document upload + document.data.pop("annotation_sets") + document.save() + doc_dict = document.get_doc_annotation_dict("gate", anonymize=False) for aset_key, aset_data in doc_dict["annotation_sets"].items(): From e289227c19b4cc3e2ebb3531aa42b55bac88b5e9 Mon Sep 17 00:00:00 2001 From: Twin Karmakharm Date: Fri, 26 May 2023 17:20:36 +0100 Subject: [PATCH 6/6] Updated docs on how the documents and annotation are now exported --- .../documents_annotations_management.md | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/docs/docs/manageradminguide/documents_annotations_management.md b/docs/docs/manageradminguide/documents_annotations_management.md index 5b55ce84..7b852340 100644 --- a/docs/docs/manageradminguide/documents_annotations_management.md +++ b/docs/docs/manageradminguide/documents_annotations_management.md @@ -187,12 +187,11 @@ possible to determine which documents were annotated by _the same person_, just You can choose how documents are exported: * `.json` & `.jsonl` - JSON or JSON Lines files can be generated in the format of: - * `raw` - Exports unmodified JSON. If you've originally uploaded in GATE format then choose this option. - - An additional field named `annotation_sets` is added for storing annotations. The annotations are laid out in the - same way as GATE JSON format. For example if a document has been annotated by `user1` with labels and values - `text`:`Annotation text`, `radio`:`val3`, and `checkbox`:`["val2", "val4"]`, the non-anonymous export might look - like this: + * `raw` - Exports the original `JSON` combined with an additional field named `annotation_sets` for storing + annotations. The annotations are laid out in the same way as GATE + [bdocjs](https://gatenlp.github.io/gateplugin-Format_Bdoc/bdoc_document.html) format. For example if a document + has been annotated by `user1` with labels and values `text`:`Annotation text`, `radio`:`val3`, and + `checkbox`:`["val2", "val4"]`, the non-anonymous export might look like this: ```json { @@ -210,14 +209,12 @@ You can choose how documents are exported: "end":10, "id":0, "features":{ - "label":{ - "text":"Annotation text", - "radio":"val3", - "checkbox":[ - "val2", - "val4" - ] - } + "text":"Annotation text", + "radio":"val3", + "checkbox":[ + "val2", + "val4" + ] } } ], @@ -232,16 +229,18 @@ You can choose how documents are exported: } ``` - In anonymous mode the name `user1` would instead be the user's opaque numeric identifier (e.g. `105`). + In anonymous mode the name `user1` would instead be derived from the user's opaque numeric identifier (e.g. + `annotator105`). The field `teamware_status` gives the usernames or anonymous IDs (depending on the "anonymize" setting) of those annotators who rejected the document, "timed out" because they did not complete their annotation in the time allowed by the project, or "aborted" for some other reason (e.g. they were removed from the project). - * `gate` - Convert documents to GATE JSON format and export. A `name` field is added that takes the ID value from the - ID field specified in the project configuration. Fields apart from `text` and the ID field specified in the project - config are placed in the `features` field, as is the `teamware_status` information. An `annotation_sets` field is - added for storing annotations. + * `gate` - Convert documents to GATE [bdocjs](https://gatenlp.github.io/gateplugin-Format_Bdoc/bdoc_document.html) + format and export. A `name` field is added that takes the `ID` value from the `ID field` specified in the + **project configuration**. Any top-level fields apart from `text`, `features`, `offset_type`, `annotation_sets`, + and the ID field specified in the project config are placed in the `features` field, as is the `teamware_status` + information. An `annotation_sets` field is added for storing annotations if it doesn't already exist. For example in the case of this uploaded JSON document: ```json @@ -271,6 +270,9 @@ You can choose how documents are exported: columns with the header of `annotations.username.label` and the status information is in columns named `teamware_status.rejected_by`, `teamware_status.timed_out` and `teamware_status.aborted`. +**Note: Documents that contains existing annotations (i.e. the `annotation_sets` field for `JSON` or `annotations` for `CSV`) are merged with the new sets of annotations. Be aware that if the document has a new annotation from an annotator with the same +username, the previous annotation will be overwritten. Existing annotations are also not anonymized when exporting the document.** + ## Deleting documents and annotations It is possible to click on the top left of corner of documents and annotations to select it, then click on the