Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for various export related issues #377

Merged
merged 6 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions backend/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,22 +980,26 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
if json_format == "raw" or json_format == "csv":
doc_dict = self.data.copy()
elif json_format == "gate":
# GATE json format are expected to have an existing "features" field
features_dict = dict(self.data["features"]) if "features" in self.data and isinstance(self.data["features"], dict) else {}

ignore_keys = {"text", self.project.document_id_field}
features_dict = {key: value for key, value in self.data.items() if key not in ignore_keys}
# Add any non-compliant top-level fields into the "features" field instead
ignore_keys = {"text", "features", "offset_type", "annotation_sets", self.project.document_id_field}
features_dict.update({key: value for key, value in self.data.items() if key not in ignore_keys})

doc_dict = {
"text": self.data["text"],
"features": features_dict,
"offset_type": "p",
"offset_type": self.data["offset_type"] if "offset_type" in self.data else "p", # Use original offset type
"name": get_value_from_key_path(self.data, self.project.document_id_field)
}

# Insert annotation sets into the doc dict
annotations = self.annotations.filter(status=Annotation.COMPLETED)
if json_format == "csv":
# Gets pre-existing annotations
annotation_sets = dict(self.data["annotations"]) if "annotations" in self.data else {}
# Format annotations for CSV export
annotation_sets = {}
for annotation in annotations:
a_data = annotation.data
annotation_dict = {}
Expand All @@ -1008,34 +1012,35 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
annotation_dict["duration_seconds"] = annotation.time_to_complete

if anonymize:
annotation_sets[str(annotation.user.id)] = annotation_dict
annotation_sets[f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"] = annotation_dict
else:
annotation_sets[annotation.user.username] = annotation_dict

doc_dict["annotations"] = annotation_sets

else:
# Gets pre-existing annotations
annotation_sets = dict(self.data["annotation_sets"]) if "annotation_sets" in self.data else {}
# Format for JSON in line with GATE formatting
annotation_sets = {}
for annotation in annotations:
a_data = annotation.data
anonymized_name = f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}"
annotation_set = {
"name": annotation.user.id if anonymize else annotation.user.username,
"name": anonymized_name if anonymize else annotation.user.username,
"annotations": [
{
"type": "Document",
"start": 0,
"end": 0,
"id": 0,
"duration_seconds": annotation.time_to_complete,
"features": {
"label": a_data
}
"features": a_data
}
],
"next_annid": 1,
}
annotation_sets[annotation.user.username] = annotation_set
annotation_sets[anonymized_name if anonymize else annotation.user.username] = annotation_set

doc_dict["annotation_sets"] = annotation_sets

# Add to the export the lists (possibly empty) of users who rejected,
Expand All @@ -1047,7 +1052,7 @@ def get_doc_annotation_dict(self, json_format="raw", anonymize=True):
("aborted", Annotation.ABORTED),
]:
teamware_status[key] = [
annotation.user.id if anonymize else annotation.user.username
f"{settings.ANONYMIZATION_PREFIX}{annotation.user.id}" if anonymize else annotation.user.username
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've updated the teamware_status to use the same anonymized names as everything else.

for annotation in self.annotations.filter(status=status)
]
if json_format == "csv":
Expand Down
166 changes: 137 additions & 29 deletions backend/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,10 +1098,11 @@ def test_get_annotations_for_user_in_project(self):
class TestDocumentAnnotationModelExport(TestCase):

def setUp(self):
self.unanonymized_prefix = "namedperson"
self.test_user = get_user_model().objects.create(username="project_creator")
self.annotator_names = [f"anno{i}" for i in range(3)]
self.annotator_names = [f"{self.unanonymized_prefix}{i}" for i in range(3)]
self.annotators = [get_user_model().objects.create(username=u) for u in self.annotator_names]
self.annotator_ids = [a.id for a in self.annotators]
self.anon_annotator_names = [f"{settings.ANONYMIZATION_PREFIX}{a.id}" for a in self.annotators]
self.project = Project.objects.create(owner=self.test_user)
for i in range(10):
document = Document.objects.create(
Expand All @@ -1112,6 +1113,55 @@ def setUp(self):
"feature1": "Testvalue 1",
"feature2": "Testvalue 1",
"feature3": "Testvalue 1",
"features": {
"gate_format_feature1": "Gate feature test value",
"gate_format_feature2": "Gate feature test value",
"gate_format_feature3": "Gate feature test value",
},
"offset_type": "x",
"annotations": {
"existing_annotator1": {
"sentiment": "positive"
},
f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": {
"sentiment": "positive"
}

},
"annotation_sets": {
"existing_annotator1": {
"name": "existing_annotator1",
"annotations": [
{
"type": "Document",
"start": 0,
"end": 10,
"id": 0,
"features": {
"sentiment": "positive"
}
}
],
"next_annid": 1
},
f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}": {
"name": f"{settings.ANONYMIZATION_PREFIX}{self.annotators[0].pk}",
"annotations": [
{
"type": "Document",
"start": 0,
"end": 10,
"id": 0,
"features": {
"sentiment": "positive"
}
}
],
"next_annid": 1
}

}


}
)
Expand Down Expand Up @@ -1147,53 +1197,88 @@ def setUp(self):
def test_export_raw(self):

for document in self.project.documents.all():
# Fields should remain exactly the same as what's been uploaded
# aside from annotation_sets
doc_dict = document.get_doc_annotation_dict("raw")
print(doc_dict)
self.assertTrue("id" in doc_dict)
self.assertTrue("text" in doc_dict)
self.assertTrue("feature1" in doc_dict)
self.assertTrue("feature2" in doc_dict)
self.assertTrue("feature3" in doc_dict)
self.assertTrue("features" in doc_dict)
self.assertTrue("offset_type" in doc_dict)
self.assertTrue("annotations" in doc_dict)
doc_features = doc_dict["features"]
self.assertTrue("gate_format_feature1" in doc_features)
self.assertTrue("gate_format_feature2" in doc_features)
self.assertTrue("gate_format_feature3" in doc_features)


self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_dict, self.annotator_ids)
self.check_teamware_status(doc_dict, self.anon_annotator_names)

def test_export_gate(self):

for document in self.project.documents.all():
# All top-level fields apart from name, text, features and annotation_sets should be
# nested inside the features field
doc_dict = document.get_doc_annotation_dict("gate")
print(doc_dict)

self.assertTrue("text" in doc_dict)
self.assertTrue("features" in doc_dict)
self.assertFalse("annotations" in doc_dict)
self.assertEqual(doc_dict["offset_type"], "x")
doc_features = doc_dict["features"]
self.assertTrue("id" in doc_features)
self.assertTrue("feature1" in doc_features)
self.assertTrue("feature2" in doc_features)
self.assertTrue("feature3" in doc_features)
self.assertTrue("annotations" in doc_features)
self.assertFalse("features" in doc_features, "Double nesting of features field")
self.assertFalse("offset_type" in doc_features, "Double nesting of offset_type field")
self.assertTrue("gate_format_feature1" in doc_features)
self.assertTrue("gate_format_feature2" in doc_features)
self.assertTrue("gate_format_feature3" in doc_features)

self.check_raw_gate_annotation_formatting(doc_dict)
self.check_teamware_status(doc_features, self.annotator_ids)
self.check_teamware_status(doc_features, self.anon_annotator_names)

def test_export_gate_with_no_offset_type(self):

def check_raw_gate_annotation_formatting(self, doc_dict):
for document in self.project.documents.all():
document.data.pop("offset_type")

doc_dict = document.get_doc_annotation_dict("gate")
self.assertEqual(doc_dict["offset_type"], "p", "offset_type should default to p")


def check_raw_gate_annotation_formatting(self, doc_dict: dict):
self.assertTrue("annotation_sets" in doc_dict)
self.assertTrue(len(doc_dict["annotation_sets"]) == 3)
self.assertEqual(len(doc_dict["annotation_sets"]), 4, doc_dict)

# Test annotation formatting
for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue("name" in aset_data)
self.assertTrue("annotations" in aset_data)
self.assertEqual(len(aset_data["annotations"]), 1)
anno_dict = aset_data["annotations"][0]
self.assertTrue("type" in anno_dict)
self.assertTrue("start" in anno_dict)
self.assertTrue("end" in anno_dict)
self.assertTrue("id" in anno_dict)
self.assertTrue("features" in anno_dict)
self.assertTrue("label" in anno_dict["features"])
label_dict = anno_dict["features"]["label"]
self.assertTrue("text1" in label_dict)
self.assertTrue("checkbox1" in label_dict)
if aset_key != "existing_annotator1":
self.assertTrue("name" in aset_data)
self.assertTrue("annotations" in aset_data)
self.assertEqual(len(aset_data["annotations"]), 1)
anno_dict = aset_data["annotations"][0]
self.assertTrue("type" in anno_dict)
self.assertTrue("start" in anno_dict)
self.assertTrue("end" in anno_dict)
self.assertTrue("id" in anno_dict)
self.assertTrue("features" in anno_dict)
features_dict = anno_dict["features"]
self.assertTrue("text1" in features_dict)
self.assertTrue("checkbox1" in features_dict)
else:
# Check that existing annotation from document upload is carried over
self.assertEqual(aset_data["annotations"][0]["features"]["sentiment"], "positive")




def check_teamware_status(self, containing_dict, expected_value):
self.assertTrue("teamware_status" in containing_dict)
Expand All @@ -1219,31 +1304,44 @@ def test_export_csv(self):
self.assertTrue("feature2" in doc_dict)
self.assertTrue("feature3" in doc_dict)
self.assertTrue("annotations" in doc_dict)
self.assertTrue(len(doc_dict["annotations"]) == 3)
self.assertEqual(len(doc_dict["annotations"]), 4, doc_dict)
anno_set_dict = doc_dict["annotations"]
for set_key in anno_set_dict:
self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
if set_key != "existing_annotator1":
self.assertTrue(isinstance(anno_set_dict[set_key]["text1"], str))
self.assertTrue(isinstance(anno_set_dict[set_key]["checkbox1"], str))
else:
self.assertEqual(anno_set_dict[set_key]["sentiment"], "positive")

self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.annotator_ids))
self.check_teamware_status(doc_dict, ",".join(str(i) for i in self.anon_annotator_names))

def test_export_raw_anonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("raw", anonymize=True)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))
self.assertFalse(aset_key.startswith(self.unanonymized_prefix))
self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix))

self.check_teamware_status(doc_dict, self.annotator_ids)
self.check_teamware_status(doc_dict, self.anon_annotator_names)

def test_export_raw_deanonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("raw", anonymize=False)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))
self.assertTrue(aset_key.startswith(self.unanonymized_prefix))
self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
Expand All @@ -1252,20 +1350,30 @@ def test_export_raw_deanonymized(self):
def test_export_gate_anonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("gate", anonymize=True)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), int))
self.assertFalse(aset_key.startswith(self.unanonymized_prefix))
self.assertFalse(aset_data.get("name", None).startswith(self.unanonymized_prefix))

self.check_teamware_status(doc_dict["features"], self.annotator_ids)
self.check_teamware_status(doc_dict["features"], self.anon_annotator_names)

def test_export_gate_deanonymized(self):

for document in self.project.documents.all():
# Mask any existing annotations that came with the document upload
document.data.pop("annotation_sets")
document.save()

doc_dict = document.get_doc_annotation_dict("gate", anonymize=False)

for aset_key, aset_data in doc_dict["annotation_sets"].items():
self.assertTrue(isinstance(aset_data.get("name", None), str))
self.assertTrue(aset_key.startswith(self.unanonymized_prefix))
self.assertTrue(aset_data.get("name", None).startswith(self.unanonymized_prefix))

# for non-anonymized export the rejected/aborted/timed_out status
# uses names rather than ID numbers
Expand Down
Loading
Loading