shapiromatron · rabstejnek · Feb 4, 2025 · Feb 6, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/hawc/apps/lit/api.py b/hawc/apps/lit/api.py
@@ -18,7 +18,7 @@
 from ..assessment.constants import AssessmentViewSetPermissions
 from ..assessment.models import Assessment
 from ..common.api import OncePerMinuteThrottle, PaginationWithCount
-from ..common.helper import FlatExport, cacheable
+from ..common.helper import FlatExport, cacheable, tryParseInt
 from ..common.renderers import PandasRenderers
 from ..common.serializers import UnusedSerializer
 from ..common.views import create_object_log
@@ -426,3 +426,35 @@ def id_search(self, request, db_id: str, id: str):
             df=qs.global_df(),
             filename=f"global-reference-data-{id}",
         )
+
+
+class DuplicateViewSet(
+    BaseAssessmentViewSet,
+):
+    model = models.DuplicateCandidateGroup
+    http_method_names = ["post"]
+
+    @action(
+        detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT
+    )
+    def resolve_duplicate(self, request, pk):
+        instance = self.get_object()
+        assessment = instance.assessment
+        if not assessment.user_can_edit_object(request.user):
+            raise PermissionDenied()
+        resolution = request.POST.get("resolution")
+        notes = request.POST.get("notes", "")
+        if resolution == "none":
+            instance.resolve(
+                resolution=constants.DuplicateResolution.FALSE_POSITIVE,
+                resolving_user=request.user,
+                notes=notes,
+            )
+        if (resolution := tryParseInt(resolution)) is not None:
+            instance.resolve(
+                resolution=constants.DuplicateResolution.RESOLVED,
+                resolving_user=request.user,
+                primary_id=resolution,
+                notes=notes,
+            )
+        return Response({"status": "ok"})
diff --git a/hawc/apps/lit/constants.py b/hawc/apps/lit/constants.py
@@ -29,6 +29,12 @@ class SearchType(models.TextChoices):
     IMPORT = "i", "Import"
 
 
+class DuplicateResolution(models.IntegerChoices):
+    UNRESOLVED = 0, "Unresolved"
+    RESOLVED = 1, "Resolved"  # TODO: change to "primary identified"
+    FALSE_POSITIVE = 2, "False positive"
+
+
 # generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/
 DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$")
 DOI_EXTRACT = re.compile(r"10\.\d{4,9}/[^\s]+")

diff --git a/hawc/apps/lit/migrations/0025_dedupesettings_duplicatecandidates.py b/hawc/apps/lit/migrations/0025_dedupesettings_duplicatecandidates.py
@@ -0,0 +1,83 @@
+# Generated by Django 5.1.4 on 2025-02-03 08:23
+
+import django.contrib.postgres.fields
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("assessment", "0047_alter_labeleditem_options"),
+        ("lit", "0024_workflows"),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DedupeSettings",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+            ],
+        ),
+        migrations.CreateModel(
+            name="DuplicateCandidates",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "resolution",
+                    models.PositiveSmallIntegerField(
+                        choices=[
+                            (0, "Unresolved"),
+                            (1, "Resolved"),
+                            (2, "False positive"),
+                        ],
+                        default=0,
+                    ),
+                ),
+                (
+                    "candidates",
+                    django.contrib.postgres.fields.ArrayField(
+                        base_field=models.IntegerField(), size=None, unique=True
+                    ),
+                ),
+                ("primary", models.IntegerField(null=True)),
+                ("notes", models.TextField(blank=True)),
+                ("created", models.DateTimeField(auto_now_add=True)),
+                ("last_updated", models.DateTimeField(auto_now=True)),
+                (
+                    "assessment",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="duplicates",
+                        to="assessment.assessment",
+                    ),
+                ),
+                (
+                    "resolving_user",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="resolved_duplicates",
+                        to=settings.AUTH_USER_MODEL,
+                    ),
+                ),
+            ],
+        ),
+    ]
diff --git a/hawc/apps/lit/migrations/0026_reference_hidden.py b/hawc/apps/lit/migrations/0026_reference_hidden.py
@@ -0,0 +1,17 @@
+# Generated by Django 5.1.4 on 2025-02-05 17:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("lit", "0025_dedupesettings_duplicatecandidates"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="reference",
+            name="hidden",
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/hawc/apps/lit/migrations/0027_duplicatecandidategroup_delete_dedupesettings_and_more.py b/hawc/apps/lit/migrations/0027_duplicatecandidategroup_delete_dedupesettings_and_more.py
@@ -0,0 +1,80 @@
+# Generated by Django 5.1.4 on 2025-02-10 12:18
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("assessment", "0047_alter_labeleditem_options"),
+        ("lit", "0026_reference_hidden"),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DuplicateCandidateGroup",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "resolution",
+                    models.PositiveSmallIntegerField(
+                        choices=[
+                            (0, "Unresolved"),
+                            (1, "Resolved"),
+                            (2, "False positive"),
+                        ],
+                        default=0,
+                    ),
+                ),
+                ("notes", models.TextField(blank=True)),
+                ("created", models.DateTimeField(auto_now_add=True)),
+                ("last_updated", models.DateTimeField(auto_now=True)),
+                (
+                    "assessment",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="duplicates",
+                        to="assessment.assessment",
+                    ),
+                ),
+                (
+                    "candidates",
+                    models.ManyToManyField(related_name="duplicate_candidates", to="lit.reference"),
+                ),
+                (
+                    "primary",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="duplicate_primaries",
+                        to="lit.reference",
+                    ),
+                ),
+                (
+                    "resolving_user",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="resolved_duplicates",
+                        to=settings.AUTH_USER_MODEL,
+                    ),
+                ),
+            ],
+        ),
+        migrations.DeleteModel(
+            name="DedupeSettings",
+        ),
+        migrations.DeleteModel(
+            name="DuplicateCandidates",
+        ),
+    ]
diff --git a/hawc/apps/lit/models.py b/hawc/apps/lit/models.py
@@ -1,6 +1,7 @@
 import html
 import json
 import logging
+import random
 import re
 from copy import copy
 from math import ceil
@@ -849,6 +850,7 @@ class Reference(models.Model):
         null=True,
         help_text="Used internally for determining when reference was " "originally added",
     )
+    hidden = models.BooleanField(default=False)
 
     BREADCRUMB_PARENT = "assessment"
 
@@ -1458,6 +1460,82 @@ def get_description(self) -> str:
         )
 
 
+class DuplicateCandidateGroup(models.Model):
+    assessment = models.ForeignKey(
+        "assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates"
+    )
+    resolution = models.PositiveSmallIntegerField(
+        choices=constants.DuplicateResolution, default=constants.DuplicateResolution.UNRESOLVED
+    )
+    resolving_user = models.ForeignKey(
+        HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates"
+    )
+    candidates = models.ManyToManyField(Reference, related_name="duplicate_candidates")
+    primary = models.ForeignKey(
+        Reference, null=True, on_delete=models.SET_NULL, related_name="duplicate_primaries"
+    )
+    notes = models.TextField(blank=True)
+    created = models.DateTimeField(auto_now_add=True)
+    last_updated = models.DateTimeField(auto_now=True)
+
+    @property
+    def secondaries(self):
+        return self.candidates.exclude(pk=self.primary_id)
+
+    def get_assessment(self):
+        return self.assessment
+
+    @classmethod
+    def validate_candidates(cls, candidates: list[int]):
+        qs = cls.objects.annotate(candidates_count=models.Count("candidates")).filter(
+            candidates_count=len(candidates)
+        )
+        for candidate in candidates:
+            qs = qs.filter(candidates=candidate)
+        return not qs.exists()
+
+    @classmethod
+    def find_duplicate_candidate_groups(cls, references) -> list[list[dict]]:
+        num_candidates = 2
+        if len(references) < num_candidates:
+            return []
+        num_groups = min(3, len(references) / num_candidates)
+        return [random.choices(references, k=num_candidates) for i in range(num_groups)]  # noqa: S311
+
+    @classmethod
+    def create_duplicate_candidate_groups(cls, assessment_id: int):
+        tasks.create_duplicate_candidate_groups.delay(assessment_id)
+
+    def _update_references(self):
+        duplicate_ids = self.secondaries.values_list("pk", flat=True)
+        self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True)
+        # if a "hidden" reference was selected as primary, unhide it
+        if self.primary.hidden:
+            self.primary.hidden = False
+            self.primary.save()
+
+    def resolve(
+        self,
+        resolution: constants.DuplicateResolution,
+        resolving_user: HAWCUser,
+        primary_id: int | None = None,
+        notes: str = "",
+    ):
+        if resolution == constants.DuplicateResolution.UNRESOLVED:
+            raise ValueError("Resolution must not be unresolved.")
+        if resolution == constants.DuplicateResolution.RESOLVED:
+            if primary_id is None:
+                raise ValueError("Primary must not be None if duplicate identified.")
+            if primary_id not in self.candidates.values_list("pk", flat=True):
+                raise ValueError("Primary must be a candidate.")
+            self.primary_id = primary_id
+            self._update_references()
+        self.resolution = resolution
+        self.resolving_user = resolving_user
+        self.notes = notes
+        self.save()
+
+
 reversion.register(LiteratureAssessment)
 reversion.register(Search)
 reversion.register(ReferenceFilterTag)

diff --git a/hawc/apps/lit/tasks.py b/hawc/apps/lit/tasks.py
@@ -142,3 +142,29 @@ def fix_pubmed_without_content():
     logger.info(f"Attempting to update pubmed content for {num_ids} identifiers")
     if num_ids > 0:
         Identifiers.update_pubmed_content(ids)
+
+
+@shared_task
+def create_duplicate_candidate_groups(assessment_id: int):
+    DuplicateCandidateGroup = apps.get_model("lit", "DuplicateCandidateGroup")
+    assessment = apps.get_model("assessment", "Assessment").objects.get(pk=assessment_id)
+    references = assessment.references.values("pk", "title")
+    candidate_groups = DuplicateCandidateGroup.find_duplicate_candidate_groups(references)
+    candidate_groups = [
+        group
+        for group in candidate_groups
+        if DuplicateCandidateGroup.validate_candidates([ref["pk"] for ref in group])
+    ]
+    with transaction.atomic():
+        objs = DuplicateCandidateGroup.objects.bulk_create(
+            [DuplicateCandidateGroup(assessment=assessment) for group in candidate_groups]
+        )
+        DuplicateCandidateGroup.candidates.through.objects.bulk_create(
+            [
+                DuplicateCandidateGroup.candidates.through(
+                    duplicatecandidategroup_id=obj.pk, reference_id=ref["pk"]
+                )
+                for obj, group in zip(objs, candidate_groups, strict=False)
+                for ref in group
+            ]
+        )