shapiromatron · rabstejnek · Feb 4, 2025 · Feb 6, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/hawc/apps/lit/api.py b/hawc/apps/lit/api.py
@@ -18,7 +18,7 @@
 from ..assessment.constants import AssessmentViewSetPermissions
 from ..assessment.models import Assessment
 from ..common.api import OncePerMinuteThrottle, PaginationWithCount
-from ..common.helper import FlatExport, cacheable
+from ..common.helper import FlatExport, cacheable, tryParseInt
 from ..common.renderers import PandasRenderers
 from ..common.serializers import UnusedSerializer
 from ..common.views import create_object_log
@@ -426,3 +426,28 @@ def id_search(self, request, db_id: str, id: str):
             df=qs.global_df(),
             filename=f"global-reference-data-{id}",
         )
+
+
+
+
+class DuplicateViewSet(
+    BaseAssessmentViewSet,
+):
+    model = models.DuplicateCandidates
+    http_method_names = ["post"]
+
+    @action(
+        detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT
+    )
+    def resolve_duplicate(self, request, pk):
+        instance = self.get_object()
+        assessment = instance.assessment
+        if not assessment.user_can_edit_object(self.request.user):
+            raise PermissionDenied()
+        resolution = request.POST.get("resolution")
+        notes = request.POST.get("notes","")
+        if resolution == "none":
+            instance.resolve(resolution=constants.DuplicateResolution.FALSE_POSITIVE,notes=notes)
+        if (resolution:=tryParseInt(resolution)) is not None:
+            instance.resolve(resolution=constants.DuplicateResolution.RESOLVED,primary=resolution,notes=notes)
+        return Response({"status": "ok"})
diff --git a/hawc/apps/lit/constants.py b/hawc/apps/lit/constants.py
@@ -28,6 +28,10 @@ class SearchType(models.TextChoices):
     SEARCH = "s", "Search"
     IMPORT = "i", "Import"
 
+class DuplicateResolution(models.IntegerChoices):
+    UNRESOLVED = 0, "Unresolved"
+    RESOLVED = 1, "Resolved" # TODO: change to "primary identified"
+    FALSE_POSITIVE = 2, "False positive"
 
 # generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/
 DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$")

diff --git a/hawc/apps/lit/migrations/0025_dedupesettings_duplicatecandidates.py b/hawc/apps/lit/migrations/0025_dedupesettings_duplicatecandidates.py
@@ -0,0 +1,84 @@
+# Generated by Django 5.1.4 on 2025-02-03 08:23
+
+import django.contrib.postgres.fields
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("assessment", "0047_alter_labeleditem_options"),
+        ("lit", "0024_workflows"),
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name="DedupeSettings",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+            ],
+        ),
+        migrations.CreateModel(
+            name="DuplicateCandidates",
+            fields=[
+                (
+                    "id",
+                    models.AutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "resolution",
+                    models.PositiveSmallIntegerField(
+                        choices=[
+                            (0, "Unresolved"),
+                            (1, "Resolved"),
+                            (2, "False positive"),
+                        ],
+                        default=0,
+                    ),
+                ),
+                (
+                    "candidates",
+                    django.contrib.postgres.fields.ArrayField(
+                        base_field=models.IntegerField(), size=None, unique=True
+                    ),
+                ),
+                ("primary", models.IntegerField(null=True)),
+                ("notes", models.TextField(blank=True)),
+                ("created", models.DateTimeField(auto_now_add=True)),
+                ("last_updated", models.DateTimeField(auto_now=True)),
+                (
+                    "assessment",
+                    models.ForeignKey(
+                        on_delete=django.db.models.deletion.CASCADE,
+                        related_name="duplicates",
+                        to="assessment.assessment",
+                    ),
+                ),
+                (
+                    "resolving_user",
+                    models.ForeignKey(
+                        null=True,
+                        on_delete=django.db.models.deletion.SET_NULL,
+                        related_name="resolved_duplicates",
+                        to=settings.AUTH_USER_MODEL,
+                    ),
+                ),
+            ],
+        ),
+    ]
diff --git a/hawc/apps/lit/migrations/0026_reference_hidden.py b/hawc/apps/lit/migrations/0026_reference_hidden.py
@@ -0,0 +1,18 @@
+# Generated by Django 5.1.4 on 2025-02-05 17:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("lit", "0025_dedupesettings_duplicatecandidates"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="reference",
+            name="hidden",
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/hawc/apps/lit/models.py b/hawc/apps/lit/models.py
@@ -6,6 +6,7 @@
 from math import ceil
 from typing import Self
 from urllib import parse
+import random
 
 from celery import chain
 from celery.result import ResultBase
@@ -849,6 +850,7 @@ class Reference(models.Model):
         null=True,
         help_text="Used internally for determining when reference was " "originally added",
     )
+    hidden = models.BooleanField(default=False)
 
     BREADCRUMB_PARENT = "assessment"
 
@@ -1458,6 +1460,101 @@ def get_description(self) -> str:
         )
 
 
+# add parameters/select on literatureassessment model
+class DedupeSettings(models.Model):
+    # deduper for use in an assessment
+    # for first pass maybe we just have a global deduper, or static choices, so that we don't have to build this
+    assessment:"Assessment"
+    parameters:dict # list of parameters for deduplication? ie schema of dedupe modules to use?
+
+    def build_deduper(self):
+        # return deduper instance using self.parameters
+        return
+
+
+# SOFT DELETES
+
+class SortedArrayField(ArrayField):
+    pass
+
+class DuplicateCandidates(models.Model):
+    assessment = models.ForeignKey(
+        "assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates"
+    )
+    resolution = models.PositiveSmallIntegerField(
+        choices=constants.DuplicateResolution,
+        default=constants.DuplicateResolution.UNRESOLVED
+    )
+    resolving_user = models.ForeignKey(HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates")
+    candidates = ArrayField(models.IntegerField(),unique=True)
+    primary = models.IntegerField(null=True)
+    notes = models.TextField(blank=True)
+    created = models.DateTimeField(auto_now_add=True)
+    last_updated = models.DateTimeField(auto_now=True)
+
+    def get_assessment(self):
+        return self.assessment
+
+    @classmethod
+    def foobar(cls,assessment):
+        references = assessment.references.values("pk","title")
+        candidate_groups = cls.random_execute(references)
+        cls.objects.bulk_create([cls(assessment=assessment,candidates=[ref["pk"] for ref in group]) for group in candidate_groups])
+
+    @classmethod
+    def random_execute(cls,references)->list[list[dict]]:
+        num_candidates = 2
+        if len(references)<num_candidates:
+            return []
+        num_groups = min(3,len(references)/num_candidates)
+        return [random.choices(references,k=num_candidates) for i in range(num_groups)]
+
+    def generate_unique_identifier(self):
+        return sorted(self.candidates)
+
+    def _update_references(self):
+        # TODO also make primary not hidden? may be unnecessary
+        duplicate_ids = set(self.candidates)-{self.primary}
+        self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True)
+
+    def resolve(self,resolution:constants.DuplicateResolution,primary:int=None,notes:str=""):
+        if resolution == constants.DuplicateResolution.UNRESOLVED:
+            raise ValueError("Resolution must not be unresolved.")
+        if resolution == constants.DuplicateResolution.RESOLVED:
+            if primary is None:
+                raise ValueError("Primary must not be None if duplicate identified.")
+            if primary not in self.candidates:
+                raise ValueError("Primary must be a candidate.")
+            self.primary = primary
+            #self._update_references()
+        self.resolution = resolution
+        self.notes = notes
+        self.save()
+
+# where to put execute method? literatureassessment, manager for dupes model
+
+
+# DuplicateCandidateGroup
+
+"""
+WORKFLOW
+
+User defines deduper for use in assessment
+User executes a session that uses a defined deduper
+Session stores list of identified candidate duplicate groups
+User resolves duplicates in a session; if group status != unresolved, it shows up on this page
+Perhaps a seperate session page of resolved groups? ie an "in progress" list view and a "done" list view
+Multiple resolutions at once? Or more like screen page in LLR where its do one, click for next (look at conflict resolution)
+Should this workflow do anything proactive? ie lets say a candidate group is identified false positive, is it a big deal if it shows up again if a user executes another session w/ same settings? (yes)
+Single user right? Not like conflict resolution? THIS IS CORRECT
+Do we want this workflow to also happen on import? That would look slightly different
+    Though maybe we could just have it happen automatically AFTER import, that way it would use the same workflow
+    If used on import, do we add "choose a deduper" option to created search? or maybe "default" attribute to deduper, whichever one is "default" is used?
+    Each assessment has undeletable "default" deduper, maybe add noop setting choice for deduper for people who don't want it running on imports?
+"""
+
+
+
 reversion.register(LiteratureAssessment)
 reversion.register(Search)
 reversion.register(ReferenceFilterTag)

diff --git a/hawc/apps/lit/templates/lit/_duplicate_candidates.html b/hawc/apps/lit/templates/lit/_duplicate_candidates.html
@@ -0,0 +1,18 @@
+<div hx-target="this" hx-swap="delete swap:1s">
+    <form hx-post="{% url 'lit:api:duplicate-resolve-duplicate' object.pk %}">
+        <div>
+        <label>
+            <input type="radio" id="none-{{object.pk}}" name="resolution" value="none" />
+            No duplicates
+        </label>
+        </div>
+        {% for candidate in object.candidates %}
+        <div>
+            <input type="radio" id="primary-{{object.pk}}-{{candidate}}" name="resolution" value="{{candidate}}" />
+            <div style="display:inline-block;">{{candidate}}</div>
+        </div>
+        {% endfor %}
+        <textarea name="notes" placeholder="Notes"></textarea>
+        <button type="submit">Resolve</button>
+    </form>
+</div>
diff --git a/hawc/apps/lit/templates/lit/duplicate_candidates.html b/hawc/apps/lit/templates/lit/duplicate_candidates.html
@@ -0,0 +1,17 @@
+{% extends 'assessment-rooted.html' %}
+
+{% load bs4 %}
+{% block content %}
+  <div class="d-flex">
+    <h2>Duplicate candidates</h2>
+    {% actions %}
+      <a class="dropdown-item" href="{% url 'lit:duplicate-task' assessment.pk %}">Run deduplication</a>
+    {% endactions %}
+  </div>
+  <ul class="list-group list-group-flush my-3">
+    {% for object in object_list %}
+        {% include 'lit/_duplicate_candidates.html' %}
+    {% endfor %}
+  </ul>
+  {% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
+{% endblock content %}
diff --git a/hawc/apps/lit/templates/lit/duplicate_candidates_2.html b/hawc/apps/lit/templates/lit/duplicate_candidates_2.html
@@ -0,0 +1,28 @@
+{% extends 'assessment-rooted.html' %}
+
+{% load bs4 %}
+{% block content %}
+  <div class="d-flex">
+    <h2>Resolved duplicates</h2>
+  </div>
+  <ul class="list-group list-group-flush my-3">
+    {% for object in object_list %}
+        <div>
+          Group {{object.pk}}
+        </div>
+        <div>
+          {{object.resolution}}
+        </div>
+        <div>
+          {{object.candidates}}
+        </div>
+        <div>
+          {{object.primary}}
+        </div>
+        <div>
+          {{object.notes}}
+        </div>
+    {% endfor %}
+  </ul>
+  {% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
+{% endblock content %}
diff --git a/hawc/apps/lit/urls.py b/hawc/apps/lit/urls.py
@@ -8,6 +8,7 @@
 router.register(r"reference", api.ReferenceViewSet, basename="reference")
 router.register(r"search", api.SearchViewSet, basename="search")
 router.register(r"tags", api.ReferenceFilterTagViewSet, basename="tags")
+router.register(r"duplicate", api.DuplicateViewSet, basename="duplicate")
 
 app_name = "lit"
 urlpatterns = [
@@ -163,4 +164,19 @@
         name="workflow-htmx",
     ),
     path("api/", include((router.urls, "api"))),
+    path(
+        "assessment/<int:pk>/duplicate-candidates/",
+        views.DuplicateCandidatesList.as_view(),
+        name="duplicate-candidates",
+    ),
+    path(
+        "assessment/<int:pk>/duplicate-candidates2/",
+        views.DuplicateCandidatesList2.as_view(),
+        name="duplicate-candidates2",
+    ),
+    path(
+        "assessment/<int:pk>/duplicate-task/",
+        views.DuplicateTask.as_view(),
+        name="duplicate-task",
+    ),
 ]