diff --git a/hawc/apps/lit/api.py b/hawc/apps/lit/api.py index c407254202..18ef4ed3ed 100644 --- a/hawc/apps/lit/api.py +++ b/hawc/apps/lit/api.py @@ -18,7 +18,7 @@ from ..assessment.constants import AssessmentViewSetPermissions from ..assessment.models import Assessment from ..common.api import OncePerMinuteThrottle, PaginationWithCount -from ..common.helper import FlatExport, cacheable +from ..common.helper import FlatExport, cacheable, tryParseInt from ..common.renderers import PandasRenderers from ..common.serializers import UnusedSerializer from ..common.views import create_object_log @@ -426,3 +426,35 @@ def id_search(self, request, db_id: str, id: str): df=qs.global_df(), filename=f"global-reference-data-{id}", ) + + +class DuplicateViewSet( + BaseAssessmentViewSet, +): + model = models.DuplicateCandidateGroup + http_method_names = ["post"] + + @action( + detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT + ) + def resolve_duplicate(self, request, pk): + instance = self.get_object() + assessment = instance.assessment + if not assessment.user_can_edit_object(request.user): + raise PermissionDenied() + resolution = request.POST.get("resolution") + notes = request.POST.get("notes", "") + if resolution == "none": + instance.resolve( + resolution=constants.DuplicateResolution.FALSE_POSITIVE, + resolving_user=request.user, + notes=notes, + ) + if (resolution := tryParseInt(resolution)) is not None: + instance.resolve( + resolution=constants.DuplicateResolution.RESOLVED, + resolving_user=request.user, + primary_id=resolution, + notes=notes, + ) + return Response({"status": "ok"}) diff --git a/hawc/apps/lit/constants.py b/hawc/apps/lit/constants.py index 1c51472140..9c3c1321ac 100644 --- a/hawc/apps/lit/constants.py +++ b/hawc/apps/lit/constants.py @@ -29,6 +29,12 @@ class SearchType(models.TextChoices): IMPORT = "i", "Import" +class DuplicateResolution(models.IntegerChoices): + UNRESOLVED = 0, "Unresolved" + RESOLVED = 1, "Primary identified" # TODO: change to "primary identified" + FALSE_POSITIVE = 2, "False positive" + + # generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/ DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$") DOI_EXTRACT = re.compile(r"10\.\d{4,9}/[^\s]+") diff --git a/hawc/apps/lit/filterset.py b/hawc/apps/lit/filterset.py index eca6cba7f6..ab6d48c309 100644 --- a/hawc/apps/lit/filterset.py +++ b/hawc/apps/lit/filterset.py @@ -8,10 +8,11 @@ ArrowOrderingFilter, BaseFilterSet, ExpandableFilterForm, + InlineFilterForm, PaginationFilter, filter_noop, ) -from . import models +from . import constants, models class ReferenceFilterSet(BaseFilterSet): @@ -345,3 +346,34 @@ class Meta: def filter_tag(self, queryset, name, value): include_descendants = self.data.get("include_descendants", False) return queryset.with_tag(tag=value, descendants=include_descendants) + + +class DuplicateCandidateGroupFilterSet(BaseFilterSet): + candidate_search = df.CharFilter( + method="filter_search", + label="Title/Author/Year", + help_text="Filter citations (authors, year, title)", + ) + resolution = df.ChoiceFilter( + empty_label="- Resolution -", + choices=constants.DuplicateResolution.choices, + ) + + class Meta: + model = models.DuplicateCandidateGroup + form = InlineFilterForm + fields = [ + "candidate_search", + "resolution", + ] + main_field = "candidate_search" + appended_fields = [ + "resolution", + ] + + def filter_queryset(self, queryset): + queryset = super().filter_queryset(queryset) + return queryset.filter(assessment=self.assessment) + + def filter_search(self, queryset, name, value): + return queryset.filter(candidates__in=models.Reference.objects.full_text_search(value)) diff --git a/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py b/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py new file mode 100644 index 0000000000..136d812a05 --- /dev/null +++ b/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py @@ -0,0 +1,79 @@ +# Generated by Django 5.1.4 on 2025-02-13 12:51 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("assessment", "0047_alter_labeleditem_options"), + ("lit", "0024_workflows"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddField( + model_name="reference", + name="hidden", + field=models.BooleanField(default=False), + ), + migrations.CreateModel( + name="DuplicateCandidateGroup", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "resolution", + models.PositiveSmallIntegerField( + choices=[ + (0, "Unresolved"), + (1, "Primary identified"), + (2, "False positive"), + ], + default=0, + ), + ), + ("notes", models.TextField(blank=True)), + ("created", models.DateTimeField(auto_now_add=True)), + ("last_updated", models.DateTimeField(auto_now=True)), + ( + "assessment", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="duplicates", + to="assessment.assessment", + ), + ), + ( + "candidates", + models.ManyToManyField(related_name="duplicate_candidates", to="lit.reference"), + ), + ( + "primary", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="duplicate_primaries", + to="lit.reference", + ), + ), + ( + "resolving_user", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="resolved_duplicates", + to=settings.AUTH_USER_MODEL, + ), + ), + ], + ), + ] diff --git a/hawc/apps/lit/models.py b/hawc/apps/lit/models.py index ed396ec210..d21c36f67f 100644 --- a/hawc/apps/lit/models.py +++ b/hawc/apps/lit/models.py @@ -1,6 +1,7 @@ import html import json import logging +import random import re from copy import copy from math import ceil @@ -849,6 +850,7 @@ class Reference(models.Model): null=True, help_text="Used internally for determining when reference was " "originally added", ) + hidden = models.BooleanField(default=False) BREADCRUMB_PARENT = "assessment" @@ -1458,6 +1460,82 @@ def get_description(self) -> str: ) +class DuplicateCandidateGroup(models.Model): + assessment = models.ForeignKey( + "assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates" + ) + resolution = models.PositiveSmallIntegerField( + choices=constants.DuplicateResolution, default=constants.DuplicateResolution.UNRESOLVED + ) + resolving_user = models.ForeignKey( + HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates" + ) + candidates = models.ManyToManyField(Reference, related_name="duplicate_candidates") + primary = models.ForeignKey( + Reference, null=True, on_delete=models.SET_NULL, related_name="duplicate_primaries" + ) + notes = models.TextField(blank=True) + created = models.DateTimeField(auto_now_add=True) + last_updated = models.DateTimeField(auto_now=True) + + @property + def secondaries(self): + return self.candidates.exclude(pk=self.primary_id) + + def get_assessment(self): + return self.assessment + + @classmethod + def validate_candidates(cls, candidates: list[int]): + qs = cls.objects.annotate(candidates_count=models.Count("candidates")).filter( + candidates_count=len(candidates) + ) + for candidate in candidates: + qs = qs.filter(candidates=candidate) + return not qs.exists() + + @classmethod + def find_duplicate_candidate_groups(cls, references) -> list[list[dict]]: + num_candidates = 2 + if len(references) < num_candidates: + return [] + num_groups = min(3, len(references) / num_candidates) + return [random.choices(references, k=num_candidates) for i in range(num_groups)] # noqa: S311 + + @classmethod + def create_duplicate_candidate_groups(cls, assessment_id: int): + tasks.create_duplicate_candidate_groups.delay(assessment_id) + + def _update_references(self): + duplicate_ids = self.secondaries.values_list("pk", flat=True) + self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True) + # if a "hidden" reference was selected as primary, unhide it + if self.primary.hidden: + self.primary.hidden = False + self.primary.save() + + def resolve( + self, + resolution: constants.DuplicateResolution, + resolving_user: HAWCUser, + primary_id: int | None = None, + notes: str = "", + ): + if resolution == constants.DuplicateResolution.UNRESOLVED: + raise ValueError("Resolution must not be unresolved.") + if resolution == constants.DuplicateResolution.RESOLVED: + if primary_id is None: + raise ValueError("Primary must not be None if duplicate identified.") + if primary_id not in self.candidates.values_list("pk", flat=True): + raise ValueError("Primary must be a candidate.") + self.primary_id = primary_id + self._update_references() + self.resolution = resolution + self.resolving_user = resolving_user + self.notes = notes + self.save() + + reversion.register(LiteratureAssessment) reversion.register(Search) reversion.register(ReferenceFilterTag) diff --git a/hawc/apps/lit/tasks.py b/hawc/apps/lit/tasks.py index 3304a67cac..37535e0d7b 100644 --- a/hawc/apps/lit/tasks.py +++ b/hawc/apps/lit/tasks.py @@ -142,3 +142,29 @@ def fix_pubmed_without_content(): logger.info(f"Attempting to update pubmed content for {num_ids} identifiers") if num_ids > 0: Identifiers.update_pubmed_content(ids) + + +@shared_task +def create_duplicate_candidate_groups(assessment_id: int): + DuplicateCandidateGroup = apps.get_model("lit", "DuplicateCandidateGroup") + assessment = apps.get_model("assessment", "Assessment").objects.get(pk=assessment_id) + references = assessment.references.values("pk", "title") + candidate_groups = DuplicateCandidateGroup.find_duplicate_candidate_groups(references) + candidate_groups = [ + group + for group in candidate_groups + if DuplicateCandidateGroup.validate_candidates([ref["pk"] for ref in group]) + ] + with transaction.atomic(): + objs = DuplicateCandidateGroup.objects.bulk_create( + [DuplicateCandidateGroup(assessment=assessment) for group in candidate_groups] + ) + DuplicateCandidateGroup.candidates.through.objects.bulk_create( + [ + DuplicateCandidateGroup.candidates.through( + duplicatecandidategroup_id=obj.pk, reference_id=ref["pk"] + ) + for obj, group in zip(objs, candidate_groups, strict=False) + for ref in group + ] + ) diff --git a/hawc/apps/lit/templates/lit/duplicate_resolution.html b/hawc/apps/lit/templates/lit/duplicate_resolution.html new file mode 100644 index 0000000000..e2b6e0c7d7 --- /dev/null +++ b/hawc/apps/lit/templates/lit/duplicate_resolution.html @@ -0,0 +1,44 @@ +{% extends 'assessment-rooted.html' %} + +{% load bs4 %} +{% block content %} +