diff --git a/hawc/apps/lit/api.py b/hawc/apps/lit/api.py index c407254202..18ef4ed3ed 100644 --- a/hawc/apps/lit/api.py +++ b/hawc/apps/lit/api.py @@ -18,7 +18,7 @@ from ..assessment.constants import AssessmentViewSetPermissions from ..assessment.models import Assessment from ..common.api import OncePerMinuteThrottle, PaginationWithCount -from ..common.helper import FlatExport, cacheable +from ..common.helper import FlatExport, cacheable, tryParseInt from ..common.renderers import PandasRenderers from ..common.serializers import UnusedSerializer from ..common.views import create_object_log @@ -426,3 +426,35 @@ def id_search(self, request, db_id: str, id: str): df=qs.global_df(), filename=f"global-reference-data-{id}", ) + + +class DuplicateViewSet( + BaseAssessmentViewSet, +): + model = models.DuplicateCandidateGroup + http_method_names = ["post"] + + @action( + detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT + ) + def resolve_duplicate(self, request, pk): + instance = self.get_object() + assessment = instance.assessment + if not assessment.user_can_edit_object(request.user): + raise PermissionDenied() + resolution = request.POST.get("resolution") + notes = request.POST.get("notes", "") + if resolution == "none": + instance.resolve( + resolution=constants.DuplicateResolution.FALSE_POSITIVE, + resolving_user=request.user, + notes=notes, + ) + if (resolution := tryParseInt(resolution)) is not None: + instance.resolve( + resolution=constants.DuplicateResolution.RESOLVED, + resolving_user=request.user, + primary_id=resolution, + notes=notes, + ) + return Response({"status": "ok"}) diff --git a/hawc/apps/lit/constants.py b/hawc/apps/lit/constants.py index 1c51472140..9c3c1321ac 100644 --- a/hawc/apps/lit/constants.py +++ b/hawc/apps/lit/constants.py @@ -29,6 +29,12 @@ class SearchType(models.TextChoices): IMPORT = "i", "Import" +class DuplicateResolution(models.IntegerChoices): + UNRESOLVED = 0, "Unresolved" + RESOLVED = 1, "Primary identified" # TODO: change to "primary identified" + FALSE_POSITIVE = 2, "False positive" + + # generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/ DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$") DOI_EXTRACT = re.compile(r"10\.\d{4,9}/[^\s]+") diff --git a/hawc/apps/lit/filterset.py b/hawc/apps/lit/filterset.py index eca6cba7f6..ab6d48c309 100644 --- a/hawc/apps/lit/filterset.py +++ b/hawc/apps/lit/filterset.py @@ -8,10 +8,11 @@ ArrowOrderingFilter, BaseFilterSet, ExpandableFilterForm, + InlineFilterForm, PaginationFilter, filter_noop, ) -from . import models +from . import constants, models class ReferenceFilterSet(BaseFilterSet): @@ -345,3 +346,34 @@ class Meta: def filter_tag(self, queryset, name, value): include_descendants = self.data.get("include_descendants", False) return queryset.with_tag(tag=value, descendants=include_descendants) + + +class DuplicateCandidateGroupFilterSet(BaseFilterSet): + candidate_search = df.CharFilter( + method="filter_search", + label="Title/Author/Year", + help_text="Filter citations (authors, year, title)", + ) + resolution = df.ChoiceFilter( + empty_label="- Resolution -", + choices=constants.DuplicateResolution.choices, + ) + + class Meta: + model = models.DuplicateCandidateGroup + form = InlineFilterForm + fields = [ + "candidate_search", + "resolution", + ] + main_field = "candidate_search" + appended_fields = [ + "resolution", + ] + + def filter_queryset(self, queryset): + queryset = super().filter_queryset(queryset) + return queryset.filter(assessment=self.assessment) + + def filter_search(self, queryset, name, value): + return queryset.filter(candidates__in=models.Reference.objects.full_text_search(value)) diff --git a/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py b/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py new file mode 100644 index 0000000000..136d812a05 --- /dev/null +++ b/hawc/apps/lit/migrations/0025_reference_hidden_duplicatecandidategroup.py @@ -0,0 +1,79 @@ +# Generated by Django 5.1.4 on 2025-02-13 12:51 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("assessment", "0047_alter_labeleditem_options"), + ("lit", "0024_workflows"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AddField( + model_name="reference", + name="hidden", + field=models.BooleanField(default=False), + ), + migrations.CreateModel( + name="DuplicateCandidateGroup", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "resolution", + models.PositiveSmallIntegerField( + choices=[ + (0, "Unresolved"), + (1, "Primary identified"), + (2, "False positive"), + ], + default=0, + ), + ), + ("notes", models.TextField(blank=True)), + ("created", models.DateTimeField(auto_now_add=True)), + ("last_updated", models.DateTimeField(auto_now=True)), + ( + "assessment", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="duplicates", + to="assessment.assessment", + ), + ), + ( + "candidates", + models.ManyToManyField(related_name="duplicate_candidates", to="lit.reference"), + ), + ( + "primary", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="duplicate_primaries", + to="lit.reference", + ), + ), + ( + "resolving_user", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="resolved_duplicates", + to=settings.AUTH_USER_MODEL, + ), + ), + ], + ), + ] diff --git a/hawc/apps/lit/models.py b/hawc/apps/lit/models.py index ed396ec210..d21c36f67f 100644 --- a/hawc/apps/lit/models.py +++ b/hawc/apps/lit/models.py @@ -1,6 +1,7 @@ import html import json import logging +import random import re from copy import copy from math import ceil @@ -849,6 +850,7 @@ class Reference(models.Model): null=True, help_text="Used internally for determining when reference was " "originally added", ) + hidden = models.BooleanField(default=False) BREADCRUMB_PARENT = "assessment" @@ -1458,6 +1460,82 @@ def get_description(self) -> str: ) +class DuplicateCandidateGroup(models.Model): + assessment = models.ForeignKey( + "assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates" + ) + resolution = models.PositiveSmallIntegerField( + choices=constants.DuplicateResolution, default=constants.DuplicateResolution.UNRESOLVED + ) + resolving_user = models.ForeignKey( + HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates" + ) + candidates = models.ManyToManyField(Reference, related_name="duplicate_candidates") + primary = models.ForeignKey( + Reference, null=True, on_delete=models.SET_NULL, related_name="duplicate_primaries" + ) + notes = models.TextField(blank=True) + created = models.DateTimeField(auto_now_add=True) + last_updated = models.DateTimeField(auto_now=True) + + @property + def secondaries(self): + return self.candidates.exclude(pk=self.primary_id) + + def get_assessment(self): + return self.assessment + + @classmethod + def validate_candidates(cls, candidates: list[int]): + qs = cls.objects.annotate(candidates_count=models.Count("candidates")).filter( + candidates_count=len(candidates) + ) + for candidate in candidates: + qs = qs.filter(candidates=candidate) + return not qs.exists() + + @classmethod + def find_duplicate_candidate_groups(cls, references) -> list[list[dict]]: + num_candidates = 2 + if len(references) < num_candidates: + return [] + num_groups = min(3, len(references) / num_candidates) + return [random.choices(references, k=num_candidates) for i in range(num_groups)] # noqa: S311 + + @classmethod + def create_duplicate_candidate_groups(cls, assessment_id: int): + tasks.create_duplicate_candidate_groups.delay(assessment_id) + + def _update_references(self): + duplicate_ids = self.secondaries.values_list("pk", flat=True) + self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True) + # if a "hidden" reference was selected as primary, unhide it + if self.primary.hidden: + self.primary.hidden = False + self.primary.save() + + def resolve( + self, + resolution: constants.DuplicateResolution, + resolving_user: HAWCUser, + primary_id: int | None = None, + notes: str = "", + ): + if resolution == constants.DuplicateResolution.UNRESOLVED: + raise ValueError("Resolution must not be unresolved.") + if resolution == constants.DuplicateResolution.RESOLVED: + if primary_id is None: + raise ValueError("Primary must not be None if duplicate identified.") + if primary_id not in self.candidates.values_list("pk", flat=True): + raise ValueError("Primary must be a candidate.") + self.primary_id = primary_id + self._update_references() + self.resolution = resolution + self.resolving_user = resolving_user + self.notes = notes + self.save() + + reversion.register(LiteratureAssessment) reversion.register(Search) reversion.register(ReferenceFilterTag) diff --git a/hawc/apps/lit/tasks.py b/hawc/apps/lit/tasks.py index 3304a67cac..37535e0d7b 100644 --- a/hawc/apps/lit/tasks.py +++ b/hawc/apps/lit/tasks.py @@ -142,3 +142,29 @@ def fix_pubmed_without_content(): logger.info(f"Attempting to update pubmed content for {num_ids} identifiers") if num_ids > 0: Identifiers.update_pubmed_content(ids) + + +@shared_task +def create_duplicate_candidate_groups(assessment_id: int): + DuplicateCandidateGroup = apps.get_model("lit", "DuplicateCandidateGroup") + assessment = apps.get_model("assessment", "Assessment").objects.get(pk=assessment_id) + references = assessment.references.values("pk", "title") + candidate_groups = DuplicateCandidateGroup.find_duplicate_candidate_groups(references) + candidate_groups = [ + group + for group in candidate_groups + if DuplicateCandidateGroup.validate_candidates([ref["pk"] for ref in group]) + ] + with transaction.atomic(): + objs = DuplicateCandidateGroup.objects.bulk_create( + [DuplicateCandidateGroup(assessment=assessment) for group in candidate_groups] + ) + DuplicateCandidateGroup.candidates.through.objects.bulk_create( + [ + DuplicateCandidateGroup.candidates.through( + duplicatecandidategroup_id=obj.pk, reference_id=ref["pk"] + ) + for obj, group in zip(objs, candidate_groups, strict=False) + for ref in group + ] + ) diff --git a/hawc/apps/lit/templates/lit/duplicate_resolution.html b/hawc/apps/lit/templates/lit/duplicate_resolution.html new file mode 100644 index 0000000000..e2b6e0c7d7 --- /dev/null +++ b/hawc/apps/lit/templates/lit/duplicate_resolution.html @@ -0,0 +1,44 @@ +{% extends 'assessment-rooted.html' %} + +{% load bs4 %} +{% block content %} +
+

Duplicate candidates

+ {% actions %} + Resolved duplicates + Identify duplicates + {% endactions %} +
+ + {% include "includes/paginator.html" with plural_object_name="duplicate groups" %} +{% endblock content %} diff --git a/hawc/apps/lit/templates/lit/overview.html b/hawc/apps/lit/templates/lit/overview.html index 57a7bcbb31..d83f3c885f 100644 --- a/hawc/apps/lit/templates/lit/overview.html +++ b/hawc/apps/lit/templates/lit/overview.html @@ -27,6 +27,11 @@

Literature Review

Tag manually added references Tag untagged references Upload full text URLs + {% if obj_perms.edit %} + Duplicate resolution + {% else %} + Resolved duplicates + {% endif %} Exports Download all references diff --git a/hawc/apps/lit/templates/lit/resolved_duplicates.html b/hawc/apps/lit/templates/lit/resolved_duplicates.html new file mode 100644 index 0000000000..c0675ef738 --- /dev/null +++ b/hawc/apps/lit/templates/lit/resolved_duplicates.html @@ -0,0 +1,38 @@ +{% extends 'assessment-rooted.html' %} + +{% load bs4 %} +{% block content %} +
+

Resolved duplicates

+ {% if obj_perms.edit %} + {% actions %} + Duplicate resolution + {% endactions %} + {% endif %} +
+ {% include 'common/inline_filter_form.html' %} + + {% include "includes/paginator.html" with plural_object_name="duplicate groups" %} +{% endblock content %} diff --git a/hawc/apps/lit/urls.py b/hawc/apps/lit/urls.py index 137e197ee4..560e926a7c 100644 --- a/hawc/apps/lit/urls.py +++ b/hawc/apps/lit/urls.py @@ -8,6 +8,7 @@ router.register(r"reference", api.ReferenceViewSet, basename="reference") router.register(r"search", api.SearchViewSet, basename="search") router.register(r"tags", api.ReferenceFilterTagViewSet, basename="tags") +router.register(r"duplicate", api.DuplicateViewSet, basename="duplicate") app_name = "lit" urlpatterns = [ @@ -163,4 +164,19 @@ name="workflow-htmx", ), path("api/", include((router.urls, "api"))), + path( + "assessment//duplicate-resolution/", + views.DuplicateResolution.as_view(), + name="duplicate-resolution", + ), + path( + "assessment//resolved-duplicates/", + views.ResolvedDuplicates.as_view(), + name="resolved-duplicates", + ), + path( + "assessment//identify-duplicates/", + views.IdentifyDuplicates.as_view(), + name="identify-duplicates", + ), ] diff --git a/hawc/apps/lit/views.py b/hawc/apps/lit/views.py index 0eea38e364..b1d9297d60 100644 --- a/hawc/apps/lit/views.py +++ b/hawc/apps/lit/views.py @@ -10,7 +10,7 @@ from django.template import loader from django.urls import reverse, reverse_lazy from django.utils.decorators import method_decorator -from django.views.generic import TemplateView +from django.views.generic import TemplateView, View from ..assessment.constants import AssessmentViewPermissions from ..assessment.models import Assessment @@ -25,6 +25,7 @@ BaseFilterList, BaseList, BaseUpdate, + MessageMixin, create_object_log, htmx_required, ) @@ -1250,3 +1251,67 @@ def venn_reference_list(self, request, *args, **kwargs): "qs": models.Reference.objects.assessment_qs(self.assessment.id).filter(id__in=ids) } return render(request, "lit/components/venn_reference_list.html", context=context) + + +class DuplicateResolution(BaseList): + parent_model = Assessment + model = models.DuplicateCandidateGroup + template_name = "lit/duplicate_resolution.html" + breadcrumb_active_name = "Duplicate resolution" + assessment_permission = AssessmentViewPermissions.TEAM_MEMBER + + paginate_by = 5 + + def get_queryset(self): + return ( + super() + .get_queryset() + .filter(assessment=self.assessment) + .filter(resolution=constants.DuplicateResolution.UNRESOLVED) + .prefetch_related("candidates", "candidates__identifiers", "candidates__tags") + ) + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + context["breadcrumbs"] = lit_overview_crumbs( + self.request.user, self.assessment, "Duplicate resolution" + ) + return context + + +class ResolvedDuplicates(BaseFilterList): + parent_model = Assessment + model = models.DuplicateCandidateGroup + filterset_class = filterset.DuplicateCandidateGroupFilterSet + template_name = "lit/resolved_duplicates.html" + breadcrumb_active_name = "Resolved duplicates" + assessment_permission = AssessmentViewPermissions.TEAM_MEMBER + + def get_queryset(self): + return ( + super() + .get_queryset() + .filter(assessment=self.assessment) + .exclude(resolution=constants.DuplicateResolution.UNRESOLVED) + ) + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + context["breadcrumbs"] = lit_overview_crumbs( + self.request.user, self.assessment, "Resolved duplicates" + ) + context["resolution_state"] = constants.DuplicateResolution + return context + + +class IdentifyDuplicates(MessageMixin, View): + success_message = "Duplicate identification requested." + + def get(self, request, *args, **kwargs): + assessment = get_object_or_404(Assessment, pk=kwargs["pk"]) + if not assessment.user_can_edit_object(request.user): + raise PermissionDenied() + url = reverse("lit:duplicate-resolution", args=(assessment.pk,)) + models.DuplicateCandidateGroup.create_duplicate_candidate_groups(assessment.pk) + self.send_message() + return HttpResponseRedirect(url)