Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deduplication workflow #1165

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion hawc/apps/lit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ..assessment.constants import AssessmentViewSetPermissions
from ..assessment.models import Assessment
from ..common.api import OncePerMinuteThrottle, PaginationWithCount
from ..common.helper import FlatExport, cacheable
from ..common.helper import FlatExport, cacheable, tryParseInt
from ..common.renderers import PandasRenderers
from ..common.serializers import UnusedSerializer
from ..common.views import create_object_log
Expand Down Expand Up @@ -426,3 +426,35 @@ def id_search(self, request, db_id: str, id: str):
df=qs.global_df(),
filename=f"global-reference-data-{id}",
)


class DuplicateViewSet(
BaseAssessmentViewSet,
):
model = models.DuplicateCandidateGroup
http_method_names = ["post"]

@action(
detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT
)
def resolve_duplicate(self, request, pk):
instance = self.get_object()
assessment = instance.assessment
if not assessment.user_can_edit_object(request.user):
raise PermissionDenied()
resolution = request.POST.get("resolution")
notes = request.POST.get("notes", "")
if resolution == "none":
instance.resolve(
resolution=constants.DuplicateResolution.FALSE_POSITIVE,
resolving_user=request.user,
notes=notes,
)
if (resolution := tryParseInt(resolution)) is not None:
instance.resolve(
resolution=constants.DuplicateResolution.RESOLVED,
resolving_user=request.user,
primary_id=resolution,
notes=notes,
)
return Response({"status": "ok"})
6 changes: 6 additions & 0 deletions hawc/apps/lit/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ class SearchType(models.TextChoices):
IMPORT = "i", "Import"


class DuplicateResolution(models.IntegerChoices):
UNRESOLVED = 0, "Unresolved"
RESOLVED = 1, "Resolved" # TODO: change to "primary identified"
FALSE_POSITIVE = 2, "False positive"


# generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/
DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$")
DOI_EXTRACT = re.compile(r"10\.\d{4,9}/[^\s]+")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Generated by Django 5.1.4 on 2025-02-03 08:23

import django.contrib.postgres.fields
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("assessment", "0047_alter_labeleditem_options"),
("lit", "0024_workflows"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

operations = [
migrations.CreateModel(
name="DedupeSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
],
),
migrations.CreateModel(
name="DuplicateCandidates",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"resolution",
models.PositiveSmallIntegerField(
choices=[
(0, "Unresolved"),
(1, "Resolved"),
(2, "False positive"),
],
default=0,
),
),
(
"candidates",
django.contrib.postgres.fields.ArrayField(
base_field=models.IntegerField(), size=None, unique=True
),
),
("primary", models.IntegerField(null=True)),
("notes", models.TextField(blank=True)),
("created", models.DateTimeField(auto_now_add=True)),
("last_updated", models.DateTimeField(auto_now=True)),
(
"assessment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="duplicates",
to="assessment.assessment",
),
),
(
"resolving_user",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="resolved_duplicates",
to=settings.AUTH_USER_MODEL,
),
),
],
),
]
17 changes: 17 additions & 0 deletions hawc/apps/lit/migrations/0026_reference_hidden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 5.1.4 on 2025-02-05 17:43

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("lit", "0025_dedupesettings_duplicatecandidates"),
]

operations = [
migrations.AddField(
model_name="reference",
name="hidden",
field=models.BooleanField(default=False),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Generated by Django 5.1.4 on 2025-02-10 12:18

import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("assessment", "0047_alter_labeleditem_options"),
("lit", "0026_reference_hidden"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

operations = [
migrations.CreateModel(
name="DuplicateCandidateGroup",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"resolution",
models.PositiveSmallIntegerField(
choices=[
(0, "Unresolved"),
(1, "Resolved"),
(2, "False positive"),
],
default=0,
),
),
("notes", models.TextField(blank=True)),
("created", models.DateTimeField(auto_now_add=True)),
("last_updated", models.DateTimeField(auto_now=True)),
(
"assessment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="duplicates",
to="assessment.assessment",
),
),
(
"candidates",
models.ManyToManyField(related_name="duplicate_candidates", to="lit.reference"),
),
(
"primary",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="duplicate_primaries",
to="lit.reference",
),
),
(
"resolving_user",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="resolved_duplicates",
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.DeleteModel(
name="DedupeSettings",
),
migrations.DeleteModel(
name="DuplicateCandidates",
),
]
78 changes: 78 additions & 0 deletions hawc/apps/lit/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import html
import json
import logging
import random
import re
from copy import copy
from math import ceil
Expand Down Expand Up @@ -849,6 +850,7 @@ class Reference(models.Model):
null=True,
help_text="Used internally for determining when reference was " "originally added",
)
hidden = models.BooleanField(default=False)

BREADCRUMB_PARENT = "assessment"

Expand Down Expand Up @@ -1458,6 +1460,82 @@ def get_description(self) -> str:
)


class DuplicateCandidateGroup(models.Model):
assessment = models.ForeignKey(
"assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates"
)
resolution = models.PositiveSmallIntegerField(
choices=constants.DuplicateResolution, default=constants.DuplicateResolution.UNRESOLVED
)
resolving_user = models.ForeignKey(
HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates"
)
candidates = models.ManyToManyField(Reference, related_name="duplicate_candidates")
primary = models.ForeignKey(
Reference, null=True, on_delete=models.SET_NULL, related_name="duplicate_primaries"
)
notes = models.TextField(blank=True)
created = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(auto_now=True)

@property
def secondaries(self):
return self.candidates.exclude(pk=self.primary_id)

def get_assessment(self):
return self.assessment

@classmethod
def validate_candidates(cls, candidates: list[int]):
qs = cls.objects.annotate(candidates_count=models.Count("candidates")).filter(
candidates_count=len(candidates)
)
for candidate in candidates:
qs = qs.filter(candidates=candidate)
return not qs.exists()

@classmethod
def find_duplicate_candidate_groups(cls, references) -> list[list[dict]]:
num_candidates = 2
if len(references) < num_candidates:
return []
num_groups = min(3, len(references) / num_candidates)
return [random.choices(references, k=num_candidates) for i in range(num_groups)] # noqa: S311

@classmethod
def create_duplicate_candidate_groups(cls, assessment_id: int):
tasks.create_duplicate_candidate_groups.delay(assessment_id)

def _update_references(self):
duplicate_ids = self.secondaries.values_list("pk", flat=True)
self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True)
# if a "hidden" reference was selected as primary, unhide it
if self.primary.hidden:
self.primary.hidden = False
self.primary.save()

def resolve(
self,
resolution: constants.DuplicateResolution,
resolving_user: HAWCUser,
primary_id: int | None = None,
notes: str = "",
):
if resolution == constants.DuplicateResolution.UNRESOLVED:
raise ValueError("Resolution must not be unresolved.")
if resolution == constants.DuplicateResolution.RESOLVED:
if primary_id is None:
raise ValueError("Primary must not be None if duplicate identified.")
if primary_id not in self.candidates.values_list("pk", flat=True):
raise ValueError("Primary must be a candidate.")
self.primary_id = primary_id
self._update_references()
self.resolution = resolution
self.resolving_user = resolving_user
self.notes = notes
self.save()


reversion.register(LiteratureAssessment)
reversion.register(Search)
reversion.register(ReferenceFilterTag)
Expand Down
26 changes: 26 additions & 0 deletions hawc/apps/lit/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,29 @@ def fix_pubmed_without_content():
logger.info(f"Attempting to update pubmed content for {num_ids} identifiers")
if num_ids > 0:
Identifiers.update_pubmed_content(ids)


@shared_task
def create_duplicate_candidate_groups(assessment_id: int):
DuplicateCandidateGroup = apps.get_model("lit", "DuplicateCandidateGroup")
assessment = apps.get_model("assessment", "Assessment").objects.get(pk=assessment_id)
references = assessment.references.values("pk", "title")
candidate_groups = DuplicateCandidateGroup.find_duplicate_candidate_groups(references)
candidate_groups = [
group
for group in candidate_groups
if DuplicateCandidateGroup.validate_candidates([ref["pk"] for ref in group])
]
with transaction.atomic():
objs = DuplicateCandidateGroup.objects.bulk_create(
[DuplicateCandidateGroup(assessment=assessment) for group in candidate_groups]
)
DuplicateCandidateGroup.candidates.through.objects.bulk_create(
[
DuplicateCandidateGroup.candidates.through(
duplicatecandidategroup_id=obj.pk, reference_id=ref["pk"]
)
for obj, group in zip(objs, candidate_groups, strict=False)
for ref in group
]
)
Loading
Loading