Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deduplication workflow #1165

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion hawc/apps/lit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from ..assessment.constants import AssessmentViewSetPermissions
from ..assessment.models import Assessment
from ..common.api import OncePerMinuteThrottle, PaginationWithCount
from ..common.helper import FlatExport, cacheable
from ..common.helper import FlatExport, cacheable, tryParseInt
from ..common.renderers import PandasRenderers
from ..common.serializers import UnusedSerializer
from ..common.views import create_object_log
Expand Down Expand Up @@ -426,3 +426,28 @@ def id_search(self, request, db_id: str, id: str):
df=qs.global_df(),
filename=f"global-reference-data-{id}",
)




class DuplicateViewSet(
BaseAssessmentViewSet,
):
model = models.DuplicateCandidates
http_method_names = ["post"]

@action(
detail=True, methods=("post",), action_perms=AssessmentViewSetPermissions.CAN_EDIT_OBJECT
)
def resolve_duplicate(self, request, pk):
instance = self.get_object()
assessment = instance.assessment
if not assessment.user_can_edit_object(self.request.user):
raise PermissionDenied()
resolution = request.POST.get("resolution")
notes = request.POST.get("notes","")
if resolution == "none":
instance.resolve(resolution=constants.DuplicateResolution.FALSE_POSITIVE,notes=notes)
if (resolution:=tryParseInt(resolution)) is not None:
instance.resolve(resolution=constants.DuplicateResolution.RESOLVED,primary=resolution,notes=notes)
return Response({"status": "ok"})
4 changes: 4 additions & 0 deletions hawc/apps/lit/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class SearchType(models.TextChoices):
SEARCH = "s", "Search"
IMPORT = "i", "Import"

class DuplicateResolution(models.IntegerChoices):
UNRESOLVED = 0, "Unresolved"
RESOLVED = 1, "Resolved" # TODO: change to "primary identified"
FALSE_POSITIVE = 2, "False positive"

# generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/
DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Generated by Django 5.1.4 on 2025-02-03 08:23

import django.contrib.postgres.fields
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("assessment", "0047_alter_labeleditem_options"),
("lit", "0024_workflows"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

operations = [
migrations.CreateModel(
name="DedupeSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
],
),
migrations.CreateModel(
name="DuplicateCandidates",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"resolution",
models.PositiveSmallIntegerField(
choices=[
(0, "Unresolved"),
(1, "Resolved"),
(2, "False positive"),
],
default=0,
),
),
(
"candidates",
django.contrib.postgres.fields.ArrayField(
base_field=models.IntegerField(), size=None, unique=True
),
),
("primary", models.IntegerField(null=True)),
("notes", models.TextField(blank=True)),
("created", models.DateTimeField(auto_now_add=True)),
("last_updated", models.DateTimeField(auto_now=True)),
(
"assessment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="duplicates",
to="assessment.assessment",
),
),
(
"resolving_user",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="resolved_duplicates",
to=settings.AUTH_USER_MODEL,
),
),
],
),
]
18 changes: 18 additions & 0 deletions hawc/apps/lit/migrations/0026_reference_hidden.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 5.1.4 on 2025-02-05 17:43

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("lit", "0025_dedupesettings_duplicatecandidates"),
]

operations = [
migrations.AddField(
model_name="reference",
name="hidden",
field=models.BooleanField(default=False),
),
]
97 changes: 97 additions & 0 deletions hawc/apps/lit/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from math import ceil
from typing import Self
from urllib import parse
import random

from celery import chain
from celery.result import ResultBase
Expand Down Expand Up @@ -849,6 +850,7 @@ class Reference(models.Model):
null=True,
help_text="Used internally for determining when reference was " "originally added",
)
hidden = models.BooleanField(default=False)

BREADCRUMB_PARENT = "assessment"

Expand Down Expand Up @@ -1458,6 +1460,101 @@ def get_description(self) -> str:
)


# add parameters/select on literatureassessment model
class DedupeSettings(models.Model):
# deduper for use in an assessment
# for first pass maybe we just have a global deduper, or static choices, so that we don't have to build this
assessment:"Assessment"
parameters:dict # list of parameters for deduplication? ie schema of dedupe modules to use?

def build_deduper(self):
# return deduper instance using self.parameters
return


# SOFT DELETES

class SortedArrayField(ArrayField):
pass

class DuplicateCandidates(models.Model):
assessment = models.ForeignKey(
"assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates"
)
resolution = models.PositiveSmallIntegerField(
choices=constants.DuplicateResolution,
default=constants.DuplicateResolution.UNRESOLVED
)
resolving_user = models.ForeignKey(HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates")
candidates = ArrayField(models.IntegerField(),unique=True)
primary = models.IntegerField(null=True)
notes = models.TextField(blank=True)
created = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(auto_now=True)

def get_assessment(self):
return self.assessment

@classmethod
def foobar(cls,assessment):
references = assessment.references.values("pk","title")
candidate_groups = cls.random_execute(references)
cls.objects.bulk_create([cls(assessment=assessment,candidates=[ref["pk"] for ref in group]) for group in candidate_groups])

@classmethod
def random_execute(cls,references)->list[list[dict]]:
num_candidates = 2
if len(references)<num_candidates:
return []
num_groups = min(3,len(references)/num_candidates)
return [random.choices(references,k=num_candidates) for i in range(num_groups)]

def generate_unique_identifier(self):
return sorted(self.candidates)

def _update_references(self):
# TODO also make primary not hidden? may be unnecessary
duplicate_ids = set(self.candidates)-{self.primary}
self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True)

def resolve(self,resolution:constants.DuplicateResolution,primary:int=None,notes:str=""):
if resolution == constants.DuplicateResolution.UNRESOLVED:
raise ValueError("Resolution must not be unresolved.")
if resolution == constants.DuplicateResolution.RESOLVED:
if primary is None:
raise ValueError("Primary must not be None if duplicate identified.")
if primary not in self.candidates:
raise ValueError("Primary must be a candidate.")
self.primary = primary
#self._update_references()
self.resolution = resolution
self.notes = notes
self.save()

# where to put execute method? literatureassessment, manager for dupes model


# DuplicateCandidateGroup

"""
WORKFLOW

User defines deduper for use in assessment
User executes a session that uses a defined deduper
Session stores list of identified candidate duplicate groups
User resolves duplicates in a session; if group status != unresolved, it shows up on this page
Perhaps a seperate session page of resolved groups? ie an "in progress" list view and a "done" list view
Multiple resolutions at once? Or more like screen page in LLR where its do one, click for next (look at conflict resolution)
Should this workflow do anything proactive? ie lets say a candidate group is identified false positive, is it a big deal if it shows up again if a user executes another session w/ same settings? (yes)
Single user right? Not like conflict resolution? THIS IS CORRECT
Do we want this workflow to also happen on import? That would look slightly different
Though maybe we could just have it happen automatically AFTER import, that way it would use the same workflow
If used on import, do we add "choose a deduper" option to created search? or maybe "default" attribute to deduper, whichever one is "default" is used?
Each assessment has undeletable "default" deduper, maybe add noop setting choice for deduper for people who don't want it running on imports?
"""



reversion.register(LiteratureAssessment)
reversion.register(Search)
reversion.register(ReferenceFilterTag)
Expand Down
18 changes: 18 additions & 0 deletions hawc/apps/lit/templates/lit/_duplicate_candidates.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<div hx-target="this" hx-swap="delete swap:1s">
<form hx-post="{% url 'lit:api:duplicate-resolve-duplicate' object.pk %}">
<div>
<label>
<input type="radio" id="none-{{object.pk}}" name="resolution" value="none" />
No duplicates
</label>
</div>
{% for candidate in object.candidates %}
<div>
<input type="radio" id="primary-{{object.pk}}-{{candidate}}" name="resolution" value="{{candidate}}" />
<div style="display:inline-block;">{{candidate}}</div>
</div>
{% endfor %}
<textarea name="notes" placeholder="Notes"></textarea>
<button type="submit">Resolve</button>
</form>
</div>
17 changes: 17 additions & 0 deletions hawc/apps/lit/templates/lit/duplicate_candidates.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{% extends 'assessment-rooted.html' %}

{% load bs4 %}
{% block content %}
<div class="d-flex">
<h2>Duplicate candidates</h2>
{% actions %}
<a class="dropdown-item" href="{% url 'lit:duplicate-task' assessment.pk %}">Run deduplication</a>
{% endactions %}
</div>
<ul class="list-group list-group-flush my-3">
{% for object in object_list %}
{% include 'lit/_duplicate_candidates.html' %}
{% endfor %}
</ul>
{% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
{% endblock content %}
28 changes: 28 additions & 0 deletions hawc/apps/lit/templates/lit/duplicate_candidates_2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{% extends 'assessment-rooted.html' %}

{% load bs4 %}
{% block content %}
<div class="d-flex">
<h2>Resolved duplicates</h2>
</div>
<ul class="list-group list-group-flush my-3">
{% for object in object_list %}
<div>
Group {{object.pk}}
</div>
<div>
{{object.resolution}}
</div>
<div>
{{object.candidates}}
</div>
<div>
{{object.primary}}
</div>
<div>
{{object.notes}}
</div>
{% endfor %}
</ul>
{% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
{% endblock content %}
16 changes: 16 additions & 0 deletions hawc/apps/lit/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
router.register(r"reference", api.ReferenceViewSet, basename="reference")
router.register(r"search", api.SearchViewSet, basename="search")
router.register(r"tags", api.ReferenceFilterTagViewSet, basename="tags")
router.register(r"duplicate", api.DuplicateViewSet, basename="duplicate")

app_name = "lit"
urlpatterns = [
Expand Down Expand Up @@ -163,4 +164,19 @@
name="workflow-htmx",
),
path("api/", include((router.urls, "api"))),
path(
"assessment/<int:pk>/duplicate-candidates/",
views.DuplicateCandidatesList.as_view(),
name="duplicate-candidates",
),
path(
"assessment/<int:pk>/duplicate-candidates2/",
views.DuplicateCandidatesList2.as_view(),
name="duplicate-candidates2",
),
path(
"assessment/<int:pk>/duplicate-task/",
views.DuplicateTask.as_view(),
name="duplicate-task",
),
]
Loading
Loading