Skip to content

Commit

Permalink
Changed model fields for candidates/primary to relation based, cleanu…
Browse files Browse the repository at this point in the history
…p & improvements
  • Loading branch information
rabstejnek committed Feb 11, 2025
1 parent 27c0a25 commit 2ac87e6
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 115 deletions.
16 changes: 9 additions & 7 deletions hawc/apps/lit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,12 +428,10 @@ def id_search(self, request, db_id: str, id: str):
)




class DuplicateViewSet(
BaseAssessmentViewSet,
):
model = models.DuplicateCandidates
model = models.DuplicateCandidateGroup
http_method_names = ["post"]

@action(
Expand All @@ -445,9 +443,13 @@ def resolve_duplicate(self, request, pk):
if not assessment.user_can_edit_object(self.request.user):
raise PermissionDenied()
resolution = request.POST.get("resolution")
notes = request.POST.get("notes","")
notes = request.POST.get("notes", "")
if resolution == "none":
instance.resolve(resolution=constants.DuplicateResolution.FALSE_POSITIVE,notes=notes)
if (resolution:=tryParseInt(resolution)) is not None:
instance.resolve(resolution=constants.DuplicateResolution.RESOLVED,primary=resolution,notes=notes)
instance.resolve(resolution=constants.DuplicateResolution.FALSE_POSITIVE, notes=notes)
if (resolution := tryParseInt(resolution)) is not None:
instance.resolve(
resolution=constants.DuplicateResolution.RESOLVED,
primary_id=resolution,
notes=notes,
)
return Response({"status": "ok"})
4 changes: 3 additions & 1 deletion hawc/apps/lit/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ class SearchType(models.TextChoices):
SEARCH = "s", "Search"
IMPORT = "i", "Import"


class DuplicateResolution(models.IntegerChoices):
UNRESOLVED = 0, "Unresolved"
RESOLVED = 1, "Resolved" # TODO: change to "primary identified"
RESOLVED = 1, "Resolved" # TODO: change to "primary identified"
FALSE_POSITIVE = 2, "False positive"


# generalized/adapted from https://www.crossref.org/blog/dois-and-matching-regular-expressions/
DOI_EXACT = re.compile(r"^10\.\d{4,9}/[^\s]+$")
DOI_EXTRACT = re.compile(r"10\.\d{4,9}/[^\s]+")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@


class Migration(migrations.Migration):

dependencies = [
("assessment", "0047_alter_labeleditem_options"),
("lit", "0024_workflows"),
Expand Down
1 change: 0 additions & 1 deletion hawc/apps/lit/migrations/0026_reference_hidden.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class Migration(migrations.Migration):

dependencies = [
("lit", "0025_dedupesettings_duplicatecandidates"),
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Generated by Django 5.1.4 on 2025-02-10 12:18

import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("assessment", "0047_alter_labeleditem_options"),
("lit", "0026_reference_hidden"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

operations = [
migrations.CreateModel(
name="DuplicateCandidateGroup",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"resolution",
models.PositiveSmallIntegerField(
choices=[
(0, "Unresolved"),
(1, "Resolved"),
(2, "False positive"),
],
default=0,
),
),
("notes", models.TextField(blank=True)),
("created", models.DateTimeField(auto_now_add=True)),
("last_updated", models.DateTimeField(auto_now=True)),
(
"assessment",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="duplicates",
to="assessment.assessment",
),
),
(
"candidates",
models.ManyToManyField(related_name="duplicate_candidates", to="lit.reference"),
),
(
"primary",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="duplicate_primaries",
to="lit.reference",
),
),
(
"resolving_user",
models.ForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="resolved_duplicates",
to=settings.AUTH_USER_MODEL,
),
),
],
),
migrations.DeleteModel(
name="DedupeSettings",
),
migrations.DeleteModel(
name="DuplicateCandidates",
),
]
125 changes: 59 additions & 66 deletions hawc/apps/lit/models.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import html
import json
import logging
import random
import re
from copy import copy
from math import ceil
from typing import Self
from urllib import parse
import random

from celery import chain
from celery.result import ResultBase
Expand Down Expand Up @@ -1460,100 +1460,93 @@ def get_description(self) -> str:
)


# add parameters/select on literatureassessment model
class DedupeSettings(models.Model):
# deduper for use in an assessment
# for first pass maybe we just have a global deduper, or static choices, so that we don't have to build this
assessment:"Assessment"
parameters:dict # list of parameters for deduplication? ie schema of dedupe modules to use?

def build_deduper(self):
# return deduper instance using self.parameters
return


# SOFT DELETES

class SortedArrayField(ArrayField):
pass

class DuplicateCandidates(models.Model):
class DuplicateCandidateGroup(models.Model):
assessment = models.ForeignKey(
"assessment.Assessment", on_delete=models.CASCADE, related_name="duplicates"
)
resolution = models.PositiveSmallIntegerField(
choices=constants.DuplicateResolution,
default=constants.DuplicateResolution.UNRESOLVED
choices=constants.DuplicateResolution, default=constants.DuplicateResolution.UNRESOLVED
)
resolving_user = models.ForeignKey(
HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates"
)
candidates = models.ManyToManyField(Reference, related_name="duplicate_candidates")
primary = models.ForeignKey(
Reference, null=True, on_delete=models.SET_NULL, related_name="duplicate_primaries"
)
resolving_user = models.ForeignKey(HAWCUser, null=True, on_delete=models.SET_NULL, related_name="resolved_duplicates")
candidates = ArrayField(models.IntegerField(),unique=True)
primary = models.IntegerField(null=True)
notes = models.TextField(blank=True)
created = models.DateTimeField(auto_now_add=True)
last_updated = models.DateTimeField(auto_now=True)

@property
def secondaries(self):
return self.candidates.exclude(pk=self.primary_id)

def get_assessment(self):
return self.assessment

@classmethod
def foobar(cls,assessment):
references = assessment.references.values("pk","title")
candidate_groups = cls.random_execute(references)
cls.objects.bulk_create([cls(assessment=assessment,candidates=[ref["pk"] for ref in group]) for group in candidate_groups])
def validate_candidates(cls, candidates: list[int]):
qs = cls.objects.annotate(candidates_count=models.Count("candidates")).filter(
candidates_count=len(candidates)
)
for candidate in candidates:
qs = qs.filter(candidates=candidate)
return not qs.exists()

@classmethod
def random_execute(cls,references)->list[list[dict]]:
def find_duplicate_candidate_groups(cls, references) -> list[list[dict]]:
num_candidates = 2
if len(references)<num_candidates:
if len(references) < num_candidates:
return []
num_groups = min(3,len(references)/num_candidates)
return [random.choices(references,k=num_candidates) for i in range(num_groups)]
num_groups = min(3, len(references) / num_candidates)
return [random.choices(references, k=num_candidates) for i in range(num_groups)]

@classmethod
def create_duplicate_candidate_groups(cls, assessment) -> list["DuplicateCandidateGroup"]:
references = assessment.references.values("pk", "title")
candidate_groups = cls.find_duplicate_candidate_groups(references)
candidate_groups = [
group
for group in candidate_groups
if cls.validate_candidates([ref["pk"] for ref in group])
]
objs = cls.objects.bulk_create([cls(assessment=assessment) for group in candidate_groups])
m2m_objs = cls.candidates.through.objects.bulk_create(
[
cls.candidates.through(duplicatecandidategroup_id=obj.pk, reference_id=ref["pk"])
for obj, group in zip(objs, candidate_groups, strict=False)
for ref in group
]
)

def generate_unique_identifier(self):
return sorted(self.candidates)

def _update_references(self):
# TODO also make primary not hidden? may be unnecessary
duplicate_ids = set(self.candidates)-{self.primary}
duplicate_ids = self.secondaries.values_list("pk", flat=True)
self.assessment.references.filter(pk__in=duplicate_ids).update(hidden=True)

def resolve(self,resolution:constants.DuplicateResolution,primary:int=None,notes:str=""):
# if a "hidden" reference was selected as primary, unhide it
if self.primary.hidden:
self.primary.hidden = False
self.primary.save()

def resolve(
self,
resolution: constants.DuplicateResolution,
primary_id: int | None = None,
notes: str = "",
):
if resolution == constants.DuplicateResolution.UNRESOLVED:
raise ValueError("Resolution must not be unresolved.")
if resolution == constants.DuplicateResolution.RESOLVED:
if primary is None:
if primary_id is None:
raise ValueError("Primary must not be None if duplicate identified.")
if primary not in self.candidates:
if primary_id not in self.candidates.values_list("pk", flat=True):
raise ValueError("Primary must be a candidate.")
self.primary = primary
#self._update_references()
self.primary_id = primary_id
self._update_references()
self.resolution = resolution
self.notes = notes
self.save()

# where to put execute method? literatureassessment, manager for dupes model


# DuplicateCandidateGroup

"""
WORKFLOW
User defines deduper for use in assessment
User executes a session that uses a defined deduper
Session stores list of identified candidate duplicate groups
User resolves duplicates in a session; if group status != unresolved, it shows up on this page
Perhaps a seperate session page of resolved groups? ie an "in progress" list view and a "done" list view
Multiple resolutions at once? Or more like screen page in LLR where its do one, click for next (look at conflict resolution)
Should this workflow do anything proactive? ie lets say a candidate group is identified false positive, is it a big deal if it shows up again if a user executes another session w/ same settings? (yes)
Single user right? Not like conflict resolution? THIS IS CORRECT
Do we want this workflow to also happen on import? That would look slightly different
Though maybe we could just have it happen automatically AFTER import, that way it would use the same workflow
If used on import, do we add "choose a deduper" option to created search? or maybe "default" attribute to deduper, whichever one is "default" is used?
Each assessment has undeletable "default" deduper, maybe add noop setting choice for deduper for people who don't want it running on imports?
"""



reversion.register(LiteratureAssessment)
reversion.register(Search)
Expand Down
32 changes: 17 additions & 15 deletions hawc/apps/lit/templates/lit/_duplicate_candidates.html
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
<div hx-target="this" hx-swap="delete swap:1s">
<form hx-post="{% url 'lit:api:duplicate-resolve-duplicate' object.pk %}">
<div>
<label>
<input type="radio" id="none-{{object.pk}}" name="resolution" value="none" />
No duplicates
</label>
<form hx-post="{% url 'lit:api:duplicate-resolve-duplicate' object.pk %}">
<div>
<label>
<input type="radio" id="none-{{object.pk}}" name="resolution" value="none" />
No duplicates
</label>
</div>
{% for candidate in object.candidates.all %}
<div>
<input type="radio" id="primary-{{object.pk}}-{{candidate.pk}}" name="resolution" value="{{candidate.pk}}" />
<div style="display:inline-block; vertical-align: top;">
{% include 'lit/_reference_with_tags.html' with ref=candidate %}
</div>
{% for candidate in object.candidates %}
<div>
<input type="radio" id="primary-{{object.pk}}-{{candidate}}" name="resolution" value="{{candidate}}" />
<div style="display:inline-block;">{{candidate}}</div>
</div>
{% endfor %}
<textarea name="notes" placeholder="Notes"></textarea>
<button type="submit">Resolve</button>
</form>
</div>
{% endfor %}
<textarea name="notes" placeholder="Notes"></textarea>
<button type="submit">Resolve</button>
</form>
</div>
2 changes: 1 addition & 1 deletion hawc/apps/lit/templates/lit/duplicate_candidates.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ <h2>Duplicate candidates</h2>
</div>
<ul class="list-group list-group-flush my-3">
{% for object in object_list %}
{% include 'lit/_duplicate_candidates.html' %}
{% include 'lit/_duplicate_candidates.html' %}
{% endfor %}
</ul>
{% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
Expand Down
30 changes: 15 additions & 15 deletions hawc/apps/lit/templates/lit/duplicate_candidates_2.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,21 @@ <h2>Resolved duplicates</h2>
</div>
<ul class="list-group list-group-flush my-3">
{% for object in object_list %}
<div>
Group {{object.pk}}
</div>
<div>
{{object.resolution}}
</div>
<div>
{{object.candidates}}
</div>
<div>
{{object.primary}}
</div>
<div>
{{object.notes}}
</div>
<div>
Group {{object.pk}}
</div>
<div>
{{object.resolution}}
</div>
<div>
{{object.candidates}}
</div>
<div>
{{object.primary}}
</div>
<div>
{{object.notes}}
</div>
{% endfor %}
</ul>
{% include "includes/paginator.html" with plural_object_name="duplicate groups" %}
Expand Down
Loading

0 comments on commit 2ac87e6

Please sign in to comment.