Skip to content

Commit

Permalink
Add new sourceOnly parameter to duplicates API, see #1380
Browse files Browse the repository at this point in the history
  • Loading branch information
mdoering committed Nov 26, 2024
1 parent c6516dc commit fa77b14
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 22 deletions.
15 changes: 10 additions & 5 deletions dao/src/main/java/life/catalogue/dao/DuplicateDao.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ public static class DuplicateRequest {
Integer sourceDatasetKey;
// optional sector to restrict the duplicates to a single sector. Requires datasetKey to be a project or release
Integer sectorKey;
// optional restriction to uni/bi/trinomials only
// optional restriction to require ALL records of a duplicate to come from a given source or sector.
// requires sourceDatasetKey or sectorKey to be present
Boolean sourceOnly;
NameCategory category;
// optional restriction on ranks
Set<Rank> ranks;
Expand All @@ -99,6 +101,7 @@ public static class DuplicateRequest {
* @param datasetKey the dataset to be analyzed
* @param sourceDatasetKey the source dataset within a project to analyze. Requires datasetKey to be a project or release
* @param sectorKey optional sector to restrict the duplicates to a single sector. Requires datasetKey to be a project or release
* @param sourceOnly optional restriction to require ALL records of a duplicate to come from a given source or sector. Requires sourceDatasetKey or sectorKey to be present.
* @param category optional restriction to uni/bi/trinomials only
* @param ranks optional restriction on ranks
* @param status optional restriction on usage status
Expand All @@ -117,6 +120,7 @@ public DuplicateRequest(@QueryParam("entity") EntityType entity,
@PathParam("key") int datasetKey,
@QueryParam("sourceDatasetKey") Integer sourceDatasetKey,
@QueryParam("sectorKey") Integer sectorKey,
@QueryParam("sourceOnly") Boolean sourceOnly,
@QueryParam("category") NameCategory category,
@QueryParam("rank") Set<Rank> ranks,
@QueryParam("status") Set<TaxonomicStatus> status,
Expand All @@ -132,6 +136,7 @@ public DuplicateRequest(@QueryParam("entity") EntityType entity,
this.datasetKey = datasetKey;
this.sourceDatasetKey = sourceDatasetKey;
this.sectorKey = sectorKey;
this.sourceOnly = sourceOnly;
this.category = category;
this.ranks = ranks;
this.status = status;
Expand Down Expand Up @@ -191,8 +196,8 @@ public int count(DuplicateRequest req) {
return mapper.countNames(req.mode, req.query, req.minSize, req.datasetKey, req.category, req.ranks, req.authorshipDifferent, req.rankDifferent,
req.codeDifferent);
} else {
return mapper.count(req.mode, req.query, req.minSize, req.datasetKey, req.sourceDatasetKey, req.sectorKey, req.category, req.ranks, req.status,
req.authorshipDifferent, req.acceptedDifferent, req.rankDifferent, req.codeDifferent, req.withDecision, req.projectKey);
return mapper.count(req.mode, req.query, req.minSize, req.datasetKey, req.sourceDatasetKey, req.sectorKey, req.sourceOnly, req.category, req.ranks,
req.status, req.authorshipDifferent, req.acceptedDifferent, req.rankDifferent, req.codeDifferent, req.withDecision, req.projectKey);
}
}
}
Expand Down Expand Up @@ -256,8 +261,8 @@ private List<Duplicate> findOrList(DuplicateRequest req, @Nullable Page page) {
dupsTmp = mapper.duplicateNames(req.mode, req.query, req.minSize, req.datasetKey, req.category, req.ranks, req.authorshipDifferent, req.rankDifferent,
req.codeDifferent, page);
} else {
dupsTmp = mapper.duplicates(req.mode, req.query, req.minSize, req.datasetKey, req.sourceDatasetKey, req.sectorKey, req.category, req.ranks, req.status,
req.authorshipDifferent, req.acceptedDifferent, req.rankDifferent, req.codeDifferent, req.withDecision, req.projectKey, page);
dupsTmp = mapper.duplicates(req.mode, req.query, req.minSize, req.datasetKey, req.sourceDatasetKey, req.sectorKey, req.sourceOnly, req.category, req.ranks,
req.status, req.authorshipDifferent, req.acceptedDifferent, req.rankDifferent, req.codeDifferent, req.withDecision, req.projectKey, page);
}
if (dupsTmp.isEmpty()) {
return Collections.EMPTY_LIST;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ List<Duplicate.Mybatis> duplicates(@Param("mode") MatchingMode mode,
@Param("datasetKey") int datasetKey,
@Param("sourceDatasetKey") Integer sourceDatasetKey,
@Param("sectorKey") Integer sectorKey,
@Param("sourceOnly") Boolean sourceOnly,
@Param("category") NameCategory category,
@Param("ranks") Set<Rank> ranks,
@Param("status") Set<TaxonomicStatus> status,
Expand All @@ -78,6 +79,7 @@ Integer count(@Param("mode") MatchingMode mode,
@Param("datasetKey") int datasetKey,
@Param("sourceDatasetKey") Integer sourceDatasetKey,
@Param("sectorKey") Integer sectorKey,
@Param("sourceOnly") Boolean sourceOnly,
@Param("category") NameCategory category,
@Param("ranks") Set<Rank> ranks,
@Param("status") Set<TaxonomicStatus> status,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,14 @@
<if test="status != null and !status.isEmpty()">
AND u.status IN (<foreach collection="status" item="st" separator=",">#{st}::TAXONOMICSTATUS</foreach>)
</if>
<if test="sectorKey != null and sourceOnly != null and sourceOnly">
<!-- all names must come from the requested sector -->
AND u.sector_key = #{sectorKey}
</if>
<if test="sourceDatasetKey != null and sourceOnly != null and sourceOnly">
<!-- all names must come from the requested source dataset -->
AND s.subject_dataset_key = #{sourceDatasetKey}
</if>
</where>
GROUP BY
<include refid="keyCol"/>
Expand All @@ -217,11 +225,11 @@
<!-- make sure all requested status are covered at least once -->
AND array_agg(u.status) @> array[<foreach collection="status" item="st" separator=",">#{st}::TAXONOMICSTATUS</foreach>]
</if>
<if test="sectorKey != null">
<if test="sectorKey != null and (sourceOnly == null or !sourceOnly)">
<!-- make sure at least one name comes from the requested sector -->
AND array_agg(u.sector_key) @> array[${sectorKey}]
</if>
<if test="sourceDatasetKey != null">
<if test="sourceDatasetKey != null and (sourceOnly == null or !sourceOnly)">
<!-- make sure at least one name comes from the requested source dataset -->
AND array_agg(s.subject_dataset_key) @> array[${sourceDatasetKey}]
</if>
Expand Down
6 changes: 3 additions & 3 deletions dao/src/test/java/life/catalogue/dao/DuplicateDaoTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public void duplicatesIAE() {
dm.list(new Page(1000)).forEach(d -> System.out.println(d.getKey() + " -> " + d.getOrigin()));
}
// no catalogue/project given but filtering decisions
var req = new DuplicateDao.DuplicateRequest(EntityType.NAME_USAGE, MatchingMode.STRICT, null, null, 1001, null, null, null, null, null, null, null, null, null, true, null);
var req = new DuplicateDao.DuplicateRequest(EntityType.NAME_USAGE, MatchingMode.STRICT, null, null, 1001, null, null, null, null, null, null, null, null, null, null, true, null);
dao.page(req, null);
}

Expand Down Expand Up @@ -219,7 +219,7 @@ private List<Duplicate> find(MatchingMode mode, Integer minSize, int datasetKey,
} else {
watch.resume();
}
var req = new DuplicateDao.DuplicateRequest(EntityType.NAME_USAGE, mode, null, minSize, datasetKey, sourceDatasetKey, null, category, ranks, status,
var req = new DuplicateDao.DuplicateRequest(EntityType.NAME_USAGE, mode, null, minSize, datasetKey, sourceDatasetKey, null, null, category, ranks, status,
authorshipDifferent, acceptedDifferent, null, null, withDecision, Datasets.COL);
ResultPage<Duplicate> result = dao.page(req, page);
watch.suspend();
Expand All @@ -233,7 +233,7 @@ private List<Duplicate> findNames(MatchingMode mode, Integer minSize, int datase
watch.resume();
}

var req = new DuplicateDao.DuplicateRequest(EntityType.NAME, mode, null, minSize, datasetKey, null, null, category, ranks,
var req = new DuplicateDao.DuplicateRequest(EntityType.NAME, mode, null, minSize, datasetKey, null, null, null, category, ranks,
null, authorshipDifferent, null, null, null, null, Datasets.COL);
ResultPage<Duplicate> result = dao.page(req, page);
watch.suspend();
Expand Down
34 changes: 22 additions & 12 deletions dao/src/test/java/life/catalogue/db/mapper/DuplicateMapperTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,51 +158,61 @@ public void usagesByIds() {
public void duplicates() {
Set<TaxonomicStatus> status = new HashSet<>();
status.add(TaxonomicStatus.PROVISIONALLY_ACCEPTED);
List<Duplicate.Mybatis> dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
List<Duplicate.Mybatis> dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES), status, false, null, null, null, false, Datasets.COL,
new Page(0, 2));
assertEquals(2, dups.size());
for (Duplicate.Mybatis d : dups) {
assertFalse(d.getUsages().isEmpty());
assertNotNull(d.getKey());
}
assertEquals((Integer) 3, mapper.count(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
assertEquals((Integer) 3, mapper.count(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES), status, false, null, null, null, false, Datasets.COL));

// all accepted, so not different
// https://github.com/Sp2000/colplus-backend/issues/456
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES), status, false, true, null, null, false, Datasets.COL,
new Page(0, 2));
assertEquals(2, dups.size());
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES), status, false, false, null, null, false, Datasets.COL,
new Page(0, 2));
assertEquals(0, dups.size());
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES), null, null, false, null, null, null, Datasets.COL,
new Page(0, 2));
assertEquals(1, dups.size());
assertEquals("achillea nigra", dups.get(0).getKey());

// https://github.com/Sp2000/colplus-backend/issues/457
// Aspidoscelis deppii subsp. schizophorus
dups = mapper.duplicates(MatchingMode.STRICT, null, 3, datasetKey, null, null, NameCategory.TRINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, null, 3, datasetKey, null, null, false, NameCategory.TRINOMIAL,
Sets.newHashSet(Rank.SUBSPECIES), null, true, null, null, null, null, Datasets.COL,
new Page(0, 5));
assertEquals(1, dups.size());

dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, 999, null, null,
dups = mapper.duplicates(MatchingMode.STRICT, null, 3, datasetKey, null, null, true, NameCategory.TRINOMIAL,
Sets.newHashSet(Rank.SUBSPECIES), null, true, null, null, null, null, Datasets.COL,
new Page(0, 5));
assertEquals(1, dups.size());

dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, 999, null, null, null,
null, null, true, null, null, null, null, null,
new Page(0, 5));
assertEquals(0, dups.size());

dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, null, 999, null, null,
null, null, true, null, null, null, null, null,
new Page(0, 5));
assertEquals(0, dups.size());

dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, null, 999, null,
dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, 999, 999, null, null,
null, null, true, null, null, null, null, null,
new Page(0, 5));
assertEquals(0, dups.size());

dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, 999, 999, null,
dups = mapper.duplicates(MatchingMode.FUZZY, null, 2, datasetKey, 999, 999, true, null,
null, null, true, null, null, null, null, null,
new Page(0, 5));
assertEquals(0, dups.size());
Expand Down Expand Up @@ -243,19 +253,19 @@ public void duplicateNames() {

// https://github.com/Sp2000/colplus-backend/issues/457
// Achillea asplenifolia
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, null, 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
Sets.newHashSet(Rank.SPECIES_AGGREGATE), null, true, null, null, null, null, Datasets.COL,
new Page(0, 5));
assertEquals(1, dups.size());

// Achillea
dups = mapper.duplicates(MatchingMode.STRICT, "Achillea", 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, "Achillea", 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
null, null, true, null, null, null, null, Datasets.COL,
new Page(0, 5));
assertEquals(2, dups.size());

// Achillea asplenifolia
dups = mapper.duplicates(MatchingMode.STRICT, "Achillea asp", 2, datasetKey, null, null, NameCategory.BINOMIAL,
dups = mapper.duplicates(MatchingMode.STRICT, "Achillea asp", 2, datasetKey, null, null, null, NameCategory.BINOMIAL,
null, null, true, null, null, null, null, Datasets.COL,
new Page(0, 5));
assertEquals(1, dups.size());
Expand Down

0 comments on commit fa77b14

Please sign in to comment.