diff --git a/api/src/main/java/life/catalogue/api/model/FormattableName.java b/api/src/main/java/life/catalogue/api/model/FormattableName.java index a1980b8c7..f51d7d328 100644 --- a/api/src/main/java/life/catalogue/api/model/FormattableName.java +++ b/api/src/main/java/life/catalogue/api/model/FormattableName.java @@ -14,6 +14,8 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.Lists; +import org.gbif.nameparser.api.NameType; + /** * Most of the Name class with all getters needed to format a Name using the NameFormatter. */ @@ -34,6 +36,8 @@ default Boolean isOriginalSpelling() { String getUnparsed(); + NameType getType(); + /** * @return the terminal epithet. Infraspecific epithet if existing, the species epithet or null */ @@ -123,6 +127,6 @@ default List nameParts() { */ @JsonIgnore default String getScientificNameNormalized() { - return SciNameNormalizer.normalize(getScientificName()); + return SciNameNormalizer.normalize(getScientificName(), getType()); } } diff --git a/api/src/main/java/life/catalogue/api/model/IndexName.java b/api/src/main/java/life/catalogue/api/model/IndexName.java index 823e64827..975082f91 100644 --- a/api/src/main/java/life/catalogue/api/model/IndexName.java +++ b/api/src/main/java/life/catalogue/api/model/IndexName.java @@ -4,10 +4,7 @@ import life.catalogue.common.tax.NameFormatter; import life.catalogue.common.text.StringUtils; -import org.gbif.nameparser.api.Authorship; -import org.gbif.nameparser.api.NamePart; -import org.gbif.nameparser.api.NomCode; -import org.gbif.nameparser.api.Rank; +import org.gbif.nameparser.api.*; import java.util.Objects; @@ -38,6 +35,7 @@ public class IndexName extends DataEntity implements FormattableName { private String authorship; @Nonnull private Rank rank; + private NameType type; private String uninomial; private String genus; private String infragenericEpithet; // we only use this for true infrageneric names, not bi-/trinomials! @@ -60,6 +58,7 @@ public IndexName(IndexName other) { this.scientificName = other.scientificName; this.authorship = other.authorship; this.rank = other.rank; + this.type = other.type; this.uninomial = other.uninomial; this.genus = other.genus; this.infragenericEpithet = other.infragenericEpithet; @@ -83,6 +82,7 @@ public IndexName(Name n) { this.scientificName = n.getScientificName(); this.authorship = n.getAuthorship(); setRank(n.getRank()); + this.type = n.getType(); this.uninomial = n.getUninomial(); this.genus = n.getGenus(); this.infragenericEpithet = n.getInfragenericEpithet(); @@ -112,6 +112,7 @@ public IndexName(Name n, int key) { public static IndexName newCanonical(IndexName n) { IndexName cn = new IndexName(); cn.setRank(CANONICAL_RANK); + cn.setType(n.getType()); // we keep a canonical infrageneric name in uninomial and ignore its genus placement! if (n.getInfragenericEpithet() != null && n.isInfrageneric()) { cn.setUninomial(n.getInfragenericEpithet()); @@ -293,6 +294,15 @@ public String getUnparsed() { return null; } + @Override + public NameType getType() { + return type; + } + + public void setType(NameType type) { + this.type = type; + } + public void setCultivarEpithet(String cultivarEpithet) { this.cultivarEpithet = cultivarEpithet; } @@ -377,29 +387,15 @@ String scientificNameHtml(){ @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof IndexName)) return false; + if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; IndexName indexName = (IndexName) o; - return Objects.equals(key, indexName.key) && - Objects.equals(canonicalId, indexName.canonicalId) && - scientificName.equals(indexName.scientificName) && - Objects.equals(authorship, indexName.authorship) && - rank == indexName.rank && - Objects.equals(uninomial, indexName.uninomial) && - Objects.equals(genus, indexName.genus) && - Objects.equals(infragenericEpithet, indexName.infragenericEpithet) && - Objects.equals(specificEpithet, indexName.specificEpithet) && - Objects.equals(infraspecificEpithet, indexName.infraspecificEpithet) && - Objects.equals(cultivarEpithet, indexName.cultivarEpithet) && - Objects.equals(combinationAuthorship, indexName.combinationAuthorship) && - Objects.equals(basionymAuthorship, indexName.basionymAuthorship) && - Objects.equals(sanctioningAuthor, indexName.sanctioningAuthor); + return Objects.equals(key, indexName.key) && Objects.equals(canonicalId, indexName.canonicalId) && Objects.equals(scientificName, indexName.scientificName) && Objects.equals(authorship, indexName.authorship) && rank == indexName.rank && type == indexName.type && Objects.equals(uninomial, indexName.uninomial) && Objects.equals(genus, indexName.genus) && Objects.equals(infragenericEpithet, indexName.infragenericEpithet) && Objects.equals(specificEpithet, indexName.specificEpithet) && Objects.equals(infraspecificEpithet, indexName.infraspecificEpithet) && Objects.equals(cultivarEpithet, indexName.cultivarEpithet) && Objects.equals(combinationAuthorship, indexName.combinationAuthorship) && Objects.equals(basionymAuthorship, indexName.basionymAuthorship) && Objects.equals(sanctioningAuthor, indexName.sanctioningAuthor); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), key, canonicalId, scientificName, authorship, rank, uninomial, genus, infragenericEpithet, specificEpithet, infraspecificEpithet, cultivarEpithet, combinationAuthorship, basionymAuthorship, sanctioningAuthor); + return Objects.hash(super.hashCode(), key, canonicalId, scientificName, authorship, rank, type, uninomial, genus, infragenericEpithet, specificEpithet, infraspecificEpithet, cultivarEpithet, combinationAuthorship, basionymAuthorship, sanctioningAuthor); } @Override @@ -413,6 +409,8 @@ public String toString() { sb.append(" [CANONICAL]"); } else { sb.append(getLabelWithRank()); + sb.append(" cid="); + sb.append(getCanonicalId()); } return sb.toString(); } diff --git a/api/src/main/java/life/catalogue/common/io/TabReader.java b/api/src/main/java/life/catalogue/common/io/TabReader.java index 71aa6a265..163ea1c48 100644 --- a/api/src/main/java/life/catalogue/common/io/TabReader.java +++ b/api/src/main/java/life/catalogue/common/io/TabReader.java @@ -26,6 +26,10 @@ public static TabReader csv(Reader reader, int skip) throws IOException { return csv(new ReaderInputStream(reader, StandardCharsets.UTF_8), StandardCharsets.UTF_8, skip, 2); } + public static TabReader csv(File file, int skip) throws IOException { + return csv(file, StandardCharsets.UTF_8, skip, 2); + } + public static TabReader csv(File file, Charset charset, int skip) throws IOException { return csv(file, charset, skip, 2); } @@ -42,6 +46,10 @@ public static TabReader tab(Reader reader, int skip) throws IOException { return tab(new ReaderInputStream(reader, StandardCharsets.UTF_8), StandardCharsets.UTF_8, skip, 2); } + public static TabReader tab(File file, int skip) throws IOException { + return tab(file, StandardCharsets.UTF_8, skip, 2); + } + public static TabReader tab(File file, Charset charset, int skip) throws IOException { return tab(file, charset, skip, 2); } diff --git a/api/src/main/java/life/catalogue/common/tax/SciNameNormalizer.java b/api/src/main/java/life/catalogue/common/tax/SciNameNormalizer.java index 5f0966c1f..acc952674 100644 --- a/api/src/main/java/life/catalogue/common/tax/SciNameNormalizer.java +++ b/api/src/main/java/life/catalogue/common/tax/SciNameNormalizer.java @@ -1,5 +1,7 @@ package life.catalogue.common.tax; +import org.gbif.nameparser.api.NameType; + import java.util.regex.Pattern; import static org.apache.commons.lang3.StringUtils.trimToNull; @@ -96,50 +98,35 @@ public static String normalizeWhitespaceAndPunctuation(String s) { * The return will be a strictly ASCII encoded string. */ public static String normalize(String s) { - return normalize(s, false, true); + return normalize(s, null, true); } - - /** - * Normalizes an entire name string including monomials and genus parts of a name. - */ - public static String normalizeAll(String s) { - return normalize(s, true, true); + + public static String normalize(String s, NameType type) { + return normalize(s, type, true); } - - private static String normalize(String s, boolean normMonomials, boolean stemming) { + + private static String normalize(String s, NameType type, boolean stemEpithets) { if (!hasContent(s)) return ""; s = normalizedAscii(s); // Remove a hybrid cross, or a likely hybrid cross. s = removeHybridMarker(s); - - // Only for bi/trinomials, otherwise we mix up ranks. - if (normMonomials) { - s = normStrongly(s, stemming); - - } else if (s.indexOf(' ') > 2) { - String[] parts = s.split(" +"); - StringBuilder sb = new StringBuilder(); - sb.append(parts[0]); - for (int i = 1; i < parts.length; i++) { - sb.append(" "); - if (Character.isLowerCase(parts[i].charAt(0))) { - sb.append(normStrongly(parts[i], stemming)); - } else { - sb.append(parts[i]); - } + + // corrent common misspellings + if (type != null && type.isParsable()) { + s = normSpellings(s); + + // apply stemming only for epithets, never monomials! + if (stemEpithets && s.indexOf(' ') > 2) { + s = stemEpithet(s); } - s = sb.toString(); } - + return s.trim(); } - - private static String normStrongly(String s, boolean stemming) { - if (stemming) { - s = stemEpithet(s); - } + + private static String normSpellings(String s) { // normalize frequent variations of i s = i.matcher(s).replaceAll("i"); // remove repeated letters→leters in binomials @@ -152,7 +139,7 @@ private static String normStrongly(String s, boolean stemming) { * Stems and normalizes some few, but frequent misspellings */ public static String normalizeEpithet(String epithet) { - return normStrongly(epithet, true); + return stemEpithet(normSpellings(epithet)); } /** diff --git a/api/src/test/java/life/catalogue/common/tax/SciNameNormalizerTest.java b/api/src/test/java/life/catalogue/common/tax/SciNameNormalizerTest.java index f5fb7f296..3f352e7f9 100644 --- a/api/src/test/java/life/catalogue/common/tax/SciNameNormalizerTest.java +++ b/api/src/test/java/life/catalogue/common/tax/SciNameNormalizerTest.java @@ -1,11 +1,16 @@ package life.catalogue.common.tax; +import life.catalogue.common.io.TabReader; + import org.junit.Test; +import java.io.*; + import static org.junit.Assert.assertEquals; public class SciNameNormalizerTest { + @Test public void removeHybridMarker() throws Exception { assertEquals("Abies", SciNameNormalizer.removeHybridMarker("Abies")); @@ -18,7 +23,7 @@ public void removeHybridMarker() throws Exception { public void testNormalize() throws Exception { assertEquals("", SciNameNormalizer.normalize("")); assertEquals("Abies", SciNameNormalizer.normalize("Abies ")); - assertEquals("Abiies", SciNameNormalizer.normalize("Abiies ")); + assertEquals("Abies", SciNameNormalizer.normalize("Abiies ")); assertEquals("Abyes", SciNameNormalizer.normalize("Abyes ")); assertEquals("Abyes alb", SciNameNormalizer.normalize("Abyes albus")); assertEquals("Abyes albiet", SciNameNormalizer.normalize("Abyes albieta")); @@ -63,41 +68,41 @@ public void testNormalize() throws Exception { @Test public void testNormalizeAll() throws Exception { - assertEquals("", SciNameNormalizer.normalizeAll("")); - assertEquals("Abies", SciNameNormalizer.normalizeAll("Abies ")); - assertEquals("Abies", SciNameNormalizer.normalizeAll("Abiies ")); - assertEquals("Abies", SciNameNormalizer.normalizeAll("Abyes ")); - assertEquals("Abies alb", SciNameNormalizer.normalizeAll("Abyes albus")); - assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abyes albieta")); - assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abies albijeta")); - assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abies albyeta")); - assertEquals("Abies alb", SciNameNormalizer.normalizeAll(" \txAbies × ållbbus\t")); - - assertEquals("Abies alb", SciNameNormalizer.normalizeAll(" \txAbies × ållbbus\t")); - assertEquals("Rachis takt", SciNameNormalizer.normalizeAll("Rhachis taktos")); - - assertEquals("Hieracium sabaud", SciNameNormalizer.normalizeAll("Hieracium sabaudum")); - assertEquals("Hieracium scorzoneraefoli", SciNameNormalizer.normalizeAll("Hieracium scorzoneræfolium")); - assertEquals("Hieracium scorzonerifoli", SciNameNormalizer.normalizeAll("Hieracium scorzonerifolium")); - assertEquals("Macrozamia platirach", SciNameNormalizer.normalizeAll("Macrozamia platyrachis")); - assertEquals("Macrozamia platirach", SciNameNormalizer.normalizeAll("Macrozamia platyrhachis")); - assertEquals("Cicas circinal", SciNameNormalizer.normalizeAll("Cycas circinalis")); - assertEquals("Cicas circinal", SciNameNormalizer.normalizeAll("Cycas circinnalis")); - assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perieri")); - assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perrieri")); - assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perrierii")); - - assertEquals("Carex caiouet", SciNameNormalizer.normalizeAll("Carex ×cayouettei")); - assertEquals("Platanus hispanic", SciNameNormalizer.normalizeAll("Platanus x hispanica")); + assertEquals("", SciNameNormalizer.normalize("")); + assertEquals("Abies", SciNameNormalizer.normalize("Abies ")); + assertEquals("Abies", SciNameNormalizer.normalize("Abiies ")); + assertEquals("Abies", SciNameNormalizer.normalize("Abyes ")); + assertEquals("Abies alb", SciNameNormalizer.normalize("Abyes albus")); + assertEquals("Abies albiet", SciNameNormalizer.normalize("Abyes albieta")); + assertEquals("Abies albiet", SciNameNormalizer.normalize("Abies albijeta")); + assertEquals("Abies albiet", SciNameNormalizer.normalize("Abies albyeta")); + assertEquals("Abies alb", SciNameNormalizer.normalize(" \txAbies × ållbbus\t")); + + assertEquals("Abies alb", SciNameNormalizer.normalize(" \txAbies × ållbbus\t")); + assertEquals("Rachis takt", SciNameNormalizer.normalize("Rhachis taktos")); + + assertEquals("Hieracium sabaud", SciNameNormalizer.normalize("Hieracium sabaudum")); + assertEquals("Hieracium scorzoneraefoli", SciNameNormalizer.normalize("Hieracium scorzoneræfolium")); + assertEquals("Hieracium scorzonerifoli", SciNameNormalizer.normalize("Hieracium scorzonerifolium")); + assertEquals("Macrozamia platirach", SciNameNormalizer.normalize("Macrozamia platyrachis")); + assertEquals("Macrozamia platirach", SciNameNormalizer.normalize("Macrozamia platyrhachis")); + assertEquals("Cicas circinal", SciNameNormalizer.normalize("Cycas circinalis")); + assertEquals("Cicas circinal", SciNameNormalizer.normalize("Cycas circinnalis")); + assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perieri")); + assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perrieri")); + assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perrierii")); + + assertEquals("Carex caiouet", SciNameNormalizer.normalize("Carex ×cayouettei")); + assertEquals("Platanus hispanic", SciNameNormalizer.normalize("Platanus x hispanica")); // https://github.com/gbif/checklistbank/issues/7 - assertEquals("Eragrostis brown", SciNameNormalizer.normalizeAll("Eragrostis brownii")); - assertEquals("Eragrostis brown", SciNameNormalizer.normalizeAll("Eragrostis brownei")); + assertEquals("Eragrostis brown", SciNameNormalizer.normalize("Eragrostis brownii")); + assertEquals("Eragrostis brown", SciNameNormalizer.normalize("Eragrostis brownei")); } @Test public void testHybridCross() throws Exception { - assertEquals("xcayouettei", SciNameNormalizer.normalize("xcayouettei")); - assertEquals("cayouettei", SciNameNormalizer.normalize("×cayouettei")); + assertEquals("xcaiouetei", SciNameNormalizer.normalize("xcayouettei")); + assertEquals("caiouetei", SciNameNormalizer.normalize("×cayouettei")); assertEquals("Carex xcaiouet", SciNameNormalizer.normalize("Carex xcayouettei")); assertEquals("Carex caiouet", SciNameNormalizer.normalize("Carex ×cayouettei")); @@ -113,8 +118,8 @@ public void testHybridCross() throws Exception { @Test public void testNonAscii() throws Exception { - assertEquals("Cem Andrexi", SciNameNormalizer.normalize("Çem Ándrexï")); - assertEquals("SOEZsoezY¥µAAAAAAAECEEEEIIIIDNOOOOOOUUUUYssaaaaaaaeceeeeiiiidnoooooouuuuyy", SciNameNormalizer.normalize("ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ")); + assertEquals("CemAndrexi", SciNameNormalizer.normalize("Çem_Ándrexï")); + assertEquals("SOEZsoezY¥µAECEIDNOUYsaeceidnoui", SciNameNormalizer.normalize("ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ")); } @Test diff --git a/dao/src/main/java/life/catalogue/es/NameStrings.java b/dao/src/main/java/life/catalogue/es/NameStrings.java index 89120aa98..e5c659476 100644 --- a/dao/src/main/java/life/catalogue/es/NameStrings.java +++ b/dao/src/main/java/life/catalogue/es/NameStrings.java @@ -10,7 +10,7 @@ import static life.catalogue.es.ddl.Analyzer.SCINAME_AUTO_COMPLETE; import static life.catalogue.es.ddl.Analyzer.SCINAME_IGNORE_CASE; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeWeakly; +import static life.catalogue.es.nu.NameUsageWrapperConverter.normalize; /** * An object embedded within the name usage document solely aimed at optimizing searchability. The name strings within this class do not @@ -44,17 +44,17 @@ public NameStrings(Name name) { } if (!StringUtils.isBlank(name.getGenus())) { genusLetter = Character.toLowerCase(name.getGenus().charAt(0)); - genusOrMonomial = getStrings(name.getGenus(), normalizeWeakly(name.getGenus())); + genusOrMonomial = getStrings(name.getGenus(), normalize(name.getGenus())); } else if (!StringUtils.isBlank(name.getUninomial())) { - genusOrMonomial = getStrings(name.getUninomial().toLowerCase(), normalizeWeakly(name.getUninomial())); + genusOrMonomial = getStrings(name.getUninomial().toLowerCase(), normalize(name.getUninomial())); } // we used to use the strong normaliser to index species/infraspecific epithets... // But that caused more problems than it helped... if (!StringUtils.isBlank(name.getSpecificEpithet())) { - specificEpithet = getStrings(name.getSpecificEpithet().toLowerCase(), normalizeWeakly(name.getSpecificEpithet())); + specificEpithet = getStrings(name.getSpecificEpithet().toLowerCase(), normalize(name.getSpecificEpithet())); } if (!StringUtils.isBlank(name.getInfraspecificEpithet())) { - infraspecificEpithet = getStrings(name.getInfraspecificEpithet().toLowerCase(), normalizeWeakly(name.getInfraspecificEpithet())); + infraspecificEpithet = getStrings(name.getInfraspecificEpithet().toLowerCase(), normalize(name.getInfraspecificEpithet())); } } diff --git a/dao/src/main/java/life/catalogue/es/nu/FuzzyMatcher.java b/dao/src/main/java/life/catalogue/es/nu/FuzzyMatcher.java index 3c26e9983..b62acf711 100644 --- a/dao/src/main/java/life/catalogue/es/nu/FuzzyMatcher.java +++ b/dao/src/main/java/life/catalogue/es/nu/FuzzyMatcher.java @@ -4,8 +4,7 @@ import life.catalogue.es.query.BoolQuery; import life.catalogue.es.query.Query; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeStrongly; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeWeakly; +import static life.catalogue.es.nu.NameUsageWrapperConverter.normalize; /** * Abstract base class for fuzzy matching. @@ -20,10 +19,10 @@ abstract class FuzzyMatcher extends QMatcher implements MatcherMixIn { @Override Query matchAsMonomial() { String[] terms = request.getSciNameSearchTerms(); - String termWN = normalizeWeakly(terms[0]); + String termWN = normalize(terms[0]); // we used to use the strongly normalised terms to index/query species/infraspecific epithets. // But that caused more problems than it helped... - String termSN = normalizeStrongly(terms[0]); + String termSN = NameUsageWrapperConverter.normalize(terms[0]); return sciNameBaseQuery() .subquery(new BoolQuery() // Prefer subspecies over species and species over genera .should(matchAsEpithet(FLD_SUBSPECIES, termWN).withBoost(1.2)) @@ -34,10 +33,10 @@ Query matchAsMonomial() { @Override Query matchAsBinomial() { String[] terms = request.getSciNameSearchTerms(); - String term0WN = normalizeWeakly(terms[0]); - String term0SN = normalizeStrongly(terms[0]); - String term1WN = normalizeWeakly(terms[1]); - String term1SN = normalizeStrongly(terms[1]); + String term0WN = normalize(terms[0]); + String term0SN = NameUsageWrapperConverter.normalize(terms[0]); + String term1WN = normalize(terms[1]); + String term1SN = NameUsageWrapperConverter.normalize(terms[1]); return sciNameBaseQuery() .subquery(new BoolQuery() .must(matchAsGenericEpithet(term0WN)) @@ -64,9 +63,9 @@ Query matchAsBinomial() { @Override Query matchAsTrinomial() { String[] terms = request.getSciNameSearchTerms(); - String term0WN = normalizeWeakly(terms[0]); - String term1SN = normalizeStrongly(terms[1]); - String term2SN = normalizeStrongly(terms[2]); + String term0WN = normalize(terms[0]); + String term1SN = NameUsageWrapperConverter.normalize(terms[1]); + String term2SN = NameUsageWrapperConverter.normalize(terms[2]); return sciNameBaseQuery() .subquery(new BoolQuery() .must(matchAsGenericEpithet(term0WN)) diff --git a/dao/src/main/java/life/catalogue/es/nu/NameUsageWrapperConverter.java b/dao/src/main/java/life/catalogue/es/nu/NameUsageWrapperConverter.java index 55e6b2cfa..43a3b995e 100644 --- a/dao/src/main/java/life/catalogue/es/nu/NameUsageWrapperConverter.java +++ b/dao/src/main/java/life/catalogue/es/nu/NameUsageWrapperConverter.java @@ -67,25 +67,15 @@ public static NameUsageWrapper decode(String payload) throws IOException { } /** - * Provides a weakly normalized version of the provided string. Used to index generic epithets. See {@link NameStrings}. + * Provides a normalized version of the provided string. Used to index generic epithets. See {@link NameStrings}. */ - public static String normalizeWeakly(String s) { + public static String normalize(String s) { if (s == null) { return null; } return SciNameNormalizer.normalize(s.toLowerCase()); } - /** - * Provides a strongly normalized version of the provided string. Used to index specific epithets and infraspecific epithets. - */ - public static String normalizeStrongly(String s) { - if (s == null) { - return null; - } - return SciNameNormalizer.normalizeAll(s.toLowerCase()); - } - /** * Extracts the classification from the provided document. * diff --git a/dao/src/main/java/life/catalogue/es/nu/SimpleMatcher.java b/dao/src/main/java/life/catalogue/es/nu/SimpleMatcher.java index dc3123a5c..48c4849f5 100644 --- a/dao/src/main/java/life/catalogue/es/nu/SimpleMatcher.java +++ b/dao/src/main/java/life/catalogue/es/nu/SimpleMatcher.java @@ -5,7 +5,7 @@ import life.catalogue.es.query.BoolQuery; import life.catalogue.es.query.Query; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeWeakly; +import static life.catalogue.es.nu.NameUsageWrapperConverter.normalize; /** * Abstract base class for non-fuzzy matching. Search terms are not normalized, so they can only hit the non-normalized versions of the @@ -22,7 +22,7 @@ abstract class SimpleMatcher extends QMatcher implements MatcherMixIn { @Override Query matchAsMonomial() { String[] terms = request.getSciNameSearchTerms(); - String term0 = normalizeWeakly(terms[0]); + String term0 = normalize(terms[0]); return sciNameBaseQuery() .subquery(new BoolQuery() // Prefer genus over species over subspecies .should(matchAsEpithet(FLD_SUBSPECIES, term0).withBoost(1.0)) @@ -33,8 +33,8 @@ Query matchAsMonomial() { @Override Query matchAsBinomial() { String[] terms = request.getSciNameSearchTerms(); - String term0 = normalizeWeakly(terms[0]); - String term1 = normalizeWeakly(terms[1]); + String term0 = normalize(terms[0]); + String term1 = normalize(terms[1]); return sciNameBaseQuery() .subquery(new BoolQuery() .must(matchAsGenericEpithet(term0)) diff --git a/dao/src/main/java/life/catalogue/es/nu/search/NameUsageHighlighter.java b/dao/src/main/java/life/catalogue/es/nu/search/NameUsageHighlighter.java index dfd027688..ab0acd7a4 100644 --- a/dao/src/main/java/life/catalogue/es/nu/search/NameUsageHighlighter.java +++ b/dao/src/main/java/life/catalogue/es/nu/search/NameUsageHighlighter.java @@ -5,6 +5,8 @@ import life.catalogue.api.search.NameUsageSearchResponse; import life.catalogue.api.search.NameUsageWrapper; +import life.catalogue.es.nu.NameUsageWrapperConverter; + import org.gbif.nameparser.api.Authorship; import java.util.Set; @@ -17,8 +19,7 @@ import static life.catalogue.api.search.NameUsageSearchRequest.SearchContent.AUTHORSHIP; import static life.catalogue.api.search.NameUsageSearchRequest.SearchContent.SCIENTIFIC_NAME; import static life.catalogue.common.collection.CollectionUtils.isEmpty; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeStrongly; -import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeWeakly; +import static life.catalogue.es.nu.NameUsageWrapperConverter.normalize; /* * A DIY highlighter we use in stead of Elasticsearch's highlight capabilities. @@ -54,8 +55,8 @@ class NameUsageHighlighter { pattern = Pattern.compile(Pattern.quote(request.getQ().toLowerCase())); } if (sc.contains(SCIENTIFIC_NAME)) { - String qWN = normalizeWeakly(request.getQ()); - String qSN = normalizeStrongly(request.getQ()); + String qWN = normalize(request.getQ()); + String qSN = NameUsageWrapperConverter.normalize(request.getQ()); patternWN = Pattern.compile(Pattern.quote(qWN)); patternSN = qWN.equals(qSN) ? null : Pattern.compile(Pattern.quote(qSN)); } @@ -89,11 +90,11 @@ private void highlightAuthorShip(NameUsageWrapper nuw) { private void highlightScientificName(NameUsageWrapper nuw) { String original = nuw.getUsage().getName().getScientificName(); - Matcher matcher = patternWN.matcher(normalizeWeakly(original)); + Matcher matcher = patternWN.matcher(normalize(original)); String highlighted = highlight(original, matcher); if (highlighted.length() == original.length() && patternSN != null) { // Then no highlighting took place; let's try with the strongly normalized name - matcher = patternSN.matcher(normalizeStrongly(original)); + matcher = patternSN.matcher(NameUsageWrapperConverter.normalize(original)); highlighted = highlight(original, matcher); } nuw.getUsage().getName().setScientificName(highlighted); diff --git a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexChronicleStore.java b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexChronicleStore.java index 50c65aa05..07d45e78d 100644 --- a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexChronicleStore.java +++ b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexChronicleStore.java @@ -317,9 +317,8 @@ public IndexName read(Bytes in, @Nullable IndexName using) { int size = in.readInt(); byte[] bytes = new byte[size]; in.read(bytes); - if (using != null) { - System.out.println("WARN: IndexName instance existing: " + using); - } + // kryo creates a new instance anyways, so we always create a new instance + // we can only reuse any existing object (happens always in memory mode) with even more effort return kryo.readObject(new Input(bytes), IndexName.class); } finally { if (kryo != null) { diff --git a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexImpl.java b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexImpl.java index 37f6db7db..3b230995e 100644 --- a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexImpl.java +++ b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexImpl.java @@ -108,7 +108,8 @@ public NameMatch match(Name name, boolean allowInserts, boolean verbose) throws if (name.getRank() == null) { name.setRank(IndexName.CANONICAL_RANK); } - List candidates = store.get(key(name)); + var key = key(name); + List candidates = store.get(key); if (candidates != null && !candidates.isEmpty()) { m = matchCandidates(name, candidates); if (verbose) { @@ -493,7 +494,7 @@ private void createCanonical(NamesIndexMapper nim, String key, IndexName cn){ */ private static String key(FormattableName n) { String origName = NameFormatter.canonicalName(n); - return UnicodeUtils.replaceNonAscii(SciNameNormalizer.normalize(UnicodeUtils.decompose(origName)).toLowerCase(), '*'); + return UnicodeUtils.replaceNonAscii(SciNameNormalizer.normalize(UnicodeUtils.decompose(origName), n.getType()).toLowerCase(), '*'); } /** diff --git a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexKryoPool.java b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexKryoPool.java index d3e9ea5e1..45b5bdd5a 100644 --- a/dao/src/main/java/life/catalogue/matching/nidx/NameIndexKryoPool.java +++ b/dao/src/main/java/life/catalogue/matching/nidx/NameIndexKryoPool.java @@ -9,6 +9,7 @@ import life.catalogue.common.kryo.FastUtilsSerializers; import org.gbif.nameparser.api.Authorship; +import org.gbif.nameparser.api.NameType; import org.gbif.nameparser.api.Rank; import java.time.LocalDateTime; @@ -33,6 +34,7 @@ public Kryo create() { kryo.register(IndexName.class); kryo.register(Authorship.class); kryo.register(Rank.class); + kryo.register(NameType.class); kryo.register(LocalDateTime.class); kryo.register(ArrayList.class); kryo.register(HashMap.class); diff --git a/dao/src/main/java/life/catalogue/matching/nidx/NamesIndexConfig.java b/dao/src/main/java/life/catalogue/matching/nidx/NamesIndexConfig.java index c2ea7a2f3..f21187ae0 100644 --- a/dao/src/main/java/life/catalogue/matching/nidx/NamesIndexConfig.java +++ b/dao/src/main/java/life/catalogue/matching/nidx/NamesIndexConfig.java @@ -12,9 +12,16 @@ public class NamesIndexConfig { public enum Store {MAPDB, CHRONICLE} public static NamesIndexConfig memory(int poolsize){ + return memory(poolsize, -1); + } + + public static NamesIndexConfig memory(int poolsize, int maxEntries){ var cfg = new NamesIndexConfig(); cfg.file = null; cfg.kryoPoolSize = poolsize; + if (maxEntries > 0) { + cfg.maxEntries = maxEntries; + } return cfg; } @@ -42,12 +49,12 @@ public static NamesIndexConfig file(File location, int poolsize){ public int kryoPoolSize = 1024; @NotNull - public Store type = Store.MAPDB; + public Store type = Store.CHRONICLE; /** * Maximum numbers of names index entries supported by a chronicle store */ @Min(1_000) - public int maxEntries = 1_000; + public int maxEntries = 10_000; } diff --git a/dao/src/main/resources/life/catalogue/db/dbschema.md b/dao/src/main/resources/life/catalogue/db/dbschema.md index 3cf93ad47..6f2729b25 100644 --- a/dao/src/main/resources/life/catalogue/db/dbschema.md +++ b/dao/src/main/resources/life/catalogue/db/dbschema.md @@ -14,6 +14,10 @@ and done it manually. So we can as well log changes here. ### PROD changes +#### 2024-12-05 taxon metrics +``` +ALTER TABLE names_index ADD COLUMN type NAMETYPE; +``` #### 2024-12-05 taxon metrics ``` diff --git a/dao/src/main/resources/life/catalogue/db/dbschema.sql b/dao/src/main/resources/life/catalogue/db/dbschema.sql index 6680af07b..5993c29bd 100644 --- a/dao/src/main/resources/life/catalogue/db/dbschema.sql +++ b/dao/src/main/resources/life/catalogue/db/dbschema.sql @@ -1106,6 +1106,7 @@ CREATE TABLE names_index ( id SERIAL PRIMARY KEY, canonical_id INTEGER NOT NULL REFERENCES names_index, rank RANK NOT NULL, + type NAMETYPE, created TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(), modified TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW(), scientific_name TEXT NOT NULL, diff --git a/dao/src/main/resources/life/catalogue/db/mapper/NamesIndexMapper.xml b/dao/src/main/resources/life/catalogue/db/mapper/NamesIndexMapper.xml index f0fb3b4a0..38c8a9515 100644 --- a/dao/src/main/resources/life/catalogue/db/mapper/NamesIndexMapper.xml +++ b/dao/src/main/resources/life/catalogue/db/mapper/NamesIndexMapper.xml @@ -12,6 +12,7 @@ scientific_name, authorship, rank, + type, uninomial, genus, infrageneric_epithet, @@ -31,6 +32,7 @@ #{scientificName}, #{authorship}, #{rank}::RANK, + #{type}::NAMETYPE, #{uninomial}, #{genus}, #{infragenericEpithet}, diff --git a/dao/src/test/java/life/catalogue/es/nu/NameUsageWrapperConverterTest.java b/dao/src/test/java/life/catalogue/es/nu/NameUsageWrapperConverterTest.java index 84a2fd668..17fea46b1 100644 --- a/dao/src/test/java/life/catalogue/es/nu/NameUsageWrapperConverterTest.java +++ b/dao/src/test/java/life/catalogue/es/nu/NameUsageWrapperConverterTest.java @@ -53,58 +53,58 @@ public void roundtripPayload() throws IOException { } @Test - public void testNormalizeWeakly1() { - String s = NameUsageWrapperConverter.normalizeWeakly("Larus"); + public void testNormalize1() { + String s = NameUsageWrapperConverter.normalize("Larus"); assertEquals("larus", s); } @Test - public void testNormalizeWeakly2() { - String s = NameUsageWrapperConverter.normalizeWeakly("等待"); + public void testNormalize2() { + String s = NameUsageWrapperConverter.normalize("等待"); assertEquals("等待", s); } @Test - public void testNormalizeWeakly3() { - String s = NameUsageWrapperConverter.normalizeWeakly("sérieux"); + public void testNormalize3() { + String s = NameUsageWrapperConverter.normalize("sérieux"); assertEquals("serieux", s); } @Test - public void testNormalizeStrongly1a() { - String s = NameUsageWrapperConverter.normalizeStrongly("Larus"); + public void testNormalize1A() { + String s = NameUsageWrapperConverter.normalize("Larus"); System.out.println(s); assertEquals("lar", s); } @Test - public void testNormalizeStrongly1b() { - String s = NameUsageWrapperConverter.normalizeStrongly("Larus fuscus"); + public void testNormalize1B() { + String s = NameUsageWrapperConverter.normalize("Larus fuscus"); assertEquals("larus fusc", s); } @Test - public void testNormalizeStrongly1c() { - String s = NameUsageWrapperConverter.normalizeStrongly("Larus fuscus fuscus"); + public void testNormalize1C() { + String s = NameUsageWrapperConverter.normalize("Larus fuscus fuscus"); System.out.println(s); assertEquals("larus fuscus fusc", s); } @Test - public void testNormalizeStrongly2() { - String s = NameUsageWrapperConverter.normalizeStrongly("等待"); + public void testNormalize2b() { + String s = NameUsageWrapperConverter.normalize("等待"); assertEquals("等待", s); } @Test - public void testNormalizeStrongly3() { - String s = NameUsageWrapperConverter.normalizeStrongly("sérieux"); + public void testNormalize3b() { + String s = NameUsageWrapperConverter.normalize("sérieux"); assertEquals("serieux", s); } @Test - public void testNormalizeStrongly4() { - String s = NameUsageWrapperConverter.normalizeStrongly("sylvestris"); + public void testNormalize4() { + String s = NameUsageWrapperConverter.normalize("sylvestris"); System.out.println(s); assertEquals("silvestr", s); } diff --git a/dao/src/test/java/life/catalogue/matching/NameIndexImplTest.java b/dao/src/test/java/life/catalogue/matching/NameIndexImplTest.java index 952f5fcc8..dcf321bd2 100644 --- a/dao/src/test/java/life/catalogue/matching/NameIndexImplTest.java +++ b/dao/src/test/java/life/catalogue/matching/NameIndexImplTest.java @@ -1,13 +1,12 @@ package life.catalogue.matching; import life.catalogue.api.TestEntityGenerator; -import life.catalogue.api.model.IndexName; -import life.catalogue.api.model.Name; -import life.catalogue.api.model.NameMatch; -import life.catalogue.api.model.VerbatimRecord; +import life.catalogue.api.model.*; import life.catalogue.api.vocab.MatchType; import life.catalogue.api.vocab.Origin; +import life.catalogue.common.io.TabReader; import life.catalogue.common.tax.AuthorshipNormalizer; +import life.catalogue.common.tax.SciNameNormalizer; import life.catalogue.db.mapper.NamesIndexMapper; import life.catalogue.matching.nidx.NameIndex; import life.catalogue.matching.nidx.NameIndexFactory; @@ -15,8 +14,14 @@ import life.catalogue.matching.nidx.NamesIndexConfig; import life.catalogue.parser.NameParser; +import life.catalogue.parser.RankParser; + +import life.catalogue.parser.UnparsableException; + import org.gbif.nameparser.api.*; +import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -28,6 +33,9 @@ import org.apache.ibatis.cursor.Cursor; import org.apache.ibatis.session.SqlSession; import org.apache.ibatis.session.SqlSessionFactory; + +import org.gbif.nameparser.util.RankUtils; + import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -64,7 +72,7 @@ public IndexName answer(InvocationOnMock invocation) { }} ).when(mapper).create(any()); - ni = NameIndexFactory.build(NamesIndexConfig.memory(512), factory, aNormalizer).started(); + ni = NameIndexFactory.build(NamesIndexConfig.memory(512, 10_000_000), factory, aNormalizer).started(); assertEquals(0, ni.size()); } @@ -834,5 +842,50 @@ private NameMatch match(String name, Rank rank) throws InterruptedException { NameMatch m = ni.match(name(name, rank), false, true); return m; } - + + + /** + * Reads a simple ColDP NameUsage file and adds 4 more columns to it: + * 1) normalised name + * 2) name type + * 3) nidx id + * 4) nidx canonical id + */ + @Test + public void procColDP() throws Exception { + File dir = new File("/Users/markus/Downloads/col12"); + File fni = new File(dir, "NameUsage.tsv"); + File fno = new File(dir, "NameUsageOut.tsv"); + try ( + TabReader reader = TabReader.tab(fni, 1); + FileWriter fw = new FileWriter(fno) + ) { + for (var row : reader) { + fw.write(processLine(row) + "\n"); + } + } + // report nidx + System.out.println("\n\n+++++ NIDX +++++"); + System.out.println("Total records: " + ni.store().count()); + + // avoid large dumps + ni.close(); + ni = null; + } + + private String processLine(String[] row) throws UnparsableException, InterruptedException { + // col:ID col:parentID col:status col:rank col:scientificName col:authorship + String[] extra = new String[4]; + Rank rank = RankParser.PARSER.parse(row[3]).get(); + Name n = TestEntityGenerator.setUserDate(NameParser.PARSER.parse(row[4], row[5], rank, null, VerbatimRecord.VOID).get().getName()); + n.setRank(rank); + extra[0] = SciNameNormalizer.normalize(row[4], n.getType()); + var nm = ni.match(n, true, false); + extra[1] = n.getType().toString(); + if (nm.hasMatch()) { + extra[2] = nm.getNameKey().toString(); + extra[3] = nm.getCanonicalNameKey().toString(); + } + return String.join("\t", row) + "\t" + String.join("\t", extra); + } } \ No newline at end of file diff --git a/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java b/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java index 5e372ce60..7a94b1e45 100644 --- a/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java +++ b/webservice/src/test/java/life/catalogue/assembly/SectorSyncMergeIT.java @@ -83,6 +83,7 @@ public class SectorSyncMergeIT extends SectorSyncTestBase { @Parameterized.Parameters public static Collection data() { return Arrays.asList(new Object[][] { + {"ismaridae", List.of("taxref", "dyntaxa", "artsnaven")}, {"bolyeriidae", List.of("itis", "reptiledb", "uksi", "pbdb")}, {"myosotis", List.of("taxref", "uksi", "pbdb", "bavaria")}, {"tetralobus", List.of("wfo", "bouchard", "plazi")}, diff --git a/webservice/src/test/java/life/catalogue/importer/PgImportIT.java b/webservice/src/test/java/life/catalogue/importer/PgImportIT.java index f421d6536..ea4748f47 100644 --- a/webservice/src/test/java/life/catalogue/importer/PgImportIT.java +++ b/webservice/src/test/java/life/catalogue/importer/PgImportIT.java @@ -620,7 +620,7 @@ public void coldpProperties() throws Exception { public void testExternalManually() throws Exception { dataset.setType(DatasetType.TAXONOMIC); - normalizeAndImportFolder(new File("/Users/markus/Downloads/dataset-53133"), DWCA); + normalizeAndImportFolder(new File("/Users/markus/Downloads/dataset-307245"), COLDP); //normalizeAndImport(URI.create("https://bdj.pensoft.net/lib/ajax_srv/archive_download.php?archive_type=2&document_id=80487"), DWCA); //normalizeAndImport(URI.create("https://tb.plazi.org/GgServer/dwca/CB7EFFE7FFD3FFB3E551FFBDFF9C916F.zip"), DWCA); //normalizeAndImport(URI.create("https://github.com/mdoering/data-ina/archive/master.zip"), COLDP); diff --git a/webservice/src/test/resources/txtree/ismaridae/artsnaven.txtree b/webservice/src/test/resources/txtree/ismaridae/artsnaven.txtree new file mode 100644 index 000000000..eab0f014a --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/artsnaven.txtree @@ -0,0 +1,7 @@ +Animalia [kingdom] + Arthropoda [phylum] + Insecta [class] + Hymenoptera [order] + Apocrita [suborder] + Diaprioidea [superfamily] + Ismariidae [family] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/ismaridae/dyntaxa.txtree b/webservice/src/test/resources/txtree/ismaridae/dyntaxa.txtree new file mode 100644 index 000000000..7c384dc73 --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/dyntaxa.txtree @@ -0,0 +1,10 @@ +Animalia [kingdom] + Arthropoda [phylum] + Hexapoda [subphylum] + Insecta Linnaeus, 1758 [class] + Hymenoptera Linnaeus, 1758 [order] + Apocrita Gerstaecker, 1867 [suborder] + Parasitica [infraorder] + Diaprioidea Sharkey, 2007 [superfamily] + Ismaridae Thomson, 1858 [family] + =Ismarinae Thomson, 1858 [family] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/ismaridae/expected.txtree b/webservice/src/test/resources/txtree/ismaridae/expected.txtree new file mode 100644 index 000000000..f333fd433 --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/expected.txtree @@ -0,0 +1,9 @@ +Biota [unranked] + Animalia [kingdom] + Arthropoda [phylum] + Hexapoda [subphylum] + Insecta [class] + Hymenoptera [order] + Diaprioidea [superfamily] + Ismaridae Thomson, 1858 [family] + =Ismarinae Thomson, 1858 [family] diff --git a/webservice/src/test/resources/txtree/ismaridae/project.txtree b/webservice/src/test/resources/txtree/ismaridae/project.txtree new file mode 100644 index 000000000..1411b3424 --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/project.txtree @@ -0,0 +1,8 @@ +Biota [unranked] + Animalia [kingdom] + Arthropoda [phylum] + Hexapoda [subphylum] + Insecta [class] + Hymenoptera [order] + Diaprioidea [superfamily] + Ismaridae [family] \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/ismaridae/readme.md b/webservice/src/test/resources/txtree/ismaridae/readme.md new file mode 100644 index 000000000..4a91b83a9 --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/readme.md @@ -0,0 +1,2 @@ +Family misspellings of Ismariidae that should be caught be the nidx alone +https://github.com/CatalogueOfLife/data/issues/889 \ No newline at end of file diff --git a/webservice/src/test/resources/txtree/ismaridae/taxref.txtree b/webservice/src/test/resources/txtree/ismaridae/taxref.txtree new file mode 100644 index 000000000..7cdb8f6a7 --- /dev/null +++ b/webservice/src/test/resources/txtree/ismaridae/taxref.txtree @@ -0,0 +1,20 @@ +Biota Endl. (D.Don) [domain] + Animalia Linnaeus, 1758 [kingdom] + Eumetazoa Bütschli, 1910 [subkingdom] + Bilateria Haeckel, 1874 [unranked] + Protostomia Grobben, 1908 [infrakingdom] + Cuticulata [unranked] + Ecdysozoa Aguinaldo, Turbeville, Linford, Rivera, Garey, Raff & Lake, 1997 [unranked] + Panarthropoda Nielsen, 1995 [unranked] + Arthropoda Latreille, 1829 [phylum] + Pancrustacea Zrzavý & Štys, 1997 [subphylum] + Altocrustacea Regier, Schultz, Zwick, Hussey, Ball, Wetzer, Martin & Cunningham, 2010 [infraphylum] + Hexapoda Blainville, 1816 [superclass] + Insecta Linnaeus, 1758 [class] + Dicondylia Hennig, 1953 [infraclass] + Pterygota Brauer, 1885 [infraclass] + Neoptera Martynov, 1923 [unranked] + Hymenoptera Linnaeus, 1758 [order] + Apocrita Gerstäcker, 1867 [suborder] + Diaprioidea Haliday, 1833 [superfamily] + Ismaridae Thomson, 1858 [family] \ No newline at end of file