Skip to content

Commit

Permalink
Test normilisation of monomial names, see CatalogueOfLife/data#892
Browse files Browse the repository at this point in the history
  • Loading branch information
mdoering committed Jan 7, 2025
1 parent 9ac10fc commit 961e9a8
Show file tree
Hide file tree
Showing 27 changed files with 276 additions and 158 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.collect.Lists;

import org.gbif.nameparser.api.NameType;

/**
* Most of the Name class with all getters needed to format a Name using the NameFormatter.
*/
Expand All @@ -34,6 +36,8 @@ default Boolean isOriginalSpelling() {

String getUnparsed();

NameType getType();

/**
* @return the terminal epithet. Infraspecific epithet if existing, the species epithet or null
*/
Expand Down Expand Up @@ -123,6 +127,6 @@ default List<String> nameParts() {
*/
@JsonIgnore
default String getScientificNameNormalized() {
return SciNameNormalizer.normalize(getScientificName());
return SciNameNormalizer.normalize(getScientificName(), getType());
}
}
40 changes: 19 additions & 21 deletions api/src/main/java/life/catalogue/api/model/IndexName.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
import life.catalogue.common.tax.NameFormatter;
import life.catalogue.common.text.StringUtils;

import org.gbif.nameparser.api.Authorship;
import org.gbif.nameparser.api.NamePart;
import org.gbif.nameparser.api.NomCode;
import org.gbif.nameparser.api.Rank;
import org.gbif.nameparser.api.*;

import java.util.Objects;

Expand Down Expand Up @@ -38,6 +35,7 @@ public class IndexName extends DataEntity<Integer> implements FormattableName {
private String authorship;
@Nonnull
private Rank rank;
private NameType type;
private String uninomial;
private String genus;
private String infragenericEpithet; // we only use this for true infrageneric names, not bi-/trinomials!
Expand All @@ -60,6 +58,7 @@ public IndexName(IndexName other) {
this.scientificName = other.scientificName;
this.authorship = other.authorship;
this.rank = other.rank;
this.type = other.type;
this.uninomial = other.uninomial;
this.genus = other.genus;
this.infragenericEpithet = other.infragenericEpithet;
Expand All @@ -83,6 +82,7 @@ public IndexName(Name n) {
this.scientificName = n.getScientificName();
this.authorship = n.getAuthorship();
setRank(n.getRank());
this.type = n.getType();
this.uninomial = n.getUninomial();
this.genus = n.getGenus();
this.infragenericEpithet = n.getInfragenericEpithet();
Expand Down Expand Up @@ -112,6 +112,7 @@ public IndexName(Name n, int key) {
public static IndexName newCanonical(IndexName n) {
IndexName cn = new IndexName();
cn.setRank(CANONICAL_RANK);
cn.setType(n.getType());
// we keep a canonical infrageneric name in uninomial and ignore its genus placement!
if (n.getInfragenericEpithet() != null && n.isInfrageneric()) {
cn.setUninomial(n.getInfragenericEpithet());
Expand Down Expand Up @@ -293,6 +294,15 @@ public String getUnparsed() {
return null;
}

@Override
public NameType getType() {
return type;
}

public void setType(NameType type) {
this.type = type;
}

public void setCultivarEpithet(String cultivarEpithet) {
this.cultivarEpithet = cultivarEpithet;
}
Expand Down Expand Up @@ -377,29 +387,15 @@ String scientificNameHtml(){

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof IndexName)) return false;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
IndexName indexName = (IndexName) o;
return Objects.equals(key, indexName.key) &&
Objects.equals(canonicalId, indexName.canonicalId) &&
scientificName.equals(indexName.scientificName) &&
Objects.equals(authorship, indexName.authorship) &&
rank == indexName.rank &&
Objects.equals(uninomial, indexName.uninomial) &&
Objects.equals(genus, indexName.genus) &&
Objects.equals(infragenericEpithet, indexName.infragenericEpithet) &&
Objects.equals(specificEpithet, indexName.specificEpithet) &&
Objects.equals(infraspecificEpithet, indexName.infraspecificEpithet) &&
Objects.equals(cultivarEpithet, indexName.cultivarEpithet) &&
Objects.equals(combinationAuthorship, indexName.combinationAuthorship) &&
Objects.equals(basionymAuthorship, indexName.basionymAuthorship) &&
Objects.equals(sanctioningAuthor, indexName.sanctioningAuthor);
return Objects.equals(key, indexName.key) && Objects.equals(canonicalId, indexName.canonicalId) && Objects.equals(scientificName, indexName.scientificName) && Objects.equals(authorship, indexName.authorship) && rank == indexName.rank && type == indexName.type && Objects.equals(uninomial, indexName.uninomial) && Objects.equals(genus, indexName.genus) && Objects.equals(infragenericEpithet, indexName.infragenericEpithet) && Objects.equals(specificEpithet, indexName.specificEpithet) && Objects.equals(infraspecificEpithet, indexName.infraspecificEpithet) && Objects.equals(cultivarEpithet, indexName.cultivarEpithet) && Objects.equals(combinationAuthorship, indexName.combinationAuthorship) && Objects.equals(basionymAuthorship, indexName.basionymAuthorship) && Objects.equals(sanctioningAuthor, indexName.sanctioningAuthor);
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), key, canonicalId, scientificName, authorship, rank, uninomial, genus, infragenericEpithet, specificEpithet, infraspecificEpithet, cultivarEpithet, combinationAuthorship, basionymAuthorship, sanctioningAuthor);
return Objects.hash(super.hashCode(), key, canonicalId, scientificName, authorship, rank, type, uninomial, genus, infragenericEpithet, specificEpithet, infraspecificEpithet, cultivarEpithet, combinationAuthorship, basionymAuthorship, sanctioningAuthor);
}

@Override
Expand All @@ -413,6 +409,8 @@ public String toString() {
sb.append(" [CANONICAL]");
} else {
sb.append(getLabelWithRank());
sb.append(" cid=");
sb.append(getCanonicalId());
}
return sb.toString();
}
Expand Down
8 changes: 8 additions & 0 deletions api/src/main/java/life/catalogue/common/io/TabReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ public static TabReader csv(Reader reader, int skip) throws IOException {
return csv(new ReaderInputStream(reader, StandardCharsets.UTF_8), StandardCharsets.UTF_8, skip, 2);
}

public static TabReader csv(File file, int skip) throws IOException {
return csv(file, StandardCharsets.UTF_8, skip, 2);
}

public static TabReader csv(File file, Charset charset, int skip) throws IOException {
return csv(file, charset, skip, 2);
}
Expand All @@ -42,6 +46,10 @@ public static TabReader tab(Reader reader, int skip) throws IOException {
return tab(new ReaderInputStream(reader, StandardCharsets.UTF_8), StandardCharsets.UTF_8, skip, 2);
}

public static TabReader tab(File file, int skip) throws IOException {
return tab(file, StandardCharsets.UTF_8, skip, 2);
}

public static TabReader tab(File file, Charset charset, int skip) throws IOException {
return tab(file, charset, skip, 2);
}
Expand Down
53 changes: 20 additions & 33 deletions api/src/main/java/life/catalogue/common/tax/SciNameNormalizer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package life.catalogue.common.tax;

import org.gbif.nameparser.api.NameType;

import java.util.regex.Pattern;

import static org.apache.commons.lang3.StringUtils.trimToNull;
Expand Down Expand Up @@ -96,50 +98,35 @@ public static String normalizeWhitespaceAndPunctuation(String s) {
* The return will be a strictly ASCII encoded string.
*/
public static String normalize(String s) {
return normalize(s, false, true);
return normalize(s, null, true);
}

/**
* Normalizes an entire name string including monomials and genus parts of a name.
*/
public static String normalizeAll(String s) {
return normalize(s, true, true);

public static String normalize(String s, NameType type) {
return normalize(s, type, true);
}
private static String normalize(String s, boolean normMonomials, boolean stemming) {

private static String normalize(String s, NameType type, boolean stemEpithets) {
if (!hasContent(s)) return "";

s = normalizedAscii(s);

// Remove a hybrid cross, or a likely hybrid cross.
s = removeHybridMarker(s);

// Only for bi/trinomials, otherwise we mix up ranks.
if (normMonomials) {
s = normStrongly(s, stemming);

} else if (s.indexOf(' ') > 2) {
String[] parts = s.split(" +");
StringBuilder sb = new StringBuilder();
sb.append(parts[0]);
for (int i = 1; i < parts.length; i++) {
sb.append(" ");
if (Character.isLowerCase(parts[i].charAt(0))) {
sb.append(normStrongly(parts[i], stemming));
} else {
sb.append(parts[i]);
}

// corrent common misspellings
if (type != null && type.isParsable()) {
s = normSpellings(s);

// apply stemming only for epithets, never monomials!
if (stemEpithets && s.indexOf(' ') > 2) {
s = stemEpithet(s);
}
s = sb.toString();
}

return s.trim();
}

private static String normStrongly(String s, boolean stemming) {
if (stemming) {
s = stemEpithet(s);
}

private static String normSpellings(String s) {
// normalize frequent variations of i
s = i.matcher(s).replaceAll("i");
// remove repeated letters→leters in binomials
Expand All @@ -152,7 +139,7 @@ private static String normStrongly(String s, boolean stemming) {
* Stems and normalizes some few, but frequent misspellings
*/
public static String normalizeEpithet(String epithet) {
return normStrongly(epithet, true);
return stemEpithet(normSpellings(epithet));
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
package life.catalogue.common.tax;

import life.catalogue.common.io.TabReader;

import org.junit.Test;

import java.io.*;

import static org.junit.Assert.assertEquals;


public class SciNameNormalizerTest {

@Test
public void removeHybridMarker() throws Exception {
assertEquals("Abies", SciNameNormalizer.removeHybridMarker("Abies"));
Expand All @@ -18,7 +23,7 @@ public void removeHybridMarker() throws Exception {
public void testNormalize() throws Exception {
assertEquals("", SciNameNormalizer.normalize(""));
assertEquals("Abies", SciNameNormalizer.normalize("Abies "));
assertEquals("Abiies", SciNameNormalizer.normalize("Abiies "));
assertEquals("Abies", SciNameNormalizer.normalize("Abiies "));
assertEquals("Abyes", SciNameNormalizer.normalize("Abyes "));
assertEquals("Abyes alb", SciNameNormalizer.normalize("Abyes albus"));
assertEquals("Abyes albiet", SciNameNormalizer.normalize("Abyes albieta"));
Expand Down Expand Up @@ -63,41 +68,41 @@ public void testNormalize() throws Exception {

@Test
public void testNormalizeAll() throws Exception {
assertEquals("", SciNameNormalizer.normalizeAll(""));
assertEquals("Abies", SciNameNormalizer.normalizeAll("Abies "));
assertEquals("Abies", SciNameNormalizer.normalizeAll("Abiies "));
assertEquals("Abies", SciNameNormalizer.normalizeAll("Abyes "));
assertEquals("Abies alb", SciNameNormalizer.normalizeAll("Abyes albus"));
assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abyes albieta"));
assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abies albijeta"));
assertEquals("Abies albiet", SciNameNormalizer.normalizeAll("Abies albyeta"));
assertEquals("Abies alb", SciNameNormalizer.normalizeAll(" \txAbies × ållbbus\t"));

assertEquals("Abies alb", SciNameNormalizer.normalizeAll(" \txAbies × ållbbus\t"));
assertEquals("Rachis takt", SciNameNormalizer.normalizeAll("Rhachis taktos"));

assertEquals("Hieracium sabaud", SciNameNormalizer.normalizeAll("Hieracium sabaudum"));
assertEquals("Hieracium scorzoneraefoli", SciNameNormalizer.normalizeAll("Hieracium scorzoneræfolium"));
assertEquals("Hieracium scorzonerifoli", SciNameNormalizer.normalizeAll("Hieracium scorzonerifolium"));
assertEquals("Macrozamia platirach", SciNameNormalizer.normalizeAll("Macrozamia platyrachis"));
assertEquals("Macrozamia platirach", SciNameNormalizer.normalizeAll("Macrozamia platyrhachis"));
assertEquals("Cicas circinal", SciNameNormalizer.normalizeAll("Cycas circinalis"));
assertEquals("Cicas circinal", SciNameNormalizer.normalizeAll("Cycas circinnalis"));
assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perieri"));
assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perrieri"));
assertEquals("Isolona perier", SciNameNormalizer.normalizeAll("Isolona perrierii"));

assertEquals("Carex caiouet", SciNameNormalizer.normalizeAll("Carex ×cayouettei"));
assertEquals("Platanus hispanic", SciNameNormalizer.normalizeAll("Platanus x hispanica"));
assertEquals("", SciNameNormalizer.normalize(""));
assertEquals("Abies", SciNameNormalizer.normalize("Abies "));
assertEquals("Abies", SciNameNormalizer.normalize("Abiies "));
assertEquals("Abies", SciNameNormalizer.normalize("Abyes "));
assertEquals("Abies alb", SciNameNormalizer.normalize("Abyes albus"));
assertEquals("Abies albiet", SciNameNormalizer.normalize("Abyes albieta"));
assertEquals("Abies albiet", SciNameNormalizer.normalize("Abies albijeta"));
assertEquals("Abies albiet", SciNameNormalizer.normalize("Abies albyeta"));
assertEquals("Abies alb", SciNameNormalizer.normalize(" \txAbies × ållbbus\t"));

assertEquals("Abies alb", SciNameNormalizer.normalize(" \txAbies × ållbbus\t"));
assertEquals("Rachis takt", SciNameNormalizer.normalize("Rhachis taktos"));

assertEquals("Hieracium sabaud", SciNameNormalizer.normalize("Hieracium sabaudum"));
assertEquals("Hieracium scorzoneraefoli", SciNameNormalizer.normalize("Hieracium scorzoneræfolium"));
assertEquals("Hieracium scorzonerifoli", SciNameNormalizer.normalize("Hieracium scorzonerifolium"));
assertEquals("Macrozamia platirach", SciNameNormalizer.normalize("Macrozamia platyrachis"));
assertEquals("Macrozamia platirach", SciNameNormalizer.normalize("Macrozamia platyrhachis"));
assertEquals("Cicas circinal", SciNameNormalizer.normalize("Cycas circinalis"));
assertEquals("Cicas circinal", SciNameNormalizer.normalize("Cycas circinnalis"));
assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perieri"));
assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perrieri"));
assertEquals("Isolona perier", SciNameNormalizer.normalize("Isolona perrierii"));

assertEquals("Carex caiouet", SciNameNormalizer.normalize("Carex ×cayouettei"));
assertEquals("Platanus hispanic", SciNameNormalizer.normalize("Platanus x hispanica"));
// https://github.com/gbif/checklistbank/issues/7
assertEquals("Eragrostis brown", SciNameNormalizer.normalizeAll("Eragrostis brownii"));
assertEquals("Eragrostis brown", SciNameNormalizer.normalizeAll("Eragrostis brownei"));
assertEquals("Eragrostis brown", SciNameNormalizer.normalize("Eragrostis brownii"));
assertEquals("Eragrostis brown", SciNameNormalizer.normalize("Eragrostis brownei"));
}

@Test
public void testHybridCross() throws Exception {
assertEquals("xcayouettei", SciNameNormalizer.normalize("xcayouettei"));
assertEquals("cayouettei", SciNameNormalizer.normalize("×cayouettei"));
assertEquals("xcaiouetei", SciNameNormalizer.normalize("xcayouettei"));
assertEquals("caiouetei", SciNameNormalizer.normalize("×cayouettei"));

assertEquals("Carex xcaiouet", SciNameNormalizer.normalize("Carex xcayouettei"));
assertEquals("Carex caiouet", SciNameNormalizer.normalize("Carex ×cayouettei"));
Expand All @@ -113,8 +118,8 @@ public void testHybridCross() throws Exception {

@Test
public void testNonAscii() throws Exception {
assertEquals("Cem Andrexi", SciNameNormalizer.normalize("Çem Ándrexï"));
assertEquals("SOEZsoezY¥µAAAAAAAECEEEEIIIIDNOOOOOOUUUUYssaaaaaaaeceeeeiiiidnoooooouuuuyy", SciNameNormalizer.normalize("ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ"));
assertEquals("CemAndrexi", SciNameNormalizer.normalize("Çem_Ándrexï"));
assertEquals("SOEZsoezY¥µAECEIDNOUYsaeceidnoui", SciNameNormalizer.normalize("ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ"));
}

@Test
Expand Down
10 changes: 5 additions & 5 deletions dao/src/main/java/life/catalogue/es/NameStrings.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import static life.catalogue.es.ddl.Analyzer.SCINAME_AUTO_COMPLETE;
import static life.catalogue.es.ddl.Analyzer.SCINAME_IGNORE_CASE;
import static life.catalogue.es.nu.NameUsageWrapperConverter.normalizeWeakly;
import static life.catalogue.es.nu.NameUsageWrapperConverter.normalize;

/**
* An object embedded within the name usage document solely aimed at optimizing searchability. The name strings within this class do not
Expand Down Expand Up @@ -44,17 +44,17 @@ public NameStrings(Name name) {
}
if (!StringUtils.isBlank(name.getGenus())) {
genusLetter = Character.toLowerCase(name.getGenus().charAt(0));
genusOrMonomial = getStrings(name.getGenus(), normalizeWeakly(name.getGenus()));
genusOrMonomial = getStrings(name.getGenus(), normalize(name.getGenus()));
} else if (!StringUtils.isBlank(name.getUninomial())) {
genusOrMonomial = getStrings(name.getUninomial().toLowerCase(), normalizeWeakly(name.getUninomial()));
genusOrMonomial = getStrings(name.getUninomial().toLowerCase(), normalize(name.getUninomial()));
}
// we used to use the strong normaliser to index species/infraspecific epithets...
// But that caused more problems than it helped...
if (!StringUtils.isBlank(name.getSpecificEpithet())) {
specificEpithet = getStrings(name.getSpecificEpithet().toLowerCase(), normalizeWeakly(name.getSpecificEpithet()));
specificEpithet = getStrings(name.getSpecificEpithet().toLowerCase(), normalize(name.getSpecificEpithet()));
}
if (!StringUtils.isBlank(name.getInfraspecificEpithet())) {
infraspecificEpithet = getStrings(name.getInfraspecificEpithet().toLowerCase(), normalizeWeakly(name.getInfraspecificEpithet()));
infraspecificEpithet = getStrings(name.getInfraspecificEpithet().toLowerCase(), normalize(name.getInfraspecificEpithet()));
}
}

Expand Down
Loading

0 comments on commit 961e9a8

Please sign in to comment.