Skip to content

Commit

Permalink
Improve CsvReader to pad rows with missing columns with empty columns…
Browse files Browse the repository at this point in the history
… instead of skipping them. Behaves similar to gbif dwca-io, see https://github.com/CatalogueOfLife/data/issues/785
  • Loading branch information
mdoering committed Oct 30, 2024
1 parent 1a7c922 commit 0674e07
Show file tree
Hide file tree
Showing 11 changed files with 122 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ private static boolean exists(String x) {
}

@VisibleForTesting
static String expandAbbreviatedGenus(String scientificName, String genus) {
public static String expandAbbreviatedGenus(String scientificName, String genus) {
if (exists(scientificName) && exists(genus) && !scientificName.equalsIgnoreCase(genus)) {
String[] parts = scientificName.split(" +", 2);
String genusCorrect = StringUtils.capitalize(genus.trim().toLowerCase());
Expand Down
38 changes: 24 additions & 14 deletions reader/src/main/java/life/catalogue/csv/CsvReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,10 @@ public boolean hasNext() {
return row != null;
}

private String[] readCompleteRow() {
/**
* Updates this.row with a new row record which might spawn several lines
*/
private boolean readCompleteRow() {
String[] newRow;
// we might have a leftover from the last multiline join
if (queuedRow != null){
Expand All @@ -651,7 +654,7 @@ private String[] readCompleteRow() {
newRow = iter.next();
}
// try to read next line and append it for strayed multiline data if the column numbers match the header
while (newRow != null && newRow.length > 1 && newRow.length < maxIdx + 1 && iter.hasNext()) {
while (newRow != null && newRow.length > 1 && newRow.length < maxIdx + 1 && iter.hasNext() && queuedRow == null) {
String[] nextRow = iter.next();
// merging 2 rows reduces the columns by 1
if (nextRow != null && nextRow.length > 0) {
Expand All @@ -667,19 +670,25 @@ private String[] readCompleteRow() {
} else {
// save newRow for next round...
queuedRow = nextRow;
return newRow;
row = newRow;
return true;
}
}
}
return newRow;
row = newRow;
return true;
}

private boolean iterHasMore() {
return iter.hasNext() || queuedRow != null;
}

private void nextRow() {
skippedLast = false;
if (iter.hasNext()) {
while (iter.hasNext() && isEmpty(row = readCompleteRow(), true));
if (iterHasMore()) {
while (iterHasMore() && readCompleteRow() && rowIsEmpty(true));
// if the last rows were empty we would getUsage the last non empty row again, clear it in that case!
if (!iter.hasNext() && isEmpty(row, false)) {
if (!iter.hasNext() && rowIsEmpty(false)) {
row = null;
} else {
records++;
Expand All @@ -697,22 +706,23 @@ private void nextRow() {
}
}

private boolean isEmpty(String[] row, boolean log) {
private boolean rowIsEmpty(boolean log) {
if (row == null) {
// ignore this row, dont log
} else if (row.length < maxIdx + 1) {
if (log) {
skippedLast = true;
skipped++;
LOG.info("{} skip line {} with too few columns (found {}, expected {})", filename, iter.getContext().currentLine()-1, row.length, maxIdx + 1);
}
} else if (isAllNull(row)) {
if (log) {
skippedLast = true;
skipped++;
LOG.debug("{} skip line {} with only empty columns", filename, iter.getContext().currentLine());
}
} else {
// expand row with empty columns if too small
if (row.length < maxIdx + 1) {
row = Arrays.copyOf(row, maxIdx + 1);
if (log) {
LOG.debug("{} line {} with too few columns (found {}, expected {})", filename, iter.getContext().currentLine() - 1, row.length, maxIdx + 1);
}
}
return false;
}
return true;
Expand Down
8 changes: 4 additions & 4 deletions reader/src/test/java/life/catalogue/csv/CsvReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,15 @@ public void strayLines() throws Exception {
ids.add(r.get(DcTerm.identifier));
System.out.println(r);
});
assertEquals(30, cnt.get());
assertEquals(1, skipped.get()); // 23341
assertEquals(30, ids.size());
assertEquals(31, cnt.get());
assertEquals(0, skipped.get()); // 23341 is padded, not skipped
assertEquals(31, ids.size());
assertTrue(ids.contains("6"));
assertTrue(ids.contains("303989"));
assertTrue(ids.contains("23340"));
assertTrue(ids.contains("303988"));
assertTrue(ids.contains("303984"));
assertFalse(ids.contains("23341")); // skipped
assertTrue(ids.contains("23341")); // padded with empty columns
}

@Test
Expand Down
15 changes: 15 additions & 0 deletions reader/src/test/java/life/catalogue/csv/DwcaReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,21 @@ public void dwca41() throws Exception {
assertEquals('\n', format.getNormalizedNewline());
}

@Test
public void dwca52() throws Exception {
DwcaReader reader = DwcaReader.from(Resources.toPath("dwca/52"));

assertTrue(reader.coreSchema().isTsv());

final AtomicInteger counter = new AtomicInteger(0);
reader.stream(DwcTerm.Taxon).forEach(tr -> {
counter.incrementAndGet();
assertNotNull(tr.get(DwcTerm.scientificName));
assertNotNull(tr.get(DwcaTerm.ID));
});
assertEquals(11, counter.get());
}

@Test
public void dwca14() throws Exception {
DwcaReader reader = DwcaReader.from(Resources.toPath("dwca/14"));
Expand Down
3 changes: 3 additions & 0 deletions reader/src/test/resources/dwca/52/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Test dwca data for UNITE with broken parentNameUsageID records,
but for which there is a working acceptedNameUsageID record.
https://github.com/CatalogueOfLife/data/issues/785
16 changes: 16 additions & 0 deletions reader/src/test/resources/dwca/52/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon">
<files>
<location>taxon.csv</location>
</files>
<id index="0" />
<field index="1" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/parentNameUsageID"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/acceptedNameUsageID"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
<field index="6" term="http://purl.org/dc/terms/references"/>
<field index="7" term="http://purl.org/dc/terms/taxonRemarks"/>
</core>
</archive>

12 changes: 12 additions & 0 deletions reader/src/test/resources/dwca/52/taxon.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
id taxonID parentNameUsageID acceptedNameUsageID scientificName taxonRank references taxonRemarks
SH0887892.09FU SH0887892.09FU 763571 SH0879619.10FU SH0887892.09FU species hypothesis dx.doi.org/10.15156/BIO/SH0887892.09FU 1.5%
SH0879619.10FU SH0879619.10FU SH0097986.10FU SH0879619.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0879619.10FU 1.5%
SH0097986.10FU SH0097986.10FU 763571 SH0097986.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0097986.10FU 3.0%
SH0763571.10FU SH0763571.10FU SH0023252.10FU SH0763571.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0763571.10FU 1.5%
763571 763571 760612 Podila humilis (Linnem. ex W. Gams) Vandepol & Bonito Species
760612 760612 119792 Podila Stajich, Vandepol & Bonito Genus
119792 119792 119237 Mortierellaceae Luerss. Family
119237 119237 672213 Mortierellales Caval.-Sm. Order
672213 672213 671028 Mortierellomycetes Doweld Class
671028 671028 118934 Mortierellomycota Tedersoo, Sánchez-Ramírez, Kõljalg, Bahram, Döring, Schigel, T. May, M. Ryberg & Abarenkov Phylum
118934 118934 Fungi R.T. Moore Kingdom
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,22 @@ public void subgenera() throws Exception {
}
}

/**
* https://github.com/CatalogueOfLife/data/issues/785
* @throws Exception
*/
@Test
public void unite() throws Exception {
var settings = new DatasetSettings();

normalize(52, settings);
printTree();

try (Transaction tx = store.getNeo().beginTx()) {
assertEquals(11, store.usages().all().count());
}
}

@Test
@Ignore
public void testExternal() throws Exception {
Expand Down
3 changes: 3 additions & 0 deletions webservice/src/test/resources/dwca/52/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Test dwca data for UNITE with broken parentNameUsageID records,
but for which there is a working acceptedNameUsageID record.
https://github.com/CatalogueOfLife/data/issues/785
16 changes: 16 additions & 0 deletions webservice/src/test/resources/dwca/52/meta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml">
<core encoding="UTF-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon">
<files>
<location>taxon.csv</location>
</files>
<id index="0" />
<field index="1" term="http://rs.tdwg.org/dwc/terms/taxonID"/>
<field index="2" term="http://rs.tdwg.org/dwc/terms/parentNameUsageID"/>
<field index="3" term="http://rs.tdwg.org/dwc/terms/acceptedNameUsageID"/>
<field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
<field index="5" term="http://rs.tdwg.org/dwc/terms/taxonRank"/>
<field index="6" term="http://purl.org/dc/terms/references"/>
<field index="7" term="http://purl.org/dc/terms/taxonRemarks"/>
</core>
</archive>

12 changes: 12 additions & 0 deletions webservice/src/test/resources/dwca/52/taxon.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
id taxonID parentNameUsageID acceptedNameUsageID scientificName taxonRank references taxonRemarks
SH0887892.09FU SH0887892.09FU 763571 SH0879619.10FU SH0887892.09FU species hypothesis dx.doi.org/10.15156/BIO/SH0887892.09FU 1.5%
SH0879619.10FU SH0879619.10FU SH0097986.10FU SH0879619.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0879619.10FU 1.5%
SH0097986.10FU SH0097986.10FU 763571 SH0097986.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0097986.10FU 3.0%
SH0763571.10FU SH0763571.10FU SH0023252.10FU SH0763571.10FU species hypothesis dx.doi.org/10.15156/BIO/SH0763571.10FU 1.5%
763571 763571 760612 Podila humilis (Linnem. ex W. Gams) Vandepol & Bonito Species
760612 760612 119792 Podila Stajich, Vandepol & Bonito Genus
119792 119792 119237 Mortierellaceae Luerss. Family
119237 119237 672213 Mortierellales Caval.-Sm. Order
672213 672213 671028 Mortierellomycetes Doweld Class
671028 671028 118934 Mortierellomycota Tedersoo, Sánchez-Ramírez, Kõljalg, Bahram, Döring, Schigel, T. May, M. Ryberg & Abarenkov Phylum
118934 118934 Fungi R.T. Moore Kingdom

0 comments on commit 0674e07

Please sign in to comment.