diff --git a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java index 8958ab530f..62fde434c8 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java @@ -285,7 +285,7 @@ public void extendLastLaboratory(String theExtend) { } } - public boolean notNull() { + public boolean isNotNull() { return !((departments == null) & (institutions == null) & (laboratories == null) & @@ -451,7 +451,7 @@ public int nbStructures() { @Deprecated public String toTEI() { StringBuilder tei = new StringBuilder(); - if (!notNull()) { + if (!isNotNull()) { return null; } else { tei.append(" tokenizations if (labeledTokens == null) labeledTokens = new TreeMap<>(); - List theTokenList = tokenizations == null ? new ArrayList<>() : tokenizations; + List theTokenList = null; + if (tokenizations == null) + theTokenList = new ArrayList<>(); + else + theTokenList = tokenizations; + + List theExistingTokenList = labeledTokens.get(label.getLabel()); + if (theExistingTokenList != null) { + theExistingTokenList.addAll(theTokenList); + theTokenList = theExistingTokenList; + } + labeledTokens.put(label.getLabel(), theTokenList); } + public List getLabeledResult(TaggingLabel label) { + return labeledTokens.get(label.getLabel()); + } + } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java index a7756ec18c..c0ad5ebe35 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java @@ -10,7 +10,12 @@ import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.utilities.UnicodeUtil; +import org.grobid.core.utilities.LayoutTokensUtil; import org.grobid.core.engines.tagging.GenericTaggerUtils; +import org.grobid.core.tokenization.TaggingTokenCluster; +import org.grobid.core.tokenization.TaggingTokenClusteror; +import org.grobid.core.engines.label.TaggingLabel; +import org.grobid.core.engines.label.TaggingLabels; import java.util.ArrayList; import java.util.List; @@ -23,7 +28,8 @@ public AffiliationAddressParser() { super(GrobidModels.AFFILIATION_ADDRESS); } - public ArrayList processing(String input) { + public List processing(String input) { + List results = null; try { if ((input == null) || (input.length() == 0)) { return null; @@ -45,10 +51,13 @@ public ArrayList processing(String input) { String header = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions); String res = label(header); - return resultBuilder(res, tokenizations, false); // don't use pre-labels + //return resultBuilder(res, tokenizations, false); // don't use pre-labels + + results = resultExtractionLayoutTokens(res, tokenizations); } catch (Exception e) { throw new GrobidException("An exception occurred while running Grobid.", e); } + return results; } protected static List getAffiliationBlocks(List tokenizations) { @@ -162,9 +171,10 @@ else if (lastLabel != null) { //System.out.println(affiliationBlocks.toString()); } - private ArrayList processingReflow(List affiliationBlocks, List tokenizations) { + private List processingReflow(List affiliationBlocks, List tokenizations) { String res = runReflow(affiliationBlocks, tokenizations); - return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow + //return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow + return resultExtractionLayoutTokens(res, tokenizations); } private String runReflow(List affiliationBlocks, @@ -174,20 +184,139 @@ private String runReflow(List affiliationBlocks, placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations)); List> allTokens = new ArrayList>(); allTokens.add(tokenizations); - String header = + String affiliationSequenceWithFeatures = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions); - if ((header == null) || (header.trim().length() == 0)) { + if ((affiliationSequenceWithFeatures == null) || (affiliationSequenceWithFeatures.trim().length() == 0)) { return null; } - return label(header); + //System.out.println(affiliationSequenceWithFeatures); + + return label(affiliationSequenceWithFeatures); } catch (Exception e) { - throw new GrobidException("An exception occured while running Grobid.", e); + throw new GrobidException("An exception occured while running Grobid at the affiliation-address labeling task.", e); } } + /** + * Extract results from a labeled sequence. + * + * @param result labeled sequence + * @param tokenizations list of tokens + * @return lis of Affiliation objects + */ + protected List resultExtractionLayoutTokens(String result, + List tokenizations) { + List affiliations = new ArrayList<>(); + if (result == null) + return affiliations; + + Affiliation affiliation = new Affiliation(); + + //System.out.println(result); + + TaggingLabel lastClusterLabel = null; + TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.AFFILIATION_ADDRESS, result, tokenizations); + + String tokenLabel = null; + List clusters = clusteror.cluster(); + for (TaggingTokenCluster cluster : clusters) { + if (cluster == null) { + continue; + } + + TaggingLabel clusterLabel = cluster.getTaggingLabel(); + Engine.getCntManager().i(clusterLabel); + //String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens())); + String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); + //String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); + //String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens()); + + List tokens = cluster.concatTokens(); + + if (clusterLabel.equals(TaggingLabels.AFFILIATION_MARKER)) { + if (affiliation.getMarker() != null) { + if (affiliation.isNotNull()) { + affiliations.add(affiliation); + } + affiliation = new Affiliation(); + } + affiliation.setMarker(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_MARKER, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_INSTITUTION)) { + if (affiliation.getInstitutions() != null && affiliation.getInstitutions().size()>0) { + + } + affiliation.addInstitution(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_INSTITUTION, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_DEPARTMENT)) { + if (affiliation.getDepartments() != null && affiliation.getDepartments().size()>0) { + + } + affiliation.addDepartment(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_DEPARTMENT, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_LABORATORY)) { + if (affiliation.getLaboratories() != null && affiliation.getLaboratories().size()>0) { + + } + affiliation.addLaboratory(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_LABORATORY, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_COUNTRY)) { + if (affiliation.getCountry() != null) { + + } + affiliation.setCountry(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_COUNTRY, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTCODE)) { + if (affiliation.getPostCode() != null) { + + } + affiliation.setPostCode(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTCODE, tokens); + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTBOX)) { + if (affiliation.getPostBox() != null) { + + } + affiliation.setPostBox(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTBOX, tokens); + + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_REGION)) { + if (affiliation.getRegion() != null) { + + } + affiliation.setRegion(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_REGION, tokens); + + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_SETTLEMENT)) { + if (affiliation.getSettlement() != null) { + + } + affiliation.setSettlement(clusterContent); + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_SETTLEMENT, tokens); + + } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_ADDRESSLINE)) { + if (affiliation.getAddressString() != null) { + affiliation.setAddressString(affiliation.getAddressString() + " " + clusterContent); + } else { + affiliation.setAddressString(clusterContent); + } + affiliation.addLabeledResult(TaggingLabels.AFFILIATION_ADDRESSLINE, tokens); + } + } + + // last affiliation + if (affiliation.isNotNull()) { + affiliations.add(affiliation); + } + + return affiliations; + } + + /** + * DEPRECATED + **/ protected ArrayList resultBuilder(String result, List tokenizations, boolean usePreLabel) { @@ -223,7 +352,7 @@ protected ArrayList resultBuilder(String result, String line = st2.nextToken(); Integer lineCountInt = lineCount; if (line.trim().length() == 0) { - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) { fullAffiliations = new ArrayList(); } @@ -291,7 +420,7 @@ protected ArrayList resultBuilder(String result, } if (newMarker) { - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -316,7 +445,7 @@ protected ArrayList resultBuilder(String result, if (s1.equals("I-") && (localFeatures.contains("LINESTART"))) { // new affiliation - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -332,7 +461,7 @@ protected ArrayList resultBuilder(String result, } else if (s1.equals("I-") && hasInstitution && hasAddress && (!lastTag.equals(""))) { // new affiliation - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) { fullAffiliations = new ArrayList(); } @@ -416,7 +545,7 @@ protected ArrayList resultBuilder(String result, if ((s1.equals("I-")) && (localFeatures.contains("LINESTART")) ) { - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -432,7 +561,7 @@ protected ArrayList resultBuilder(String result, } } else if ((s1.equals("I-")) && hasDepartment && hasAddress && !lastTag.equals("")) { - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) { fullAffiliations = new ArrayList(); } @@ -465,7 +594,7 @@ protected ArrayList resultBuilder(String result, if ((s1.equals("I-")) && hasAddress && (localFeatures.contains("LINESTART")) ) { - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -505,7 +634,7 @@ protected ArrayList resultBuilder(String result, if (s1.equals("I-") && (localFeatures.contains("LINESTART"))) { // new affiliation - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -524,7 +653,7 @@ protected ArrayList resultBuilder(String result, && hasAddress && (!lastTag.equals(""))) { // new affiliation - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); fullAffiliations.add(aff); @@ -704,7 +833,7 @@ protected ArrayList resultBuilder(String result, lineCount++; newMarker = false; } - if (aff.notNull()) { + if (aff.isNotNull()) { if (fullAffiliations == null) fullAffiliations = new ArrayList(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java index 14c599458b..8ebf46c2b3 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java @@ -470,10 +470,10 @@ public List processingReferenceSection(DocumentSource documentSource /** * Extract results from a labeled sequence. * - * @param result result + * @param result labeled sequence * @param volumePostProcess whether post process volume * @param tokenizations list of tokens - * @return bibilio item + * @return biblio item */ public BiblioItem resultExtractionLayoutTokens(String result, boolean volumePostProcess, diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index 3a2429c7b1..5004f6dc1c 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -303,7 +303,7 @@ public MutablePair,List,List,List,List(label.getGrobidModel(), label.getLabel()), label); } @@ -422,6 +443,18 @@ protected static void register(TaggingLabel label) { register(FUNDING_AFFILIATION); register(FUNDING_INSTITUTION); register(FUNDING_OTHER); + + // affiliation-address + register(AFFILIATION_MARKER); + register(AFFILIATION_INSTITUTION); + register(AFFILIATION_DEPARTMENT); + register(AFFILIATION_LABORATORY); + register(AFFILIATION_COUNTRY); + register(AFFILIATION_POSTCODE); + register(AFFILIATION_POSTBOX); + register(AFFILIATION_REGION); + register(AFFILIATION_SETTLEMENT); + register(AFFILIATION_ADDRESSLINE); } protected TaggingLabels() { diff --git a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java index 6abeb3d579..1e3efdcaaf 100755 --- a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java +++ b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java @@ -109,8 +109,6 @@ public String printVector() { else res.append(" 0\n"); - - return res.toString(); } diff --git a/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenClusteror.java b/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenClusteror.java index a15b50da1e..28f28d5ea7 100644 --- a/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenClusteror.java +++ b/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenClusteror.java @@ -71,6 +71,10 @@ public List cluster() { TaggingTokenCluster curCluster = new TaggingTokenCluster(it.peek().getTaggingLabel()); while (it.hasNext()) { LabeledTokensContainer cont = it.next(); + if (cont == null) { + // this should not happen, but for the sake of paranoia, we skip + continue; + } if (begin || cont.isBeginning() || cont.getTaggingLabel() != curCluster.getTaggingLabel()) { curCluster = new TaggingTokenCluster(cont.getTaggingLabel()); result.add(curCluster);