diff --git a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java index 091fdbfd29..59ede39920 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java @@ -192,6 +192,7 @@ public void setAffiliationString(String s) { public void setRawAffiliationString(String s) { rawAffiliationString = s; + rawAffiliationString = rawAffiliationString.replaceAll("( )+", " "); } public void setInstitutions(List affs) { diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index 032436872a..c3bfdf5130 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -51,6 +51,9 @@ public class BiblioItem { // map of labels (e.g. or <abstract>) to LayoutToken private Map<String, List<LayoutToken>> labeledTokens; + // accumulation of the LayoutTokens for sequences of affiliation/address + private List<List<LayoutToken>> affiliationAddresslabeledTokens; + /** * The following are internal working structures not meant to be used outside. * For collecting layout tokens of the various bibliographical component, @@ -4419,6 +4422,13 @@ public void generalResultMappingHeader(String labeledResult, List<LayoutToken> t theList = theList == null ? new ArrayList<>() : theList; theList.addAll(clusterTokens); labeledTokens.put(clusterLabel.getLabel(), theList); + + if (clusterLabel.equals(TaggingLabels.HEADER_AFFILIATION) || clusterLabel.equals(TaggingLabels.HEADER_ADDRESS)) { + if (affiliationAddresslabeledTokens == null) + affiliationAddresslabeledTokens = new ArrayList<>(); + if (!affiliationAddresslabeledTokens.contains(clusterTokens)) + affiliationAddresslabeledTokens.add(clusterTokens); + } } } @@ -4458,4 +4468,8 @@ public String getAvailabilityStmt() { public void setAvailabilityStmt(String availabilityStmt) { this.availabilityStmt = availabilityStmt; } + + public List<List<LayoutToken>> getAffiliationAddresslabeledTokens() { + return affiliationAddresslabeledTokens; + } } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java index 6c6c343ffd..1e13f9612c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java @@ -79,6 +79,71 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio return affiliationBlocks; } + protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutToken>> tokenizations) { + ArrayList<String> affiliationBlocks = new ArrayList<String>(); + int end = 0; + for(List<LayoutToken> tokenizationSegment : tokenizations) { + if (tokenizationSegment == null || tokenizationSegment.size() == 0) + continue; + + // if we have an offset shit, we introduce a segmentation of the affiliation block + LayoutToken startToken = tokenizationSegment.get(0); + int start = startToken.getOffset(); + if (start-end > 2) + affiliationBlocks.add("\n"); + + for(LayoutToken tok : tokenizationSegment) { + if (tok.getText().length() == 0) + continue; + + if (!tok.getText().equals(" ")) { + if (tok.getText().equals("\n")) { + affiliationBlocks.add("@newline"); + } else + affiliationBlocks.add(tok + " <affiliation>"); + } + end = tok.getOffset(); + } + } + return affiliationBlocks; + } + + public List<Affiliation> processingLayoutTokens(List<List<LayoutToken>> tokenizations) { + List<Affiliation> results = null; + try { + if ((tokenizations == null) || (tokenizations.size() == 0)) { + return null; + } + + List<LayoutToken> tokenizationsAffiliation = new ArrayList<>(); + for (List<LayoutToken> tokenization : tokenizations) { +//System.out.println(tokenization.toString()); + tokenizationsAffiliation.addAll(tokenization); + } + + List<String> affiliationBlocks = getAffiliationBlocksFromSegments(tokenizations); + +//System.out.println(affiliationBlocks.toString()); + + List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>(); + List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>(); + placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation)); + countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation)); + List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>(); + allTokens.add(tokenizationsAffiliation); + String affiliationSequenceWithFeatures = + FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); + + String res = label(affiliationSequenceWithFeatures); + //return resultBuilder(res, tokenizations, false); // don't use pre-labels + + results = resultExtractionLayoutTokens(res, tokenizationsAffiliation); + } catch (Exception e) { + throw new GrobidException("An exception occurred while running Grobid.", e); + } + return results; + } + /** * Post processing of extracted field affiliation and address. * Here the input string to be processed comes from a previous parser: the segmentation @@ -86,30 +151,37 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio * We also need to keep the original tokenization information to recreate the exact * initial string. */ - public List<Affiliation> processReflow(String result, List<LayoutToken> tokenizations) { + /*public List<Affiliation> processReflow(String result, List<LayoutToken> tokenizations) { if ((result == null) || (result.length() == 0)) { return null; } - List<String> affiliationBlocks = new ArrayList<String>(); + List<String> affiliationFeatureLines = new ArrayList<String>(); List<LayoutToken> subTokenizations = new ArrayList<LayoutToken>(); - filterAffiliationAddress(result, tokenizations, affiliationBlocks, subTokenizations); + filterAffiliationAddress(result, tokenizations, affiliationFeatureLines, subTokenizations); - return processingReflow(affiliationBlocks, subTokenizations); - } + System.out.println(affiliationFeatureLines.toString()); + System.out.println(subTokenizations.toString()); + + return processingReflow(affiliationFeatureLines, subTokenizations); + }*/ - private void filterAffiliationAddress(String result, + /*private void filterAffiliationAddress(String result, List<LayoutToken> tokenizations, - List<String> affiliationBlocks, + List<String> affiliationFeatureLines, List<LayoutToken> subTokenizations) { + // result is the header feature matrix with labels + // tokenizations is the layout tokens of the full header + // affiliationFeatureLines is wehre to put the lines with header labels affiliation or address + // subTokenizations is where to put the layout tokens corresponding to what is labeled with header labels affiliation or address StringTokenizer st = new StringTokenizer(result, "\n"); String lastLabel = null; - int p = 0; + int p = 0; // index in the tokenizations list List<LayoutToken> tokenizationsBuffer = null; while (st.hasMoreTokens() && (p < tokenizations.size())) { String line = st.nextToken(); if (line.trim().length() == 0) { - affiliationBlocks.add("\n"); + affiliationFeatureLines.add("\n"); lastLabel = null; } else { @@ -144,23 +216,29 @@ private void filterAffiliationAddress(String result, p = p0; continue; } - } + } int ll = s.length; String label = s[ll-1]; + if ((label.indexOf("affiliation") == -1) && (label.indexOf("address") == -1)) { + // not affiliation/address input + if (lastLabel != null) { + affiliationFeatureLines.add("\n"); + } + lastLabel = null; + continue; + } + if ((tokOriginal != null) && ( ((label.indexOf("affiliation") != -1) || (label.indexOf("address") != -1)) )) { - affiliationBlocks.add(tokOriginal + " " + label); + affiliationFeatureLines.add(tokOriginal + " " + label); // add the content of tokenizationsBuffer for(LayoutToken tokk : tokenizationsBuffer) { subTokenizations.add(tokk); } if (tokenizationsBuffer.size() > 0 && isEndLine) { - affiliationBlocks.add("@newline"); + affiliationFeatureLines.add("@newline"); } - } - else if (lastLabel != null) { - affiliationBlocks.add("\n"); - } + } if ((label.indexOf("affiliation") != -1) || (label.indexOf("address") != -1)) { lastLabel = label; @@ -172,15 +250,15 @@ else if (lastLabel != null) { //System.out.println(subTokenizations.toString()); //System.out.println(affiliationBlocks.toString()); - } + }*/ - private List<Affiliation> processingReflow(List<String> affiliationBlocks, List<LayoutToken> tokenizations) { - String res = runReflow(affiliationBlocks, tokenizations); + /*private List<Affiliation> processingReflow(List<String> affiliationFeatureLines, List<LayoutToken> tokenizations) { + String res = runReflow(affiliationFeatureLines, tokenizations); //return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow return resultExtractionLayoutTokens(res, tokenizations); - } + }*/ - private String runReflow(List<String> affiliationBlocks, + /*private String runReflow(List<String> affiliationFeatureLines, List<LayoutToken> tokenizations) { try { List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>(); @@ -190,7 +268,7 @@ private String runReflow(List<String> affiliationBlocks, List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>(); allTokens.add(tokenizations); String affiliationSequenceWithFeatures = - FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); + FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationFeatureLines, allTokens, placesPositions, countriesPositions); if ((affiliationSequenceWithFeatures == null) || (affiliationSequenceWithFeatures.trim().length() == 0)) { return null; @@ -202,7 +280,7 @@ private String runReflow(List<String> affiliationBlocks, } catch (Exception e) { throw new GrobidException("An exception occured while running Grobid at the affiliation-address labeling task.", e); } - } + }*/ /** * Extract results from a labeled sequence. @@ -211,20 +289,20 @@ private String runReflow(List<String> affiliationBlocks, * @param tokenizations list of tokens * @return lis of Affiliation objects */ - protected List<Affiliation> resultExtractionLayoutTokens(String result, - List<LayoutToken> tokenizations) { + protected List<Affiliation> resultExtractionLayoutTokens(String result, List<LayoutToken> tokenizations) { List<Affiliation> affiliations = new ArrayList<>(); if (result == null) return affiliations; Affiliation affiliation = new Affiliation(); - System.out.println(result); +//System.out.println(result); TaggingLabel lastClusterLabel = null; TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.AFFILIATION_ADDRESS, result, tokenizations); String tokenLabel = null; + boolean newline = true; List<TaggingTokenCluster> clusters = clusteror.cluster(); for (TaggingTokenCluster cluster : clusters) { if (cluster == null) { @@ -242,12 +320,15 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result, List<LayoutToken> tokens = cluster.concatTokens(); if (clusterLabel.equals(TaggingLabels.AFFILIATION_MARKER)) { - if (affiliation.getMarker() != null) { + // if an affiliation has already a merker, or if a marker start a line, + // we introduce a new affiliation + if (affiliation.getMarker() != null || newline) { if (affiliation.isNotNull()) { affiliations.add(affiliation); } affiliation = new Affiliation(); } + affiliation.setMarker(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_MARKER, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_INSTITUTION)) { @@ -331,6 +412,21 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result, if (!clusterLabel.equals(TaggingLabels.OTHER) && affiliation.isNotNull()) { affiliation.appendLayoutTokens(tokens); } + + if (!clusterLabel.equals(TaggingLabels.AFFILIATION_MARKER)) { + if (affiliation.getRawAffiliationString() == null) { + affiliation.setRawAffiliationString(clusterContent); + } else { + affiliation.setRawAffiliationString(affiliation.getRawAffiliationString() + " " + clusterContent); + } + } + + newline = false; + if (tokens.size() > 0) { + LayoutToken lastToken = tokens.get(tokens.size()-1); + if (lastToken.getText() != null && lastToken.getText().equals("\n")) + newline = true; + } } // last affiliation @@ -885,17 +981,47 @@ protected ArrayList<Affiliation> resultBuilder(String result, /** * Extract results from a labelled header in the training format without any string modification. */ - public StringBuilder trainingExtraction(String result, - List<LayoutToken> tokenizations) { - if ((result == null) || (result.length() == 0)) { + public StringBuilder trainingExtraction(List<LayoutToken> tokenizationsAffiliation) { + /*if ((result == null) || (result.length() == 0)) { + return null; + }*/ + + if (tokenizationsAffiliation == null || tokenizationsAffiliation.size() == 0) return null; + + List<String> affiliationBlocks = getAffiliationBlocks(tokenizationsAffiliation); + List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>(); + List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>(); + placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation)); + countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation)); + List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>(); + allTokens.add(tokenizationsAffiliation); + + String affiliationSequenceWithFeatures = null; + try { + affiliationSequenceWithFeatures = + FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); + } catch(Exception e) { + throw new GrobidException("An exception occurred while running Grobid.", e); } - List<String> affiliationBlocks = new ArrayList<String>(); - List<LayoutToken> tokenizationsAffiliation = new ArrayList<LayoutToken>(); + if (affiliationSequenceWithFeatures == null) { + return null; + } + + String resultAffiliation = label(affiliationSequenceWithFeatures); + //return resultBuilder(res, tokenizations, false); // don't use pre-labels + + //results = resultExtractionLayoutTokens(res, tokenizations); + + + + + //List<String> affiliationBlocks = new ArrayList<String>(); + //List<LayoutToken> tokenizationsAffiliation = new ArrayList<LayoutToken>(); - filterAffiliationAddress(result, tokenizations, affiliationBlocks, tokenizationsAffiliation); - String resultAffiliation = runReflow(affiliationBlocks, tokenizationsAffiliation); + //filterAffiliationAddress(result, tokenizations, affiliationBlocks, tokenizationsAffiliation); + //String resultAffiliation = runReflow(affiliationBlocks, tokenizationsAffiliation); StringBuilder bufferAffiliation = new StringBuilder(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 08fabc0540..d6252f88fe 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -1353,6 +1353,8 @@ public Document createTraining(File inputFile, writer.close(); String rese = parsers.getHeaderParser().label(header); + BiblioItem resHeader = new BiblioItem(); + resHeader = parsers.getHeaderParser().resultExtraction(rese, headerTokenizations, resHeader); // buffer for the header block StringBuilder bufferHeader = parsers.getHeaderParser().trainingExtraction(rese, headerTokenizations); @@ -1362,8 +1364,9 @@ public Document createTraining(File inputFile, } // buffer for the affiliation+address block + List<LayoutToken> tokenizationsAffiliation = resHeader.getLayoutTokens(TaggingLabels.HEADER_AFFILIATION); StringBuilder bufferAffiliation = - parsers.getAffiliationAddressParser().trainingExtraction(rese, headerTokenizations); + parsers.getAffiliationAddressParser().trainingExtraction(tokenizationsAffiliation); // buffer for the date block StringBuilder bufferDate = null; diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index ac1a58decc..2db68597cc 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -204,8 +204,12 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, // remove invalid authors (no last name, noise, etc.) resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors())); + //List<LayoutToken> tokenizationsAffiliation = resHeader.getLayoutTokens(TaggingLabels.HEADER_AFFILIATION); + List<List<LayoutToken>> tokenizationsAffiliation = resHeader.getAffiliationAddresslabeledTokens(); + //resHeader.setFullAffiliations( + // parsers.getAffiliationAddressParser().processReflow(res, tokenizations)); resHeader.setFullAffiliations( - parsers.getAffiliationAddressParser().processReflow(res, tokenizations)); + parsers.getAffiliationAddressParser().processingLayoutTokens(tokenizationsAffiliation)); resHeader.attachEmails(); boolean attached = false; if (fragmentedAuthors && !hasMarker) { diff --git a/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenSynchronizer.java b/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenSynchronizer.java index 2a7a7f1fb0..cb9a656b5c 100644 --- a/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenSynchronizer.java +++ b/grobid-core/src/main/java/org/grobid/core/tokenization/TaggingTokenSynchronizer.java @@ -124,8 +124,10 @@ private String prepareErrorMessage(int preTokenizationPtr) { StringBuilder sb = new StringBuilder(); for (int i = Math.max(0, tokensAndLabelsPtr - limit); i < Math.min(tokensAndLabelsPtr + limit, tokensAndLabels.size()); i++) { Triple<String, String, String> s = tokensAndLabels.get(i); - String str = i == tokensAndLabelsPtr ? "-->\t'" + s.getA() + "'" : "\t'" + s.getA() + "'"; - sb.append(str).append("\n"); + if (s != null) { + String str = i == tokensAndLabelsPtr ? "-->\t'" + s.getA() + "'" : "\t'" + s.getA() + "'"; + sb.append(str).append("\n"); + } } StringBuilder sb2 = new StringBuilder(); diff --git a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java index 6b5c254bc3..3f4b2d657d 100644 --- a/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java +++ b/grobid-core/src/test/java/org/grobid/core/engines/AffiliationAddressParserTest.java @@ -106,7 +106,7 @@ private List<Affiliation> processLabelResults( LOGGER.debug("tokenizations: {}", tokenizations); List<String> affiliationBlocks = getAffiliationBlocksWithLineFeed(tokenizations); String header = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress( - affiliationBlocks, Arrays.asList(tokenizations), NO_PLACES_POSITIONS + affiliationBlocks, Arrays.asList(tokenizations), NO_PLACES_POSITIONS, NO_PLACES_POSITIONS ); LOGGER.debug("header: {}", header); String labelResult = addLabelsToFeatures(header, labels); diff --git a/grobid-core/src/test/java/org/grobid/core/test/TestAffiliationAddressParser.java b/grobid-core/src/test/java/org/grobid/core/test/TestAffiliationAddressParser.java index 196fdc0dad..cc7a52ee0c 100755 --- a/grobid-core/src/test/java/org/grobid/core/test/TestAffiliationAddressParser.java +++ b/grobid-core/src/test/java/org/grobid/core/test/TestAffiliationAddressParser.java @@ -25,10 +25,9 @@ public static void tearDown(){ GrobidFactory.reset(); } - @Test + //@Test public void testParser() throws Exception { - String affiliationSequence1 = "Atomic Physics Division, Department of Atomic Physics and Luminescence, " + "Faculty of Applied Physics and Mathematics, Gdansk University of " + "Technology, Narutowicza 11/12, 80-233 Gdansk, Poland";