diff --git a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java index 62fde434c8..4683918388 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Affiliation.java @@ -154,6 +154,7 @@ public void setAddressString(String s) { } public void setCountry(String s) { + s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n"); country = s; } @@ -170,10 +171,12 @@ public void setPostBox(String s) { } public void setRegion(String s) { + s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n"); region = s; } public void setSettlement(String s) { + s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n"); settlement = s; } @@ -219,6 +222,9 @@ public void addLaboratory(String aff) { laboratories.add(TextUtilities.cleanField(aff, true)); } + /** + * DEPRECATED + **/ public void extendFirstInstitution(String theExtend) { if (institutions == null) { institutions = new ArrayList(); @@ -230,6 +236,9 @@ public void extendFirstInstitution(String theExtend) { } } + /** + * DEPRECATED + **/ public void extendLastInstitution(String theExtend) { if (institutions == null) { institutions = new ArrayList(); @@ -241,6 +250,9 @@ public void extendLastInstitution(String theExtend) { } } + /** + * DEPRECATED + **/ public void extendFirstDepartment(String theExtend) { if (departments == null) { departments = new ArrayList(); @@ -252,6 +264,9 @@ public void extendFirstDepartment(String theExtend) { } } + /** + * DEPRECATED + **/ public void extendLastDepartment(String theExtend) { if (departments == null) { departments = new ArrayList(); @@ -263,6 +278,9 @@ public void extendLastDepartment(String theExtend) { } } + /** + * DEPRECATED + **/ public void extendFirstLaboratory(String theExtend) { if (laboratories == null) { laboratories = new ArrayList(); @@ -274,6 +292,9 @@ public void extendFirstLaboratory(String theExtend) { } } + /** + * DEPRECATED + **/ public void extendLastLaboratory(String theExtend) { if (laboratories == null) { laboratories = new ArrayList(); @@ -286,19 +307,32 @@ public void extendLastLaboratory(String theExtend) { } public boolean isNotNull() { - return !((departments == null) & - (institutions == null) & - (laboratories == null) & - (country == null) & - (postCode == null) & - (postBox == null) & - (region == null) & - (settlement == null) & - (addrLine == null) & - (affiliationString == null) & + return !((departments == null) && + (institutions == null) && + (laboratories == null) && + (country == null) && + (postCode == null) && + (postBox == null) && + (region == null) && + (settlement == null) && + (addrLine == null) && + (affiliationString == null) && (addressString == null)); } + public boolean hasAddress() { + if (country != null || + postCode != null || + postBox != null || + settlement != null || + addrLine != null || + region != null || + addressString != null) { + return true; + } else + return false; + } + public void setFailAffiliation(boolean b) { failAffiliation = b; } @@ -448,7 +482,7 @@ public int nbStructures() { return nbStruct; } - @Deprecated + /*@Deprecated public String toTEI() { StringBuilder tei = new StringBuilder(); if (!isNotNull()) { @@ -535,7 +569,7 @@ public String toTEI() { } return tei.toString(); - } + }*/ public static String toTEI(Affiliation aff, int nbTag) { StringBuffer tei = new StringBuffer(); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java index c0ad5ebe35..da7a2ca915 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/AffiliationAddressParser.java @@ -45,12 +45,15 @@ public List processing(String input) { List affiliationBlocks = getAffiliationBlocks(tokenizations); List> placesPositions = new ArrayList>(); - placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations)); + List> countriesPositions = new ArrayList>(); + placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizations)); + countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizations)); List> allTokens = new ArrayList>(); allTokens.add(tokenizations); - String header = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions); + String affiliationSequenceWithFeatures = + FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); - String res = label(header); + String res = label(affiliationSequenceWithFeatures); //return resultBuilder(res, tokenizations, false); // don't use pre-labels results = resultExtractionLayoutTokens(res, tokenizations); @@ -181,11 +184,13 @@ private String runReflow(List affiliationBlocks, List tokenizations) { try { List> placesPositions = new ArrayList>(); - placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations)); + List> countriesPositions = new ArrayList>(); + placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizations)); + countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizations)); List> allTokens = new ArrayList>(); allTokens.add(tokenizations); String affiliationSequenceWithFeatures = - FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions); + FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions); if ((affiliationSequenceWithFeatures == null) || (affiliationSequenceWithFeatures.trim().length() == 0)) { return null; @@ -214,7 +219,7 @@ protected List resultExtractionLayoutTokens(String result, Affiliation affiliation = new Affiliation(); - //System.out.println(result); + System.out.println(result); TaggingLabel lastClusterLabel = null; TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.AFFILIATION_ADDRESS, result, tokenizations); @@ -229,8 +234,8 @@ protected List resultExtractionLayoutTokens(String result, TaggingLabel clusterLabel = cluster.getTaggingLabel(); Engine.getCntManager().i(clusterLabel); - //String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens())); - String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); + String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens())); + //String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens()); //String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens()); //String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens()); @@ -247,53 +252,71 @@ protected List resultExtractionLayoutTokens(String result, affiliation.addLabeledResult(TaggingLabels.AFFILIATION_MARKER, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_INSTITUTION)) { if (affiliation.getInstitutions() != null && affiliation.getInstitutions().size()>0) { - + if (affiliation.hasAddress()) { + // new affiliation + if (affiliation.isNotNull()) { + affiliations.add(affiliation); + } + affiliation = new Affiliation(); + } } affiliation.addInstitution(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_INSTITUTION, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_DEPARTMENT)) { if (affiliation.getDepartments() != null && affiliation.getDepartments().size()>0) { - + if (affiliation.hasAddress()) { + // new affiliation + if (affiliation.isNotNull()) { + affiliations.add(affiliation); + } + affiliation = new Affiliation(); + } } affiliation.addDepartment(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_DEPARTMENT, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_LABORATORY)) { if (affiliation.getLaboratories() != null && affiliation.getLaboratories().size()>0) { - + if (affiliation.hasAddress()) { + // new affiliation + if (affiliation.isNotNull()) { + affiliations.add(affiliation); + } + affiliation = new Affiliation(); + } } affiliation.addLaboratory(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_LABORATORY, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_COUNTRY)) { - if (affiliation.getCountry() != null) { - - } - affiliation.setCountry(clusterContent); + if (affiliation.getCountry() != null) + affiliation.setCountry(affiliation.getCountry() + " " + clusterContent); + else + affiliation.setCountry(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_COUNTRY, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTCODE)) { - if (affiliation.getPostCode() != null) { - - } - affiliation.setPostCode(clusterContent); + if (affiliation.getPostCode() != null) + affiliation.setPostCode(affiliation.getPostCode() + " " + clusterContent); + else + affiliation.setPostCode(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTCODE, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTBOX)) { - if (affiliation.getPostBox() != null) { - - } - affiliation.setPostBox(clusterContent); + if (affiliation.getPostBox() != null) + affiliation.setPostBox(affiliation.getPostBox() + " " + clusterContent); + else + affiliation.setPostBox(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTBOX, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_REGION)) { - if (affiliation.getRegion() != null) { - - } - affiliation.setRegion(clusterContent); + if (affiliation.getRegion() != null) + affiliation.setRegion(affiliation.getRegion() + " " + clusterContent); + else + affiliation.setRegion(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_REGION, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_SETTLEMENT)) { - if (affiliation.getSettlement() != null) { - - } - affiliation.setSettlement(clusterContent); + if (affiliation.getSettlement() != null) + affiliation.setSettlement(affiliation.getSettlement() + " " + clusterContent); + else + affiliation.setSettlement(clusterContent); affiliation.addLabeledResult(TaggingLabels.AFFILIATION_SETTLEMENT, tokens); } else if (clusterLabel.equals(TaggingLabels.AFFILIATION_ADDRESSLINE)) { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java b/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java index 60c5a15830..ee0f89decb 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/ProcessEngine.java @@ -270,7 +270,7 @@ public void processAffiliation(final GrobidMainArgs pGbdArgs) throws Exception { if (isEmpty(result)) { throw new GrobidResourceException("Cannot read the input data for affiliations. Check the documentation. "); } - IOUtilities.writeInFile(pGbdArgs.getPath2Output() + File.separator + "result", result.get(0).toTEI()); + IOUtilities.writeInFile(pGbdArgs.getPath2Output() + File.separator + "result", Affiliation.toTEI(result.get(0),0)); } /** diff --git a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java index 1e3efdcaaf..614786385e 100755 --- a/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java +++ b/grobid-core/src/main/java/org/grobid/core/features/FeaturesVectorAffiliationAddress.java @@ -117,7 +117,8 @@ public String printVector() { */ static public String addFeaturesAffiliationAddress(List lines, List> allTokens, - List> locationPlaces) throws Exception { + List> locationPlaces, + List> countriesPositions) throws Exception { if (locationPlaces == null) { throw new GrobidException("At least one list of gazetter matches positions is null."); } @@ -128,17 +129,21 @@ static public String addFeaturesAffiliationAddress(List lines, StringBuffer result = new StringBuffer(); List block = null; boolean isPlace = false; + boolean isCountry = false; String lineStatus = "LINESTART"; int locPlace = 0; List currentLocationPlaces = locationPlaces.get(locPlace); + List currentCountryPlaces = countriesPositions.get(locPlace); List tokens = allTokens.get(locPlace); int currentPosPlaces = 0; + int currentPosCountries = 0; int mm = 0; // position of the token in the current sentence String line = null; for (int i = 0; i < lines.size(); i++) { line = lines.get(i); isPlace = false; + isCountry = false; if (line.equals("\n")) { result.append("\n \n"); continue; @@ -176,6 +181,30 @@ static public String addFeaturesAffiliationAddress(List lines, } } + // check the position of matches for country names + skipTest = false; + if ((currentCountryPlaces != null) && (currentCountryPlaces.size() > 0)) { + if (currentPosCountries == currentCountryPlaces.size() - 1) { + if (currentCountryPlaces.get(currentPosCountries).end < mm) { + skipTest = true; + } + } + if (!skipTest) { + for (int j = currentPosCountries; j < currentCountryPlaces.size(); j++) { + if ((currentCountryPlaces.get(j).start <= mm) && + (currentCountryPlaces.get(j).end >= mm)) { + isCountry = true; + currentPosCountries = j; + break; + } else if (currentCountryPlaces.get(j).start > mm) { + isCountry = false; + currentPosCountries = j; + break; + } + } + } + } + if (line.trim().contains("@newline")) { lineStatus = "LINESTART"; continue; @@ -185,8 +214,10 @@ static public String addFeaturesAffiliationAddress(List lines, result.append("\n"); lineStatus = "LINESTART"; currentLocationPlaces = locationPlaces.get(locPlace); + currentCountryPlaces = countriesPositions.get(locPlace); tokens = allTokens.get(locPlace); currentPosPlaces = 0; + currentPosCountries = 0; locPlace++; mm = 0; } else { @@ -202,7 +233,7 @@ static public String addFeaturesAffiliationAddress(List lines, } } - FeaturesVectorAffiliationAddress vector = addFeaturesAffiliationAddress(line, lineStatus, isPlace); + FeaturesVectorAffiliationAddress vector = addFeaturesAffiliationAddress(line, lineStatus, isPlace, isCountry); result.append(vector.printVector()); if (lineStatus.equals("LINESTART")) { @@ -220,7 +251,8 @@ static public String addFeaturesAffiliationAddress(List lines, static private FeaturesVectorAffiliationAddress addFeaturesAffiliationAddress(String line, String lineStatus, - boolean isPlace) { + boolean isPlace, + boolean isCountry) { FeatureFactory featureFactory = FeatureFactory.getInstance(); FeaturesVectorAffiliationAddress featuresVector = new FeaturesVectorAffiliationAddress(); @@ -288,13 +320,12 @@ else if (featureFactory.test_digit(word)) if (featuresVector.punctType == null) featuresVector.punctType = "NOPUNCT"; - if (featureFactory.test_country(word)) { - featuresVector.countryName = true; - } - - if (isPlace) { + if (isPlace) featuresVector.locationName = true; - } + + //if (featureFactory.test_country(word)) + if (isCountry) + featuresVector.countryName = true; featuresVector.wordShape = TextUtilities.wordShape(word); } diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index a25f4f9879..67d55330fd 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -58,6 +58,7 @@ public class Lexicon { private FastMatcher cityPattern = null; private FastMatcher organisationPattern = null; private FastMatcher locationPattern = null; + private FastMatcher countryPattern = null; private FastMatcher orgFormPattern = null; private FastMatcher collaborationPattern = null; @@ -186,6 +187,7 @@ private void initCountryCodes() { LOGGER.info("Initiating country codes"); countryCodes = new HashMap(); countries = new HashSet(); + countryPattern = new FastMatcher(); LOGGER.info("End of initialization of country codes"); } @@ -220,6 +222,10 @@ private void addCountryCodes(String path) { throw new GrobidResourceException("Cannot close all streams.", e); } } + + for (String country : countries) { + countryPattern.loadTerm(country, GrobidAnalyzer.getInstance(), false, false); // ignore delimiters, not case sensitive + } } public String getCountryCode(String country) { @@ -227,6 +233,18 @@ public String getCountryCode(String country) { return code; } + public void initCountryPatterns() { + if (countries == null || countries.size() == 0) { + // it should never be the case + addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + + "lexicon"+File.separator+"countries"+File.separator+"CountryCodes.xml"); + } + + for (String country : countries) { + countryPattern.loadTerm(country, GrobidAnalyzer.getInstance(), false, false); // ignore delimiters, not case sensitive + } + } + public final void addFirstNames(String path) { File file = new File(path); if (!file.exists()) { @@ -727,6 +745,18 @@ public List tokenPositionsOrganisationNames(List s) return results; } + /** + * Soft look-up in country name gazetteer for a given list of LayoutToken objects + * with token positions + */ + public List tokenPositionsCountryNames(List s) { + if (countryPattern == null) { + initCountryPatterns(); + } + List results = countryPattern.matchLayoutToken(s); + return results; + } + /** * Soft look-up in organisation names gazetteer for a string. * It return a list of positions referring to the character positions within the string. diff --git a/grobid-home/lexicon/countries/CountryCodes.xml b/grobid-home/lexicon/countries/CountryCodes.xml index 9dfba20fd1..d7c5ac90f3 100755 --- a/grobid-home/lexicon/countries/CountryCodes.xml +++ b/grobid-home/lexicon/countries/CountryCodes.xml @@ -119,9 +119,10 @@ AR ARG ARGENTINE + Argentine ARGENTINA Argentina - República Argentina + República Argentina AM @@ -141,6 +142,7 @@ AU AUS AUSTRALIE + Australie AUSTRALIA Australia @@ -148,6 +150,7 @@ AT AUT AUTRICHE + Autriche AUSTRIA Austria Österreich @@ -198,8 +201,9 @@ BE BEL BELGIQUE + Belgique BELGIUM - Belgium + Belgium BZ @@ -255,8 +259,9 @@ BR BRA BRÉSIL + Brésil BRAZIL - Brazil + Brazil Brasil @@ -306,8 +311,9 @@ CA CAN CANADA + Canada CANADA - Canada + Canada CV @@ -326,14 +332,15 @@ CHL CHILI CHILE - Chile + Chile CN CHN CHINE + Chine CHINA - China + China People’s Republic of China People’s Republic China Peoples R China @@ -520,6 +527,7 @@ ÉTATS-UNIS UNITED STATES United States + United States of America USA U.S.A U.S.A. @@ -1023,6 +1031,7 @@ MX MEX MEXIQUE + Mexique MEXICO Mexico México @@ -1128,8 +1137,9 @@ NO NOR NORVÈGE + Norvège NORWAY - Norway + Norway Norwegen diff --git a/grobid-home/lexicon/countries/location.country b/grobid-home/lexicon/countries/location.country index 3cf0cb374c..fc7b6ac36d 100644 --- a/grobid-home/lexicon/countries/location.country +++ b/grobid-home/lexicon/countries/location.country @@ -134,6 +134,7 @@ Grand Duchy of Frankfurt Grand Duchy of Hesse Grand Duchy of Kraków Grand Duchy of Lithuania +Grand Duchy of Luxembourg Grand Duchy of Poznań Grand Duchy of Tuscany Grand Duchy of Würzburg diff --git a/grobid-home/lexicon/places/location.txt b/grobid-home/lexicon/places/location.txt index 0f7693e3cb..2987e55497 100644 --- a/grobid-home/lexicon/places/location.txt +++ b/grobid-home/lexicon/places/location.txt @@ -92247,7 +92247,6 @@ United States UNITED STATES Inc U.S. United States of America -United States of America Ap Tau Pai A Guarda Crossfield, Alberta diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java index 5ff1b5cc67..b7e8470377 100644 --- a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java +++ b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessString.java @@ -216,13 +216,19 @@ public Response processAffiliations(String affiliation) { affiliation = affiliation.replaceAll("\\t", " "); List affiliationList = engine.processAffiliation(affiliation); - if (affiliationList != null) { - for(Affiliation affi : affiliationList) { - if (retVal == null) { - retVal = ""; + if (affiliationList != null) { + if (retVal == null) { + retVal = ""; + } + if (affiliationList.size() == 1) { + retVal += Affiliation.toTEI(affiliationList.get(0),0); + } else { + retVal += "\n"; + for(Affiliation affi : affiliationList) { + retVal += Affiliation.toTEI(affi,1); } - retVal += affi.toTEI(); - } + retVal += "\n"; + } } if (GrobidRestUtils.isResultNullOrEmpty(retVal)) { response = Response.status(Status.NO_CONTENT).build(); diff --git a/grobid-trainer/resources/dataset/affiliation-address/corpus/55000370.affiliation.tei.xml b/grobid-trainer/resources/dataset/affiliation-address/corpus/55000370.affiliation.tei.xml index 1081c32b44..339ff9afbf 100644 --- a/grobid-trainer/resources/dataset/affiliation-address/corpus/55000370.affiliation.tei.xml +++ b/grobid-trainer/resources/dataset/affiliation-address/corpus/55000370.affiliation.tei.xml @@ -20,7 +20,8 @@ - The Journal of Nuclear Medicine Department ofNuclear Medicine, + The Journal of Nuclear Medicine + Department ofNuclear Medicine, The University ofMassachusetts Medical Center,
diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/AffiliationAddressTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/AffiliationAddressTrainer.java index 0bdc8908ba..3ce9fc5444 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/AffiliationAddressTrainer.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/AffiliationAddressTrainer.java @@ -98,6 +98,7 @@ public boolean accept(File dir, String name) { // get a factory for SAX parser SAXParserFactory spf = SAXParserFactory.newInstance(); List> placesPositions = null; + List> countriesPositions = null; List> allTokens = null; int n = 0; @@ -115,11 +116,12 @@ public boolean accept(File dir, String name) { final List labeled = parser2.getLabeledResult(); allTokens = parser2.getAllTokens(); placesPositions = parser2.getPlacesPositions(); + countriesPositions = parser2.getCountriesPositions(); totalExamples += parser2.n; // we can now add the features String affAdd = FeaturesVectorAffiliationAddress - .addFeaturesAffiliationAddress(labeled, allTokens, placesPositions); + .addFeaturesAffiliationAddress(labeled, allTokens, placesPositions, countriesPositions); // format with features for sequence tagging... // given the split ratio we write either in the training file or the evaluation file diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/sax/TEIAffiliationAddressSaxParser.java b/grobid-trainer/src/main/java/org/grobid/trainer/sax/TEIAffiliationAddressSaxParser.java index a584237077..807e843dc7 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/sax/TEIAffiliationAddressSaxParser.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/sax/TEIAffiliationAddressSaxParser.java @@ -31,6 +31,7 @@ public class TEIAffiliationAddressSaxParser extends DefaultHandler { private List labeled = null; // store line by line the labeled data public List> placesPositions = null; // list of offset positions of place names + public List> countriesPositions = null; // list of offset positions of country names public List> allTokens = null; //private Writer writerAddress = null; // writer for the address model @@ -46,11 +47,18 @@ public void setTEIHeaderOutput(Writer writ) { public TEIAffiliationAddressSaxParser() { labeled = new ArrayList(); placesPositions = new ArrayList>(); + countriesPositions = new ArrayList>(); allTokens = new ArrayList>(); } public void characters(char[] buffer, int start, int length) { - accumulator.append(buffer, start, length); + StringBuffer localBuffer = new StringBuffer(); + localBuffer.append(buffer, start, length); + String localText = localBuffer.toString(); + localText = localText.replace("\n\t", " "); + localText = localText.replaceAll("( )+", " "); + accumulator.append(localText); + //accumulator.append(buffer, start, length); //if (allContent != null) { // allContent.append(buffer, start, length); //} @@ -68,6 +76,10 @@ public List> getPlacesPositions() { return placesPositions; } + public List> getCountriesPositions() { + return countriesPositions; + } + public List> getAllTokens() { return allTokens; } @@ -96,7 +108,8 @@ public void endElement(java.lang.String uri, accumulator.setLength(0); } else if (qName.equals("lb") || qName.equals("pb")) { // we note a line break - accumulator.append(" @newline "); + //accumulator.append(" @newline "); + accumulator.append("\n"); } else if (qName.equals("affiliation")) { String text = getText(); if (text.length() > 0) { @@ -129,8 +142,8 @@ public void endElement(java.lang.String uri, allString = allString.replace("@newline", "\n"); //List toto = lexicon.tokenPositionsCityNames(allString); List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(allString); - List toto = lexicon.tokenPositionsCityNames(tokens); - placesPositions.add(toto); + placesPositions.add(lexicon.tokenPositionsLocationNames(tokens)); + countriesPositions.add(lexicon.tokenPositionsCountryNames(tokens)); allTokens.add(tokens); allContent = null; allString = null; @@ -221,16 +234,24 @@ private void writeField(String text) { StringTokenizer st = new StringTokenizer(text, " \n\t" + TextUtilities.fullPunctuations, true); boolean begin = true; while (st.hasMoreTokens()) { - String tok = st.nextToken().trim(); + String tok = st.nextToken(); + if (tok.equals("\n")) { + labeled.add("@newline"); + continue; + } + + tok = tok.trim(); if (tok.length() == 0) { continue; } - if (tok.equals("@newline")) { + + /*if (tok.equals("@newline")) { labeled.add("@newline"); } else if (tok.equals("+PAGE+")) { // page break - no influence here labeled.add("@newline"); - } else { + } else*/ + { String content = tok; int i = 0; if (content.length() > 0) {