Skip to content

Commit

Permalink
fix features for affiliation
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Dec 12, 2023
1 parent 742ca4b commit 68360d9
Show file tree
Hide file tree
Showing 12 changed files with 234 additions and 76 deletions.
58 changes: 46 additions & 12 deletions grobid-core/src/main/java/org/grobid/core/data/Affiliation.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ public void setAddressString(String s) {
}

public void setCountry(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
country = s;
}

Expand All @@ -170,10 +171,12 @@ public void setPostBox(String s) {
}

public void setRegion(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
region = s;
}

public void setSettlement(String s) {
s = TextUtilities.removeLeadingAndTrailingChars(s, "[({.,])}: \n","[({.,])}: \n");
settlement = s;
}

Expand Down Expand Up @@ -219,6 +222,9 @@ public void addLaboratory(String aff) {
laboratories.add(TextUtilities.cleanField(aff, true));
}

/**
* DEPRECATED
**/
public void extendFirstInstitution(String theExtend) {
if (institutions == null) {
institutions = new ArrayList<String>();
Expand All @@ -230,6 +236,9 @@ public void extendFirstInstitution(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastInstitution(String theExtend) {
if (institutions == null) {
institutions = new ArrayList<String>();
Expand All @@ -241,6 +250,9 @@ public void extendLastInstitution(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendFirstDepartment(String theExtend) {
if (departments == null) {
departments = new ArrayList<String>();
Expand All @@ -252,6 +264,9 @@ public void extendFirstDepartment(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastDepartment(String theExtend) {
if (departments == null) {
departments = new ArrayList<String>();
Expand All @@ -263,6 +278,9 @@ public void extendLastDepartment(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendFirstLaboratory(String theExtend) {
if (laboratories == null) {
laboratories = new ArrayList<String>();
Expand All @@ -274,6 +292,9 @@ public void extendFirstLaboratory(String theExtend) {
}
}

/**
* DEPRECATED
**/
public void extendLastLaboratory(String theExtend) {
if (laboratories == null) {
laboratories = new ArrayList<String>();
Expand All @@ -286,19 +307,32 @@ public void extendLastLaboratory(String theExtend) {
}

public boolean isNotNull() {
return !((departments == null) &
(institutions == null) &
(laboratories == null) &
(country == null) &
(postCode == null) &
(postBox == null) &
(region == null) &
(settlement == null) &
(addrLine == null) &
(affiliationString == null) &
return !((departments == null) &&
(institutions == null) &&
(laboratories == null) &&
(country == null) &&
(postCode == null) &&
(postBox == null) &&
(region == null) &&
(settlement == null) &&
(addrLine == null) &&
(affiliationString == null) &&
(addressString == null));
}

public boolean hasAddress() {
if (country != null ||
postCode != null ||
postBox != null ||
settlement != null ||
addrLine != null ||
region != null ||
addressString != null) {
return true;
} else
return false;
}

public void setFailAffiliation(boolean b) {
failAffiliation = b;
}
Expand Down Expand Up @@ -448,7 +482,7 @@ public int nbStructures() {
return nbStruct;
}

@Deprecated
/*@Deprecated
public String toTEI() {
StringBuilder tei = new StringBuilder();
if (!isNotNull()) {
Expand Down Expand Up @@ -535,7 +569,7 @@ public String toTEI() {
}
return tei.toString();
}
}*/

public static String toTEI(Affiliation aff, int nbTag) {
StringBuffer tei = new StringBuffer();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,15 @@ public List<Affiliation> processing(String input) {

List<String> affiliationBlocks = getAffiliationBlocks(tokenizations);
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations));
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizations));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizations));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
allTokens.add(tokenizations);
String header = FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);

String res = label(header);
String res = label(affiliationSequenceWithFeatures);
//return resultBuilder(res, tokenizations, false); // don't use pre-labels

results = resultExtractionLayoutTokens(res, tokenizations);
Expand Down Expand Up @@ -181,11 +184,13 @@ private String runReflow(List<String> affiliationBlocks,
List<LayoutToken> tokenizations) {
try {
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations));
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizations));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizations));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
allTokens.add(tokenizations);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions);
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);

if ((affiliationSequenceWithFeatures == null) || (affiliationSequenceWithFeatures.trim().length() == 0)) {
return null;
Expand Down Expand Up @@ -214,7 +219,7 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result,

Affiliation affiliation = new Affiliation();

//System.out.println(result);
System.out.println(result);

TaggingLabel lastClusterLabel = null;
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.AFFILIATION_ADDRESS, result, tokenizations);
Expand All @@ -229,8 +234,8 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result,
TaggingLabel clusterLabel = cluster.getTaggingLabel();
Engine.getCntManager().i(clusterLabel);

//String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens()));
String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens());
String clusterContent = LayoutTokensUtil.normalizeText(LayoutTokensUtil.toText(cluster.concatTokens()));
//String clusterContent = LayoutTokensUtil.toText(cluster.concatTokens());
//String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
//String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens());

Expand All @@ -247,53 +252,71 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result,
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_MARKER, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_INSTITUTION)) {
if (affiliation.getInstitutions() != null && affiliation.getInstitutions().size()>0) {

if (affiliation.hasAddress()) {
// new affiliation
if (affiliation.isNotNull()) {
affiliations.add(affiliation);
}
affiliation = new Affiliation();
}
}
affiliation.addInstitution(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_INSTITUTION, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_DEPARTMENT)) {
if (affiliation.getDepartments() != null && affiliation.getDepartments().size()>0) {

if (affiliation.hasAddress()) {
// new affiliation
if (affiliation.isNotNull()) {
affiliations.add(affiliation);
}
affiliation = new Affiliation();
}
}
affiliation.addDepartment(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_DEPARTMENT, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_LABORATORY)) {
if (affiliation.getLaboratories() != null && affiliation.getLaboratories().size()>0) {

if (affiliation.hasAddress()) {
// new affiliation
if (affiliation.isNotNull()) {
affiliations.add(affiliation);
}
affiliation = new Affiliation();
}
}
affiliation.addLaboratory(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_LABORATORY, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_COUNTRY)) {
if (affiliation.getCountry() != null) {

}
affiliation.setCountry(clusterContent);
if (affiliation.getCountry() != null)
affiliation.setCountry(affiliation.getCountry() + " " + clusterContent);
else
affiliation.setCountry(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_COUNTRY, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTCODE)) {
if (affiliation.getPostCode() != null) {

}
affiliation.setPostCode(clusterContent);
if (affiliation.getPostCode() != null)
affiliation.setPostCode(affiliation.getPostCode() + " " + clusterContent);
else
affiliation.setPostCode(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTCODE, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_POSTBOX)) {
if (affiliation.getPostBox() != null) {

}
affiliation.setPostBox(clusterContent);
if (affiliation.getPostBox() != null)
affiliation.setPostBox(affiliation.getPostBox() + " " + clusterContent);
else
affiliation.setPostBox(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_POSTBOX, tokens);

} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_REGION)) {
if (affiliation.getRegion() != null) {

}
affiliation.setRegion(clusterContent);
if (affiliation.getRegion() != null)
affiliation.setRegion(affiliation.getRegion() + " " + clusterContent);
else
affiliation.setRegion(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_REGION, tokens);

} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_SETTLEMENT)) {
if (affiliation.getSettlement() != null) {

}
affiliation.setSettlement(clusterContent);
if (affiliation.getSettlement() != null)
affiliation.setSettlement(affiliation.getSettlement() + " " + clusterContent);
else
affiliation.setSettlement(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_SETTLEMENT, tokens);

} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_ADDRESSLINE)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ public void processAffiliation(final GrobidMainArgs pGbdArgs) throws Exception {
if (isEmpty(result)) {
throw new GrobidResourceException("Cannot read the input data for affiliations. Check the documentation. ");
}
IOUtilities.writeInFile(pGbdArgs.getPath2Output() + File.separator + "result", result.get(0).toTEI());
IOUtilities.writeInFile(pGbdArgs.getPath2Output() + File.separator + "result", Affiliation.toTEI(result.get(0),0));
}

/**
Expand Down
Loading

0 comments on commit 68360d9

Please sign in to comment.