Skip to content

Commit

Permalink
big fixing for updated affiliation parser
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Dec 14, 2023
1 parent d7b2ff1 commit fb621d6
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ public void setAffiliationString(String s) {

public void setRawAffiliationString(String s) {
rawAffiliationString = s;
rawAffiliationString = rawAffiliationString.replaceAll("( )+", " ");
}

public void setInstitutions(List<String> affs) {
Expand Down
14 changes: 14 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ public class BiblioItem {
// map of labels (e.g. <title> or <abstract>) to LayoutToken
private Map<String, List<LayoutToken>> labeledTokens;

// accumulation of the LayoutTokens for sequences of affiliation/address
private List<List<LayoutToken>> affiliationAddresslabeledTokens;

/**
* The following are internal working structures not meant to be used outside.
* For collecting layout tokens of the various bibliographical component,
Expand Down Expand Up @@ -4419,6 +4422,13 @@ public void generalResultMappingHeader(String labeledResult, List<LayoutToken> t
theList = theList == null ? new ArrayList<>() : theList;
theList.addAll(clusterTokens);
labeledTokens.put(clusterLabel.getLabel(), theList);

if (clusterLabel.equals(TaggingLabels.HEADER_AFFILIATION) || clusterLabel.equals(TaggingLabels.HEADER_ADDRESS)) {
if (affiliationAddresslabeledTokens == null)
affiliationAddresslabeledTokens = new ArrayList<>();
if (!affiliationAddresslabeledTokens.contains(clusterTokens))
affiliationAddresslabeledTokens.add(clusterTokens);
}
}
}

Expand Down Expand Up @@ -4458,4 +4468,8 @@ public String getAvailabilityStmt() {
public void setAvailabilityStmt(String availabilityStmt) {
this.availabilityStmt = availabilityStmt;
}

public List<List<LayoutToken>> getAffiliationAddresslabeledTokens() {
return affiliationAddresslabeledTokens;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -79,37 +79,109 @@ protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizatio
return affiliationBlocks;
}

protected static List<String> getAffiliationBlocksFromSegments(List<List<LayoutToken>> tokenizations) {
ArrayList<String> affiliationBlocks = new ArrayList<String>();
int end = 0;
for(List<LayoutToken> tokenizationSegment : tokenizations) {
if (tokenizationSegment == null || tokenizationSegment.size() == 0)
continue;

// if we have an offset shit, we introduce a segmentation of the affiliation block
LayoutToken startToken = tokenizationSegment.get(0);
int start = startToken.getOffset();
if (start-end > 2)
affiliationBlocks.add("\n");

for(LayoutToken tok : tokenizationSegment) {
if (tok.getText().length() == 0)
continue;

if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
affiliationBlocks.add("@newline");
} else
affiliationBlocks.add(tok + " <affiliation>");
}
end = tok.getOffset();
}
}
return affiliationBlocks;
}

public List<Affiliation> processingLayoutTokens(List<List<LayoutToken>> tokenizations) {
List<Affiliation> results = null;
try {
if ((tokenizations == null) || (tokenizations.size() == 0)) {
return null;
}

List<LayoutToken> tokenizationsAffiliation = new ArrayList<>();
for (List<LayoutToken> tokenization : tokenizations) {
//System.out.println(tokenization.toString());
tokenizationsAffiliation.addAll(tokenization);
}

List<String> affiliationBlocks = getAffiliationBlocksFromSegments(tokenizations);

//System.out.println(affiliationBlocks.toString());

List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
allTokens.add(tokenizationsAffiliation);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);

String res = label(affiliationSequenceWithFeatures);
//return resultBuilder(res, tokenizations, false); // don't use pre-labels

results = resultExtractionLayoutTokens(res, tokenizationsAffiliation);
} catch (Exception e) {
throw new GrobidException("An exception occurred while running Grobid.", e);
}
return results;
}

/**
* Post processing of extracted field affiliation and address.
* Here the input string to be processed comes from a previous parser: the segmentation
* can be kept and we filter in all tokens labelled <address> or <affiliation>.
* We also need to keep the original tokenization information to recreate the exact
* initial string.
*/
public List<Affiliation> processReflow(String result, List<LayoutToken> tokenizations) {
/*public List<Affiliation> processReflow(String result, List<LayoutToken> tokenizations) {
if ((result == null) || (result.length() == 0)) {
return null;
}
List<String> affiliationBlocks = new ArrayList<String>();
List<String> affiliationFeatureLines = new ArrayList<String>();
List<LayoutToken> subTokenizations = new ArrayList<LayoutToken>();
filterAffiliationAddress(result, tokenizations, affiliationBlocks, subTokenizations);
filterAffiliationAddress(result, tokenizations, affiliationFeatureLines, subTokenizations);
return processingReflow(affiliationBlocks, subTokenizations);
}
System.out.println(affiliationFeatureLines.toString());
System.out.println(subTokenizations.toString());
return processingReflow(affiliationFeatureLines, subTokenizations);
}*/

private void filterAffiliationAddress(String result,
/*private void filterAffiliationAddress(String result,
List<LayoutToken> tokenizations,
List<String> affiliationBlocks,
List<String> affiliationFeatureLines,
List<LayoutToken> subTokenizations) {
// result is the header feature matrix with labels
// tokenizations is the layout tokens of the full header
// affiliationFeatureLines is wehre to put the lines with header labels affiliation or address
// subTokenizations is where to put the layout tokens corresponding to what is labeled with header labels affiliation or address
StringTokenizer st = new StringTokenizer(result, "\n");
String lastLabel = null;
int p = 0;
int p = 0; // index in the tokenizations list
List<LayoutToken> tokenizationsBuffer = null;
while (st.hasMoreTokens() && (p < tokenizations.size())) {
String line = st.nextToken();
if (line.trim().length() == 0) {
affiliationBlocks.add("\n");
affiliationFeatureLines.add("\n");
lastLabel = null;
}
else {
Expand Down Expand Up @@ -144,23 +216,29 @@ private void filterAffiliationAddress(String result,
p = p0;
continue;
}
}
}
int ll = s.length;
String label = s[ll-1];
if ((label.indexOf("affiliation") == -1) && (label.indexOf("address") == -1)) {
// not affiliation/address input
if (lastLabel != null) {
affiliationFeatureLines.add("\n");
}
lastLabel = null;
continue;
}
if ((tokOriginal != null) && ( ((label.indexOf("affiliation") != -1) || (label.indexOf("address") != -1)) )) {
affiliationBlocks.add(tokOriginal + " " + label);
affiliationFeatureLines.add(tokOriginal + " " + label);
// add the content of tokenizationsBuffer
for(LayoutToken tokk : tokenizationsBuffer) {
subTokenizations.add(tokk);
}
if (tokenizationsBuffer.size() > 0 && isEndLine) {
affiliationBlocks.add("@newline");
affiliationFeatureLines.add("@newline");
}
}
else if (lastLabel != null) {
affiliationBlocks.add("\n");
}
}
if ((label.indexOf("affiliation") != -1) || (label.indexOf("address") != -1)) {
lastLabel = label;
Expand All @@ -172,15 +250,15 @@ else if (lastLabel != null) {
//System.out.println(subTokenizations.toString());
//System.out.println(affiliationBlocks.toString());
}
}*/

private List<Affiliation> processingReflow(List<String> affiliationBlocks, List<LayoutToken> tokenizations) {
String res = runReflow(affiliationBlocks, tokenizations);
/*private List<Affiliation> processingReflow(List<String> affiliationFeatureLines, List<LayoutToken> tokenizations) {
String res = runReflow(affiliationFeatureLines, tokenizations);
//return resultBuilder(res, tokenizations, false); // normally use pre-label because it is a reflow
return resultExtractionLayoutTokens(res, tokenizations);
}
}*/

private String runReflow(List<String> affiliationBlocks,
/*private String runReflow(List<String> affiliationFeatureLines,
List<LayoutToken> tokenizations) {
try {
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
Expand All @@ -190,7 +268,7 @@ private String runReflow(List<String> affiliationBlocks,
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
allTokens.add(tokenizations);
String affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationFeatureLines, allTokens, placesPositions, countriesPositions);
if ((affiliationSequenceWithFeatures == null) || (affiliationSequenceWithFeatures.trim().length() == 0)) {
return null;
Expand All @@ -202,7 +280,7 @@ private String runReflow(List<String> affiliationBlocks,
} catch (Exception e) {
throw new GrobidException("An exception occured while running Grobid at the affiliation-address labeling task.", e);
}
}
}*/

/**
* Extract results from a labeled sequence.
Expand All @@ -211,20 +289,20 @@ private String runReflow(List<String> affiliationBlocks,
* @param tokenizations list of tokens
* @return lis of Affiliation objects
*/
protected List<Affiliation> resultExtractionLayoutTokens(String result,
List<LayoutToken> tokenizations) {
protected List<Affiliation> resultExtractionLayoutTokens(String result, List<LayoutToken> tokenizations) {
List<Affiliation> affiliations = new ArrayList<>();
if (result == null)
return affiliations;

Affiliation affiliation = new Affiliation();

System.out.println(result);
//System.out.println(result);

TaggingLabel lastClusterLabel = null;
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.AFFILIATION_ADDRESS, result, tokenizations);

String tokenLabel = null;
boolean newline = true;
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
Expand All @@ -242,12 +320,15 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result,
List<LayoutToken> tokens = cluster.concatTokens();

if (clusterLabel.equals(TaggingLabels.AFFILIATION_MARKER)) {
if (affiliation.getMarker() != null) {
// if an affiliation has already a merker, or if a marker start a line,
// we introduce a new affiliation
if (affiliation.getMarker() != null || newline) {
if (affiliation.isNotNull()) {
affiliations.add(affiliation);
}
affiliation = new Affiliation();
}

affiliation.setMarker(clusterContent);
affiliation.addLabeledResult(TaggingLabels.AFFILIATION_MARKER, tokens);
} else if (clusterLabel.equals(TaggingLabels.AFFILIATION_INSTITUTION)) {
Expand Down Expand Up @@ -331,6 +412,21 @@ protected List<Affiliation> resultExtractionLayoutTokens(String result,
if (!clusterLabel.equals(TaggingLabels.OTHER) && affiliation.isNotNull()) {
affiliation.appendLayoutTokens(tokens);
}

if (!clusterLabel.equals(TaggingLabels.AFFILIATION_MARKER)) {
if (affiliation.getRawAffiliationString() == null) {
affiliation.setRawAffiliationString(clusterContent);
} else {
affiliation.setRawAffiliationString(affiliation.getRawAffiliationString() + " " + clusterContent);
}
}

newline = false;
if (tokens.size() > 0) {
LayoutToken lastToken = tokens.get(tokens.size()-1);
if (lastToken.getText() != null && lastToken.getText().equals("\n"))
newline = true;
}
}

// last affiliation
Expand Down Expand Up @@ -885,17 +981,47 @@ protected ArrayList<Affiliation> resultBuilder(String result,
/**
* Extract results from a labelled header in the training format without any string modification.
*/
public StringBuilder trainingExtraction(String result,
List<LayoutToken> tokenizations) {
if ((result == null) || (result.length() == 0)) {
public StringBuilder trainingExtraction(List<LayoutToken> tokenizationsAffiliation) {
/*if ((result == null) || (result.length() == 0)) {
return null;
}*/

if (tokenizationsAffiliation == null || tokenizationsAffiliation.size() == 0)
return null;

List<String> affiliationBlocks = getAffiliationBlocks(tokenizationsAffiliation);
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
List<List<OffsetPosition>> countriesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsLocationNames(tokenizationsAffiliation));
countriesPositions.add(lexicon.tokenPositionsCountryNames(tokenizationsAffiliation));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
allTokens.add(tokenizationsAffiliation);

String affiliationSequenceWithFeatures = null;
try {
affiliationSequenceWithFeatures =
FeaturesVectorAffiliationAddress.addFeaturesAffiliationAddress(affiliationBlocks, allTokens, placesPositions, countriesPositions);
} catch(Exception e) {
throw new GrobidException("An exception occurred while running Grobid.", e);
}

List<String> affiliationBlocks = new ArrayList<String>();
List<LayoutToken> tokenizationsAffiliation = new ArrayList<LayoutToken>();
if (affiliationSequenceWithFeatures == null) {
return null;
}

String resultAffiliation = label(affiliationSequenceWithFeatures);
//return resultBuilder(res, tokenizations, false); // don't use pre-labels

//results = resultExtractionLayoutTokens(res, tokenizations);




//List<String> affiliationBlocks = new ArrayList<String>();
//List<LayoutToken> tokenizationsAffiliation = new ArrayList<LayoutToken>();

filterAffiliationAddress(result, tokenizations, affiliationBlocks, tokenizationsAffiliation);
String resultAffiliation = runReflow(affiliationBlocks, tokenizationsAffiliation);
//filterAffiliationAddress(result, tokenizations, affiliationBlocks, tokenizationsAffiliation);
//String resultAffiliation = runReflow(affiliationBlocks, tokenizationsAffiliation);

StringBuilder bufferAffiliation = new StringBuilder();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,8 @@ public Document createTraining(File inputFile,
writer.close();

String rese = parsers.getHeaderParser().label(header);
BiblioItem resHeader = new BiblioItem();
resHeader = parsers.getHeaderParser().resultExtraction(rese, headerTokenizations, resHeader);

// buffer for the header block
StringBuilder bufferHeader = parsers.getHeaderParser().trainingExtraction(rese, headerTokenizations);
Expand All @@ -1362,8 +1364,9 @@ public Document createTraining(File inputFile,
}

// buffer for the affiliation+address block
List<LayoutToken> tokenizationsAffiliation = resHeader.getLayoutTokens(TaggingLabels.HEADER_AFFILIATION);
StringBuilder bufferAffiliation =
parsers.getAffiliationAddressParser().trainingExtraction(rese, headerTokenizations);
parsers.getAffiliationAddressParser().trainingExtraction(tokenizationsAffiliation);

// buffer for the date block
StringBuilder bufferDate = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,12 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
// remove invalid authors (no last name, noise, etc.)
resHeader.setFullAuthors(Person.sanityCheck(resHeader.getFullAuthors()));

//List<LayoutToken> tokenizationsAffiliation = resHeader.getLayoutTokens(TaggingLabels.HEADER_AFFILIATION);
List<List<LayoutToken>> tokenizationsAffiliation = resHeader.getAffiliationAddresslabeledTokens();
//resHeader.setFullAffiliations(
// parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
resHeader.setFullAffiliations(
parsers.getAffiliationAddressParser().processReflow(res, tokenizations));
parsers.getAffiliationAddressParser().processingLayoutTokens(tokenizationsAffiliation));
resHeader.attachEmails();
boolean attached = false;
if (fragmentedAuthors && !hasMarker) {
Expand Down
Loading

0 comments on commit fb621d6

Please sign in to comment.