Skip to content

Commit

Permalink
Merge pull request #549 from elifesciences/extract-raw-affiliation-st…
Browse files Browse the repository at this point in the history
…ring-fork

optionally extract raw affiliation string
  • Loading branch information
kermitt2 authored Apr 24, 2020
2 parents 9caa622 + 61926aa commit 639e0f5
Show file tree
Hide file tree
Showing 16 changed files with 800 additions and 289 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.grobid.core.layout.LayoutToken;

import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.wipo.nlp.textboundaries.ReTokenizer;
import org.wipo.nlp.textboundaries.ReTokenizerFactory;

Expand Down Expand Up @@ -158,22 +159,8 @@ public List<LayoutToken> tokenizeWithLayoutToken(String text) {
}

public List<LayoutToken> tokenizeWithLayoutToken(String text, Language lang) {
List<LayoutToken> result = new ArrayList<>();
text = UnicodeUtil.normaliseText(text);
List<String> tokens = tokenize(text, lang);
int pos = 0;
for (int i = 0; i < tokens.size(); i++) {
String tok = tokens.get(i);
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(tok);
layoutToken.setOffset(pos);
result.add(layoutToken);
pos += tok.length();
if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
layoutToken.setNewLineAfter(true);
}
}

return result;
return LayoutTokensUtil.getLayoutTokensForTokenizedText(tokens);
}
}
11 changes: 11 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Affiliation.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public class Affiliation {

private String addressString = null; // unspecified address field
private String affiliationString = null; // unspecified affiliation field
private String rawAffiliationString = null; // raw affiliation text (excluding marker)

private boolean failAffiliation = true; // tag for unresolved affiliation attachment

Expand All @@ -55,6 +56,7 @@ public Affiliation(org.grobid.core.data.Affiliation aff) {
settlement = aff.getSettlement();
addrLine = aff.getAddrLine();
affiliationString = aff.getAffiliationString();
rawAffiliationString = aff.getRawAffiliationString();
}

public String getAcronym() {
Expand Down Expand Up @@ -105,6 +107,10 @@ public String getAffiliationString() {
return affiliationString;
}

public String getRawAffiliationString() {
return rawAffiliationString;
}

public List<String> getInstitutions() {
return institutions;
}
Expand Down Expand Up @@ -169,6 +175,10 @@ public void setAffiliationString(String s) {
affiliationString = s;
}

public void setRawAffiliationString(String s) {
rawAffiliationString = s;
}

public void setInstitutions(List<String> affs) {
institutions = affs;
}
Expand Down Expand Up @@ -481,6 +491,7 @@ public String toString() {
", marker='" + marker + '\'' +
", addressString='" + addressString + '\'' +
", affiliationString='" + affiliationString + '\'' +
", rawAffiliationString='" + rawAffiliationString + '\'' +
", failAffiliation=" + failAffiliation +
'}';
}
Expand Down
374 changes: 150 additions & 224 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,9 @@ else if (biblio.getE_Year().length() == 4)
//biblio.attachAffiliations();

if ( (config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("persName")) )
tei.append(biblio.toTEIAuthorBlock(6, true));
tei.append(biblio.toTEIAuthorBlock(6, true, config));
else
tei.append(biblio.toTEIAuthorBlock(6, false));
tei.append(biblio.toTEIAuthorBlock(6, false, config));

// title
String title = biblio.getTitle();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ public ArrayList<Affiliation> processing(String input) {
return null;
}

ArrayList<String> affiliationBlocks = new ArrayList<String>();
input = UnicodeUtil.normaliseText(input);
input = input.trim();

Expand All @@ -44,21 +43,8 @@ public ArrayList<Affiliation> processing(String input) {
//while (st.hasMoreTokens()) {
// String tok = st.nextToken();
//int p = 0;
for(LayoutToken tok : tokenizations) {
if (tok.getText().length() == 0) continue;
if (tok.getText().equals("\n")) {
//tokenizations.set(p, new LayoutToken(" "));
tok.setText(" ");
}
if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
affiliationBlocks.add("@newline");
} else
affiliationBlocks.add(tok + " <affiliation>");
}
//p++;
}

List<String> affiliationBlocks = getAffiliationBlocks(tokenizations);
List<List<OffsetPosition>> placesPositions = new ArrayList<List<OffsetPosition>>();
placesPositions.add(lexicon.tokenPositionsCityNames(tokenizations));
List<List<LayoutToken>> allTokens = new ArrayList<List<LayoutToken>>();
Expand All @@ -72,6 +58,27 @@ public ArrayList<Affiliation> processing(String input) {
}
}

protected static List<String> getAffiliationBlocks(List<LayoutToken> tokenizations) {
ArrayList<String> affiliationBlocks = new ArrayList<String>();
for(LayoutToken tok : tokenizations) {
if (tok.getText().length() == 0) continue;

// is this necessary?
if (tok.getText().equals("\n")) {
//tokenizations.set(p, new LayoutToken(" "));
tok.setText(" ");
}
if (!tok.getText().equals(" ")) {
if (tok.getText().equals("\n")) {
affiliationBlocks.add("@newline");
} else
affiliationBlocks.add(tok + " <affiliation>");
}
//p++;
}
return affiliationBlocks;
}

/**
* Post processing of extracted field affiliation and address.
* Here the input string to be processed comes from a previous parser: the segmentation
Expand Down Expand Up @@ -196,7 +203,7 @@ private String runReflow(List<String> affiliationBlocks,
}


private ArrayList<Affiliation> resultBuilder(String result,
protected ArrayList<Affiliation> resultBuilder(String result,
List<LayoutToken> tokenizations,
boolean usePreLabel) {
ArrayList<Affiliation> fullAffiliations = null;
Expand Down Expand Up @@ -698,6 +705,16 @@ private ArrayList<Affiliation> resultBuilder(String result,
}
}

if (!s1.endsWith("<marker>")) {
if (aff.getRawAffiliationString() == null) {
aff.setRawAffiliationString(s2);
} else if (addSpace) {
aff.setRawAffiliationString(aff.getRawAffiliationString() + " " + s2);
} else {
aff.setRawAffiliationString(aff.getRawAffiliationString() + s2);
}
}

lastTag = s1;
lineCount++;
newMarker = false;
Expand Down
12 changes: 11 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/engines/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,17 @@ public Language runLanguageId(String filePath) {
* @return the TEI representation of the extracted bibliographical
* information
*/
public String processHeader(String inputFile, int consolidate, BiblioItem result) {
public String processHeader(
String inputFile,
int consolidate,
boolean includeRawAffiliations,
BiblioItem result
) {
GrobidAnalysisConfig config = new GrobidAnalysisConfig.GrobidAnalysisConfigBuilder()
.startPage(0)
.endPage(2)
.consolidateHeader(consolidate)
.includeRawAffiliations(includeRawAffiliations)
.build();
return processHeader(inputFile, config, result);
}
Expand All @@ -364,6 +370,10 @@ public String processHeader(String inputFile, int consolidate, BiblioItem result
* @return the TEI representation of the extracted bibliographical
* information
*/
public String processHeader(String inputFile, int consolidate, BiblioItem result) {
return processHeader(inputFile, GrobidAnalysisConfig.defaultInstance(), result);
}

public String processHeader(String inputFile, GrobidAnalysisConfig config, BiblioItem result) {
// normally the BiblioItem reference must not be null, but if it is the
// case, we still continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,18 +140,23 @@ public Document processing(DocumentSource documentSource,
if (GrobidProperties.isHeaderUseHeuristics()) {
// heuristics for identifying the header zone, this is the old version of the header block identification,
// still used because more robust than the pure machine learning approach (lack of training data)
parsers.getHeaderParser().processingHeaderBlock(config.getConsolidateHeader(), doc, resHeader);
parsers.getHeaderParser().processingHeaderBlock(config, doc, resHeader);
}

if (isBlank(resHeader.getTitle()) || isBlank(resHeader.getAuthors()) || CollectionUtils.isEmpty(resHeader.getFullAuthors())) {
resHeader = new BiblioItem();
// using the segmentation model to identify the header zones
parsers.getHeaderParser().processingHeaderSection(config.getConsolidateHeader(), doc, resHeader);
parsers.getHeaderParser().processingHeaderSection(config, doc, resHeader);
} else {
// if the heuristics method was initially used, we anyway take the abstract derived from the segementation
// model, because this structure is significantly more reliable with this approach
BiblioItem resHeader2 = new BiblioItem();
parsers.getHeaderParser().processingHeaderSection(0, doc, resHeader2);
// we have already consolidated
GrobidAnalysisConfig configWithoutConsolidate = (
GrobidAnalysisConfig.builder(config)
.consolidateHeader(0)
).build();
parsers.getHeaderParser().processingHeaderSection(configWithoutConsolidate, doc, resHeader2);
if (isNotBlank(resHeader2.getAbstract())) {
resHeader.setAbstract(resHeader2.getAbstract());
resHeader.setLayoutTokensForLabel(resHeader2.getLayoutTokens(TaggingLabels.HEADER_ABSTRACT), TaggingLabels.HEADER_ABSTRACT);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ public Pair<String, Document> processing(File input, BiblioItem resHeader, Grobi
documentSource = DocumentSource.fromPdf(input, config.getStartPage(), config.getEndPage());
Document doc = parsers.getSegmentationParser().processing(documentSource, config);

String tei = processingHeaderSection(config.getConsolidateHeader(), doc, resHeader);
String tei = processingHeaderSection(config, doc, resHeader);
return new ImmutablePair<String, Document>(tei, doc);
} finally {
if (documentSource != null) {
Expand All @@ -102,7 +102,7 @@ public Pair<String, Document> processing2(String pdfInput, BiblioItem resHeader,
throw new GrobidException("PDF parsing resulted in empty content");
}

String tei = processingHeaderBlock(config.getConsolidateHeader(), doc, resHeader);
String tei = processingHeaderBlock(config, doc, resHeader);
return Pair.of(tei, doc);
} catch (Exception e) {
throw new GrobidException(e, GrobidExceptionStatus.GENERAL);
Expand All @@ -116,7 +116,7 @@ public Pair<String, Document> processing2(String pdfInput, BiblioItem resHeader,
/**
* Header processing after identification of the header blocks with heuristics (old approach)
*/
public String processingHeaderBlock(int consolidate, Document doc, BiblioItem resHeader) throws Exception {
public String processingHeaderBlock(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader) throws Exception {
String header;
//if (doc.getBlockDocumentHeaders() == null) {
header = doc.getHeaderFeatured(true, true);
Expand Down Expand Up @@ -264,7 +264,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
}
}

resHeader = consolidateHeader(resHeader, consolidate);
resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());

// normalization of dates
if (resHeader != null) {
Expand Down Expand Up @@ -296,7 +296,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
doc.setResHeader(resHeader);

TEIFormatter teiFormatter = new TEIFormatter(doc, null);
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, GrobidAnalysisConfig.builder().consolidateHeader(consolidate).build());
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, config);
tei.append("\t</text>\n");
tei.append("</TEI>\n");
//LOGGER.debug(tei.toString());
Expand All @@ -306,7 +306,7 @@ public String processingHeaderBlock(int consolidate, Document doc, BiblioItem re
/**
* Header processing after application of the segmentation model (new approach)
*/
public String processingHeaderSection(int consolidate, Document doc, BiblioItem resHeader) {
public String processingHeaderSection(GrobidAnalysisConfig config, Document doc, BiblioItem resHeader) {
try {
SortedSet<DocumentPiece> documentHeaderParts = doc.getDocumentPart(SegmentationLabels.HEADER);
List<LayoutToken> tokenizations = doc.getTokenizations();
Expand Down Expand Up @@ -478,7 +478,7 @@ public String processingHeaderSection(int consolidate, Document doc, BiblioItem
}
}

resHeader = consolidateHeader(resHeader, consolidate);
resHeader = consolidateHeader(resHeader, config.getConsolidateHeader());

// normalization of dates
if (resHeader != null) {
Expand All @@ -504,7 +504,7 @@ public String processingHeaderSection(int consolidate, Document doc, BiblioItem
}

TEIFormatter teiFormatter = new TEIFormatter(doc, null);
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, GrobidAnalysisConfig.defaultInstance());
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, config);
tei.append("\t</text>\n");
tei.append("</TEI>\n");
return tei.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ private GrobidAnalysisConfig() {
// if consolidate header
private int consolidateHeader = 0;

// if the raw affiliation string should be included in the parsed results
private boolean includeRawAffiliations = false;

// if the raw bibliographical string should be included in the parsed results
private boolean includeRawCitations = false;

Expand Down Expand Up @@ -82,6 +85,15 @@ private GrobidAnalysisConfig() {
public static class GrobidAnalysisConfigBuilder {
GrobidAnalysisConfig config = new GrobidAnalysisConfig();

public GrobidAnalysisConfigBuilder() {
}

public GrobidAnalysisConfigBuilder(GrobidAnalysisConfig config) {
// TODO add more properties
this.config.includeRawAffiliations = config.getIncludeRawAffiliations();
this.config.includeRawCitations = config.getIncludeRawCitations();
}

public GrobidAnalysisConfigBuilder consolidateHeader(int consolidate) {
config.consolidateHeader = consolidate;
return this;
Expand All @@ -97,6 +109,11 @@ public GrobidAnalysisConfigBuilder consolidateCitations(int consolidate) {
return this;
}

public GrobidAnalysisConfigBuilder includeRawAffiliations(boolean rawAffiliations) {
config.includeRawAffiliations = rawAffiliations;
return this;
}

public GrobidAnalysisConfigBuilder includeRawCitations(boolean rawCitations) {
config.includeRawCitations = rawCitations;
return this;
Expand Down Expand Up @@ -168,6 +185,10 @@ public static GrobidAnalysisConfigBuilder builder() {
return new GrobidAnalysisConfigBuilder();
}

public static GrobidAnalysisConfigBuilder builder(GrobidAnalysisConfig config) {
return new GrobidAnalysisConfigBuilder(config);
}

public static GrobidAnalysisConfig defaultInstance() {
return new GrobidAnalysisConfig();
}
Expand All @@ -188,6 +209,10 @@ public int getConsolidateHeader() {
return consolidateHeader;
}

public boolean getIncludeRawAffiliations() {
return includeRawAffiliations;
}

public boolean getIncludeRawCitations() {
return includeRawCitations;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,4 +317,22 @@ public static List<LayoutToken> subListByOffset(List<LayoutToken> token, int sta
.collect(Collectors.toList());
}

public static List<LayoutToken> getLayoutTokensForTokenizedText(List<String> tokens) {
List<LayoutToken> result = new ArrayList<>();
int pos = 0;
for (int i = 0; i < tokens.size(); i++) {
String tok = tokens.get(i);
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(tok);
layoutToken.setOffset(pos);
result.add(layoutToken);
pos += tok.length();
if (i < tokens.size() - 1 && tokens.get(i + 1).equals("\n")) {
layoutToken.setNewLineAfter(true);
}
}

return result;
}

}
Loading

0 comments on commit 639e0f5

Please sign in to comment.