Skip to content

Commit

Permalink
Merge pull request #1075 from kermitt2/fix-note-page
Browse files Browse the repository at this point in the history
Fix OOBE when processing large quantities of notes
  • Loading branch information
kermitt2 authored Jan 14, 2024
2 parents 46913da + 69fcecb commit cbc77d5
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 16 deletions.
22 changes: 22 additions & 0 deletions .github/workflows/ci-build-unstable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,25 @@ jobs:
format: jacoco
file: build/reports/jacoco/codeCoverageReport/codeCoverageReport.xml
debug: true

docker-build:
needs: [ build ]
runs-on: ubuntu-latest

steps:
- name: Create more disk space
run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- uses: actions/checkout@v2
- name: Build and push
id: docker_build
uses: mr-smithers-excellent/docker-build-push@v5
with:
username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }}
password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }}
image: lfoppiano/grobid
registry: docker.io
pushImage: ${{ github.event_name != 'pull_request' }}
tags: latest-develop
dockerfile: Dockerfile.crf
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
7 changes: 6 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Note.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.grobid.core.data;

import org.apache.commons.collections4.CollectionUtils;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.Page;
import org.grobid.core.utilities.*;
Expand Down Expand Up @@ -74,7 +75,11 @@ public void setOffsetStartInPage(int offsetStartInPage) {
}

public int getPageNumber() {
return tokens.get(0).getPage();
if (CollectionUtils.isNotEmpty(tokens))
return tokens.get(0).getPage();
else {
return -1;
}
}

public String getText() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,6 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
}

String footText = doc.getDocumentPieceText(docPiece);
footText = TextUtilities.dehyphenize(footText);
footText = footText.replace("\n", " ");
//footText = footText.replace(" ", " ").trim();
if (footText.length() < 6)
Expand All @@ -1071,6 +1070,9 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
if (localNotes != null)
notes.addAll(localNotes);
}

notes.stream()
.forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText())));

return notes;
}
Expand All @@ -1089,7 +1091,7 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
String groupStr = ma.group(1);
footText = ma.group(2);
try {
if (groupStr.indexOf(".") != -1)
if (groupStr.contains("."))
sugarText = ".";
String groupStrNormalized = groupStr.replace(".", "");
groupStrNormalized = groupStrNormalized.trim();
Expand All @@ -1109,11 +1111,12 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
} else
break;

if (toConsume.length() == 0)
if (StringUtils.isEmpty(toConsume))
break;
}
if (start != 0)
if (start != 0) {
noteTokens = noteTokens.subList(start, noteTokens.size());
}
}
} catch (NumberFormatException e) {
currentNumber = -1;
Expand All @@ -1130,38 +1133,38 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No

// add possible subsequent notes concatenated in the same note sequence (this is a common error,
// which is addressed here by heuristics, it may not be necessary in the future with a better
// segmentation model using more foot notes training data)
// segmentation model using more footnotes training data)
if (currentNumber != -1) {
String nextLabel = " " + (currentNumber+1);
// suger characters after note number must be consistent with the previous ones to avoid false match
// sugar characters after note number must be consistent with the previous ones to avoid false match
if (sugarText != null)
nextLabel += sugarText;

int ind = footText.indexOf(nextLabel);
if (ind != -1) {
int nextFootnoteLabelIndex = footText.indexOf(nextLabel);
if (nextFootnoteLabelIndex != -1) {
// optionally we could restrict here to superscript numbers
// review local note
localNote.setText(footText.substring(0, ind));
localNote.setText(footText.substring(0, nextFootnoteLabelIndex));
int pos = 0;
List<LayoutToken> previousNoteTokens = new ArrayList<>();
List<LayoutToken> nextNoteTokens = new ArrayList<>();
for(LayoutToken localToken : noteTokens) {
if (localToken.getText() == null || localToken.getText().length() == 0)
if (StringUtils.isEmpty(localToken.getText()))
continue;
pos += localToken.getText().length();
if (pos <= ind+1) {
if (pos <= nextFootnoteLabelIndex+1) {
previousNoteTokens.add(localToken);
} else {
nextNoteTokens.add(localToken);
}
}
localNote.setTokens(previousNoteTokens);
String nextFootText = footText.substring(ind+1, footText.length());
String nextFootText = footText.substring(nextFootnoteLabelIndex+1);

// process the concatenated note
if (nextNoteTokens.size() >0 && nextFootText.length()>0) {
if (CollectionUtils.isNotEmpty(nextNoteTokens) && StringUtils.isNotEmpty(nextFootText)) {
List<Note> nextNotes = makeNotes(nextNoteTokens, nextFootText, noteType, notes.size());
if (nextNotes != null && nextNotes.size()>0)
if (CollectionUtils.isNotEmpty(nextNotes))
notes.addAll(nextNotes);
}
}
Expand Down Expand Up @@ -1460,7 +1463,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int clusterPage = Iterables.getLast(clusterTokens).getPage();

List<Note> notesSamePage = null;
if (notes != null && notes.size() > 0) {
if (CollectionUtils.isNotEmpty(notes)) {
notesSamePage = notes.stream()
.filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage)
.collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;

public class TEIFormatterTest {
Expand All @@ -35,6 +37,30 @@ public void testMakeFootNote() throws Exception {
assertThat(LayoutTokensUtil.toText(footnote.getTokens()), is("This is a footnote"));
assertThat(footnote.getLabel(), is("1"));
}


@Test
public void testMakeNotes() throws Exception {
String text = "198 U.S. Const. art. I, § §9 & 10. \n199 To be sure, there are revisionist arguments that the Ex Post Facto clause itself extends to retroactive civil laws too. See Eastern Enterprises v. Apfel, 524 U.S. 498, 538-39 (1998) (Thomas, J., concurring). And as with bills of attainder, in the wake of the Civil War the Supreme Court held that Ironclad Oath requirements were ex post facto laws as well. Cummings, 71 U.S. at 326-332; Garland, 71 U.S. at 377-368. But as discussed in the text, even these principles do not ensnare Section Three going forward, on a non-ex-post-facto basis \n200 3 U.S. at 378-80 (arguments of counsel). \n201 Id. \n202 Id. at 382. See Baude & Sachs, Eleventh Amendment, supra note 9, at 626-627. Electronic copy available at: https://ssrn.com/abstract=4532751";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
text = text.replace("\n", " ");
tokens.stream().forEach(t-> t.setOffset(t.getOffset() + 403));
List<Note> footnotes = new TEIFormatter(null, null)
.makeNotes(tokens, text, Note.NoteType.FOOT, 37);

assertThat(footnotes, hasSize(5));
assertThat(footnotes.get(0).getLabel(), is("198"));
assertThat(footnotes.get(0).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(1).getLabel(), is("199"));
assertThat(footnotes.get(1).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(2).getLabel(), is("200"));
assertThat(footnotes.get(2).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(3).getLabel(), is("201"));
assertThat(footnotes.get(3).getText(), is("Id. "));
assertThat(footnotes.get(3).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(4).getLabel(), is("202"));
assertThat(footnotes.get(4).getTokens(), hasSize(greaterThan(0)));
}



Expand Down

0 comments on commit cbc77d5

Please sign in to comment.