diff --git a/.github/workflows/ci-build-unstable.yml b/.github/workflows/ci-build-unstable.yml index aa3adcc561..cdfa981049 100644 --- a/.github/workflows/ci-build-unstable.yml +++ b/.github/workflows/ci-build-unstable.yml @@ -30,3 +30,25 @@ jobs: format: jacoco file: build/reports/jacoco/codeCoverageReport/codeCoverageReport.xml debug: true + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v2 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v5 + with: + username: ${{ secrets.DOCKERHUB_USERNAME_LFOPPIANO }} + password: ${{ secrets.DOCKERHUB_TOKEN_LFOPPIANO }} + image: lfoppiano/grobid + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: latest-develop + dockerfile: Dockerfile.crf + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/data/Note.java b/grobid-core/src/main/java/org/grobid/core/data/Note.java index caf9acf52c..d11beafc00 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Note.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Note.java @@ -1,5 +1,6 @@ package org.grobid.core.data; +import org.apache.commons.collections4.CollectionUtils; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.Page; import org.grobid.core.utilities.*; @@ -74,7 +75,11 @@ public void setOffsetStartInPage(int offsetStartInPage) { } public int getPageNumber() { - return tokens.get(0).getPage(); + if (CollectionUtils.isNotEmpty(tokens)) + return tokens.get(0).getPage(); + else { + return -1; + } } public String getText() { diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 79025c6d6e..26148cf132 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1053,7 +1053,6 @@ protected List getTeiNotes(Document doc, SortedSet document } String footText = doc.getDocumentPieceText(docPiece); - footText = TextUtilities.dehyphenize(footText); footText = footText.replace("\n", " "); //footText = footText.replace(" ", " ").trim(); if (footText.length() < 6) @@ -1071,6 +1070,9 @@ protected List getTeiNotes(Document doc, SortedSet document if (localNotes != null) notes.addAll(localNotes); } + + notes.stream() + .forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText()))); return notes; } @@ -1089,7 +1091,7 @@ protected List makeNotes(List noteTokens, String footText, No String groupStr = ma.group(1); footText = ma.group(2); try { - if (groupStr.indexOf(".") != -1) + if (groupStr.contains(".")) sugarText = "."; String groupStrNormalized = groupStr.replace(".", ""); groupStrNormalized = groupStrNormalized.trim(); @@ -1109,11 +1111,12 @@ protected List makeNotes(List noteTokens, String footText, No } else break; - if (toConsume.length() == 0) + if (StringUtils.isEmpty(toConsume)) break; } - if (start != 0) + if (start != 0) { noteTokens = noteTokens.subList(start, noteTokens.size()); + } } } catch (NumberFormatException e) { currentNumber = -1; @@ -1130,38 +1133,38 @@ protected List makeNotes(List noteTokens, String footText, No // add possible subsequent notes concatenated in the same note sequence (this is a common error, // which is addressed here by heuristics, it may not be necessary in the future with a better - // segmentation model using more foot notes training data) + // segmentation model using more footnotes training data) if (currentNumber != -1) { String nextLabel = " " + (currentNumber+1); - // suger characters after note number must be consistent with the previous ones to avoid false match + // sugar characters after note number must be consistent with the previous ones to avoid false match if (sugarText != null) nextLabel += sugarText; - int ind = footText.indexOf(nextLabel); - if (ind != -1) { + int nextFootnoteLabelIndex = footText.indexOf(nextLabel); + if (nextFootnoteLabelIndex != -1) { // optionally we could restrict here to superscript numbers // review local note - localNote.setText(footText.substring(0, ind)); + localNote.setText(footText.substring(0, nextFootnoteLabelIndex)); int pos = 0; List previousNoteTokens = new ArrayList<>(); List nextNoteTokens = new ArrayList<>(); for(LayoutToken localToken : noteTokens) { - if (localToken.getText() == null || localToken.getText().length() == 0) + if (StringUtils.isEmpty(localToken.getText())) continue; pos += localToken.getText().length(); - if (pos <= ind+1) { + if (pos <= nextFootnoteLabelIndex+1) { previousNoteTokens.add(localToken); } else { nextNoteTokens.add(localToken); } } localNote.setTokens(previousNoteTokens); - String nextFootText = footText.substring(ind+1, footText.length()); + String nextFootText = footText.substring(nextFootnoteLabelIndex+1); // process the concatenated note - if (nextNoteTokens.size() >0 && nextFootText.length()>0) { + if (CollectionUtils.isNotEmpty(nextNoteTokens) && StringUtils.isNotEmpty(nextFootText)) { List nextNotes = makeNotes(nextNoteTokens, nextFootText, noteType, notes.size()); - if (nextNotes != null && nextNotes.size()>0) + if (CollectionUtils.isNotEmpty(nextNotes)) notes.addAll(nextNotes); } } @@ -1460,7 +1463,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, int clusterPage = Iterables.getLast(clusterTokens).getPage(); List notesSamePage = null; - if (notes != null && notes.size() > 0) { + if (CollectionUtils.isNotEmpty(notes)) { notesSamePage = notes.stream() .filter(f -> !f.isIgnored() && f.getPageNumber() == clusterPage) .collect(Collectors.toList()); diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index b9e32083d4..1e794f8765 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -11,6 +11,8 @@ import java.util.List; import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; public class TEIFormatterTest { @@ -35,6 +37,30 @@ public void testMakeFootNote() throws Exception { assertThat(LayoutTokensUtil.toText(footnote.getTokens()), is("This is a footnote")); assertThat(footnote.getLabel(), is("1")); } + + + @Test + public void testMakeNotes() throws Exception { + String text = "198 U.S. Const. art. I, § §9 & 10. \n199 To be sure, there are revisionist arguments that the Ex Post Facto clause itself extends to retroactive civil laws too. See Eastern Enterprises v. Apfel, 524 U.S. 498, 538-39 (1998) (Thomas, J., concurring). And as with bills of attainder, in the wake of the Civil War the Supreme Court held that Ironclad Oath requirements were ex post facto laws as well. Cummings, 71 U.S. at 326-332; Garland, 71 U.S. at 377-368. But as discussed in the text, even these principles do not ensnare Section Three going forward, on a non-ex-post-facto basis \n200 3 U.S. at 378-80 (arguments of counsel). \n201 Id. \n202 Id. at 382. See Baude & Sachs, Eleventh Amendment, supra note 9, at 626-627. Electronic copy available at: https://ssrn.com/abstract=4532751"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text); + text = text.replace("\n", " "); + tokens.stream().forEach(t-> t.setOffset(t.getOffset() + 403)); + List footnotes = new TEIFormatter(null, null) + .makeNotes(tokens, text, Note.NoteType.FOOT, 37); + + assertThat(footnotes, hasSize(5)); + assertThat(footnotes.get(0).getLabel(), is("198")); + assertThat(footnotes.get(0).getTokens(), hasSize(greaterThan(0))); + assertThat(footnotes.get(1).getLabel(), is("199")); + assertThat(footnotes.get(1).getTokens(), hasSize(greaterThan(0))); + assertThat(footnotes.get(2).getLabel(), is("200")); + assertThat(footnotes.get(2).getTokens(), hasSize(greaterThan(0))); + assertThat(footnotes.get(3).getLabel(), is("201")); + assertThat(footnotes.get(3).getText(), is("Id. ")); + assertThat(footnotes.get(3).getTokens(), hasSize(greaterThan(0))); + assertThat(footnotes.get(4).getLabel(), is("202")); + assertThat(footnotes.get(4).getTokens(), hasSize(greaterThan(0))); + }