Skip to content

Commit

Permalink
move dephypenisation at the end of the note process
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jan 11, 2024
1 parent 6a3a0d5 commit 30780ef
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,6 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
}

String footText = doc.getDocumentPieceText(docPiece);
footText = TextUtilities.dehyphenize(footText);
footText = footText.replace("\n", " ");
//footText = footText.replace(" ", " ").trim();
if (footText.length() < 6)
Expand All @@ -1071,6 +1070,9 @@ protected List<Note> getTeiNotes(Document doc, SortedSet<DocumentPiece> document
if (localNotes != null)
notes.addAll(localNotes);
}

notes.stream()
.forEach(n -> n.setText(TextUtilities.dehyphenize(n.getText())));

return notes;
}
Expand All @@ -1089,7 +1091,7 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
String groupStr = ma.group(1);
footText = ma.group(2);
try {
if (groupStr.indexOf(".") != -1)
if (groupStr.contains("."))
sugarText = ".";
String groupStrNormalized = groupStr.replace(".", "");
groupStrNormalized = groupStrNormalized.trim();
Expand All @@ -1109,11 +1111,12 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No
} else
break;

if (toConsume.length() == 0)
if (StringUtils.isEmpty(toConsume))
break;
}
if (start != 0)
if (start != 0) {
noteTokens = noteTokens.subList(start, noteTokens.size());
}
}
} catch (NumberFormatException e) {
currentNumber = -1;
Expand All @@ -1130,38 +1133,38 @@ protected List<Note> makeNotes(List<LayoutToken> noteTokens, String footText, No

// add possible subsequent notes concatenated in the same note sequence (this is a common error,
// which is addressed here by heuristics, it may not be necessary in the future with a better
// segmentation model using more foot notes training data)
// segmentation model using more footnotes training data)
if (currentNumber != -1) {
String nextLabel = " " + (currentNumber+1);
// suger characters after note number must be consistent with the previous ones to avoid false match
// sugar characters after note number must be consistent with the previous ones to avoid false match
if (sugarText != null)
nextLabel += sugarText;

int ind = footText.indexOf(nextLabel);
if (ind != -1) {
int nextFootnoteLabelIndex = footText.indexOf(nextLabel);
if (nextFootnoteLabelIndex != -1) {
// optionally we could restrict here to superscript numbers
// review local note
localNote.setText(footText.substring(0, ind));
localNote.setText(footText.substring(0, nextFootnoteLabelIndex));
int pos = 0;
List<LayoutToken> previousNoteTokens = new ArrayList<>();
List<LayoutToken> nextNoteTokens = new ArrayList<>();
for(LayoutToken localToken : noteTokens) {
if (localToken.getText() == null || localToken.getText().length() == 0)
if (StringUtils.isEmpty(localToken.getText()))
continue;
pos += localToken.getText().length();
if (pos <= ind+1) {
if (pos <= nextFootnoteLabelIndex+1) {
previousNoteTokens.add(localToken);
} else {
nextNoteTokens.add(localToken);
}
}
localNote.setTokens(previousNoteTokens);
String nextFootText = footText.substring(ind+1, footText.length());
String nextFootText = footText.substring(nextFootnoteLabelIndex+1);

// process the concatenated note
if (nextNoteTokens.size() >0 && nextFootText.length()>0) {
if (CollectionUtils.isNotEmpty(nextNoteTokens) && StringUtils.isNotEmpty(nextFootText)) {
List<Note> nextNotes = makeNotes(nextNoteTokens, nextFootText, noteType, notes.size());
if (nextNotes != null && nextNotes.size()>0)
if (CollectionUtils.isNotEmpty(nextNotes))
notes.addAll(nextNotes);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import java.util.List;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;

public class TEIFormatterTest {
Expand All @@ -35,6 +37,30 @@ public void testMakeFootNote() throws Exception {
assertThat(LayoutTokensUtil.toText(footnote.getTokens()), is("This is a footnote"));
assertThat(footnote.getLabel(), is("1"));
}


@Test
public void testMakeNotes() throws Exception {
String text = "198 U.S. Const. art. I, § §9 & 10. \n199 To be sure, there are revisionist arguments that the Ex Post Facto clause itself extends to retroactive civil laws too. See Eastern Enterprises v. Apfel, 524 U.S. 498, 538-39 (1998) (Thomas, J., concurring). And as with bills of attainder, in the wake of the Civil War the Supreme Court held that Ironclad Oath requirements were ex post facto laws as well. Cummings, 71 U.S. at 326-332; Garland, 71 U.S. at 377-368. But as discussed in the text, even these principles do not ensnare Section Three going forward, on a non-ex-post-facto basis \n200 3 U.S. at 378-80 (arguments of counsel). \n201 Id. \n202 Id. at 382. See Baude & Sachs, Eleventh Amendment, supra note 9, at 626-627. Electronic copy available at: https://ssrn.com/abstract=4532751";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);
text = text.replace("\n", " ");
tokens.stream().forEach(t-> t.setOffset(t.getOffset() + 403));
List<Note> footnotes = new TEIFormatter(null, null)
.makeNotes(tokens, text, Note.NoteType.FOOT, 37);

assertThat(footnotes, hasSize(5));
assertThat(footnotes.get(0).getLabel(), is("198"));
assertThat(footnotes.get(0).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(1).getLabel(), is("199"));
assertThat(footnotes.get(1).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(2).getLabel(), is("200"));
assertThat(footnotes.get(2).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(3).getLabel(), is("201"));
assertThat(footnotes.get(3).getText(), is("Id. "));
assertThat(footnotes.get(3).getTokens(), hasSize(greaterThan(0)));
assertThat(footnotes.get(4).getLabel(), is("202"));
assertThat(footnotes.get(4).getTokens(), hasSize(greaterThan(0)));
}



Expand Down

0 comments on commit 30780ef

Please sign in to comment.