Skip to content

Commit

Permalink
review sequence segmentation following max sequence length
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Feb 5, 2024
1 parent 017bc28 commit 08c0405
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.analyzers.GrobidDefaultAnalyzer;
import org.grobid.core.lang.Language;
import org.grobid.core.sax.ST36SaxParser;
import org.grobid.core.layout.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -319,6 +320,35 @@ public String extractAllReferencesString(List<String> texts,
// list of references by index of tokenized text segments
Map<Integer, List<PatentItem>> patentsBySegment = new HashMap<>();
Map<Integer, List<BibDataSet>> articlesBySegment = new HashMap<>();

// sub-segment texts if a DL model will be applied. Use the max sequence length for size limit
if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) {
List<String> newTexts = new ArrayList<>();
int maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation");
for(String text : texts) {
List<String> tokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(text);
if (tokenizations.size() > maxSequence) {
//System.out.println(maxSequence + " vs " + tokenizations.size());
String[] subtexts = text.split("\n\n");
for(int i=0; i<subtexts.length; i++) {
List<String> subtokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(subtexts[i]);
if (subtokenizations.size() > maxSequence) {
String[] subsubtexts = subtexts[i].split(".\n");
for(int j=0; j<subsubtexts.length; j++) {
List<String> subsubtokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(subsubtexts[j]);
newTexts.add(subsubtexts[j]);
}
} else {
newTexts.add(subtexts[i]);
}
}
} else {
newTexts.add(text);
}
}
texts = newTexts;
}

try {
// if parameters are null, these lists will only be valid in the method
if (patents == null) {
Expand Down
15 changes: 7 additions & 8 deletions grobid-core/src/main/java/org/grobid/core/sax/ST36SaxParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -566,17 +566,17 @@ public void startElement(String namespaceURI,
}
}

public List<List<String>> segment(List<String> tokens, int maxSize) {
public static List<List<String>> segment(List<String> tokens, int maxSize) {
List<List<String>> allTokenizations = new ArrayList<>();
List<String> currentTokenization = new ArrayList<>();

// segment based on double "\n\n" pattern
int pos = 0;
for(String token : tokens) {
if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
/*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
pos++;
continue;
}
}*/

if (!token.equals("\n")) {
currentTokenization.add(token);
Expand Down Expand Up @@ -612,10 +612,10 @@ public List<List<String>> segment(List<String> tokens, int maxSize) {
currentTokenization = new ArrayList<>();
pos = 0;
for(String token : tokenization) {
if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
/*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
pos++;
continue;
}
}*/

if (!token.equals(".")) {
currentTokenization.add(token);
Expand Down Expand Up @@ -656,10 +656,10 @@ public List<List<String>> segment(List<String> tokens, int maxSize) {
currentTokenization = new ArrayList<>();
pos = 0;
for(String token : tokenization) {
if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
/*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) {
pos++;
continue;
}
}*/

if (!token.equals(".")) {
currentTokenization.add(token);
Expand Down Expand Up @@ -689,6 +689,5 @@ public List<List<String>> segment(List<String> tokens, int maxSize) {
}

return allTokenizations3;

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ public void addFilter(String filt) {

public String getText() {
String text = accumulator.toString().trim();
text = text.replace("\n", " ");
//text = text.replace("\n", " ");
text = text.replace("\t", " ");
text = text.replaceAll("\\p{Space}+", " ");
//text = text.replaceAll("\\p{Space}+", " ");
text = text.replaceAll("( )+", " ");
return text;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ public static File getPdfaltoPath() {
return pathToPdfalto;
}

private static String getGrobidCRFEngineName(final String modelName) {
public static String getGrobidCRFEngineName(final String modelName) {
ModelParameters param = modelMap.get(modelName);
if (param == null) {
LOGGER.debug("No configuration parameter defined for model " + modelName);
Expand Down

0 comments on commit 08c0405

Please sign in to comment.