From 08c0405e4ddc3360a6579dcc43c81d2aa3fe9d1f Mon Sep 17 00:00:00 2001 From: Patrice Lopez Date: Mon, 5 Feb 2024 23:53:32 +0100 Subject: [PATCH] review sequence segmentation following max sequence length --- .../engines/patent/ReferenceExtractor.java | 30 +++++++++++++++++++ .../org/grobid/core/sax/ST36SaxParser.java | 15 +++++----- .../org/grobid/core/sax/TextSaxParser.java | 5 ++-- .../core/utilities/GrobidProperties.java | 2 +- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java index 7de2a02981..7fc51ead3d 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/patent/ReferenceExtractor.java @@ -31,6 +31,7 @@ import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.analyzers.GrobidDefaultAnalyzer; import org.grobid.core.lang.Language; +import org.grobid.core.sax.ST36SaxParser; import org.grobid.core.layout.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -319,6 +320,35 @@ public String extractAllReferencesString(List texts, // list of references by index of tokenized text segments Map> patentsBySegment = new HashMap<>(); Map> articlesBySegment = new HashMap<>(); + + // sub-segment texts if a DL model will be applied. Use the max sequence length for size limit + if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + List newTexts = new ArrayList<>(); + int maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); + for(String text : texts) { + List tokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(text); + if (tokenizations.size() > maxSequence) { +//System.out.println(maxSequence + " vs " + tokenizations.size()); + String[] subtexts = text.split("\n\n"); + for(int i=0; i subtokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(subtexts[i]); + if (subtokenizations.size() > maxSequence) { + String[] subsubtexts = subtexts[i].split(".\n"); + for(int j=0; j subsubtokenizations = GrobidDefaultAnalyzer.getInstance().tokenize(subsubtexts[j]); + newTexts.add(subsubtexts[j]); + } + } else { + newTexts.add(subtexts[i]); + } + } + } else { + newTexts.add(text); + } + } + texts = newTexts; + } + try { // if parameters are null, these lists will only be valid in the method if (patents == null) { diff --git a/grobid-core/src/main/java/org/grobid/core/sax/ST36SaxParser.java b/grobid-core/src/main/java/org/grobid/core/sax/ST36SaxParser.java index c7dbab525b..381f7df9be 100755 --- a/grobid-core/src/main/java/org/grobid/core/sax/ST36SaxParser.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/ST36SaxParser.java @@ -566,17 +566,17 @@ public void startElement(String namespaceURI, } } - public List> segment(List tokens, int maxSize) { + public static List> segment(List tokens, int maxSize) { List> allTokenizations = new ArrayList<>(); List currentTokenization = new ArrayList<>(); // segment based on double "\n\n" pattern int pos = 0; for(String token : tokens) { - if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { + /*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { pos++; continue; - } + }*/ if (!token.equals("\n")) { currentTokenization.add(token); @@ -612,10 +612,10 @@ public List> segment(List tokens, int maxSize) { currentTokenization = new ArrayList<>(); pos = 0; for(String token : tokenization) { - if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { + /*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { pos++; continue; - } + }*/ if (!token.equals(".")) { currentTokenization.add(token); @@ -656,10 +656,10 @@ public List> segment(List tokens, int maxSize) { currentTokenization = new ArrayList<>(); pos = 0; for(String token : tokenization) { - if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { + /*if (currentTokenization.size() == 0 && (token.equals("\n") || token.equals(" ") || token.equals("\t"))) { pos++; continue; - } + }*/ if (!token.equals(".")) { currentTokenization.add(token); @@ -689,6 +689,5 @@ public List> segment(List tokens, int maxSize) { } return allTokenizations3; - } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java index d79bc437a2..2d8b3d3b3b 100755 --- a/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java +++ b/grobid-core/src/main/java/org/grobid/core/sax/TextSaxParser.java @@ -53,9 +53,10 @@ public void addFilter(String filt) { public String getText() { String text = accumulator.toString().trim(); - text = text.replace("\n", " "); + //text = text.replace("\n", " "); text = text.replace("\t", " "); - text = text.replaceAll("\\p{Space}+", " "); + //text = text.replaceAll("\\p{Space}+", " "); + text = text.replaceAll("( )+", " "); return text; } diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java index 2778839b94..05048224f1 100644 --- a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java @@ -588,7 +588,7 @@ public static File getPdfaltoPath() { return pathToPdfalto; } - private static String getGrobidCRFEngineName(final String modelName) { + public static String getGrobidCRFEngineName(final String modelName) { ModelParameters param = modelMap.get(modelName); if (param == null) { LOGGER.debug("No configuration parameter defined for model " + modelName);