diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index 67d55330fd..8ae002fb15 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -566,7 +566,7 @@ else if (code.equals("it")) return "Italian"; else if (code.equals("jp")) return "Japanese"; - else if (code.equals("kr")) + else if (code.equals("kr") || code.equals("ko")) return "Korean"; else if (code.equals("nl")) return "Deutch"; diff --git a/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java b/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java index 76ca5cbf7f..b71569dc52 100755 --- a/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java +++ b/grobid-trainer/src/main/java/org/grobid/trainer/PatentParserTrainer.java @@ -182,7 +182,13 @@ public boolean accept(File dir, String name) { List> segmentedAccumulatedTokens = new ArrayList<>(); List> segmentedAccumulatedLabels = new ArrayList<>(); - if (accumulatedTokens.size() > 1000) { + int maxSequence = 1000; + if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + List newTexts = new ArrayList<>(); + maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); + } + + if (accumulatedTokens.size() > maxSequence) { // we have a problem of sequence length for Deep Learning algorithms // we need to segment further. We ensure here that we don't segment // near or inside patent or NPL references @@ -190,8 +196,8 @@ public boolean accept(File dir, String name) { while(k") || accumulatedLabels.get(k-1).endsWith("refPatent>")) { k--; @@ -355,7 +361,13 @@ public void createDataSet(String setName, String corpusPath, String outputPath, List> segmentedAccumulatedTokens = new ArrayList<>(); List> segmentedAccumulatedLabels = new ArrayList<>(); - if (accumulatedTokens.size() > 1000) { + int maxSequence = 1000; + if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) { + List newTexts = new ArrayList<>(); + maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation"); + } + + if (accumulatedTokens.size() > maxSequence) { // we have a problem of sequence length for Deep Learning algorithms // we need to segment further. We ensure here that we don't segment // near or inside patent or NPL references @@ -363,8 +375,8 @@ public void createDataSet(String setName, String corpusPath, String outputPath, while(k") || accumulatedLabels.get(k-1).endsWith("refPatent>")) { k--;