From 23d4f25438455300062006d56c276f53eeb4c6be Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 28 Nov 2024 17:22:32 +0000 Subject: [PATCH] correctly rename the training data when the original PDF contain the word `.pdf` #776 --- .../grobid/core/engines/FullTextParser.java | 41 +++++++++---------- .../org/grobid/core/engines/Segmentation.java | 12 +++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index dfc623a7c2..b102b20795 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -1112,8 +1112,7 @@ public Document createTraining(File inputFile, List tokenizations = doc.getTokenizations(); // we write first the full text untagged (but featurized with segmentation features) - String outPathFulltext = pathFullText + File.separator + - pdfFileName.replace(".pdf", ".training.segmentation"); + String outPathFulltext = pathFullText + File.separator + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation"); Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), StandardCharsets.UTF_8); writer.write(fulltext + "\n"); writer.close(); @@ -1124,7 +1123,7 @@ public Document createTraining(File inputFile, rawtxt.append(txtline.getText()); } String outPathRawtext = pathFullText + File.separator + - pdfFileName.replace(".pdf", ".training.segmentation.rawtxt"); + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation.rawtxt"); FileUtils.writeStringToFile(new File(outPathRawtext), rawtxt.toString(), StandardCharsets.UTF_8); if (isNotBlank(fulltext)) { @@ -1134,7 +1133,7 @@ public Document createTraining(File inputFile, // write the TEI file to reflect the extact layout of the text as extracted from the pdf writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator + - pdfFileName.replace(".pdf", ".training.segmentation.tei.xml")), false), StandardCharsets.UTF_8); + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.segmentation.tei.xml")), false), StandardCharsets.UTF_8); writer.write("\n\n\t\n\t\t\n\t\n\t\n"); @@ -1156,13 +1155,13 @@ public Document createTraining(File inputFile, String raw = result.getRight(); if (tei != null) { String outPath = pathTEI + "/" + - pdfFileName.replace(".pdf", ".training.references.referenceSegmenter.tei.xml"); + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.referenceSegmenter.tei.xml"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8); writer.write(tei + "\n"); writer.close(); // generate also the raw vector file with the features - outPath = pathTEI + "/" + pdfFileName.replace(".pdf", ".training.references.referenceSegmenter"); + outPath = pathTEI + "/" + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.referenceSegmenter"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPath), false), StandardCharsets.UTF_8); writer.write(raw + "\n"); writer.close(); @@ -1203,7 +1202,7 @@ public Document createTraining(File inputFile, Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator + - pdfFileName.replace(".pdf", ".training.references.tei.xml")), false), StandardCharsets.UTF_8); + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.references.tei.xml")), false), StandardCharsets.UTF_8); writerReference.write("\n\n\n\n\t\n\t\n"); } @@ -1301,13 +1300,13 @@ public Document createTraining(File inputFile, Pair trainingFigure = processTrainingDataFigures(rese, tokenizationsBody, inputFile.getName()); if (trainingFigure.getLeft().trim().length() > 0) { String outPathFigures = pathFullText + File.separator - + pdfFileName.replace(".pdf", ".training.figure"); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.figure"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFigures), false), StandardCharsets.UTF_8); writer.write(trainingFigure.getRight() + "\n\n"); writer.close(); String outPathFiguresTEI = pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.figure.tei.xml"); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.figure.tei.xml"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFiguresTEI), false), StandardCharsets.UTF_8); writer.write(trainingFigure.getLeft() + "\n"); writer.close(); @@ -1317,13 +1316,13 @@ public Document createTraining(File inputFile, Pair trainingTable = processTrainingDataTables(rese, tokenizationsBody, inputFile.getName()); if (trainingTable.getLeft().trim().length() > 0) { String outPathTables = pathFullText + File.separator - + pdfFileName.replace(".pdf", ".training.table"); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.table"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTables), false), StandardCharsets.UTF_8); writer.write(trainingTable.getRight() + "\n\n"); writer.close(); String outPathTablesTEI = pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.table.tei.xml"); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.table.tei.xml"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPathTablesTEI), false), StandardCharsets.UTF_8); writer.write(trainingTable.getLeft() + "\n"); writer.close(); @@ -1352,7 +1351,7 @@ public Document createTraining(File inputFile, if ((header != null) && (header.trim().length() > 0)) { // we write the header untagged - String outPathHeader = pathTEI + File.separator + pdfFileName.replace(".pdf", ".training.header"); + String outPathHeader = pathTEI + File.separator + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header"); writer = new OutputStreamWriter(new FileOutputStream(new File(outPathHeader), false), StandardCharsets.UTF_8); writer.write(header + "\n"); writer.close(); @@ -1467,9 +1466,9 @@ public Document createTraining(File inputFile, // write the training TEI file for header which reflects the extract layout of the text as // extracted from the pdf writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.tei.xml")), false), StandardCharsets.UTF_8); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.tei.xml")), false), StandardCharsets.UTF_8); writer.write("\n\n\t\n\t\t\n\t\n\t 0) { Writer writerAffiliation = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.affiliation.tei.xml")), false), StandardCharsets.UTF_8); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.affiliation.tei.xml")), false), StandardCharsets.UTF_8); writerAffiliation.write(""); writerAffiliation.write("\n"); @@ -1507,7 +1506,7 @@ public Document createTraining(File inputFile, if (bufferDate.length() > 0) { Writer writerDate = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.date.xml")), false), StandardCharsets.UTF_8); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.date.xml")), false), StandardCharsets.UTF_8); writerDate.write("\n"); writerDate.write("\n"); @@ -1523,7 +1522,7 @@ public Document createTraining(File inputFile, if (bufferName.length() > 0) { Writer writerName = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.authors.tei.xml")), false), StandardCharsets.UTF_8); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.authors.tei.xml")), false), StandardCharsets.UTF_8); writerName.write(""); writerName.write("\n"); @@ -1546,7 +1545,7 @@ public Document createTraining(File inputFile, if (bufferReference.length() > 0) { Writer writerReference = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator - + pdfFileName.replace(".pdf", ".training.header.reference.xml")), false), StandardCharsets.UTF_8); + + pdfFileName.replaceAll("(?i)\\.pdf$", ".training.header.reference.xml")), false), StandardCharsets.UTF_8); writerReference.write("\n"); writerReference.write("\n"); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/Segmentation.java b/grobid-core/src/main/java/org/grobid/core/engines/Segmentation.java index 9f66865576..d1323e215f 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/Segmentation.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/Segmentation.java @@ -1,6 +1,7 @@ package org.grobid.core.engines; import eugfc.imageio.plugins.PNMRegistry; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.FileUtils; import org.grobid.core.GrobidModels; import org.grobid.core.document.BasicStructureBuilder; @@ -230,7 +231,7 @@ private void dealWithImages(DocumentSource documentSource, Document doc, File as * Addition of the features at line level for the complete document. *

* This is an alternative to the token level, where the unit for labeling is the line - so allowing faster - * processing and involving less features. + * processing and involving fewer features. * Lexical features becomes line prefix and suffix, the feature text unit is the first 10 characters of the * line without space. * The dictionary flags are at line level (i.e. the line contains a name mention, a place mention, a year, etc.) @@ -319,8 +320,9 @@ private String getFeatureVectorsAsString(Document doc, Map patt mm = 0; //endPage = true; - if ((page.getBlocks() == null) || (page.getBlocks().size() == 0)) + if (CollectionUtils.isEmpty(page.getBlocks())) { continue; + } for(int blockIndex=0; blockIndex < page.getBlocks().size(); blockIndex++) { Block block = page.getBlocks().get(blockIndex); @@ -444,7 +446,7 @@ private String getFeatureVectorsAsString(Document doc, Map patt if (text == null) continue; - // final sanitisation and filtering + // final sanitization and filtering text = text.replaceAll("[ \n\r]", ""); text = text.trim(); @@ -754,7 +756,7 @@ public void createBlankTrainingData(File file, // we write the full text untagged (but featurized) String outPathFulltext = pathFullText + File.separator + - PDFFileName.replace(".pdf", ".training.blank"); + PDFFileName.replaceAll("(?i)\\.pdf$", ".training.blank"); Writer writer = new OutputStreamWriter(new FileOutputStream(new File(outPathFulltext), false), "UTF-8"); writer.write(fulltext + "\n"); writer.close(); @@ -770,7 +772,7 @@ public void createBlankTrainingData(File file, // write the TEI file to reflect the extact layout of the text as extracted from the pdf writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI + File.separator + - PDFFileName.replace(".pdf", ".training.blank.tei.xml")), false), "UTF-8"); + PDFFileName.replaceAll("(?i)\\.pdf$", ".training.blank.tei.xml")), false), "UTF-8"); writer.write("\n\n\t\n\t\t\n\t\n\t\n");