Skip to content

Commit

Permalink
TIKA-4354 -- make incremental update metadata and parsing default in …
Browse files Browse the repository at this point in the history
…tika-cli (#2059)

* TIKA-4354 -- make incremental update metadata and parsing default in tika-cli
  • Loading branch information
tballison authored Nov 19, 2024
1 parent 347d58c commit ff9d722
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 7 deletions.
12 changes: 6 additions & 6 deletions tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ public void process(InputStream stream, OutputStream output, Metadata metadata)
*/
private String password = System.getenv("TIKA_PASSWORD");
private DigestingParser.Digester digester = null;
private boolean asyncMode = false;
private boolean pipeMode = true;
private boolean fork = false;
private boolean prettyPrint;
Expand Down Expand Up @@ -340,9 +339,12 @@ private void configurePDFExtractSettings() {
if (configFilePath == null && context.get(PDFParserConfig.class) == null) {
PDFParserConfig pdfParserConfig = new PDFParserConfig();
pdfParserConfig.setExtractInlineImages(true);
pdfParserConfig.setExtractIncrementalUpdateInfo(true);
pdfParserConfig.setParseIncrementalUpdates(true);
String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images and incremental updates for the PDFParser (TIKA-2374 and " + "TIKA-4017).\n" +
"Aside from the -z option, this is not the default behavior\n" + "in Tika generally or in tika-server.";
String warn = "As a convenience, TikaCLI has turned on extraction of\n" +
"inline images and incremental updates for the PDFParser (TIKA-2374, " +
"TIKA-4017 and TIKA-4354).\n" +
"This is not the default behavior in Tika generally or in tika-server.";
LOG.info(warn);
context.set(PDFParserConfig.class, pdfParserConfig);
}
Expand Down Expand Up @@ -401,8 +403,6 @@ public void process(String arg) throws Exception {
// ignore, as container-aware detectors are now always used
} else if (arg.equals("-f") || arg.equals("--fork")) {
fork = true;
} else if (arg.equals("-a") || arg.equals("--async")) {
asyncMode = true;
} else if (arg.startsWith("--config=")) {
configFilePath = arg.substring("--config=".length());
} else if (arg.startsWith("--digest=")) {
Expand Down Expand Up @@ -446,7 +446,6 @@ public void process(String arg) throws Exception {
}
extractDir = new File(dirPath);
} else if (arg.equals("-z") || arg.equals("--extract")) {
configurePDFExtractSettings();
type = NO_OUTPUT;
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
Expand Down Expand Up @@ -475,6 +474,7 @@ public void process(String arg) throws Exception {
} else {
url = new URL(arg);
}
configurePDFExtractSettings();
if (recursiveJSON) {
handleRecursiveJson(url, System.out);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public void testAsync() throws Exception {
json++;
}
}
assertEquals(17, json);
assertEquals(18, json);
}

private void checkForPrettyPrint(File f) throws IOException {
Expand Down
8 changes: 8 additions & 0 deletions tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,14 @@ public void testJsonMetadataPrettyPrintOutput() throws Exception {
assertTrue(fb > -1 && title > -1 && fb > title);
}

@Test
public void testDefaultPDFIncrementalUpdateSettings() throws Exception {
String json = getParamOutContent("-J",
resourcePrefix + "testPDF_incrementalUpdates.pdf");
assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\""));
assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
}

/**
* Tests -l option of the cli
*
Expand Down
Binary file not shown.

0 comments on commit ff9d722

Please sign in to comment.