Skip to content

Commit

Permalink
TIKA-4016 -- add length in InputStreamDigester (#1303)
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison authored Aug 24, 2023
1 parent 97a1b19 commit a2f1a19
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 8 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Release 2.9.1 - ??

* The InputStreamDigester now calculates stream length (TIKA-4016).

Release 2.9.0 - 8/23/2023

* With user configuration, the PDFParser can now throw an EncryptedDocumentException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.StringUtils;

public class InputStreamDigester implements DigestingParser.Digester {

Expand Down Expand Up @@ -72,17 +73,25 @@ public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyN
/**
* Copied from commons-codec
*/
private static MessageDigest updateDigest(MessageDigest digest, InputStream data)
private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata)
throws IOException {
byte[] buffer = new byte[1024];

long total = 0;
for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
digest.update(buffer, 0, read);
total += read;
}

setContentLength(total, metadata);
return digest;
}

private static void setContentLength(long length, Metadata metadata) {
if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
//only add it if it hasn't been populated already
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
}
}

private MessageDigest newMessageDigest() {
try {
Provider provider = getProvider();
Expand Down Expand Up @@ -128,7 +137,7 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext)
//and its size is greater than its mark limit,
//just digest the underlying file.
if (sz > markLimit) {
digestFile(tis.getFile(), metadata);
digestFile(tis.getFile(), sz, metadata);
return;
}
}
Expand All @@ -148,12 +157,12 @@ public void digest(InputStream is, Metadata metadata, ParseContext parseContext)
//if the stream wasn't finished -- if the stream was longer than the mark limit --
//spool to File and digest that.
if (tis != null) {
digestFile(tis.getFile(), metadata);
digestFile(tis.getFile(), -1, metadata);
} else {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp, metadata);
digestFile(tmpTikaInputStream.getFile(), metadata);
digestFile(tmpTikaInputStream.getFile(), -1, metadata);
} finally {
try {
tmp.dispose();
Expand All @@ -169,7 +178,14 @@ private String getMetadataKey() {
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName;
}

private void digestFile(File f, Metadata m) throws IOException {
private void digestFile(File f, long sz, Metadata m) throws IOException {
//only add it if it hasn't been populated already
if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
if (sz < 0) {
sz = f.length();
}
setContentLength(sz, m);
}
try (InputStream is = new FileInputStream(f)) {
digestStream(is, m);
}
Expand All @@ -185,7 +201,7 @@ private boolean digestStream(InputStream is, Metadata metadata) throws IOExcepti
byte[] digestBytes;
MessageDigest messageDigest = newMessageDigest();

updateDigest(messageDigest, is);
updateDigest(messageDigest, is, metadata);
digestBytes = messageDigest.digest();

if (is instanceof BoundedInputStream) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;

import org.apache.commons.codec.digest.DigestUtils;
import org.junit.jupiter.api.Test;

import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
Expand Down Expand Up @@ -115,6 +117,21 @@ public void testCommaSeparated() throws Exception {
assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
}

@Test
public void testLengthsCalculated() throws Exception {
//This tests that TIKA-4016 added lengths
//before TIKA-4016, lengths were missing from 0, 1 and 11
TikaConfig config = null;
try (InputStream is = getResourceAsStream("/configs/tika-config-digests.xml")) {
config = new TikaConfig(is);
}
Parser p = new AutoDetectParser(config);
List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p);
for (Metadata m : metadataList) {
assertNotNull(m.get(Metadata.CONTENT_LENGTH));
}
}

@Test
public void testReset() throws Exception {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Expand Down

0 comments on commit a2f1a19

Please sign in to comment.