Skip to content

Commit

Permalink
TIKA-4220 -- temporary workaround for tar detection regression (#1687)…
Browse files Browse the repository at this point in the history
… -- main branch
  • Loading branch information
tballison committed May 7, 2024
1 parent 48c934f commit ebdb6fc
Show file tree
Hide file tree
Showing 3 changed files with 673 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.zip.TikaArchiveStreamFactory;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand Down Expand Up @@ -253,6 +254,16 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}

TemporaryResources tmp = new TemporaryResources();
try {
_parse(stream, handler, metadata, context, tmp);
} finally {
tmp.close();
}
}

private void _parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context, TemporaryResources tmp)
throws TikaException, IOException, SAXException {
ArchiveInputStream ais = null;
String encoding = null;
try {
Expand All @@ -262,9 +273,23 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed

ais = factory.createArchiveInputStream(CloseShieldInputStream.wrap(stream));

//TODO -- fix this when we next upgrade commons-compress
//TODO -- we've probably already detected the stream by here. We should
//rely on that detection and not re-detect.
if (factory != null) {
encoding = factory.getEntryEncoding();
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} else {
TikaArchiveStreamFactory tikaFactory = new TikaArchiveStreamFactory();
encoding = tikaFactory.getEntryEncoding();
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = tikaFactory.createArchiveInputStream(new CloseShieldInputStream(stream));
}
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
Expand Down Expand Up @@ -294,11 +319,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}

Expand Down Expand Up @@ -330,7 +353,6 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}
} finally {
ais.close();
tmp.close();
xhtml.endDocument();
}
}
Expand Down Expand Up @@ -518,7 +540,7 @@ public ArchiveEntry getNextEntry() throws IOException {
@Override
public void close() throws IOException {
file.close();
}
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.List;

import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
Expand Down Expand Up @@ -121,7 +120,9 @@ static MediaType detectArchiveFormat(byte[] prefix, int length) {
return TIFF;
}
try {
String name = ArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
String name =
TikaArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix,
0, length));
return PackageConstants.getMediaType(name);
} catch (ArchiveException e) {
return MediaType.OCTET_STREAM;
Expand Down
Loading

0 comments on commit ebdb6fc

Please sign in to comment.