Skip to content

Commit

Permalink
TIKA-4220 -- temporary workaround for tar detection regression (#1687)
Browse files Browse the repository at this point in the history
* TIKA-4220 on branch_2x -- temporary commons-compress workaround
  • Loading branch information
tballison authored Mar 25, 2024
1 parent 9dc037d commit 6dd4ac7
Show file tree
Hide file tree
Showing 3 changed files with 673 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.detect.zip.TikaArchiveStreamFactory;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand Down Expand Up @@ -252,18 +253,38 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}

TemporaryResources tmp = new TemporaryResources();
try {
_parse(stream, handler, metadata, context, tmp);
} finally {
tmp.close();
}
}

private void _parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context, TemporaryResources tmp)
throws TikaException, IOException, SAXException {
ArchiveInputStream ais = null;
String encoding = null;
try {
ArchiveStreamFactory factory =
context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
encoding = factory.getEntryEncoding();
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed

ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));

context.get(ArchiveStreamFactory.class);
//TODO -- fix this when we next upgrade commons-compress
//TODO -- we've probably already detected the stream by here. We should
//rely on that detection and not re-detect.
if (factory != null) {
encoding = factory.getEntryEncoding();
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = factory.createArchiveInputStream(new CloseShieldInputStream(stream));
} else {
TikaArchiveStreamFactory tikaFactory = new TikaArchiveStreamFactory();
encoding = tikaFactory.getEntryEncoding();
// At the end we want to close the archive stream to release
// any associated resources, but the underlying document stream
// should not be closed
ais = tikaFactory.createArchiveInputStream(new CloseShieldInputStream(stream));
}
} catch (StreamingNotSupportedException sne) {
// Most archive formats work on streams, but a few need files
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
Expand Down Expand Up @@ -293,11 +314,9 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
// Pending a fix for COMPRESS-269 / TIKA-1525, this bit is a little nasty
ais = new SevenZWrapper(sevenz);
} else {
tmp.close();
throw new TikaException("Unknown non-streaming format " + sne.getFormat(), sne);
}
} catch (ArchiveException e) {
tmp.close();
throw new TikaException("Unable to unpack document stream", e);
}

Expand Down Expand Up @@ -329,7 +348,6 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}
} finally {
ais.close();
tmp.close();
xhtml.endDocument();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import java.util.List;

import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
Expand Down Expand Up @@ -121,7 +120,9 @@ static MediaType detectArchiveFormat(byte[] prefix, int length) {
return TIFF;
}
try {
String name = ArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix, 0, length));
String name =
TikaArchiveStreamFactory.detect(new UnsynchronizedByteArrayInputStream(prefix,
0, length));
return PackageConstants.getMediaType(name);
} catch (ArchiveException e) {
return MediaType.OCTET_STREAM;
Expand Down
Loading

0 comments on commit 6dd4ac7

Please sign in to comment.