diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java new file mode 100644 index 0000000000..83220f0d1b --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.extractor; + +import org.xml.sax.ContentHandler; + +/** + * Simple pointer class to allow parsers to pass on the parent contenthandler through + * to the embedded document's parse + */ +public class ParentContentHandler { + + private final ContentHandler contentHandler; + + public ParentContentHandler(ContentHandler contentHandler) { + this.contentHandler = contentHandler; + } + + public ContentHandler getContentHandler() { + return contentHandler; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 3cb78d5207..629b289aea 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -19,7 +19,9 @@ import java.io.IOException; import java.io.InputStream; import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import org.apache.commons.io.input.CloseShieldInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -29,6 +31,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.extractor.ParentContentHandler; import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -82,6 +85,7 @@ public class RecursiveParserWrapper extends ParserDecorator { private final boolean catchEmbeddedExceptions; + private final boolean inlineContent = false; /** * Initialize the wrapper with {@link #catchEmbeddedExceptions} set * to true as default. @@ -158,7 +162,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl try { TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata); RecursivelySecureContentHandler secureContentHandler = - new RecursivelySecureContentHandler(localHandler, tis, writeLimit, + new RecursivelySecureContentHandler(localHandler, tis, new SecureHandlerCounter(writeLimit), throwOnWriteLimitReached, context); context.set(RecursivelySecureContentHandler.class, secureContentHandler); getWrappedParser().parse(tis, secureContentHandler, metadata, context); @@ -179,6 +183,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata); parserState.recursiveParserWrapperHandler.endDocument(); + context.set(RecursivelySecureContentHandler.class, null); } } @@ -250,12 +255,21 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, new EmbeddedParserDecorator(getWrappedParser(), objectLocation, idPath, parserState)); long started = System.currentTimeMillis(); - RecursivelySecureContentHandler secureContentHandler = - context.get(RecursivelySecureContentHandler.class); //store the handler that was used before this parse //so that you can return it back to its state at the end of this parse - ContentHandler preContextHandler = secureContentHandler.handler; - secureContentHandler.updateContentHandler(localHandler); + RecursivelySecureContentHandler preParseHandler = context.get(RecursivelySecureContentHandler.class); + + ParentContentHandler preParseParentHandler = context.get(ParentContentHandler.class); + context.set(ParentContentHandler.class, new ParentContentHandler(preParseHandler)); + TemporaryResources tmp = null; + TikaInputStream tis = TikaInputStream.cast(stream); + if (tis == null) { + tmp = new TemporaryResources(); + tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata); + } + ContentHandler secureContentHandler = + new RecursivelySecureContentHandler(localHandler, tis, preParseHandler.handlerCounter, + preParseHandler.throwOnWriteLimitReached, context); try { super.parse(stream, secureContentHandler, metadata, context); @@ -286,11 +300,15 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata, } } finally { context.set(Parser.class, preContextParser); - secureContentHandler.updateContentHandler(preContextHandler); + context.set(RecursivelySecureContentHandler.class, preParseHandler); + context.set(ParentContentHandler.class, preParseParentHandler); long elapsedMillis = System.currentTimeMillis() - started; metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); parserState.recursiveParserWrapperHandler .endEmbeddedDocument(localHandler, metadata); + if (tmp != null) { + tis.close(); + } } } } @@ -308,35 +326,51 @@ private ParserState(AbstractRecursiveParserWrapperHandler handler) { } } - static class RecursivelySecureContentHandler extends SecureContentHandler { - private ContentHandler handler; - - //total allowable chars across all handlers + static class SecureHandlerCounter { private final int totalWriteLimit; + private boolean writeLimitReached = false; + //total chars written to all handlers + private int totalChars = 0; + + private SecureHandlerCounter(int totalWriteLimit) { + this.totalWriteLimit = totalWriteLimit; + } + /** + * Given the requested length, how many characters are actually available + * @param length + * @return + */ + int getAvailable(int length) { + return Math.min(totalWriteLimit - totalChars, length); + } + void addChars(int numChars) { + totalChars += numChars; + } + + } + + // + static class RecursivelySecureContentHandler extends SecureContentHandler { + private static AtomicInteger COUNTER = new AtomicInteger(); + private final ContentHandler handler; + private final SecureHandlerCounter handlerCounter; private final boolean throwOnWriteLimitReached; private final ParseContext parseContext; - private boolean writeLimitReached = false; + private final int id = COUNTER.getAndIncrement(); - //total chars written to all handlers - private int totalChars = 0; public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream, - int totalWriteLimit, + SecureHandlerCounter handlerCounter, boolean throwOnWriteLimitReached, ParseContext parseContext) { super(handler, stream); this.handler = handler; - this.totalWriteLimit = totalWriteLimit; + this.handlerCounter = handlerCounter; this.throwOnWriteLimitReached = throwOnWriteLimitReached; this.parseContext = parseContext; } - public void updateContentHandler(ContentHandler handler) { - setContentHandler(handler); - this.handler = handler; - } - /** * Bypass the SecureContentHandler... *

@@ -364,17 +398,17 @@ public void endElement(String uri, String localName, String name) throws SAXExce @Override public void characters(char[] ch, int start, int length) throws SAXException { - if (writeLimitReached) { + if (handlerCounter.writeLimitReached) { return; } - if (totalWriteLimit < 0) { + if (handlerCounter.totalWriteLimit < 0) { super.characters(ch, start, length); return; } - int availableLength = Math.min(totalWriteLimit - totalChars, length); + int availableLength = handlerCounter.getAvailable(length); super.characters(ch, start, availableLength); - totalChars += availableLength; + handlerCounter.addChars(availableLength); if (availableLength < length) { handleWriteLimitReached(); } @@ -382,27 +416,27 @@ public void characters(char[] ch, int start, int length) throws SAXException { @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - if (writeLimitReached) { + if (handlerCounter.writeLimitReached) { return; } - if (totalWriteLimit < 0) { + if (handlerCounter.totalWriteLimit < 0) { super.ignorableWhitespace(ch, start, length); return; } - int availableLength = Math.min(totalWriteLimit - totalChars, length); + int availableLength = handlerCounter.getAvailable(length); super.ignorableWhitespace(ch, start, availableLength); - totalChars += availableLength; + handlerCounter.addChars(availableLength); if (availableLength < length) { handleWriteLimitReached(); } } private void handleWriteLimitReached() throws WriteLimitReachedException { - writeLimitReached = true; + handlerCounter.writeLimitReached = true; if (throwOnWriteLimitReached) { - throw new WriteLimitReachedException(totalWriteLimit); + throw new WriteLimitReachedException(handlerCounter.totalWriteLimit); } else { ParseRecord parseRecord = parseContext.get(ParseRecord.class); if (parseRecord != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java index 18359735ff..ec021643cf 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java @@ -96,6 +96,7 @@ public class TesseractOCRConfig implements Serializable { // See addOtherTesseractConfig. private Map otherTesseractConfig = new HashMap<>(); private Set userConfigured = new HashSet<>(); + private boolean inlineContent = false; /** * This takes a language string, parses it and then bins individual langs into @@ -477,6 +478,15 @@ public boolean isApplyRotation() { return this.applyRotation; } + public void setInlineContent(boolean inlineContent) { + this.inlineContent = inlineContent; + userConfigured.add("inlineContent"); + } + + public boolean isInlineContent() { + return inlineContent; + } + /** * Sets whether or not a rotation value should be calculated and passed to ImageMagick. * diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index a28ae8951f..8012a00f98 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -65,14 +65,19 @@ import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.ParentContentHandler; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractExternalProcessParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; @@ -265,13 +270,33 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, //this is the text output file name specified on the tesseract //commandline. The actual output file name will have a suffix added. File tmpOCROutputFile = tmp.createTemporaryFile(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + ContentHandler baseHandler = getContentHandler(config.isInlineContent(), handler, metadata, parseContext); + XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); xhtml.startDocument(); parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext, config); xhtml.endDocument(); } } + private ContentHandler getContentHandler(boolean isInlineContent, + ContentHandler handler, Metadata metadata, ParseContext parseContext) { + if (! isInlineContent) { + return handler; + } + //check for inlining of the parent content handler + //if there's no parent, skip + ParentContentHandler parentContentHandler = parseContext.get(ParentContentHandler.class); + if (parentContentHandler == null) { + return handler; + } + String embeddedType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (! TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(embeddedType)) { + return handler; + } + //check for literally the same or wrapped parent and handler? + return new TeeContentHandler(new EmbeddedContentHandler(new BodyContentHandler(parentContentHandler.getContentHandler())), handler); + } + private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ContentHandler xhtml, Metadata metadata, ParseContext parseContext, TesseractOCRConfig config) @@ -824,6 +849,15 @@ public void setApplyRotation(boolean applyRotation) { public boolean isApplyRotation() { return defaultConfig.isApplyRotation(); } + + @Field + public void setInlineContent(boolean inlineContent) { + defaultConfig.setInlineContent(inlineContent); + } + + public boolean isInlineContent() { + return defaultConfig.isInlineContent(); + } /** * If set to true and if tesseract is found, this will load the * langs that result from --list-langs. At parse time, the diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java index 6ce19e3dd2..764930672d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java @@ -248,6 +248,24 @@ public void getNormalMetadataToo() throws Exception { assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution")); } + @Test + public void testInlining() throws Exception { + assumeTrue(canRun(), "can run OCR"); + TesseractOCRConfig config = new TesseractOCRConfig(); + config.setInlineContent(true); + ParseContext context = new ParseContext(); + context.set(TesseractOCRConfig.class, config); + List metadataList = getRecursiveMetadata("testOCR.pptx", context); + debug(metadataList); + //0 is main doc, 1 is embedded image, 2 is thumbnail + assertEquals(3, metadataList.size()); + assertContains("This is some text", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertNotContained("This is some text", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + assertNotContained("This is some text", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT)); + + assertContains("Happy New Year 2003", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); + assertContains("Happy New Year 2003", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + } //TODO: add unit tests for jp2/jpx/ppm TIKA-2174 }