diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
new file mode 100644
index 0000000000..83220f0d1b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.xml.sax.ContentHandler;
+
+/**
+ * Simple pointer class to allow parsers to pass on the parent contenthandler through
+ * to the embedded document's parse
+ */
+public class ParentContentHandler {
+
+ private final ContentHandler contentHandler;
+
+ public ParentContentHandler(ContentHandler contentHandler) {
+ this.contentHandler = contentHandler;
+ }
+
+ public ContentHandler getContentHandler() {
+ return contentHandler;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cb78d5207..629b289aea 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,7 +19,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -29,6 +31,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -82,6 +85,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
private final boolean catchEmbeddedExceptions;
+ private final boolean inlineContent = false;
/**
* Initialize the wrapper with {@link #catchEmbeddedExceptions} set
* to true
as default.
@@ -158,7 +162,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
RecursivelySecureContentHandler secureContentHandler =
- new RecursivelySecureContentHandler(localHandler, tis, writeLimit,
+ new RecursivelySecureContentHandler(localHandler, tis, new SecureHandlerCounter(writeLimit),
throwOnWriteLimitReached, context);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
@@ -179,6 +183,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
parserState.recursiveParserWrapperHandler.endDocument();
+ context.set(RecursivelySecureContentHandler.class, null);
}
}
@@ -250,12 +255,21 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
idPath, parserState));
long started = System.currentTimeMillis();
- RecursivelySecureContentHandler secureContentHandler =
- context.get(RecursivelySecureContentHandler.class);
//store the handler that was used before this parse
//so that you can return it back to its state at the end of this parse
- ContentHandler preContextHandler = secureContentHandler.handler;
- secureContentHandler.updateContentHandler(localHandler);
+ RecursivelySecureContentHandler preParseHandler = context.get(RecursivelySecureContentHandler.class);
+
+ ParentContentHandler preParseParentHandler = context.get(ParentContentHandler.class);
+ context.set(ParentContentHandler.class, new ParentContentHandler(preParseHandler));
+ TemporaryResources tmp = null;
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis == null) {
+ tmp = new TemporaryResources();
+ tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata);
+ }
+ ContentHandler secureContentHandler =
+ new RecursivelySecureContentHandler(localHandler, tis, preParseHandler.handlerCounter,
+ preParseHandler.throwOnWriteLimitReached, context);
try {
super.parse(stream, secureContentHandler, metadata, context);
@@ -286,11 +300,15 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
}
} finally {
context.set(Parser.class, preContextParser);
- secureContentHandler.updateContentHandler(preContextHandler);
+ context.set(RecursivelySecureContentHandler.class, preParseHandler);
+ context.set(ParentContentHandler.class, preParseParentHandler);
long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler
.endEmbeddedDocument(localHandler, metadata);
+ if (tmp != null) {
+ tis.close();
+ }
}
}
}
@@ -308,35 +326,51 @@ private ParserState(AbstractRecursiveParserWrapperHandler handler) {
}
}
- static class RecursivelySecureContentHandler extends SecureContentHandler {
- private ContentHandler handler;
-
- //total allowable chars across all handlers
+ static class SecureHandlerCounter {
private final int totalWriteLimit;
+ private boolean writeLimitReached = false;
+ //total chars written to all handlers
+ private int totalChars = 0;
+
+ private SecureHandlerCounter(int totalWriteLimit) {
+ this.totalWriteLimit = totalWriteLimit;
+ }
+ /**
+ * Given the requested length, how many characters are actually available
+ * @param length
+ * @return
+ */
+ int getAvailable(int length) {
+ return Math.min(totalWriteLimit - totalChars, length);
+ }
+ void addChars(int numChars) {
+ totalChars += numChars;
+ }
+
+ }
+
+ //
+ static class RecursivelySecureContentHandler extends SecureContentHandler {
+ private static AtomicInteger COUNTER = new AtomicInteger();
+ private final ContentHandler handler;
+ private final SecureHandlerCounter handlerCounter;
private final boolean throwOnWriteLimitReached;
private final ParseContext parseContext;
- private boolean writeLimitReached = false;
+ private final int id = COUNTER.getAndIncrement();
- //total chars written to all handlers
- private int totalChars = 0;
public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream,
- int totalWriteLimit,
+ SecureHandlerCounter handlerCounter,
boolean throwOnWriteLimitReached, ParseContext parseContext) {
super(handler, stream);
this.handler = handler;
- this.totalWriteLimit = totalWriteLimit;
+ this.handlerCounter = handlerCounter;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.parseContext = parseContext;
}
- public void updateContentHandler(ContentHandler handler) {
- setContentHandler(handler);
- this.handler = handler;
- }
-
/**
* Bypass the SecureContentHandler...
*
@@ -364,17 +398,17 @@ public void endElement(String uri, String localName, String name) throws SAXExce
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
- if (writeLimitReached) {
+ if (handlerCounter.writeLimitReached) {
return;
}
- if (totalWriteLimit < 0) {
+ if (handlerCounter.totalWriteLimit < 0) {
super.characters(ch, start, length);
return;
}
- int availableLength = Math.min(totalWriteLimit - totalChars, length);
+ int availableLength = handlerCounter.getAvailable(length);
super.characters(ch, start, availableLength);
- totalChars += availableLength;
+ handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
@@ -382,27 +416,27 @@ public void characters(char[] ch, int start, int length) throws SAXException {
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- if (writeLimitReached) {
+ if (handlerCounter.writeLimitReached) {
return;
}
- if (totalWriteLimit < 0) {
+ if (handlerCounter.totalWriteLimit < 0) {
super.ignorableWhitespace(ch, start, length);
return;
}
- int availableLength = Math.min(totalWriteLimit - totalChars, length);
+ int availableLength = handlerCounter.getAvailable(length);
super.ignorableWhitespace(ch, start, availableLength);
- totalChars += availableLength;
+ handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
}
private void handleWriteLimitReached() throws WriteLimitReachedException {
- writeLimitReached = true;
+ handlerCounter.writeLimitReached = true;
if (throwOnWriteLimitReached) {
- throw new WriteLimitReachedException(totalWriteLimit);
+ throw new WriteLimitReachedException(handlerCounter.totalWriteLimit);
} else {
ParseRecord parseRecord = parseContext.get(ParseRecord.class);
if (parseRecord != null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 18359735ff..ec021643cf 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -96,6 +96,7 @@ public class TesseractOCRConfig implements Serializable {
// See addOtherTesseractConfig.
private Maptrue
and if tesseract is found, this will load the
* langs that result from --list-langs. At parse time, the
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 6ce19e3dd2..764930672d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -248,6 +248,24 @@ public void getNormalMetadataToo() throws Exception {
assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
}
+ @Test
+ public void testInlining() throws Exception {
+ assumeTrue(canRun(), "can run OCR");
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setInlineContent(true);
+ ParseContext context = new ParseContext();
+ context.set(TesseractOCRConfig.class, config);
+ List