Skip to content

Commit

Permalink
TIKA-4256 -- allow inlining of ocr'd content in the RecursiveParserWr…
Browse files Browse the repository at this point in the history
…apper (#1762)

* TIKA-4256 -- allow inlining of ocr'd content
  • Loading branch information
tballison authored May 20, 2024
1 parent 429fb9a commit 7a03331
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 31 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.extractor;

import org.xml.sax.ContentHandler;

/**
* Simple pointer class to allow parsers to pass on the parent contenthandler through
* to the embedded document's parse
*/
public class ParentContentHandler {

private final ContentHandler contentHandler;

public ParentContentHandler(ContentHandler contentHandler) {
this.contentHandler = contentHandler;
}

public ContentHandler getContentHandler() {
return contentHandler;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
Expand All @@ -29,6 +31,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
Expand Down Expand Up @@ -82,6 +85,7 @@ public class RecursiveParserWrapper extends ParserDecorator {

private final boolean catchEmbeddedExceptions;

private final boolean inlineContent = false;
/**
* Initialize the wrapper with {@link #catchEmbeddedExceptions} set
* to <code>true</code> as default.
Expand Down Expand Up @@ -158,7 +162,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
RecursivelySecureContentHandler secureContentHandler =
new RecursivelySecureContentHandler(localHandler, tis, writeLimit,
new RecursivelySecureContentHandler(localHandler, tis, new SecureHandlerCounter(writeLimit),
throwOnWriteLimitReached, context);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
Expand All @@ -179,6 +183,7 @@ public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandl
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
parserState.recursiveParserWrapperHandler.endDocument();
context.set(RecursivelySecureContentHandler.class, null);
}
}

Expand Down Expand Up @@ -250,12 +255,21 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
idPath, parserState));
long started = System.currentTimeMillis();
RecursivelySecureContentHandler secureContentHandler =
context.get(RecursivelySecureContentHandler.class);
//store the handler that was used before this parse
//so that you can return it back to its state at the end of this parse
ContentHandler preContextHandler = secureContentHandler.handler;
secureContentHandler.updateContentHandler(localHandler);
RecursivelySecureContentHandler preParseHandler = context.get(RecursivelySecureContentHandler.class);

ParentContentHandler preParseParentHandler = context.get(ParentContentHandler.class);
context.set(ParentContentHandler.class, new ParentContentHandler(preParseHandler));
TemporaryResources tmp = null;
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis == null) {
tmp = new TemporaryResources();
tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata);
}
ContentHandler secureContentHandler =
new RecursivelySecureContentHandler(localHandler, tis, preParseHandler.handlerCounter,
preParseHandler.throwOnWriteLimitReached, context);

try {
super.parse(stream, secureContentHandler, metadata, context);
Expand Down Expand Up @@ -286,11 +300,15 @@ public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
}
} finally {
context.set(Parser.class, preContextParser);
secureContentHandler.updateContentHandler(preContextHandler);
context.set(RecursivelySecureContentHandler.class, preParseHandler);
context.set(ParentContentHandler.class, preParseParentHandler);
long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler
.endEmbeddedDocument(localHandler, metadata);
if (tmp != null) {
tis.close();
}
}
}
}
Expand All @@ -308,35 +326,51 @@ private ParserState(AbstractRecursiveParserWrapperHandler handler) {
}
}

static class RecursivelySecureContentHandler extends SecureContentHandler {
private ContentHandler handler;

//total allowable chars across all handlers
static class SecureHandlerCounter {
private final int totalWriteLimit;
private boolean writeLimitReached = false;
//total chars written to all handlers
private int totalChars = 0;

private SecureHandlerCounter(int totalWriteLimit) {
this.totalWriteLimit = totalWriteLimit;
}
/**
* Given the requested length, how many characters are actually available
* @param length
* @return
*/
int getAvailable(int length) {
return Math.min(totalWriteLimit - totalChars, length);
}
void addChars(int numChars) {
totalChars += numChars;
}

}

//
static class RecursivelySecureContentHandler extends SecureContentHandler {
private static AtomicInteger COUNTER = new AtomicInteger();
private final ContentHandler handler;
private final SecureHandlerCounter handlerCounter;

private final boolean throwOnWriteLimitReached;

private final ParseContext parseContext;

private boolean writeLimitReached = false;
private final int id = COUNTER.getAndIncrement();

//total chars written to all handlers
private int totalChars = 0;
public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream,
int totalWriteLimit,
SecureHandlerCounter handlerCounter,
boolean throwOnWriteLimitReached, ParseContext parseContext) {
super(handler, stream);
this.handler = handler;
this.totalWriteLimit = totalWriteLimit;
this.handlerCounter = handlerCounter;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.parseContext = parseContext;
}

public void updateContentHandler(ContentHandler handler) {
setContentHandler(handler);
this.handler = handler;
}

/**
* Bypass the SecureContentHandler...
* <p>
Expand Down Expand Up @@ -364,45 +398,45 @@ public void endElement(String uri, String localName, String name) throws SAXExce

@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (writeLimitReached) {
if (handlerCounter.writeLimitReached) {
return;
}

if (totalWriteLimit < 0) {
if (handlerCounter.totalWriteLimit < 0) {
super.characters(ch, start, length);
return;
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
int availableLength = handlerCounter.getAvailable(length);
super.characters(ch, start, availableLength);
totalChars += availableLength;
handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
}

@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
if (writeLimitReached) {
if (handlerCounter.writeLimitReached) {
return;
}

if (totalWriteLimit < 0) {
if (handlerCounter.totalWriteLimit < 0) {
super.ignorableWhitespace(ch, start, length);
return;
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
int availableLength = handlerCounter.getAvailable(length);
super.ignorableWhitespace(ch, start, availableLength);
totalChars += availableLength;
handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
}

private void handleWriteLimitReached() throws WriteLimitReachedException {
writeLimitReached = true;
handlerCounter.writeLimitReached = true;

if (throwOnWriteLimitReached) {
throw new WriteLimitReachedException(totalWriteLimit);
throw new WriteLimitReachedException(handlerCounter.totalWriteLimit);
} else {
ParseRecord parseRecord = parseContext.get(ParseRecord.class);
if (parseRecord != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ public class TesseractOCRConfig implements Serializable {
// See addOtherTesseractConfig.
private Map<String, String> otherTesseractConfig = new HashMap<>();
private Set<String> userConfigured = new HashSet<>();
private boolean inlineContent = false;

/**
* This takes a language string, parses it and then bins individual langs into
Expand Down Expand Up @@ -477,6 +478,15 @@ public boolean isApplyRotation() {
return this.applyRotation;
}

public void setInlineContent(boolean inlineContent) {
this.inlineContent = inlineContent;
userConfigured.add("inlineContent");
}

public boolean isInlineContent() {
return inlineContent;
}

/**
* Sets whether or not a rotation value should be calculated and passed to ImageMagick.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,19 @@
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
Expand Down Expand Up @@ -265,13 +270,33 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix added.
File tmpOCROutputFile = tmp.createTemporaryFile();
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
ContentHandler baseHandler = getContentHandler(config.isInlineContent(), handler, metadata, parseContext);
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext, config);
xhtml.endDocument();
}
}

private ContentHandler getContentHandler(boolean isInlineContent,
ContentHandler handler, Metadata metadata, ParseContext parseContext) {
if (! isInlineContent) {
return handler;
}
//check for inlining of the parent content handler
//if there's no parent, skip
ParentContentHandler parentContentHandler = parseContext.get(ParentContentHandler.class);
if (parentContentHandler == null) {
return handler;
}
String embeddedType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (! TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(embeddedType)) {
return handler;
}
//check for literally the same or wrapped parent and handler?
return new TeeContentHandler(new EmbeddedContentHandler(new BodyContentHandler(parentContentHandler.getContentHandler())), handler);
}

private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
ContentHandler xhtml,
Metadata metadata, ParseContext parseContext, TesseractOCRConfig config)
Expand Down Expand Up @@ -824,6 +849,15 @@ public void setApplyRotation(boolean applyRotation) {
public boolean isApplyRotation() {
return defaultConfig.isApplyRotation();
}

@Field
public void setInlineContent(boolean inlineContent) {
defaultConfig.setInlineContent(inlineContent);
}

public boolean isInlineContent() {
return defaultConfig.isInlineContent();
}
/**
* If set to <code>true</code> and if tesseract is found, this will load the
* langs that result from --list-langs. At parse time, the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,24 @@ public void getNormalMetadataToo() throws Exception {
assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
}

@Test
public void testInlining() throws Exception {
assumeTrue(canRun(), "can run OCR");
TesseractOCRConfig config = new TesseractOCRConfig();
config.setInlineContent(true);
ParseContext context = new ParseContext();
context.set(TesseractOCRConfig.class, config);
List<Metadata> metadataList = getRecursiveMetadata("testOCR.pptx", context);
debug(metadataList);
//0 is main doc, 1 is embedded image, 2 is thumbnail
assertEquals(3, metadataList.size());
assertContains("This is some text", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("This is some text", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("This is some text", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));

assertContains("Happy New Year 2003", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertContains("Happy New Year 2003", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
}
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174

}

0 comments on commit 7a03331

Please sign in to comment.