Skip to content

Commit

Permalink
TIKA-4154 - parameterize max string length for jackson (#1402)
Browse files Browse the repository at this point in the history
* TIKA-4154 -- parameterize max string length for serialization with jackson

(cherry picked from commit e726aea)
  • Loading branch information
tballison committed Oct 14, 2023
1 parent eae931f commit 4f922cc
Show file tree
Hide file tree
Showing 13 changed files with 153 additions and 96 deletions.
32 changes: 32 additions & 0 deletions tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@
*/
public class TikaConfig {

public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000;//jackson's default
public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME = "maxJsonStringFieldLength";

//use this to look for unneeded instantiations of TikaConfig
protected static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();

Expand All @@ -103,6 +106,8 @@ public class TikaConfig {
private final MetadataFilter metadataFilter;
private final AutoDetectParserConfig autoDetectParserConfig;

private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH;

public TikaConfig(String file) throws TikaException, IOException, SAXException {
this(Paths.get(file));
}
Expand Down Expand Up @@ -174,6 +179,7 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException,
this.metadataFilter = MetadataFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
this.serviceLoader = loader;
setMaxJsonStringFieldLength(element);
TIMES_INSTANTIATED.incrementAndGet();
}

Expand Down Expand Up @@ -273,6 +279,7 @@ public TikaConfig() throws TikaException, IOException {
executorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.metadataFilter = MetadataFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
setMaxJsonStringFieldLength(element);
} catch (SAXException e) {
throw new TikaException("Specified Tika configuration has syntax errors: " + config,
e);
Expand All @@ -281,6 +288,31 @@ public TikaConfig() throws TikaException, IOException {
TIMES_INSTANTIATED.incrementAndGet();
}

/**
*
* @return maximum field length when serializing String fields in Tika's metadata or metadata
* list into JSON
*/
public static int getMaxJsonStringFieldLength() {
return MAX_JSON_STRING_FIELD_LENGTH;
}

private void setMaxJsonStringFieldLength(Element properties) throws TikaConfigException {
NodeList nodeList = properties.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node n = nodeList.item(i);
if (n.getNodeName().equals(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME)) {
try {
MAX_JSON_STRING_FIELD_LENGTH = Integer.parseInt(n.getTextContent());
} catch (NumberFormatException e) {
throw new TikaConfigException(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " +
"is not an integer", e);
}
return;
}
}
}

private static MimeTypes getDefaultMimeTypes(ClassLoader loader) {
return MimeTypes.getDefaultMimeTypes(loader);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,20 @@

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.StreamReadConstraints;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.EmitKey;

public class JsonEmitData {

public static void toJson(EmitData emitData, Writer writer) throws IOException {
try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
try (JsonGenerator jsonGenerator = new JsonFactory()
.setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength())
.build()).createGenerator(writer)) {
jsonGenerator.writeStartObject();
EmitKey key = emitData.getEmitKey();
jsonGenerator.writeStringField(JsonFetchEmitTuple.EMITTER, key.getEmitterName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
Expand Down Expand Up @@ -54,7 +56,8 @@ public class JsonFetchEmitTuple {


public static FetchEmitTuple fromJson(Reader reader) throws IOException {
try (JsonParser jParser = new JsonFactory().createParser(reader)) {
try (JsonParser jParser = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build()).createParser(reader)) {
JsonToken token = jParser.nextToken();
if (token != JsonToken.START_OBJECT) {
throw new IOException("require start object, but see: " + token.name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.pipes.FetchEmitTuple;

public class JsonFetchEmitTupleList {

public static List<FetchEmitTuple> fromJson(Reader reader) throws IOException {
List<FetchEmitTuple> list;
try (JsonParser jParser = new JsonFactory().createParser(reader)) {
try (JsonParser jParser = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build()).createParser(reader)) {
JsonToken token = jParser.nextToken();
if (token != JsonToken.START_ARRAY) {
throw new IOException("require start array, but see: " + token.name());
Expand All @@ -56,7 +59,8 @@ public static String toJson(List<FetchEmitTuple> list) throws IOException {

public static void toJson(List<FetchEmitTuple> list, Writer writer) throws IOException {

try (JsonGenerator jsonGenerator = new JsonFactory().createGenerator(writer)) {
try (JsonGenerator jsonGenerator = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build()).createGenerator(writer)) {
jsonGenerator.writeStartArray();
for (FetchEmitTuple t : list) {
JsonFetchEmitTuple.writeTuple(t, jsonGenerator);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;
import org.apache.commons.io.input.CloseShieldReader;
import org.apache.commons.io.output.CloseShieldWriter;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;

Expand All @@ -48,7 +50,10 @@ public static void toJson(Metadata metadata, Writer writer) throws IOException {
writer.write("null");
return;
}
try (JsonGenerator jsonGenerator = new JsonFactory()
long max = TikaConfig.getMaxJsonStringFieldLength();
try (JsonGenerator jsonGenerator = new JsonFactory().setStreamReadConstraints(
StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build())
.createGenerator(new CloseShieldWriter(writer))) {
if (PRETTY_PRINT) {
jsonGenerator.useDefaultPrettyPrinter();
Expand Down Expand Up @@ -92,7 +97,9 @@ static void writeMetadataObject(Metadata metadata, JsonGenerator jsonGenerator,
*/
public static Metadata fromJson(Reader reader) throws IOException {
Metadata m = null;
try (JsonParser jParser = new JsonFactory().createParser(new CloseShieldReader(reader))) {
try (JsonParser jParser = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build())
.createParser(new CloseShieldReader(reader))) {
m = readMetadataObject(jParser);
}
return m;
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.StreamReadConstraints;
import org.apache.commons.io.input.CloseShieldReader;
import org.apache.commons.io.output.CloseShieldWriter;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

Expand All @@ -40,15 +42,18 @@ public class JsonMetadataList {
*
* @param metadataList list of metadata to write
* @param writer writer
* @param prettyPrint whether or not to pretty print the output
* @param prettyPrint whether or not to pretty print the output
* @throws org.apache.tika.exception.TikaException if there is an IOException during writing
*/
public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint) throws IOException {
public static void toJson(List<Metadata> metadataList, Writer writer, boolean prettyPrint)
throws IOException {
if (metadataList == null) {
writer.write("null");
return;
}
try (JsonGenerator jsonGenerator = new JsonFactory()
try (JsonGenerator jsonGenerator = new JsonFactory().setStreamReadConstraints(
StreamReadConstraints.builder().maxStringLength(
TikaConfig.getMaxJsonStringFieldLength()).build())
.createGenerator(new CloseShieldWriter(writer))) {
if (prettyPrint) {
jsonGenerator.useDefaultPrettyPrinter();
Expand Down Expand Up @@ -85,7 +90,9 @@ public static List<Metadata> fromJson(Reader reader) throws IOException {
return ms;
}
ms = new ArrayList<>();
try (JsonParser jParser = new JsonFactory().createParser(new CloseShieldReader(reader))) {
try (JsonParser jParser = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build())
.createParser(new CloseShieldReader(reader))) {

JsonToken token = jParser.nextToken();
if (token != JsonToken.START_ARRAY) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.StreamReadConstraints;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;


Expand All @@ -39,7 +41,10 @@ public JsonStreamingSerializer(Writer writer) {

public void add(Metadata metadata) throws IOException {
if (!hasStartedArray) {
jsonGenerator = new JsonFactory().createGenerator(writer);
jsonGenerator =
new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
.maxStringLength(TikaConfig.getMaxJsonStringFieldLength()).build())
.createGenerator(writer);
jsonGenerator.writeStartArray();
hasStartedArray = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;

import org.junit.jupiter.api.Test;

import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;

public class JsonMetadataTest {
Expand Down Expand Up @@ -109,13 +111,18 @@ public void testLargeNumberOfKeys() throws Exception {

@Test
public void testLargeValues() throws Exception {
//TIKA-4154
TikaConfig tikaConfig = null;
try (InputStream is =
JsonMetadata.class.getResourceAsStream("/config/tika-config-json.xml")) {
tikaConfig = new TikaConfig(is);
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1000000; i++) {
for (int i = 0; i < 30000000; i++) {
sb.append("v");
}
Metadata m = new Metadata();
m.add("large_value1", sb.toString());
m.add("large_value2", sb.toString());
m.add("large_value", sb.toString());
StringWriter writer = new StringWriter();
JsonMetadata.toJson(m, writer);
Metadata deserialized = JsonMetadata.fromJson(new StringReader(writer.toString()));
Expand Down
20 changes: 20 additions & 0 deletions tika-serialization/src/test/resources/config/tika-config-json.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<maxJsonStringFieldLength>50000000</maxJsonStringFieldLength>
</properties>
Loading

0 comments on commit 4f922cc

Please sign in to comment.