Skip to content

Commit

Permalink
TIKA-4352 -- add an exclusion list in the StandardWriteFilter (#2046)
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison authored Nov 14, 2024
1 parent 3a8990d commit 5a3a7d2
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {


private final Set<String> includeFields;
private final Set<String> excludeFields;

private Map<String, Integer> fieldSizes = new HashMap<>();

Expand All @@ -125,19 +126,22 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
* @param maxEstimatedSize
* @param includeFields if null or empty, all fields are included; otherwise, which fields
* to add to the metadata object.
* @param excludeFields these fields will not be included (unless they're in {@link StandardWriteFilter#ALWAYS_SET_FIELDS})
* @param includeEmpty if <code>true</code>, this will set or add an empty value to the
* metadata object.
*/
protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
int maxValuesPerField,
Set<String> includeFields,
Set<String> excludeFields,
boolean includeEmpty) {

this.maxKeySize = maxKeySize;
this.maxFieldSize = maxFieldSize;
this.maxTotalEstimatedSize = maxEstimatedSize;
this.maxValuesPerField = maxValuesPerField;
this.includeFields = includeFields;
this.excludeFields = excludeFields;
this.includeEmpty = includeEmpty;
}

Expand Down Expand Up @@ -176,6 +180,7 @@ public void set(String field, String value, Map<String, String[]> data) {
setAlwaysInclude(field, value, data);
return;
}

StringSizePair filterKey = filterKey(field, value, data);
setFilterKey(filterKey, value, data);
}
Expand Down Expand Up @@ -433,11 +438,10 @@ private boolean includeField(String name) {
if (ALWAYS_SET_FIELDS.contains(name)) {
return true;
}
if (includeFields == null ||
includeFields.contains(name)) {
return true;
if (excludeFields.contains(name)) {
return false;
}
return false;
return includeFields.isEmpty() || includeFields.contains(name);
}

private static int estimateSize(String s) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;

private Set<String> includeFields = null;
private Set<String> includeFields = Collections.EMPTY_SET;
private Set<String> excludeFields = Collections.EMPTY_SET;
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
Expand All @@ -55,7 +56,8 @@ public MetadataWriteFilter newInstance() {
}

return new StandardWriteFilter(maxKeySize, maxFieldSize,
maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty);
maxTotalEstimatedBytes, maxValuesPerField, includeFields,
excludeFields, includeEmpty);
}

public void setIncludeFields(List<String> includeFields) {
Expand All @@ -64,6 +66,12 @@ public void setIncludeFields(List<String> includeFields) {
this.includeFields = Collections.unmodifiableSet(keys);
}

public void setExcludeFields(List<String> excludeFields) {
Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size());
keys.addAll(excludeFields);
this.excludeFields = Collections.unmodifiableSet(keys);
}

public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) {
this.maxTotalEstimatedBytes = maxTotalEstimatedBytes;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Set;

Expand Down Expand Up @@ -116,7 +117,7 @@ public void testMetadataFactoryFieldsConfig() throws Exception {
@Test
public void testKeySizeFilter() throws Exception {
Metadata metadata = filter(10, 1000, 10000, 100,
null, true);
Collections.EMPTY_SET, Collections.EMPTY_SET, true);
//test that must add keys are not truncated
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
Expand All @@ -138,13 +139,13 @@ public void testAfterMaxHit() throws Exception {
String k = "dc:creator";//20 bytes
//key is > maxTotalBytes, so the value isn't even added
Metadata metadata = filter(100, 10000, 10,
100, null, false);
100, Collections.EMPTY_SET, Collections.EMPTY_SET, false);
metadata.set(k, "ab");
assertEquals(1, metadata.names().length);
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));

metadata = filter(100, 10000, 50, 100,
null, false);
Collections.EMPTY_SET, Collections.EMPTY_SET, false);
for (int i = 0; i < 10; i++) {
metadata.set(k, "abcde");
}
Expand Down Expand Up @@ -178,7 +179,8 @@ public void testAfterMaxHit() throws Exception {
@Test
public void testMinSizeForAlwaysInclude() throws Exception {
//test that mimes don't get truncated
Metadata metadata = filter(100, 10, 10000, 100, null, true);
Metadata metadata = filter(100, 10, 10000, 100,
Collections.EMPTY_SET, Collections.EMPTY_SET, true);

String mime = getLongestMime().toString();
metadata.set(Metadata.CONTENT_TYPE, mime);
Expand All @@ -192,21 +194,47 @@ public void testMinSizeForAlwaysInclude() throws Exception {

@Test
public void testMaxFieldValues() throws Exception {
Metadata metadata = filter(100, 10000, 10000, 3, null, true);
Metadata metadata = filter(100, 10000, 10000, 3,
Collections.EMPTY_SET, Collections.EMPTY_SET, true);
for (int i = 0; i < 10; i++) {
metadata.add(TikaCoreProperties.SUBJECT, "ab");
}
assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
}

@Test
public void testExclude() throws Exception {
TikaConfig tikaConfig =
new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
"<mock>";
mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
mock += "<metadata action=\"add\" name=\"subject\">01234567890123456789</metadata>";
mock += "<metadata action=\"add\" name=\"subjectB\">01234567890123456789</metadata>";
mock += "<write element=\"p\" times=\"1\"> hello </write>\n";
mock += "</mock>";
Metadata metadata = new Metadata();
List<Metadata> metadataList =
getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
parser, metadata, new ParseContext(), true);
assertEquals(1, metadataList.size());
metadata = metadataList.get(0);
assertEquals(9, metadata.names().length);
assertEquals("01234567890123456789", metadata.get("dc:creator"));
assertEquals("01234567890123456789", metadata.get("subjectB"));
assertNull(metadata.get("subject"));
}


private void assertTruncated(Metadata metadata) {
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
}
private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes,
int maxValuesPerField,
Set<String> includeFields, boolean includeEmpty) {
Set<String> includeFields, Set<String> excludeFields, boolean includeEmpty) {
MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize,
maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
maxTotalBytes, maxValuesPerField, includeFields, excludeFields, includeEmpty);
Metadata metadata = new Metadata();
metadata.setMetadataWriteFilter(filter);
return metadata;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"/>
</parsers>
<autoDetectParserConfig>
<params>
<spoolToDisk>12345</spoolToDisk>
<outputThreshold>6789</outputThreshold>
</params>
<metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
<params>
<excludeFields>
<field>subject</field>
</excludeFields>
</params>
</metadataWriteFilterFactory>
</autoDetectParserConfig>
</properties>

0 comments on commit 5a3a7d2

Please sign in to comment.