Skip to content

Commit

Permalink
Extract all fields from filter:includeRegex and filter:excludeRegex … (
Browse files Browse the repository at this point in the history
…#2450)

* Extract all fields from filter:includeRegex and filter:excludeRegex when an index-only field is present
Correct RegexFunctionVisitor incorrectly calling index-only fields non-event fields
Correctly apply De Morgans law with ANDed exclusions

* comments

---------

Co-authored-by: hlgp <[email protected]>
Co-authored-by: hgklohr <[email protected]>
  • Loading branch information
3 people committed Jul 3, 2024
1 parent 905c389 commit 5040fd8
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 80 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.util.List;
import java.util.Set;

import org.apache.commons.jexl3.parser.ASTAndNode;
import org.apache.commons.jexl3.parser.ASTFunctionNode;
import org.apache.commons.jexl3.parser.ASTIdentifier;
import org.apache.commons.jexl3.parser.ASTOrNode;
Expand All @@ -21,8 +20,6 @@
import datawave.query.jexl.JexlASTHelper;
import datawave.query.jexl.JexlNodeFactory;
import datawave.query.jexl.functions.FunctionJexlNodeVisitor;
import datawave.query.language.functions.jexl.Jexl;
import datawave.query.language.tree.FunctionNode;
import datawave.query.parser.JavaRegexAnalyzer;
import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException;
import datawave.query.util.MetadataHelper;
Expand All @@ -41,16 +38,16 @@ public class RegexFunctionVisitor extends FunctionIndexQueryExpansionVisitor {

private static final Logger log = ThreadConfigurableLogger.getLogger(RegexFunctionVisitor.class);

protected Set<String> nonEventFields;
protected Set<String> indexOnlyFields;

public RegexFunctionVisitor(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> nonEventFields) {
public RegexFunctionVisitor(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> indexOnlyFields) {
super(config, metadataHelper, null);
this.nonEventFields = nonEventFields;
this.indexOnlyFields = indexOnlyFields;
}

@SuppressWarnings("unchecked")
public static <T extends JexlNode> T expandRegex(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> nonEventFields, T script) {
RegexFunctionVisitor visitor = new RegexFunctionVisitor(config, metadataHelper, nonEventFields);
public static <T extends JexlNode> T expandRegex(ShardQueryConfiguration config, MetadataHelper metadataHelper, Set<String> indexOnlyFields, T script) {
RegexFunctionVisitor visitor = new RegexFunctionVisitor(config, metadataHelper, indexOnlyFields);
JexlNode root = (T) script.jjtAccept(visitor, null);
root = TreeFlatteningRebuildingVisitor.flatten(root);
return (T) root;
Expand All @@ -67,66 +64,59 @@ public Object visit(ASTFunctionNode node, Object data) {

List<ASTIdentifier> identifiers = JexlASTHelper.getIdentifiers(arguments.get(0));
List<JexlNode> extractChildren = new ArrayList<>(identifiers.size());
List<JexlNode> keepChildren = new ArrayList<>(identifiers.size());

boolean extractFields = false;
for (ASTIdentifier identifier : identifiers) {
JexlNode regexNode = buildRegexNode(identifier, functionMetadata.name(), JexlNodes.getIdentifierOrLiteralAsString(arguments.get(1)));
if (regexNode != null) {
extractChildren.add(regexNode);
} else {
keepChildren.add(JexlNodeFactory.buildIdentifier(identifier.getName()));
String field = JexlASTHelper.deconstructIdentifier(identifier.getName());
// if the function contains any index-only fields, extract them all from the function
if (indexOnlyFields.contains(field.toUpperCase())) {
extractFields = true;
break;
}

}

if (extractChildren.size() == 0) {
// nothing to rewrite
if (!extractFields) {
return returnNode;
} else if (keepChildren.size() == 0) {
// rewrite all nodes
if (identifiers.size() == 1) {
// we've already rewritten our one node
returnNode = extractChildren.get(0);
} else {
if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
// build an AND node because of how DeMorgan's law works with expanding negations
returnNode = JexlNodeFactory.createAndNode(extractChildren);
}
}
} else {
// construct each and put it all together
JexlNode extractNode;
List<JexlNode> joint = new ArrayList<>();
List<JexlNode> newArgs = new ArrayList<>();

if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
newArgs.add(JexlNodeFactory.createOrNode(keepChildren));
extractNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
newArgs.add(JexlNodeFactory.createAndNode(keepChildren));
extractNode = JexlNodeFactory.createAndNode(extractChildren);
}

newArgs.add(arguments.get(1));
JexlNode newFunc = FunctionJexlNodeVisitor.makeFunctionFrom(functionMetadata.namespace(), functionMetadata.name(),
newArgs.toArray(new JexlNode[0]));

joint.add(extractNode);
joint.add(newFunc);
for (ASTIdentifier identifier : identifiers) {
JexlNode regexNode = buildRegexNode(identifier, functionMetadata.name(), JexlNodes.getIdentifierOrLiteralAsString(arguments.get(1)));
if (regexNode != null) {
extractChildren.add(regexNode);
}
}

if (functionMetadata.name().equals(INCLUDE_REGEX) && arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(joint);
if (extractChildren.size() == 0) {
// nothing to rewrite
return returnNode;
} else {
returnNode = JexlNodeFactory.createAndNode(joint);
// rewrite all nodes
if (identifiers.size() == 1) {
// we've already rewritten our one node
returnNode = extractChildren.get(0);
} else {
if (functionMetadata.name().equals(INCLUDE_REGEX)) {
if (arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
} else {
returnNode = JexlNodeFactory.createAndNode(extractChildren);
}
} else {
// Follow DeMorgan's law when expanding negations
if (arguments.get(0) instanceof ASTOrNode) {
returnNode = JexlNodeFactory.createAndNode(extractChildren);
} else {
returnNode = JexlNodeFactory.createOrNode(extractChildren);
}
}
}
}
}
}

if (log.isTraceEnabled()) {
log.trace("Rewrote \"" + JexlStringBuildingVisitor.buildQueryWithoutParse(node) + "\" into \""
+ JexlStringBuildingVisitor.buildQueryWithoutParse(returnNode) + "\"");
if (log.isTraceEnabled()) {
log.trace("Rewrote \"" + JexlStringBuildingVisitor.buildQueryWithoutParse(node) + "\" into \""
+ JexlStringBuildingVisitor.buildQueryWithoutParse(returnNode) + "\"");
}
}
return returnNode;
}
Expand All @@ -144,21 +134,20 @@ public Object visit(ASTFunctionNode node, Object data) {
*/
private JexlNode buildRegexNode(ASTIdentifier identifier, String functionName, String regex) {
String field = JexlASTHelper.deconstructIdentifier(identifier.getName());
if (nonEventFields.contains(field.toUpperCase())) {
try {
JavaRegexAnalyzer jra = new JavaRegexAnalyzer(regex);
if (!jra.isNgram()) {
if (functionName.equals(INCLUDE_REGEX)) {
return JexlNodeFactory.buildERNode(field, regex);
} else {
return JexlNodeFactory.buildNRNode(field, regex);
}
try {
JavaRegexAnalyzer jra = new JavaRegexAnalyzer(regex);
if (!jra.isNgram()) {
if (functionName.equals(INCLUDE_REGEX)) {
return JexlNodeFactory.buildERNode(field, regex);
} else {
return JexlNodeFactory.buildNRNode(field, regex);
}
} catch (JavaRegexParseException e) {
QueryException qe = new QueryException(DatawaveErrorCode.INVALID_REGEX);
throw new DatawaveFatalQueryException(qe);
}
} catch (JavaRegexParseException e) {
QueryException qe = new QueryException(DatawaveErrorCode.INVALID_REGEX);
throw new DatawaveFatalQueryException(qe);
}

return null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,22 @@ public void testFilterFunctionsInvalidatedByIndexOnlyFields() throws ParseExcept
}
}

@Test
public void testMultiFieldInclude() throws Exception {
eventQueryLogic.setParser(new LuceneToJexlQueryParser());
Map<String,String> extraParameters = new HashMap<>();
// @formatter:off
String[] queryStrings = {
"UUID:SOPRANO AND #INCLUDE(LOCATION || POSIZIONE || NAME, 'newjersey')"
};
@SuppressWarnings("unchecked")
List<String>[] expectedLists = new List[] {Collections.singletonList("SOPRANO")};
for (int i = 0; i < queryStrings.length; i++) {
runTestQuery(expectedLists[i], queryStrings[i], format.parse("20091231"), format.parse("20150101"), extraParameters);
}

}

@Test
public void testDelayedExceededValueThresholdRegexTFField() throws Exception {
Map<String,String> extraParameters = new HashMap();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,59 +70,59 @@ public void testRewriteMultiFieldedIncludeRegex() throws ParseException {
public void testRewriteMultiHeteroIndexOnlyFieldsIncludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDB");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || filter:includeRegex(FIELDA || FIELDC, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDA =~ 'ba.*' || FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || filter:includeRegex(FIELDA || FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' || FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testRewriteMultiHeteroIndexOnlyFieldsIncludeRegex2() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDB", "FIELDC");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || filter:includeRegex(FIELDA, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || FIELDA =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || filter:includeRegex(FIELDA, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDB =~ 'ba.*' || FIELDC =~ 'ba.*' || FIELDA =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testRewriteHeteroIndexOnlyFieldsExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && FIELDC !~ 'ba.*'";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA||FIELDB||FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testANDIncludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && filter:includeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && FIELDC =~ 'ba.*'";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:includeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && filter:includeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA =~ 'ba.*' && FIELDB =~ 'ba.*' && FIELDC =~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

@Test
public void testANDExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA", "FIELDB");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' || FIELDB !~ 'ba.*' || FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA&&FIELDB&&FIELDC, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*' && filter:excludeRegex(FIELDC, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' || FIELDB !~ 'ba.*' || FIELDC !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -142,11 +142,11 @@ public void testRewriteMultiFieldedExcludeRegex() throws ParseException {
public void testRewriteMultiHeteroFieldedExcludeRegex() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA");
String query = "FOO == 'bar' && filter:excludeRegex(FIELDA||FIELDB, 'ba.*')";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' && filter:excludeRegex(FIELDB, 'ba.*'))";
String expected = "FOO == 'bar' && (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);

query = "FOO == 'bar' || filter:excludeRegex(FIELDA||FIELDB, 'ba.*')";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && filter:excludeRegex(FIELDB, 'ba.*'))";
expected = "FOO == 'bar' || (FIELDA !~ 'ba.*' && FIELDB !~ 'ba.*')";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -166,7 +166,7 @@ public void testEndWildCardNotIndexOnly() throws ParseException {
public void testMixedEventNonEvent() throws ParseException {
Set<String> indexOnlyFields = Sets.newHashSet("NON_EVENT_FIELD");
String query = "filter:includeRegex(EVENT_FIELD || NON_EVENT_FIELD,'all_.*?')";
String expected = " NON_EVENT_FIELD =~ 'all_.*?' || filter:includeRegex(EVENT_FIELD, 'all_.*?')";
String expected = "EVENT_FIELD =~ 'all_.*?' || NON_EVENT_FIELD =~ 'all_.*?'";
assertVisitorResult(query, expected, indexOnlyFields);
}

Expand All @@ -186,6 +186,15 @@ public void testDoubleEndedWildCard2() throws ParseException {
assertVisitorResult(query, query, indexOnlyFields);
}

@Test
public void testMultiFieldDoubleEndedWildCard() throws ParseException {
// we cannot extract the index only field because we don't support double ended wildcards against the index. Queries like this will fail in a later
// visitor
Set<String> indexOnlyFields = Sets.newHashSet("FIELDA");
String query = "FOO == 'bar' && filter:includeRegex(FIELDA||FIELDB,'.*all_.*')";
assertVisitorResult(query, query, indexOnlyFields);
}

@Test
public void testBadRegex() throws ParseException {
exception.expect(DatawaveFatalQueryException.class);
Expand Down

0 comments on commit 5040fd8

Please sign in to comment.