Skip to content

Commit

Permalink
Fix ScoredPhrase Bug (#2440)
Browse files Browse the repository at this point in the history
* Build out test suite for ContentFunctionsDescriptor, fix scored phrase bug

* update code comments with examples

---------

Co-authored-by: Ivan Bella <[email protected]>
Co-authored-by: Keith Ratcliffe <[email protected]>
  • Loading branch information
3 people committed Jul 9, 2024
1 parent 2d1451f commit f5ac4ef
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -300,36 +300,34 @@ public Set<String>[] fieldsAndTerms(Set<String> termFrequencyFields, Set<String>
}
}
} else if (CONTENT_SCORED_PHRASE_FUNCTION_NAME.equals(funcName)) {
JexlNode firstArg = args.next();

if (firstArg instanceof ASTNumberLiteral || firstArg instanceof ASTUnaryMinusNode) {
// firstArg is max score value, skip
firstArg = args.next();
}

// we override the zones if the first argument is a string
if (firstArg instanceof ASTStringLiteral) {
fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral());

if (args.peek() instanceof ASTNumberLiteral || args.peek() instanceof ASTUnaryMinusNode) {
args.next(); // max score not needed for fields and terms
}
JexlNode arg = args.next();

if (arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode) {
// if the first argument is a number, then no field exists
// for example, content:scoredPhrase(-1.5, termOffsetMap, 'value')
termOffsetMap = args.next();
} else {
JexlNode nextArg = args.peek();

// The zones may (more likely) be specified as an identifier
if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) {
if (oredFields != null && firstArg instanceof ASTAndNode) {
oredFields.setValue(false);
}

fields = JexlASTHelper.getIdentifierNames(firstArg);
termOffsetMap = args.next();
if (arg instanceof ASTIdentifier) {
// single field case
// for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value')
fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(arg)));
} else {
termOffsetMap = firstArg;
// multi field case
// for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value')
Set<String> identifiers = JexlASTHelper.getIdentifierNames(arg);
if (!identifiers.isEmpty()) {
fields = identifiers;

if (oredFields != null && arg instanceof ASTAndNode) {
oredFields.setValue(false);
}
}
}

// skip score because it is not needed when gathering just the fields and values from a function
args.next();

termOffsetMap = args.next();
}
} else if (CONTENT_WITHIN_FUNCTION_NAME.equals(funcName)) {
JexlNode arg = args.next();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package datawave.query.jexl.functions;

import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.fail;

import java.util.Set;

import org.apache.commons.jexl3.parser.ASTFunctionNode;
import org.apache.commons.jexl3.parser.ASTJexlScript;
import org.apache.commons.jexl3.parser.JexlNode;
import org.apache.commons.jexl3.parser.ParseException;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.junit.jupiter.api.Test;

import datawave.query.jexl.JexlASTHelper;
import datawave.query.jexl.functions.ContentFunctionsDescriptor.ContentJexlArgumentDescriptor;
import datawave.query.util.MockMetadataHelper;

class ContentFunctionsDescriptorTest {

private final String unfieldedPhrase = "content:phrase(termOffsetMap, 'foo', 'bar')";
private final String fieldedPhrase = "content:phrase(FIELD, termOffsetMap, 'foo', 'bar')";
private final String multiFieldedPhrase = "content:phrase((FIELD_A || FIELD_B), termOffsetMap, 'foo', 'bar')";

private final String unfieldedScoredPhrase = "content:scoredPhrase(-1.5, termOffsetMap, 'foo', 'bar')";
private final String fieldedScoredPhrase = "content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'foo', 'bar')";
private final String multiFieldedScoredPhrase = "content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'foo', 'bar')";

private final String unfieldedAdjacent = "content:adjacent(termOffsetMap, 'foo', 'bar')";
private final String fieldedAdjacent = "content:adjacent(FIELD, termOffsetMap, 'foo', 'bar')";
private final String multiFieldedAdjacent = "content:adjacent((FIELD_A || FIELD_B), termOffsetMap, 'foo', 'bar')";

private final String unfieldedWithin = "content:within(1, termOffsetMap, 'foo', 'bar')";
private final String fieldedWithin = "content:within(FIELD, 1, termOffsetMap, 'foo', 'bar')";
private final String multiFieldedWithin = "content:within((FIELD_A || FIELD_B), 1, termOffsetMap, 'foo', 'bar')";

private final ContentFunctionsDescriptor descriptor = new ContentFunctionsDescriptor();

@Test
void testGetHitTermValue() {
String expected = "foo bar";
assertHitTermValue(getDescriptor(unfieldedPhrase), expected);
assertHitTermValue(getDescriptor(fieldedPhrase), expected);
assertHitTermValue(getDescriptor(multiFieldedPhrase), expected);

assertHitTermValue(getDescriptor(unfieldedScoredPhrase), expected);
assertHitTermValue(getDescriptor(fieldedScoredPhrase), expected);
assertHitTermValue(getDescriptor(multiFieldedScoredPhrase), expected);

assertHitTermValue(getDescriptor(unfieldedAdjacent), expected);
assertHitTermValue(getDescriptor(fieldedAdjacent), expected);
assertHitTermValue(getDescriptor(multiFieldedAdjacent), expected);

assertHitTermValue(getDescriptor(unfieldedWithin), expected);
assertHitTermValue(getDescriptor(fieldedWithin), expected);
assertHitTermValue(getDescriptor(multiFieldedWithin), expected);
}

private void assertHitTermValue(ContentJexlArgumentDescriptor jexlDescriptor, String expected) {
assertEquals(expected, jexlDescriptor.getHitTermValue());
}

@Test
void testGetHitTermValues() {
Set<String> expectedHitTermValues = Set.of("foo", "bar");
assertHitTermValues(getDescriptor(unfieldedPhrase), expectedHitTermValues);
assertHitTermValues(getDescriptor(fieldedPhrase), expectedHitTermValues);
assertHitTermValues(getDescriptor(multiFieldedPhrase), expectedHitTermValues);

assertHitTermValues(getDescriptor(unfieldedScoredPhrase), expectedHitTermValues);
assertHitTermValues(getDescriptor(fieldedScoredPhrase), expectedHitTermValues);
assertHitTermValues(getDescriptor(multiFieldedScoredPhrase), expectedHitTermValues);

assertHitTermValues(getDescriptor(unfieldedAdjacent), expectedHitTermValues);
assertHitTermValues(getDescriptor(fieldedAdjacent), expectedHitTermValues);
assertHitTermValues(getDescriptor(multiFieldedAdjacent), expectedHitTermValues);

assertHitTermValues(getDescriptor(unfieldedWithin), expectedHitTermValues);
assertHitTermValues(getDescriptor(fieldedWithin), expectedHitTermValues);
assertHitTermValues(getDescriptor(multiFieldedWithin), expectedHitTermValues);
}

private void assertHitTermValues(ContentJexlArgumentDescriptor jexlDescriptor, Set<String> expected) {
assertEquals(expected, jexlDescriptor.getHitTermValues());
}

@Test
@SuppressWarnings("unchecked")
void testFieldsAndTerms() {
assertFieldsAndTerms(getDescriptor(unfieldedPhrase), new Set[] {Set.of(), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(fieldedPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(multiFieldedPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")});

assertFieldsAndTerms(getDescriptor(unfieldedScoredPhrase), new Set[] {Set.of(), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(fieldedScoredPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(multiFieldedScoredPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")});

assertFieldsAndTerms(getDescriptor(unfieldedAdjacent), new Set[] {Set.of(), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(fieldedAdjacent), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(multiFieldedAdjacent), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")});

assertFieldsAndTerms(getDescriptor(unfieldedWithin), new Set[] {Set.of(), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(fieldedWithin), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")});
assertFieldsAndTerms(getDescriptor(multiFieldedWithin), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")});
}

private void assertFieldsAndTerms(ContentJexlArgumentDescriptor jexlDescriptor, Set<String>[] expected) {
Set<String>[] fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true));
assertArrayEquals(expected, fieldsAndTerms);

fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true), false);
assertArrayEquals(expected, fieldsAndTerms);
}

@Test
void testFieldSets() {
assertFieldSets(getDescriptor(unfieldedPhrase), Set.of(Set.of("FIELD"), Set.of("FIELD_A"), Set.of("FIELD_B")));
assertFieldSets(getDescriptor(fieldedPhrase), Set.of(Set.of("FIELD")));
assertFieldSets(getDescriptor(multiFieldedPhrase), Set.of(Set.of("FIELD_A"), Set.of("FIELD_B")));

assertFieldSets(getDescriptor(unfieldedScoredPhrase), Set.of(Set.of("FIELD"), Set.of("FIELD_A"), Set.of("FIELD_B")));
assertFieldSets(getDescriptor(fieldedScoredPhrase), Set.of(Set.of("FIELD")));
assertFieldSets(getDescriptor(multiFieldedScoredPhrase), Set.of(Set.of("FIELD_A"), Set.of("FIELD_B")));

assertFieldSets(getDescriptor(unfieldedAdjacent), Set.of(Set.of("FIELD"), Set.of("FIELD_A"), Set.of("FIELD_B")));
assertFieldSets(getDescriptor(fieldedAdjacent), Set.of(Set.of("FIELD")));
assertFieldSets(getDescriptor(multiFieldedAdjacent), Set.of(Set.of("FIELD_A"), Set.of("FIELD_B")));

assertFieldSets(getDescriptor(unfieldedWithin), Set.of(Set.of("FIELD"), Set.of("FIELD_A"), Set.of("FIELD_B")));
assertFieldSets(getDescriptor(fieldedWithin), Set.of(Set.of("FIELD")));
assertFieldSets(getDescriptor(multiFieldedWithin), Set.of(Set.of("FIELD_A"), Set.of("FIELD_B")));

}

private void assertFieldSets(ContentJexlArgumentDescriptor jexlDescriptor, Set<Set<String>> expected) {
MockMetadataHelper helper = new MockMetadataHelper();
helper.addTermFrequencyFields(Set.of("FIELD", "FIELD_A", "FIELD_B"));
helper.setIndexedFields(Set.of("FIELD", "FIELD_A", "FIELD_B"));

Set<Set<String>> fieldSet = jexlDescriptor.fieldSets(helper, Set.of());
assertEquals(expected, fieldSet);
}

@Test
void testFields() {
assertFields(getDescriptor(unfieldedPhrase), Set.of("FIELD", "FIELD_A", "FIELD_B"));
assertFields(getDescriptor(fieldedPhrase), Set.of("FIELD"));
assertFields(getDescriptor(multiFieldedPhrase), Set.of("FIELD_A", "FIELD_B"));

assertFields(getDescriptor(unfieldedScoredPhrase), Set.of("FIELD", "FIELD_A", "FIELD_B"));
assertFields(getDescriptor(fieldedScoredPhrase), Set.of("FIELD"));
assertFields(getDescriptor(multiFieldedScoredPhrase), Set.of("FIELD_A", "FIELD_B"));

assertFields(getDescriptor(unfieldedAdjacent), Set.of("FIELD", "FIELD_A", "FIELD_B"));
assertFields(getDescriptor(fieldedAdjacent), Set.of("FIELD"));
assertFields(getDescriptor(multiFieldedAdjacent), Set.of("FIELD_A", "FIELD_B"));

assertFields(getDescriptor(unfieldedWithin), Set.of("FIELD", "FIELD_A", "FIELD_B"));
assertFields(getDescriptor(fieldedWithin), Set.of("FIELD"));
assertFields(getDescriptor(multiFieldedWithin), Set.of("FIELD_A", "FIELD_B"));
}

private void assertFields(ContentJexlArgumentDescriptor jexlDescriptor, Set<String> expected) {
MockMetadataHelper helper = new MockMetadataHelper();
helper.addTermFrequencyFields(Set.of("FIELD", "FIELD_A", "FIELD_B"));
helper.setIndexedFields(Set.of("FIELD", "FIELD_A", "FIELD_B"));

Set<String> fieldSet = jexlDescriptor.fields(helper, Set.of());
assertEquals(expected, fieldSet);
}

private ContentJexlArgumentDescriptor getDescriptor(String query) {
return descriptor.getArgumentDescriptor(getFunction(query));
}

private ASTFunctionNode getFunction(String query) {
ASTJexlScript script = getQueryTree(query);
JexlNode node = script.jjtGetChild(0);
if (node instanceof ASTFunctionNode) {
return (ASTFunctionNode) node;
}
fail("Node was not an ASTFunctionNode");
throw new RuntimeException("Node was not an ASTFunctionNode");
}

private ASTJexlScript getQueryTree(String query) {
try {
return JexlASTHelper.parseAndFlattenJexlQuery(query);
} catch (ParseException e) {
fail("Failed to parse query: " + query);
throw new RuntimeException(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,7 @@ public void expandContentScoredPhraseFunctionIntoMultipleFields() throws ParseEx
runTest(original, expected);
}

// scored phrase does not support 'scoredPhrase(Iterable, ...)'
@Test(expected = IllegalArgumentException.class)
@Test
public void expandContentScoredPhraseFunctionIntoMultipleFields_exception() throws ParseException {
Set<String> fields = Sets.newHashSet("FOO", "BAR");
Set<String> tfFields = Sets.newHashSet("FOO", "BAR");
Expand Down

0 comments on commit f5ac4ef

Please sign in to comment.