Skip to content

Commit

Permalink
Update hit term extractor to use scores when considering hit terms
Browse files Browse the repository at this point in the history
  • Loading branch information
austin007008 authored and drewfarris committed Mar 13, 2024
1 parent b087ac9 commit 7327e3d
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
Expand Down Expand Up @@ -66,6 +63,9 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
// the top value
protected Value tv;

private int leftSkippedWords;
private int rightSkippedWords;

@Override
public IteratorOptions describeOptions() {
IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
Expand Down Expand Up @@ -135,6 +135,8 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
this.startOffset = Integer.parseInt(options.get(START_OFFSET));
this.endOffset = Integer.parseInt(options.get(END_OFFSET));
this.fieldName = options.get(FIELD_NAME);
leftSkippedWords = 0;
rightSkippedWords = 0;
}

@Override
Expand Down Expand Up @@ -264,7 +266,7 @@ public void next() throws IOException {
Text cv = top.getColumnVisibility();
long ts = top.getTimestamp();
Text row = top.getRow();
List<String>[] terms = new List[endOffset - startOffset];
WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];

// while we have term frequencies for the same document
while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
Expand All @@ -278,6 +280,7 @@ public void next() throws IOException {
try {
// parse the offsets from the value
TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
boolean useScores = info.getScoreCount() == info.getTermOffsetCount();

// for each offset, gather all the terms in our range
for (int i = 0; i < info.getTermOffsetCount(); i++) {
Expand All @@ -288,10 +291,14 @@ public void next() throws IOException {
int index = offset - startOffset;
// if the value is larger than the value for this offset thus far
if (terms[index] == null) {
terms[index] = new ArrayList<>();
terms[index] = new WordsAndScores();
}
// use this value
terms[index].add(fieldAndValue[1]);
if (useScores) {
terms[index].addTerm(fieldAndValue[1], info.getScore(i));
} else {
terms[index].addTerm(fieldAndValue[1]);
}
}
}
} catch (InvalidProtocolBufferException e) {
Expand All @@ -315,28 +322,25 @@ public void next() throws IOException {
* the terms to create a phrase from
* @return the phrase
*/
protected String generatePhrase(List<String>[] terms) {
String[] largestTerms = new String[terms.length];
protected String generatePhrase(WordsAndScores[] terms) {
String[] termsToOutput = new String[terms.length];
for (int i = 0; i < terms.length; i++) {
largestTerms[i] = getLongestTerm(terms[i]);
}

return joiner.join(largestTerms);
}

/**
* Get the longest term from a list of terms;
*
* @param terms
* the terms to create a phrase
* @return the longest term (null if empty or null list)
*/
protected String getLongestTerm(List<String> terms) {
if (terms == null || terms.isEmpty()) {
return null;
} else {
return terms.stream().max(Comparator.comparingInt(String::length)).get();
if (terms[i] == null) {
termsToOutput[i] = null;
} else {
if (WordsAndScores.STOP_WORD_LIST.contains(terms[i].getWordToOutput())) {
if (i <= (terms.length / 2)) {
leftSkippedWords++;
} else {
rightSkippedWords++;
}
termsToOutput[i] = null;
} else {
termsToOutput[i] = terms[i].getWordToOutput();
}
}
}
return joiner.join(termsToOutput);
}

/**
Expand Down Expand Up @@ -471,17 +475,18 @@ private String getDtUid(String str) {
}
}

public int getLeftSkippedWords() {
return leftSkippedWords;
}

public int getRightSkippedWords() {
return rightSkippedWords;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("TermFrequencyExcerptIterator: ");
sb.append(this.fieldName);
sb.append(", ");
sb.append(this.startOffset);
sb.append(", ");
sb.append(this.endOffset);

return sb.toString();

return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package datawave.query.iterator.logic;

import java.util.ArrayList;
import java.util.Set;

public class WordsAndScores {
private final ArrayList<String> words;
private final ArrayList<Integer> scores;

private int longestWordIndex;
private int biggestScoreIndex;
private int arrSize;

public static final Set<String> STOP_WORD_LIST = Set.of("<eps>");

public WordsAndScores() {
words = new ArrayList<>();
scores = new ArrayList<>();
longestWordIndex = -1;
biggestScoreIndex = -1;
arrSize = 0;
}

public void addTerm(String word, int score) {
arrSize++;
words.add(word);
scores.add(score);
if (arrSize > 1) {
if (word.length() > words.get(arrSize - 2).length()) {
longestWordIndex = arrSize - 1;
}
if (score <= scores.get(arrSize - 2) && score >= 0) {
if (score < scores.get((arrSize - 2))) {
biggestScoreIndex = arrSize - 1;
} else {
if (word.length() > words.get(biggestScoreIndex).length()) {
biggestScoreIndex = arrSize - 1;
}
}
}
} else {
longestWordIndex = 0;
biggestScoreIndex = 0;
}
}

public void addTerm(String word) {
arrSize++;
words.add(word);
scores.add(-1);
if (arrSize > 1) {
if (word.length() > words.get(arrSize - 2).length()) {
longestWordIndex = arrSize - 1;
}
} else {
longestWordIndex = 0;
}
}

public String getWordToOutput() {
if (biggestScoreIndex != -1 && scores.get(biggestScoreIndex) != -1) {
return words.get(biggestScoreIndex);
} else {
return words.get(longestWordIndex);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public class ExcerptTransform extends DocumentTransform.DefaultDocumentTransform
public static final String HIT_EXCERPT = "HIT_EXCERPT";

private final Map<String,String> excerptIteratorOptions = new HashMap<>();
private final SortedKeyValueIterator<Key,Value> excerptIterator;
private final TermFrequencyExcerptIterator excerptIterator;
private final ExcerptFields excerptFields;
private final IteratorEnvironment env;
private final SortedKeyValueIterator<Key,Value> source;
Expand All @@ -66,7 +66,7 @@ public ExcerptTransform(ExcerptFields excerptFields, IteratorEnvironment env, So
this.excerptFields = excerptFields;
this.env = env;
this.source = source;
this.excerptIterator = excerptIterator;
this.excerptIterator = (TermFrequencyExcerptIterator) excerptIterator;
}

@Nullable
Expand Down Expand Up @@ -314,29 +314,62 @@ private Set<Excerpt> getExcerpts(PhraseIndexes phraseIndexes) {
* @return the excerpt
*/
private String getExcerpt(String field, int start, int end, Range range, ArrayList<String> hitTermValues) {
int prevLeftWordsSkipped = 0;
int currLeftWordsSkipped = 0;
boolean leftLock = false;
int prevRightWordsSkipped = 0;
int currRightWordsSkipped = 0;
boolean rightLock = false;
int timesToTry = 100;
excerptIteratorOptions.put(TermFrequencyExcerptIterator.FIELD_NAME, field);
excerptIteratorOptions.put(TermFrequencyExcerptIterator.START_OFFSET, String.valueOf(start));
excerptIteratorOptions.put(TermFrequencyExcerptIterator.END_OFFSET, String.valueOf(end));
try {
excerptIterator.init(source, excerptIteratorOptions, env);
excerptIterator.seek(range, Collections.emptyList(), false);
if (excerptIterator.hasTop()) {
Key topKey = excerptIterator.getTopKey();
String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL);
// The column qualifier is expected to be field\0phrase.
if (parts.length == 2) {
return getHitPhrase(hitTermValues, parts);
for (int i = 0; i <= timesToTry; i++) {
if (!leftLock) {
prevLeftWordsSkipped = currLeftWordsSkipped;
excerptIteratorOptions.put(TermFrequencyExcerptIterator.START_OFFSET, String.valueOf(start - prevLeftWordsSkipped));
}
if (!rightLock) {
prevRightWordsSkipped = currRightWordsSkipped;
excerptIteratorOptions.put(TermFrequencyExcerptIterator.END_OFFSET, String.valueOf(end + prevRightWordsSkipped));
}
try {
excerptIterator.init(source, excerptIteratorOptions, env);
excerptIterator.seek(range, Collections.emptyList(), false);
if (excerptIterator.hasTop()) {
if (!leftLock) {
currLeftWordsSkipped = excerptIterator.getLeftSkippedWords();
if (currLeftWordsSkipped == prevLeftWordsSkipped) {
leftLock = true;
}
}
if (!rightLock) {
currRightWordsSkipped = excerptIterator.getRightSkippedWords();
if (currRightWordsSkipped == prevRightWordsSkipped) {
rightLock = true;
}
}
if (i == timesToTry || (leftLock && rightLock)) {
Key topKey = excerptIterator.getTopKey();
String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL);
// The column qualifier is expected to be field\0phrase.
if (parts.length == 2) {
return getHitPhrase(hitTermValues, parts);
} else {
log.warn(TermFrequencyExcerptIterator.class.getSimpleName()
+ " returned top key with incorrectly-formatted column qualifier in key: " + topKey + " when scanning for excerpt ["
+ (start - prevLeftWordsSkipped) + "," + (end + prevRightWordsSkipped) + "] for field " + field + " within range "
+ range);
return null;
}
}
} else {
log.warn(TermFrequencyExcerptIterator.class.getSimpleName() + " returned top key with incorrectly-formatted column qualifier in key: "
+ topKey + " when scanning for excerpt [" + start + "," + end + "] for field " + field + " within range " + range);
return null;
}
} else {
return null;
} catch (IOException e) {
throw new RuntimeException("Failed to scan for excerpt [" + (start - prevLeftWordsSkipped) + "," + (end + prevRightWordsSkipped)
+ "] for field " + field + " within range " + range, e);
}
} catch (IOException e) {
throw new RuntimeException("Failed to scan for excerpt [" + start + "," + end + "] for field " + field + " within range " + range, e);
}
throw new RuntimeException("This should never be reached. Something went wrong!");
}

private static String getHitPhrase(ArrayList<String> hitTermValues, String[] phraseParts) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ public class TermFrequencyExcerptIteratorTest extends EasyMockSupport {
private final TermFrequencyExcerptIterator iterator = new TermFrequencyExcerptIterator();

@BeforeClass
public static void beforeClass() throws Exception {
public static void beforeClass() {
givenData("email", "123.456.789", "BODY", "the quick brown fox jumped over the lazy dog ");
givenData("email", "123.456.789", "CONTENT", "there is no greater divide in fandoms than that between star wars and star trek fans");
givenData("scan", "987.654.321", "TITLE", "document scan 12345");
givenData("scan", "987.654.321", "CONTENT", "we've been trying to reach you about your car warranty");
givenData("email", "111.222.333", "BODY", "the coldest tale <eps> ever told");
givenData("email", "111.222.333", "CONTENT", "somewhere far along <eps> the street they <eps> lost their soul <eps> to a person so mean");
}

private static void givenData(String datatype, String uid, String fieldName, String phrase) {
Expand Down Expand Up @@ -211,4 +213,41 @@ private void initIterator() throws IOException {
// noinspection unchecked
iterator.init(new SortedListKeyValueIterator(source), options, env);
}

@Test
public void testMatchFoundWithRemovedStoplistWord() throws IOException {
givenOptions("BODY", 1, 5);
initIterator();

Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333"));
Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false);

iterator.seek(range, Collections.emptyList(), false);

assertTrue(iterator.hasTop());

Key topKey = iterator.getTopKey();
assertEquals(row, topKey.getRow());
assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily());
assertEquals(new Text("BODY" + Constants.NULL + "coldest tale ever"), topKey.getColumnQualifier());
}

@Test
public void testMatchFoundWithStoplistWordAndOutOfBoundsRange() throws IOException {
givenOptions("CONTENT", -10, 21);
initIterator();

Key startKey = new Key(row, new Text("email" + Constants.NULL + "111.222.333"));
Range range = new Range(startKey, true, startKey.followingKey(PartialKey.ROW_COLFAM), false);

iterator.seek(range, Collections.emptyList(), false);

assertTrue(iterator.hasTop());

Key topKey = iterator.getTopKey();
assertEquals(row, topKey.getRow());
assertEquals(new Text("email" + Constants.NULL + "111.222.333"), topKey.getColumnFamily());
assertEquals(new Text("CONTENT" + Constants.NULL + "somewhere far along the street they lost their soul to a person so mean"),
topKey.getColumnQualifier());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,8 @@ private void givenMatchingPhrase(String field, int start, int end, String phrase
if (phrase != null) {
expect(iterator.hasTop()).andReturn(true);
Key key = new Key(new Text("row"), new Text("cf"), new Text(field + Constants.NULL + phrase));
expect(iterator.getLeftSkippedWords()).andReturn(0);
expect(iterator.getRightSkippedWords()).andReturn(0);
expect(iterator.getTopKey()).andReturn(key);
} else {
expect(iterator.hasTop()).andReturn(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ public enum WhatKindaRange {
protected static final String shard = date + "_0";
protected static final ColumnVisibility columnVisibility = new ColumnVisibility("ALL");
protected static final Value emptyValue = new Value(new byte[0]);
protected static final long timeStamp = 1356998400000l;
protected static final long timeStamp = 1356998400000L;

public static final String corleoneUID = UID.builder().newId("Corleone".getBytes(), (Date) null).toString();
public static final String corleoneChildUID = UID.builder().newId("Corleone".getBytes(), (Date) null, "1").toString();
public static final String sopranoUID = UID.builder().newId("Soprano".toString().getBytes(), (Date) null).toString();
public static final String caponeUID = UID.builder().newId("Capone".toString().getBytes(), (Date) null).toString();
public static final String sopranoUID = UID.builder().newId("Soprano".getBytes(), (Date) null).toString();
public static final String caponeUID = UID.builder().newId("Capone".getBytes(), (Date) null).toString();

protected static String normalizeColVal(Map.Entry<String,String> colVal) throws Exception {
if ("FROM_ADDRESS".equals(colVal.getKey()) || "TO_ADDRESS".equals(colVal.getKey())) {
Expand Down

0 comments on commit 7327e3d

Please sign in to comment.