Skip to content

Commit

Permalink
scores, no scores, onebest excerpt
Browse files Browse the repository at this point in the history
  • Loading branch information
austin007008 committed May 22, 2024
1 parent d7f9cee commit b165000
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,8 @@ public void next() throws IOException {
stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
}
if (stopFound && !trim) {
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
tk = new Key(row, new Text(dtUid),
new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX" + Constants.NULL + Constants.NULL), cv, ts);
tv = new Value();
return;
}
Expand All @@ -331,7 +332,35 @@ public void next() throws IOException {
source.next();
}
// generate the return key and value
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
String phraseWithScores = generatePhrase(terms);
boolean usedScores = false;
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
if (term.getUseScores()) {
usedScores = true;
term.setOutputScores(false);
}
}
String phraseWithoutScores = generatePhrase(terms);
String oneBestExcerpt;
if (!usedScores && startOffset < endOffset && !phraseWithScores.isEmpty()) {
phraseWithScores = "XXXNOTSCOREDXXX";
oneBestExcerpt = "XXXNOTSCOREDXXX";
} else {
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
term.setOneBestExcerpt(true);
}
oneBestExcerpt = generatePhrase(terms);
}

tk = new Key(row, new Text(dtUid),
new Text(fieldName + Constants.NULL + phraseWithScores + Constants.NULL + phraseWithoutScores + Constants.NULL + oneBestExcerpt), cv,
ts);
tv = new Value();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ public class WordsAndScores {

private boolean useScores;
private boolean hasHitTerm;
private boolean outputScores;
private boolean oneBestExcerpt;

private static final String OPENP = "(";
private static final String CLOSEP = ")";
Expand Down Expand Up @@ -61,6 +63,8 @@ public WordsAndScores() {
overrideIsBeginning = false;
overrideIsMiddle = false;
overrideIsEnd = false;
outputScores = false;
oneBestExcerpt = false;
}

public void reset() {
Expand All @@ -76,6 +80,8 @@ public void reset() {
overrideIsBeginning = false;
overrideIsMiddle = false;
overrideIsEnd = false;
outputScores = false;
oneBestExcerpt = false;
}

/**
Expand Down Expand Up @@ -123,11 +129,13 @@ public boolean addTerm(String word, int score, List<String> hitTermsList) {
} else { // if we do not already have a valid smallestScore...
smallestScoreIndex = arrSize - 1; // set this index as the smallest score
useScores = true;
outputScores = true;
}
} else { // if this is the first word/score we are adding, set index 0 (the only one) as the longest/smallest
longestWordIndex = 0;
smallestScoreIndex = 0;
useScores = true;
outputScores = true;
if ((hitTermsList != null) && hitTermsList.contains(word)) {
hitTermIndex = 0;
hasHitTerm = true;
Expand Down Expand Up @@ -181,24 +189,31 @@ public String getWordToOutput() {
log.warn("Trying to get token to output when none have been added: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
} else {
if (oneBestExcerpt) {
if (hitTermIndex == smallestScoreIndex) {
return OPENB + words.get(hitTermIndex) + CLOSEB;
} else {
return words.get(smallestScoreIndex);
}
}
if (overrideIndex >= 0 && overrideIndex < arrSize) {
if (STOP_WORD_LIST.contains(words.get(overrideIndex))) { // if the hit term is on the stop list for some reason...
return null;
}
if (overrideIsBeginning) {
if (useScores && (scores.get(overrideIndex) != -1)) {
if (useScores && outputScores && (scores.get(overrideIndex) != -1)) {
return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return OPENB + words.get(overrideIndex);
}
} else if (overrideIsEnd) {
if (useScores && (scores.get(overrideIndex) != -1)) {
if (useScores && outputScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB;
} else {
return words.get(overrideIndex) + CLOSEB;
}
} else if (overrideIsMiddle) {
if (useScores && (scores.get(overrideIndex) != -1)) {
if (useScores && outputScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return words.get(overrideIndex);
Expand All @@ -213,7 +228,7 @@ public String getWordToOutput() {
if (STOP_WORD_LIST.contains(words.get(hitTermIndex))) { // if the hit term is on the stop list for some reason...
return null;
}
if (useScores && (scores.get(hitTermIndex) != -1)) { // if we have a valid score for the hit term...
if (useScores && outputScores && (scores.get(hitTermIndex) != -1)) { // if we have a valid score for the hit term...
return OPENB + words.get(hitTermIndex) + OPENP + userReadable(scores.get(hitTermIndex)) + CLOSEP + CLOSEB;
} else {
return OPENB + words.get(hitTermIndex) + CLOSEB;
Expand All @@ -223,7 +238,12 @@ public String getWordToOutput() {
if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
return null;
}
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest score
if (outputScores) {
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest
// score
} else {
return words.get(smallestScoreIndex);
}
} else { // default to returning the longest word if the scores don't exist/aren't valid
if (STOP_WORD_LIST.contains(words.get(longestWordIndex))) { // if the selected term is in the stop list...
return null;
Expand Down Expand Up @@ -320,6 +340,14 @@ public void setOverride(int overrideIndex, int overridePosition) {
}
}

public void setOutputScores(boolean outputScores) {
this.outputScores = outputScores;
}

public void setOneBestExcerpt(boolean oneBestExcerpt) {
this.oneBestExcerpt = oneBestExcerpt;
}

/**
* Returns a boolean that is true if there is a valid score in the scores list and false otherwise.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public class ExcerptTransform extends DocumentTransform.DefaultDocumentTransform

public static final String PHRASE_INDEXES_ATTRIBUTE = "PHRASE_INDEXES_ATTRIBUTE";
public static final String HIT_EXCERPT = "HIT_EXCERPT";
public static final String HIT_EXCERPT_WITH_SCORES = "HIT_EXCERPT_WITH_SCORES";
public static final String HIT_EXCERPT_ONE_BEST = "HIT_EXCERPT_ONE_BEST";

private final Map<String,String> excerptIteratorOptions = new HashMap<>();
private final TermFrequencyExcerptIterator excerptIterator;
Expand Down Expand Up @@ -213,12 +215,27 @@ private ValueTuple attributeToHitTuple(Attribute<?> source) {
* the document
*/
private void addExcerptsToDocument(Set<Excerpt> excerpts, Document document) {
Attributes attributes = new Attributes(true);
Attributes attributesWithoutScores = new Attributes(true);
Attributes attributesWithScores = new Attributes(true);
Attributes attributesOneBest = new Attributes(true);
boolean hasScores = false;
for (Excerpt excerpt : excerpts) {
Content content = new Content(excerpt.getExcerpt(), excerpt.getSource(), true);
attributes.add(content);
Content contentWithoutScores = new Content(excerpt.getExcerptWithoutScores(), excerpt.getSource(), true);
attributesWithoutScores.add(contentWithoutScores);
String excerptWithScores = excerpt.getExcerptWithScores();
if (!excerptWithScores.equals("XXXNOTSCOREDXXX") && !excerptWithScores.isBlank()) {
Content contentWithScores = new Content(excerptWithScores, excerpt.getSource(), true);
attributesWithScores.add(contentWithScores);
hasScores = true;
Content contentOneBest = new Content(excerpt.getExcerptOneBest(), excerpt.getSource(), true);
attributesOneBest.add(contentOneBest);
}
}
document.put(HIT_EXCERPT, attributesWithoutScores);
if (hasScores) {
document.put(HIT_EXCERPT_WITH_SCORES, attributesWithScores);
document.put(HIT_EXCERPT_ONE_BEST, attributesOneBest);
}
document.put(HIT_EXCERPT, attributes);
}

/**
Expand Down Expand Up @@ -286,7 +303,8 @@ private Set<Excerpt> getExcerpts(PhraseIndexes phraseIndexes) {
String excerpt = getExcerpt(field, start, end, range, hitTermValues);
// Only retain non-blank excerpts.
if (excerpt != null && !excerpt.isEmpty()) {
excerpts.add(new Excerpt(startKey, excerpt));
String[] parts = excerpt.split(Constants.NULL);
excerpts.add(new Excerpt(startKey, parts[0], parts[1], parts[2]));
} else {
if (log.isTraceEnabled()) {
log.trace("Failed to find excerpt [" + start + "," + end + "] for field " + field + "for document " + eventId.replace('\u0000', '/'));
Expand Down Expand Up @@ -351,15 +369,15 @@ private String getExcerpt(String field, int start, int end, Range range, ArrayLi
Key topKey = excerptIterator.getTopKey();
String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL);
// The column qualifier is expected to be field\0phrase.
if (parts.length != 2) {
if (parts.length != 4) {
log.warn(TermFrequencyExcerptIterator.class.getSimpleName() + " returned top key with incorrectly-formatted column qualifier in key: "
+ topKey + " when scanning for excerpt [" + start + "," + end + "] for field " + field + " within range " + range);
return null;
}

// if we have reached the limit of times to try, or we have no stop words removed
if (i == 1 || !parts[1].equals("XXXWESKIPPEDAWORDXXX")) {
return parts[1];
return parts[1] + Constants.NULL + parts[2] + Constants.NULL + parts[3];
}
} else {
return null;
Expand Down Expand Up @@ -415,16 +433,28 @@ public Iterator<Entry<Key,Document>> getIterator(final Iterator<Entry<Key,Docume
* A class that holds the info for one excerpt.
*/
private static class Excerpt {
private final String excerpt;
private final String excerptWithScores;
private final String excerptWithoutScores;
private final String excerptOneBest;
private final Key source;

public Excerpt(Key source, String excerpt) {
public Excerpt(Key source, String excerptWithScores, String excerptWithoutScores, String excerptOneBest) {
this.source = source;
this.excerpt = excerpt;
this.excerptWithScores = excerptWithScores;
this.excerptWithoutScores = excerptWithoutScores;
this.excerptOneBest = excerptOneBest;
}

public String getExcerptWithScores() {
return excerptWithScores;
}

public String getExcerptWithoutScores() {
return excerptWithoutScores;
}

public String getExcerpt() {
return excerpt;
public String getExcerptOneBest() {
return excerptOneBest;
}

public Key getSource() {
Expand All @@ -438,12 +468,13 @@ public boolean equals(Object o) {
if (o == null || getClass() != o.getClass())
return false;
Excerpt excerpt1 = (Excerpt) o;
return excerpt.equals(excerpt1.excerpt) && source.equals(excerpt1.source);
return (excerptWithScores.equals(excerpt1.excerptWithScores) && source.equals(excerpt1.source))
&& (excerptWithoutScores.equals(excerpt1.excerptWithoutScores)) && (excerptOneBest.equals(excerpt1.excerptOneBest));
}

@Override
public int hashCode() {
return Objects.hash(excerpt, source);
return Objects.hash(excerptWithScores, excerptWithoutScores, excerptOneBest, source);
}
}

Expand Down
Loading

0 comments on commit b165000

Please sign in to comment.