Skip to content

Commit

Permalink
scores, no scores, onebest excerpt
Browse files Browse the repository at this point in the history
clean and test

ln for score and fix skippedword return

one best eps fix

remove google.sets from excerpt test
  • Loading branch information
austin007008 committed Jun 24, 2024
1 parent ab96d5f commit 8da2efd
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 109 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
private static final String BEFORE = "BEFORE";
private static final String AFTER = "AFTER";
private static final String BOTH = "BOTH";
private static final String XXXWESKIPPEDAWORDXXX = "XXXWESKIPPEDAWORDXXX";

@Override
public IteratorOptions describeOptions() {
Expand Down Expand Up @@ -316,7 +317,8 @@ public void next() throws IOException {
stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
}
if (stopFound && !trim) {
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + XXXWESKIPPEDAWORDXXX + Constants.NULL
+ XXXWESKIPPEDAWORDXXX + Constants.NULL + XXXWESKIPPEDAWORDXXX), cv, ts);
tv = new Value();
return;
}
Expand All @@ -331,7 +333,37 @@ public void next() throws IOException {
source.next();
}
// generate the return key and value
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
String phraseWithScores = generatePhrase(terms);
boolean usedScores = false;
// check to see if we have something scored and if so, turn off outputting the scores
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
if (term.getUseScores()) {
usedScores = true;
term.setOutputScores(false);
}
}
String phraseWithoutScores = generatePhrase(terms);
String oneBestExcerpt;
// if not scored, we won't output anything for these two parts
if (!usedScores && startOffset < endOffset && !phraseWithScores.isEmpty()) {
phraseWithScores = "XXXNOTSCOREDXXX";
oneBestExcerpt = "XXXNOTSCOREDXXX";
} else {
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
term.setOneBestExcerpt(true);
}
oneBestExcerpt = generatePhrase(terms);
}

tk = new Key(row, new Text(dtUid),
new Text(fieldName + Constants.NULL + phraseWithScores + Constants.NULL + phraseWithoutScores + Constants.NULL + oneBestExcerpt), cv,
ts);
tv = new Value();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import org.apache.log4j.Logger;

import datawave.ingest.protobuf.TermWeightPosition;

/**
* An object used to save terms and their respective scores (if a score exists and is valid).
*/
Expand All @@ -28,12 +30,13 @@ public class WordsAndScores {

/** the index of the override to output */
private int overrideIndex;
private boolean overrideIsBeginning;
private boolean overrideIsMiddle;
private boolean overrideIsEnd;
/** 1 for beginning of phrase, 2 for middle of phrase, 3 for end of phrase, anything else for none of those. */
private int overrideValue;

private boolean useScores;
private boolean hasHitTerm;
private boolean outputScores;
private boolean oneBestExcerpt;

private static final String OPENP = "(";
private static final String CLOSEP = ")";
Expand All @@ -58,9 +61,9 @@ public WordsAndScores() {
useScores = false;
hasHitTerm = false;
overrideIndex = -1;
overrideIsBeginning = false;
overrideIsMiddle = false;
overrideIsEnd = false;
overrideValue = -1;
outputScores = false;
oneBestExcerpt = false;
}

public void reset() {
Expand All @@ -73,9 +76,9 @@ public void reset() {
useScores = false;
hasHitTerm = false;
overrideIndex = -1;
overrideIsBeginning = false;
overrideIsMiddle = false;
overrideIsEnd = false;
overrideValue = -1;
outputScores = false;
oneBestExcerpt = false;
}

/**
Expand Down Expand Up @@ -123,11 +126,13 @@ public boolean addTerm(String word, int score, List<String> hitTermsList) {
} else { // if we do not already have a valid smallestScore...
smallestScoreIndex = arrSize - 1; // set this index as the smallest score
useScores = true;
outputScores = true;
}
} else { // if this is the first word/score we are adding, set index 0 (the only one) as the longest/smallest
longestWordIndex = 0;
smallestScoreIndex = 0;
useScores = true;
outputScores = true;
if ((hitTermsList != null) && hitTermsList.contains(word)) {
hitTermIndex = 0;
hasHitTerm = true;
Expand Down Expand Up @@ -181,33 +186,43 @@ public String getWordToOutput() {
log.warn("Trying to get token to output when none have been added: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
} else {
if (oneBestExcerpt) {
if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
return null;
}
if (hitTermIndex == smallestScoreIndex) {
return OPENB + words.get(hitTermIndex) + CLOSEB;
} else {
return words.get(smallestScoreIndex);
}
}
if (overrideIndex >= 0 && overrideIndex < arrSize) {
if (STOP_WORD_LIST.contains(words.get(overrideIndex))) { // if the hit term is on the stop list for some reason...
return null;
}
if (overrideIsBeginning) {
if (useScores && (scores.get(overrideIndex) != -1)) {
return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return OPENB + words.get(overrideIndex);
}
} else if (overrideIsEnd) {
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB;
} else {
return words.get(overrideIndex) + CLOSEB;
}
} else if (overrideIsMiddle) {
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return words.get(overrideIndex);
}
} else {
log.warn("Invalid override value: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
switch (overrideValue) {
case 1:
if (useScores && (scores.get(overrideIndex) != -1)) {
return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return OPENB + words.get(overrideIndex);
}
case 3:
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB;
} else {
return words.get(overrideIndex) + CLOSEB;
}
case 2:
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return words.get(overrideIndex);
}
default:
log.warn("Invalid override value: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
}

}
if (hasHitTerm) { // if we have a hit term...
if (STOP_WORD_LIST.contains(words.get(hitTermIndex))) { // if the hit term is on the stop list for some reason...
Expand All @@ -223,7 +238,12 @@ public String getWordToOutput() {
if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
return null;
}
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest score
if (outputScores) {
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest
// score
} else {
return words.get(smallestScoreIndex);
}
} else { // default to returning the longest word if the scores don't exist/aren't valid
if (STOP_WORD_LIST.contains(words.get(longestWordIndex))) { // if the selected term is in the stop list...
return null;
Expand All @@ -234,18 +254,11 @@ public String getWordToOutput() {
}

/**
* Converts the score into a number from 0-1000 (higher is better) so that it is easier for the user to understand.
* Converts the score into a number from 0-100 (higher is better) so that it is easier for the user to understand.
*/
private String userReadable(int score) {
int x = Math.max(1000 - (score / 50000), 0);
if (x == 0) {
return String.valueOf(0);
} else if (x == 1000) {
return String.valueOf(1);
} else {
String y = String.format("%.3f", (float) x / 1000);
return y.substring(1);
}
private int userReadable(int score) {
// the original probability got put through ln(x) so we do e^x to put it back to the original probability
return (int) Math.round((Math.exp(TermWeightPosition.termWeightScoreToPositionScore(score))) * 100);
}

/**
Expand Down Expand Up @@ -308,16 +321,17 @@ public void setWordsAndScoresList(List<String> words, List<Integer> scores, List
}
}

public void setOverride(int overrideIndex, int overridePosition) {
public void setOverride(int overrideIndex, int overrideValue) {
this.overrideIndex = overrideIndex;
// 1 for beginning of phrase, 2 for middle of phrase, 3 for end of phrase, anything else for none of those.
if (overridePosition == 1) {
overrideIsBeginning = true;
} else if (overridePosition == 2) {
overrideIsMiddle = true;
} else if (overridePosition == 3) {
overrideIsEnd = true;
}
this.overrideValue = overrideValue;
}

public void setOutputScores(boolean outputScores) {
this.outputScores = outputScores;
}

public void setOneBestExcerpt(boolean oneBestExcerpt) {
this.oneBestExcerpt = oneBestExcerpt;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ public class ExcerptTransform extends DocumentTransform.DefaultDocumentTransform

public static final String PHRASE_INDEXES_ATTRIBUTE = "PHRASE_INDEXES_ATTRIBUTE";
public static final String HIT_EXCERPT = "HIT_EXCERPT";
public static final String HIT_EXCERPT_WITH_SCORES = "HIT_EXCERPT_WITH_SCORES";
public static final String HIT_EXCERPT_ONE_BEST = "HIT_EXCERPT_ONE_BEST";

private final Map<String,String> excerptIteratorOptions = new HashMap<>();
private final TermFrequencyExcerptIterator excerptIterator;
Expand Down Expand Up @@ -213,12 +215,27 @@ private ValueTuple attributeToHitTuple(Attribute<?> source) {
* the document
*/
private void addExcerptsToDocument(Set<Excerpt> excerpts, Document document) {
Attributes attributes = new Attributes(true);
Attributes attributesWithoutScores = new Attributes(true);
Attributes attributesWithScores = new Attributes(true);
Attributes attributesOneBest = new Attributes(true);
boolean hasScores = false;
for (Excerpt excerpt : excerpts) {
Content content = new Content(excerpt.getExcerpt(), excerpt.getSource(), true);
attributes.add(content);
Content contentWithoutScores = new Content(excerpt.getExcerptWithoutScores(), excerpt.getSource(), true);
attributesWithoutScores.add(contentWithoutScores);
String excerptWithScores = excerpt.getExcerptWithScores();
if (!excerptWithScores.equals("XXXNOTSCOREDXXX") && !excerptWithScores.isBlank()) {
Content contentWithScores = new Content(excerptWithScores, excerpt.getSource(), true);
attributesWithScores.add(contentWithScores);
hasScores = true;
Content contentOneBest = new Content(excerpt.getExcerptOneBest(), excerpt.getSource(), true);
attributesOneBest.add(contentOneBest);
}
}
document.put(HIT_EXCERPT, attributesWithoutScores);
if (hasScores) {
document.put(HIT_EXCERPT_WITH_SCORES, attributesWithScores);
document.put(HIT_EXCERPT_ONE_BEST, attributesOneBest);
}
document.put(HIT_EXCERPT, attributes);
}

/**
Expand Down Expand Up @@ -286,7 +303,8 @@ private Set<Excerpt> getExcerpts(PhraseIndexes phraseIndexes) {
String excerpt = getExcerpt(field, start, end, range, hitTermValues);
// Only retain non-blank excerpts.
if (excerpt != null && !excerpt.isEmpty()) {
excerpts.add(new Excerpt(startKey, excerpt));
String[] parts = excerpt.split(Constants.NULL);
excerpts.add(new Excerpt(startKey, parts[0], parts[1], parts[2]));
} else {
if (log.isTraceEnabled()) {
log.trace("Failed to find excerpt [" + start + "," + end + "] for field " + field + "for document " + eventId.replace('\u0000', '/'));
Expand Down Expand Up @@ -351,15 +369,15 @@ private String getExcerpt(String field, int start, int end, Range range, ArrayLi
Key topKey = excerptIterator.getTopKey();
String[] parts = topKey.getColumnQualifier().toString().split(Constants.NULL);
// The column qualifier is expected to be field\0phrase.
if (parts.length != 2) {
if (parts.length != 4) {
log.warn(TermFrequencyExcerptIterator.class.getSimpleName() + " returned top key with incorrectly-formatted column qualifier in key: "
+ topKey + " when scanning for excerpt [" + start + "," + end + "] for field " + field + " within range " + range);
return null;
}

// if we have reached the limit of times to try, or we have no stop words removed
if (i == 1 || !parts[1].equals("XXXWESKIPPEDAWORDXXX")) {
return parts[1];
return parts[1] + Constants.NULL + parts[2] + Constants.NULL + parts[3];
}
} else {
return null;
Expand Down Expand Up @@ -415,16 +433,28 @@ public Iterator<Entry<Key,Document>> getIterator(final Iterator<Entry<Key,Docume
* A class that holds the info for one excerpt.
*/
private static class Excerpt {
private final String excerpt;
private final String excerptWithScores;
private final String excerptWithoutScores;
private final String excerptOneBest;
private final Key source;

public Excerpt(Key source, String excerpt) {
public Excerpt(Key source, String excerptWithScores, String excerptWithoutScores, String excerptOneBest) {
this.source = source;
this.excerpt = excerpt;
this.excerptWithScores = excerptWithScores;
this.excerptWithoutScores = excerptWithoutScores;
this.excerptOneBest = excerptOneBest;
}

public String getExcerptWithScores() {
return excerptWithScores;
}

public String getExcerptWithoutScores() {
return excerptWithoutScores;
}

public String getExcerpt() {
return excerpt;
public String getExcerptOneBest() {
return excerptOneBest;
}

public Key getSource() {
Expand All @@ -438,12 +468,13 @@ public boolean equals(Object o) {
if (o == null || getClass() != o.getClass())
return false;
Excerpt excerpt1 = (Excerpt) o;
return excerpt.equals(excerpt1.excerpt) && source.equals(excerpt1.source);
return (excerptWithScores.equals(excerpt1.excerptWithScores) && source.equals(excerpt1.source))
&& (excerptWithoutScores.equals(excerpt1.excerptWithoutScores)) && (excerptOneBest.equals(excerpt1.excerptOneBest));
}

@Override
public int hashCode() {
return Objects.hash(excerpt, source);
return Objects.hash(excerptWithScores, excerptWithoutScores, excerptOneBest, source);
}
}

Expand Down
Loading

0 comments on commit 8da2efd

Please sign in to comment.