Skip to content

Commit

Permalink
brackets around whole phrase and add override
Browse files Browse the repository at this point in the history
scores, no scores, onebest excerpt

clean and test

ln for score and fix skippedword return

one best eps fix

remove google.sets from excerpt test

fix brackets around all phrases in excerpts
  • Loading branch information
austin007008 committed Jun 25, 2024
1 parent 246e09d commit 592611a
Show file tree
Hide file tree
Showing 7 changed files with 408 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
private static final String BEFORE = "BEFORE";
private static final String AFTER = "AFTER";
private static final String BOTH = "BOTH";
private static final String XXXWESKIPPEDAWORDXXX = "XXXWESKIPPEDAWORDXXX";

@Override
public IteratorOptions describeOptions() {
Expand Down Expand Up @@ -283,6 +284,10 @@ public void next() throws IOException {
WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
boolean stopFound;

if (dtUid == null) {
return;
}

// while we have term frequencies for the same document
while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
top = source.getTopKey();
Expand Down Expand Up @@ -316,7 +321,8 @@ public void next() throws IOException {
stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
}
if (stopFound && !trim) {
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + XXXWESKIPPEDAWORDXXX + Constants.NULL
+ XXXWESKIPPEDAWORDXXX + Constants.NULL + XXXWESKIPPEDAWORDXXX), cv, ts);
tv = new Value();
return;
}
Expand All @@ -331,7 +337,37 @@ public void next() throws IOException {
source.next();
}
// generate the return key and value
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
String phraseWithScores = generatePhrase(terms);
boolean usedScores = false;
// check to see if we have something scored and if so, turn off outputting the scores
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
if (term.getUseScores()) {
usedScores = true;
term.setOutputScores(false);
}
}
String phraseWithoutScores = generatePhrase(terms);
String oneBestExcerpt;
// if not scored, we won't output anything for these two parts
if (!usedScores && startOffset < endOffset && !phraseWithScores.isEmpty()) {
phraseWithScores = "XXXNOTSCOREDXXX";
oneBestExcerpt = "XXXNOTSCOREDXXX";
} else {
for (WordsAndScores term : terms) {
if (term == null) {
continue;
}
term.setOneBestExcerpt(true);
}
oneBestExcerpt = generatePhrase(terms);
}

tk = new Key(row, new Text(dtUid),
new Text(fieldName + Constants.NULL + phraseWithScores + Constants.NULL + phraseWithoutScores + Constants.NULL + oneBestExcerpt), cv,
ts);
tv = new Value();
}

Expand All @@ -343,6 +379,7 @@ public void next() throws IOException {
* @return the phrase
*/
protected String generatePhrase(WordsAndScores[] terms) {
checkForHitPhrase(terms);
// create an array with the same length as the one we just passed in
String[] termsToOutput = new String[terms.length];
boolean bef = direction.equals(BEFORE);
Expand Down Expand Up @@ -437,6 +474,13 @@ protected String generatePhrase(WordsAndScores[] terms) {
}
}

/**
* Trim down both side of the excerpt to the size that we want
*
* @param termsToOutput
* the terms to create a phrase from
* @return the trimmed array
*/
private String[] bothTrim(String[] termsToOutput) {
int expandedMid = (endOffset - startOffset) / 2;
int start = (int) (expandedMid - origHalfSize);
Expand Down Expand Up @@ -470,6 +514,87 @@ private String[] bothTrim(String[] termsToOutput) {
return termsToOutput;
}

/**
* Looks for hit phrases (not separate hit terms) and puts the whole phrase in brackets
*
* @param terms
* the terms to create a phrase from
*/
private void checkForHitPhrase(WordsAndScores[] terms) {
ArrayList<String> hitPhrases = new ArrayList<>();
// checks for phrases (anything in the hit list with a space in it) and adds them to a new arrayList
for (String s : hitTermsList) {
if (s.contains(" ")) {
hitPhrases.add(s);
}
}
// if we don't find any, return unchanged
if (hitPhrases.isEmpty()) {
return;
}
// for each hit phrase found...
for (String hitPhrase : hitPhrases) {
// split the phrase on the spaces into the separate terms
String[] individualHitTerms = hitPhrase.split(" ");
// if the phrase is almost the same size as the whole excerpt, skip this iteration
if ((terms.length - 2) < individualHitTerms.length) {
continue;
}
// iterate across the WordsAndScores until the end of the hit phrase reaches the last offset
for (int j = 0; j < terms.length - individualHitTerms.length + 1; j++) {
// if we find the hit phrase...
if (isPhraseFound(individualHitTerms, terms, j)) {
// set which position in the phrase each offset is
int overridePosition;
for (int k = 0; k < individualHitTerms.length; k++) {
// beginning of phrase
if (k == 0) {
overridePosition = 1;
} else if (k == individualHitTerms.length - 1) { // end of phrase
overridePosition = 3;
} else { // middle of phrase
overridePosition = 2;
}
// set the override values for the current positions WordsAndScores to the index of the hit term in this position plus the override
terms[j + k].setOverride(terms[j + k].getWordsList().indexOf(individualHitTerms[k]), overridePosition);
}
}
}
}
}

/**
* Check to see if the whole hit phrase is found in the offsets starting at the passed in j value
*
* @param individualHitTerms
* the array of the hit phrase split into individual terms
*
* @param terms
* the terms to create a phrase from
*
* @param j
* the current starting offset in the WordsAndScores array
* @return boolean isPhraseFound
*/
private boolean isPhraseFound(String[] individualHitTerms, WordsAndScores[] terms, int j) {
ArrayList<String> tempWords;
// k represents what position we are in of the individual hit terms array
for (int k = 0; k < individualHitTerms.length; k++) {
// if a WordsAndScores is null, the phrase obviously wasn't found
if (terms[j + k] == null) {
return false;
}
// get the words list from the current WordsAndScores
tempWords = (ArrayList<String>) terms[j + k].getWordsList();
// if the current WordsAndScores doesn't have the term for this position, the phrase obviously wasn't found
if (!tempWords.contains(individualHitTerms[k])) {
return false;
}
}
// we found the whole phrase!!!
return true;
}

/**
* Determine if this dt and uid are in the accepted column families
*
Expand Down Expand Up @@ -557,7 +682,7 @@ private String getDtUidFromTfKey(Key tfKey) {
* @return the start or end document (cq) for our tf scan range. Null if dt,uid does not exist in the event key
*/
private String getDtUidFromEventKey(Key eventKey, boolean startKey, boolean inclusive) {
// if an infinite end range, or unspecified end document, then no cdocument to specify
// if an infinite end range, or unspecified end document, then no document to specify
if (eventKey == null || eventKey.getColumnFamily() == null || eventKey.getColumnFamily().getLength() == 0) {
return null;
}
Expand Down Expand Up @@ -620,7 +745,6 @@ public void setTrim(boolean trim) {

@Override
public String toString() {

return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import org.apache.log4j.Logger;

import datawave.ingest.protobuf.TermWeightPosition;

/**
* An object used to save terms and their respective scores (if a score exists and is valid).
*/
Expand All @@ -26,8 +28,15 @@ public class WordsAndScores {
/** the size of the words and scores arrays */
private int arrSize;

/** the index of the override to output */
private int overrideIndex;
/** 1 for beginning of phrase, 2 for middle of phrase, 3 for end of phrase, anything else for none of those. */
private int overrideValue;

private boolean useScores;
private boolean hasHitTerm;
private boolean outputScores;
private boolean oneBestExcerpt;

private static final String OPENP = "(";
private static final String CLOSEP = ")";
Expand All @@ -51,6 +60,10 @@ public WordsAndScores() {
arrSize = 0;
useScores = false;
hasHitTerm = false;
overrideIndex = -1;
overrideValue = -1;
outputScores = false;
oneBestExcerpt = false;
}

public void reset() {
Expand All @@ -62,6 +75,10 @@ public void reset() {
arrSize = 0;
useScores = false;
hasHitTerm = false;
overrideIndex = -1;
overrideValue = -1;
outputScores = false;
oneBestExcerpt = false;
}

/**
Expand Down Expand Up @@ -109,11 +126,13 @@ public boolean addTerm(String word, int score, List<String> hitTermsList) {
} else { // if we do not already have a valid smallestScore...
smallestScoreIndex = arrSize - 1; // set this index as the smallest score
useScores = true;
outputScores = true;
}
} else { // if this is the first word/score we are adding, set index 0 (the only one) as the longest/smallest
longestWordIndex = 0;
smallestScoreIndex = 0;
useScores = true;
outputScores = true;
if ((hitTermsList != null) && hitTermsList.contains(word)) {
hitTermIndex = 0;
hasHitTerm = true;
Expand Down Expand Up @@ -164,15 +183,53 @@ public boolean addTerm(String word, List<String> hitTermsList) {
*/
public String getWordToOutput() {
if (smallestScoreIndex == -1 && longestWordIndex == -1) { // if we try and get the word from an object with nothing added to it (should never happen)...
log.warn("Trying to get token to output when none have been added: Will output \"reportmetodatawave\".");
return "reportmetodatawave";
log.warn("Trying to get token to output when none have been added: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
} else {
if (oneBestExcerpt) {
if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
return null;
}
if (hitTermIndex == smallestScoreIndex) {
return OPENB + words.get(hitTermIndex) + CLOSEB;
} else {
return words.get(smallestScoreIndex);
}
}
if (overrideIndex >= 0 && overrideIndex < arrSize) {
if (STOP_WORD_LIST.contains(words.get(overrideIndex))) { // if the hit term is on the stop list for some reason...
return null;
}
switch (overrideValue) {
case 1:
if (useScores && (scores.get(overrideIndex) != -1)) {
return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return OPENB + words.get(overrideIndex);
}
case 3:
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB;
} else {
return words.get(overrideIndex) + CLOSEB;
}
case 2:
if (useScores && (scores.get(overrideIndex) != -1)) {
return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
} else {
return words.get(overrideIndex);
}
default:
log.warn("Invalid override value: Will output \"REPORTMETODATAWAVE\".");
return "REPORTMETODATAWAVE";
}
}
if (hasHitTerm) { // if we have a hit term...
if (STOP_WORD_LIST.contains(words.get(hitTermIndex))) { // if the hit term is on the stop list for some reason...
return null;
}
if (useScores && (scores.get(hitTermIndex) != -1)) { // if we have a valid score for the hit term...
return OPENB + words.get(hitTermIndex) + CLOSEB + OPENP + userReadable(scores.get(hitTermIndex)) + CLOSEP;
return OPENB + words.get(hitTermIndex) + OPENP + userReadable(scores.get(hitTermIndex)) + CLOSEP + CLOSEB;
} else {
return OPENB + words.get(hitTermIndex) + CLOSEB;
}
Expand All @@ -181,7 +238,12 @@ public String getWordToOutput() {
if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
return null;
}
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest score
if (outputScores) {
return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest
// score
} else {
return words.get(smallestScoreIndex);
}
} else { // default to returning the longest word if the scores don't exist/aren't valid
if (STOP_WORD_LIST.contains(words.get(longestWordIndex))) { // if the selected term is in the stop list...
return null;
Expand All @@ -192,18 +254,11 @@ public String getWordToOutput() {
}

/**
* Converts the score into a number from 0-1000 (higher is better) so that it is easier for the user to understand.
* Converts the score into a number from 0-100 (higher is better) so that it is easier for the user to understand.
*/
private String userReadable(int score) {
int x = Math.max(1000 - (score / 50000), 0);
if (x == 0) {
return String.valueOf(0);
} else if (x == 1000) {
return String.valueOf(1);
} else {
String y = String.format("%.3f", (float) x / 1000);
return y.substring(1);
}
private int userReadable(int score) {
// the original probability got put through ln(x) so we do e^x to put it back to the original probability
return (int) Math.round((Math.exp(TermWeightPosition.termWeightScoreToPositionScore(score))) * 100);
}

/**
Expand Down Expand Up @@ -266,6 +321,19 @@ public void setWordsAndScoresList(List<String> words, List<Integer> scores, List
}
}

public void setOverride(int overrideIndex, int overrideValue) {
this.overrideIndex = overrideIndex;
this.overrideValue = overrideValue;
}

public void setOutputScores(boolean outputScores) {
this.outputScores = outputScores;
}

public void setOneBestExcerpt(boolean oneBestExcerpt) {
this.oneBestExcerpt = oneBestExcerpt;
}

/**
* Returns a boolean that is true if there is a valid score in the scores list and false otherwise.
*
Expand Down
Loading

0 comments on commit 592611a

Please sign in to comment.