brackets around whole phrase and add override

scores, no scores, onebest excerpt clean and test ln for score and fix skippedword return one best eps fix remove google.sets from excerpt test fix brackets around all phrases in excerpts
NationalSecurityAgency · Jun 25, 2024 · 592611a · 592611a
1 parent 246e09d
commit 592611a
Show file tree

Hide file tree

Showing 7 changed files with 408 additions and 85 deletions.
diff --git a/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java b/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java
@@ -76,6 +76,7 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
     private static final String BEFORE = "BEFORE";
     private static final String AFTER = "AFTER";
     private static final String BOTH = "BOTH";
+    private static final String XXXWESKIPPEDAWORDXXX = "XXXWESKIPPEDAWORDXXX";
 
     @Override
     public IteratorOptions describeOptions() {
@@ -283,6 +284,10 @@ public void next() throws IOException {
         WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
         boolean stopFound;
 
+        if (dtUid == null) {
+            return;
+        }
+
         // while we have term frequencies for the same document
         while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
             top = source.getTopKey();
@@ -316,7 +321,8 @@ public void next() throws IOException {
                                 stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
                             }
                             if (stopFound && !trim) {
-                                tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
+                                tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + XXXWESKIPPEDAWORDXXX + Constants.NULL
+                                                + XXXWESKIPPEDAWORDXXX + Constants.NULL + XXXWESKIPPEDAWORDXXX), cv, ts);
                                 tv = new Value();
                                 return;
                             }
@@ -331,7 +337,37 @@ public void next() throws IOException {
             source.next();
         }
         // generate the return key and value
-        tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
+        String phraseWithScores = generatePhrase(terms);
+        boolean usedScores = false;
+        // check to see if we have something scored and if so, turn off outputting the scores
+        for (WordsAndScores term : terms) {
+            if (term == null) {
+                continue;
+            }
+            if (term.getUseScores()) {
+                usedScores = true;
+                term.setOutputScores(false);
+            }
+        }
+        String phraseWithoutScores = generatePhrase(terms);
+        String oneBestExcerpt;
+        // if not scored, we won't output anything for these two parts
+        if (!usedScores && startOffset < endOffset && !phraseWithScores.isEmpty()) {
+            phraseWithScores = "XXXNOTSCOREDXXX";
+            oneBestExcerpt = "XXXNOTSCOREDXXX";
+        } else {
+            for (WordsAndScores term : terms) {
+                if (term == null) {
+                    continue;
+                }
+                term.setOneBestExcerpt(true);
+            }
+            oneBestExcerpt = generatePhrase(terms);
+        }
+
+        tk = new Key(row, new Text(dtUid),
+                        new Text(fieldName + Constants.NULL + phraseWithScores + Constants.NULL + phraseWithoutScores + Constants.NULL + oneBestExcerpt), cv,
+                        ts);
         tv = new Value();
     }
 
@@ -343,6 +379,7 @@ public void next() throws IOException {
      * @return the phrase
      */
     protected String generatePhrase(WordsAndScores[] terms) {
+        checkForHitPhrase(terms);
         // create an array with the same length as the one we just passed in
         String[] termsToOutput = new String[terms.length];
         boolean bef = direction.equals(BEFORE);
@@ -437,6 +474,13 @@ protected String generatePhrase(WordsAndScores[] terms) {
         }
     }
 
+    /**
+     * Trim down both side of the excerpt to the size that we want
+     *
+     * @param termsToOutput
+     *            the terms to create a phrase from
+     * @return the trimmed array
+     */
     private String[] bothTrim(String[] termsToOutput) {
         int expandedMid = (endOffset - startOffset) / 2;
         int start = (int) (expandedMid - origHalfSize);
@@ -470,6 +514,87 @@ private String[] bothTrim(String[] termsToOutput) {
         return termsToOutput;
     }
 
+    /**
+     * Looks for hit phrases (not separate hit terms) and puts the whole phrase in brackets
+     *
+     * @param terms
+     *            the terms to create a phrase from
+     */
+    private void checkForHitPhrase(WordsAndScores[] terms) {
+        ArrayList<String> hitPhrases = new ArrayList<>();
+        // checks for phrases (anything in the hit list with a space in it) and adds them to a new arrayList
+        for (String s : hitTermsList) {
+            if (s.contains(" ")) {
+                hitPhrases.add(s);
+            }
+        }
+        // if we don't find any, return unchanged
+        if (hitPhrases.isEmpty()) {
+            return;
+        }
+        // for each hit phrase found...
+        for (String hitPhrase : hitPhrases) {
+            // split the phrase on the spaces into the separate terms
+            String[] individualHitTerms = hitPhrase.split(" ");
+            // if the phrase is almost the same size as the whole excerpt, skip this iteration
+            if ((terms.length - 2) < individualHitTerms.length) {
+                continue;
+            }
+            // iterate across the WordsAndScores until the end of the hit phrase reaches the last offset
+            for (int j = 0; j < terms.length - individualHitTerms.length + 1; j++) {
+                // if we find the hit phrase...
+                if (isPhraseFound(individualHitTerms, terms, j)) {
+                    // set which position in the phrase each offset is
+                    int overridePosition;
+                    for (int k = 0; k < individualHitTerms.length; k++) {
+                        // beginning of phrase
+                        if (k == 0) {
+                            overridePosition = 1;
+                        } else if (k == individualHitTerms.length - 1) { // end of phrase
+                            overridePosition = 3;
+                        } else { // middle of phrase
+                            overridePosition = 2;
+                        }
+                        // set the override values for the current positions WordsAndScores to the index of the hit term in this position plus the override
+                        terms[j + k].setOverride(terms[j + k].getWordsList().indexOf(individualHitTerms[k]), overridePosition);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Check to see if the whole hit phrase is found in the offsets starting at the passed in j value
+     *
+     * @param individualHitTerms
+     *            the array of the hit phrase split into individual terms
+     *
+     * @param terms
+     *            the terms to create a phrase from
+     *
+     * @param j
+     *            the current starting offset in the WordsAndScores array
+     * @return boolean isPhraseFound
+     */
+    private boolean isPhraseFound(String[] individualHitTerms, WordsAndScores[] terms, int j) {
+        ArrayList<String> tempWords;
+        // k represents what position we are in of the individual hit terms array
+        for (int k = 0; k < individualHitTerms.length; k++) {
+            // if a WordsAndScores is null, the phrase obviously wasn't found
+            if (terms[j + k] == null) {
+                return false;
+            }
+            // get the words list from the current WordsAndScores
+            tempWords = (ArrayList<String>) terms[j + k].getWordsList();
+            // if the current WordsAndScores doesn't have the term for this position, the phrase obviously wasn't found
+            if (!tempWords.contains(individualHitTerms[k])) {
+                return false;
+            }
+        }
+        // we found the whole phrase!!!
+        return true;
+    }
+
     /**
      * Determine if this dt and uid are in the accepted column families
      *
@@ -557,7 +682,7 @@ private String getDtUidFromTfKey(Key tfKey) {
      * @return the start or end document (cq) for our tf scan range. Null if dt,uid does not exist in the event key
      */
     private String getDtUidFromEventKey(Key eventKey, boolean startKey, boolean inclusive) {
-        // if an infinite end range, or unspecified end document, then no cdocument to specify
+        // if an infinite end range, or unspecified end document, then no document to specify
         if (eventKey == null || eventKey.getColumnFamily() == null || eventKey.getColumnFamily().getLength() == 0) {
             return null;
         }
@@ -620,7 +745,6 @@ public void setTrim(boolean trim) {
 
     @Override
     public String toString() {
-
         return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
     }
 

diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java
@@ -6,6 +6,8 @@
 
 import org.apache.log4j.Logger;
 
+import datawave.ingest.protobuf.TermWeightPosition;
+
 /**
  * An object used to save terms and their respective scores (if a score exists and is valid).
  */
@@ -26,8 +28,15 @@ public class WordsAndScores {
     /** the size of the words and scores arrays */
     private int arrSize;
 
+    /** the index of the override to output */
+    private int overrideIndex;
+    /** 1 for beginning of phrase, 2 for middle of phrase, 3 for end of phrase, anything else for none of those. */
+    private int overrideValue;
+
     private boolean useScores;
     private boolean hasHitTerm;
+    private boolean outputScores;
+    private boolean oneBestExcerpt;
 
     private static final String OPENP = "(";
     private static final String CLOSEP = ")";
@@ -51,6 +60,10 @@ public WordsAndScores() {
         arrSize = 0;
         useScores = false;
         hasHitTerm = false;
+        overrideIndex = -1;
+        overrideValue = -1;
+        outputScores = false;
+        oneBestExcerpt = false;
     }
 
     public void reset() {
@@ -62,6 +75,10 @@ public void reset() {
         arrSize = 0;
         useScores = false;
         hasHitTerm = false;
+        overrideIndex = -1;
+        overrideValue = -1;
+        outputScores = false;
+        oneBestExcerpt = false;
     }
 
     /**
@@ -109,11 +126,13 @@ public boolean addTerm(String word, int score, List<String> hitTermsList) {
                 } else { // if we do not already have a valid smallestScore...
                     smallestScoreIndex = arrSize - 1; // set this index as the smallest score
                     useScores = true;
+                    outputScores = true;
                 }
             } else { // if this is the first word/score we are adding, set index 0 (the only one) as the longest/smallest
                 longestWordIndex = 0;
                 smallestScoreIndex = 0;
                 useScores = true;
+                outputScores = true;
                 if ((hitTermsList != null) && hitTermsList.contains(word)) {
                     hitTermIndex = 0;
                     hasHitTerm = true;
@@ -164,15 +183,53 @@ public boolean addTerm(String word, List<String> hitTermsList) {
      */
     public String getWordToOutput() {
         if (smallestScoreIndex == -1 && longestWordIndex == -1) { // if we try and get the word from an object with nothing added to it (should never happen)...
-            log.warn("Trying to get token to output when none have been added: Will output \"reportmetodatawave\".");
-            return "reportmetodatawave";
+            log.warn("Trying to get token to output when none have been added: Will output \"REPORTMETODATAWAVE\".");
+            return "REPORTMETODATAWAVE";
         } else {
+            if (oneBestExcerpt) {
+                if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
+                    return null;
+                }
+                if (hitTermIndex == smallestScoreIndex) {
+                    return OPENB + words.get(hitTermIndex) + CLOSEB;
+                } else {
+                    return words.get(smallestScoreIndex);
+                }
+            }
+            if (overrideIndex >= 0 && overrideIndex < arrSize) {
+                if (STOP_WORD_LIST.contains(words.get(overrideIndex))) { // if the hit term is on the stop list for some reason...
+                    return null;
+                }
+                switch (overrideValue) {
+                    case 1:
+                        if (useScores && (scores.get(overrideIndex) != -1)) {
+                            return OPENB + words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
+                        } else {
+                            return OPENB + words.get(overrideIndex);
+                        }
+                    case 3:
+                        if (useScores && (scores.get(overrideIndex) != -1)) {
+                            return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP + CLOSEB;
+                        } else {
+                            return words.get(overrideIndex) + CLOSEB;
+                        }
+                    case 2:
+                        if (useScores && (scores.get(overrideIndex) != -1)) {
+                            return words.get(overrideIndex) + OPENP + userReadable(scores.get(overrideIndex)) + CLOSEP;
+                        } else {
+                            return words.get(overrideIndex);
+                        }
+                    default:
+                        log.warn("Invalid override value: Will output \"REPORTMETODATAWAVE\".");
+                        return "REPORTMETODATAWAVE";
+                }
+            }
             if (hasHitTerm) { // if we have a hit term...
                 if (STOP_WORD_LIST.contains(words.get(hitTermIndex))) { // if the hit term is on the stop list for some reason...
                     return null;
                 }
                 if (useScores && (scores.get(hitTermIndex) != -1)) { // if we have a valid score for the hit term...
-                    return OPENB + words.get(hitTermIndex) + CLOSEB + OPENP + userReadable(scores.get(hitTermIndex)) + CLOSEP;
+                    return OPENB + words.get(hitTermIndex) + OPENP + userReadable(scores.get(hitTermIndex)) + CLOSEP + CLOSEB;
                 } else {
                     return OPENB + words.get(hitTermIndex) + CLOSEB;
                 }
@@ -181,7 +238,12 @@ public String getWordToOutput() {
                 if (STOP_WORD_LIST.contains(words.get(smallestScoreIndex))) { // if the selected term is in the stop list...
                     return null;
                 }
-                return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest score
+                if (outputScores) {
+                    return words.get(smallestScoreIndex) + OPENP + userReadable(scores.get(smallestScoreIndex)) + CLOSEP; // return the word with the smallest
+                                                                                                                          // score
+                } else {
+                    return words.get(smallestScoreIndex);
+                }
             } else { // default to returning the longest word if the scores don't exist/aren't valid
                 if (STOP_WORD_LIST.contains(words.get(longestWordIndex))) { // if the selected term is in the stop list...
                     return null;
@@ -192,18 +254,11 @@ public String getWordToOutput() {
     }
 
     /**
-     * Converts the score into a number from 0-1000 (higher is better) so that it is easier for the user to understand.
+     * Converts the score into a number from 0-100 (higher is better) so that it is easier for the user to understand.
      */
-    private String userReadable(int score) {
-        int x = Math.max(1000 - (score / 50000), 0);
-        if (x == 0) {
-            return String.valueOf(0);
-        } else if (x == 1000) {
-            return String.valueOf(1);
-        } else {
-            String y = String.format("%.3f", (float) x / 1000);
-            return y.substring(1);
-        }
+    private int userReadable(int score) {
+        // the original probability got put through ln(x) so we do e^x to put it back to the original probability
+        return (int) Math.round((Math.exp(TermWeightPosition.termWeightScoreToPositionScore(score))) * 100);
     }
 
     /**
@@ -266,6 +321,19 @@ public void setWordsAndScoresList(List<String> words, List<Integer> scores, List
         }
     }
 
+    public void setOverride(int overrideIndex, int overrideValue) {
+        this.overrideIndex = overrideIndex;
+        this.overrideValue = overrideValue;
+    }
+
+    public void setOutputScores(boolean outputScores) {
+        this.outputScores = outputScores;
+    }
+
+    public void setOneBestExcerpt(boolean oneBestExcerpt) {
+        this.oneBestExcerpt = oneBestExcerpt;
+    }
+
     /**
      * Returns a boolean that is true if there is a valid score in the scores list and false otherwise.
      *