Performance: optimize MatchHashesAndScoreQuery for case where a hash …

…occurs once (96% recall at 190 qps) (#612)
alexklibisz · Nov 30, 2023 · 504589b · 504589b
1 parent a74973a
commit 504589b
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 14 deletions.
diff --git a/Taskfile.yaml b/Taskfile.yaml
@@ -56,7 +56,7 @@ tasks:
       - venv/bin/python run.py --dataset fashion-mnist-784-euclidean --algorithm elastiknn-l2lsh --runs 3 --count 100 --parallelism 1 --force --local
       - mkdir -p $RESULTS_DIR
       - venv/bin/python plot.py --dataset fashion-mnist-784-euclidean --count 100 --output $RESULTS_DIR/plot.png | venv/bin/python ../parse_results.py > $RESULTS_DIR/results.md
-      - base64 -b 0 -i $RESULTS_DIR/plot.png > $RESULTS_DIR/plot.b64
+      - base64 -w 0 -i $RESULTS_DIR/plot.png > $RESULTS_DIR/plot.b64
       - cat $RESULTS_DIR/results.md
 
   annbRunOfficialFashionMnist:

diff --git a/docs/pages/performance/fashion-mnist/plot.b64 b/docs/pages/performance/fashion-mnist/plot.b64
diff --git a/docs/pages/performance/fashion-mnist/plot.png b/docs/pages/performance/fashion-mnist/plot.png
diff --git a/docs/pages/performance/fashion-mnist/results.md b/docs/pages/performance/fashion-mnist/results.md
@@ -1,10 +1,10 @@
 |Model|Parameters|Recall|Queries per Second|
 |---|---|---|---|
-|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|346.083|
-|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.446|289.601|
-|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.634|276.494|
-|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|242.322|
-|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.767|301.259|
-|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.847|259.373|
-|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.922|201.386|
-|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|180.637|
+|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|337.457|
+|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.446|281.828|
+|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.634|272.814|
+|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|232.698|
+|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.767|303.686|
+|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.846|254.121|
+|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.922|215.233|
+|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|190.689|
diff --git a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/ArrayHitCounter.java b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/ArrayHitCounter.java
@@ -21,6 +21,15 @@ public ArrayHitCounter(int capacity) {
         maxKey = 0;
     }
 
+    @Override
+    public void increment(int key) {
+        if (counts[key]++ == 0) {
+            numHits++;
+            minKey = Math.min(key, minKey);
+            maxKey = Math.max(key, maxKey);
+        }
+    }
+
     @Override
     public void increment(int key, short count) {
         if ((counts[key] += count) == count) {

diff --git a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/EmptyHitCounter.java b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/EmptyHitCounter.java
@@ -3,6 +3,10 @@
 import org.apache.lucene.search.KthGreatest;
 
 public final class EmptyHitCounter implements HitCounter {
+
+    @Override
+    public void increment(int key) {}
+
     @Override
     public void increment(int key, short count) {}
 

diff --git a/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/HitCounter.java b/elastiknn-lucene/src/main/java/com/klibisz/elastiknn/search/HitCounter.java
@@ -7,6 +7,8 @@
  */
 public interface HitCounter {
 
+    void increment(int key);
+
     void increment(int key, short count);
 
     boolean isEmpty();

diff --git a/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java b/elastiknn-lucene/src/main/java/org/apache/lucene/search/MatchHashesAndScoreQuery.java
@@ -67,10 +67,23 @@ private HitCounter countHits(LeafReader reader) throws IOException {
                     // TODO: Is this the right place to use the live docs bitset to check for deleted docs?
                     // Bits liveDocs = reader.getLiveDocs();
                     for (HashAndFreq hf : hashAndFrequencies) {
-                        if (termsEnum.seekExact(new BytesRef(hf.hash))) {
-                            docs = termsEnum.postings(docs, PostingsEnum.NONE);
-                            while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
-                                counter.increment(docs.docID(), (short) min(hf.freq, docs.freq()));
+                        // We take two different paths here, depending on the frequency of the current hash.
+                        // If the frequency is one, we avoid checking the frequency of matching docs when
+                        // incrementing the counter. This yields a ~5% to ~10% speedup.
+                        // See https://github.com/alexklibisz/elastiknn/pull/612 for details.
+                        if (hf.freq == 1) {
+                            if (termsEnum.seekExact(new BytesRef(hf.hash))) {
+                                docs = termsEnum.postings(docs, PostingsEnum.NONE);
+                                while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                                    counter.increment(docs.docID());
+                                }
+                            }
+                        } else {
+                            if (termsEnum.seekExact(new BytesRef(hf.hash))) {
+                                docs = termsEnum.postings(docs, PostingsEnum.NONE);
+                                while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+                                    counter.increment(docs.docID(), (short) min(hf.freq, docs.freq()));
+                                }
                             }
                         }
                     }