Skip to content

Commit

Permalink
Performance: optimize MatchHashesAndScoreQuery for case where a hash …
Browse files Browse the repository at this point in the history
…occurs once (96% recall at 190 qps) (#612)
  • Loading branch information
alexklibisz authored Nov 30, 2023
1 parent a74973a commit 504589b
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 14 deletions.
2 changes: 1 addition & 1 deletion Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ tasks:
- venv/bin/python run.py --dataset fashion-mnist-784-euclidean --algorithm elastiknn-l2lsh --runs 3 --count 100 --parallelism 1 --force --local
- mkdir -p $RESULTS_DIR
- venv/bin/python plot.py --dataset fashion-mnist-784-euclidean --count 100 --output $RESULTS_DIR/plot.png | venv/bin/python ../parse_results.py > $RESULTS_DIR/results.md
- base64 -b 0 -i $RESULTS_DIR/plot.png > $RESULTS_DIR/plot.b64
- base64 -w 0 -i $RESULTS_DIR/plot.png > $RESULTS_DIR/plot.b64
- cat $RESULTS_DIR/results.md

annbRunOfficialFashionMnist:
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/performance/fashion-mnist/plot.b64

Large diffs are not rendered by default.

Binary file modified docs/pages/performance/fashion-mnist/plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 8 additions & 8 deletions docs/pages/performance/fashion-mnist/results.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
|Model|Parameters|Recall|Queries per Second|
|---|---|---|---|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|346.083|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.446|289.601|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.634|276.494|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|242.322|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.767|301.259|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.847|259.373|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.922|201.386|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|180.637|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=0|0.378|337.457|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=0|0.446|281.828|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=500 probes=3|0.634|272.814|
|eknn-l2lsh|L=100 k=4 w=1024 candidates=1000 probes=3|0.716|232.698|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=0|0.767|303.686|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=0|0.846|254.121|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=500 probes=3|0.922|215.233|
|eknn-l2lsh|L=100 k=4 w=2048 candidates=1000 probes=3|0.960|190.689|
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ public ArrayHitCounter(int capacity) {
maxKey = 0;
}

@Override
public void increment(int key) {
if (counts[key]++ == 0) {
numHits++;
minKey = Math.min(key, minKey);
maxKey = Math.max(key, maxKey);
}
}

@Override
public void increment(int key, short count) {
if ((counts[key] += count) == count) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import org.apache.lucene.search.KthGreatest;

public final class EmptyHitCounter implements HitCounter {

@Override
public void increment(int key) {}

@Override
public void increment(int key, short count) {}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
*/
public interface HitCounter {

void increment(int key);

void increment(int key, short count);

boolean isEmpty();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,23 @@ private HitCounter countHits(LeafReader reader) throws IOException {
// TODO: Is this the right place to use the live docs bitset to check for deleted docs?
// Bits liveDocs = reader.getLiveDocs();
for (HashAndFreq hf : hashAndFrequencies) {
if (termsEnum.seekExact(new BytesRef(hf.hash))) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
counter.increment(docs.docID(), (short) min(hf.freq, docs.freq()));
// We take two different paths here, depending on the frequency of the current hash.
// If the frequency is one, we avoid checking the frequency of matching docs when
// incrementing the counter. This yields a ~5% to ~10% speedup.
// See https://github.com/alexklibisz/elastiknn/pull/612 for details.
if (hf.freq == 1) {
if (termsEnum.seekExact(new BytesRef(hf.hash))) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
counter.increment(docs.docID());
}
}
} else {
if (termsEnum.seekExact(new BytesRef(hf.hash))) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
counter.increment(docs.docID(), (short) min(hf.freq, docs.freq()));
}
}
}
}
Expand Down

0 comments on commit 504589b

Please sign in to comment.