Skip to content

Commit

Permalink
Support filter threshold setting per field in a query.
Browse files Browse the repository at this point in the history
This threshold is currently used to determine whether to use bitvector posting list when searching in a disk index.
This optimization is triggered when the relative hit estimate of the query term is larger than the filter threshold.

This relates to #33139 and #33165 (schema syntax) and #33156 (config wiring).
  • Loading branch information
geirst committed Jan 23, 2025
1 parent e020972 commit 18e07a6
Show file tree
Hide file tree
Showing 11 changed files with 51 additions and 41 deletions.
6 changes: 3 additions & 3 deletions searchcore/src/tests/proton/matching/querynodes_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,11 +533,11 @@ TEST(QueryNodesTest, requireThatSimpleIntermediatesGetProperBlending)

TEST(QueryNodesTest, control_query_nodes_size)
{
EXPECT_EQ(64u + sizeof(std::string), sizeof(ProtonTermData));
EXPECT_EQ(72u + sizeof(std::string), sizeof(ProtonTermData));
EXPECT_EQ(32u + 2 * sizeof(std::string), sizeof(search::query::NumberTerm));
EXPECT_EQ(96u + 3 * sizeof(std::string), sizeof(ProtonNodeTypes::NumberTerm));
EXPECT_EQ(104u + 3 * sizeof(std::string), sizeof(ProtonNodeTypes::NumberTerm));
EXPECT_EQ(32u + 2 * sizeof(std::string), sizeof(search::query::StringTerm));
EXPECT_EQ(96u + 3 * sizeof(std::string), sizeof(ProtonNodeTypes::StringTerm));
EXPECT_EQ(104u + 3 * sizeof(std::string), sizeof(ProtonNodeTypes::StringTerm));
}

} // namespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ ProtonTermData::resolve(const ViewResolver &resolver, const IIndexEnvironment &i
for (size_t i = 0; i < fields.size(); ++i) {
const FieldInfo *info = idxEnv.getFieldByName(fields[i]);
if (info != nullptr) {
_fields.emplace_back(fields[i], info->id(), forceFilter || info->isFilter());
_fields.emplace_back(fields[i], info->id(),
(forceFilter ? search::fef::FilterThreshold(true) : info->get_filter_threshold()));
FieldEntry & field = _fields.back();
field.attribute_field = is_attribute(info->type());
} else {
Expand Down
5 changes: 4 additions & 1 deletion searchcore/src/vespa/searchcore/proton/matching/querynodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ class ProtonTermData : public search::fef::ITermData
bool attribute_field;

FieldEntry(const std::string &name, uint32_t fieldId, bool is_filter) noexcept
: FieldEntry(name, fieldId, search::fef::FilterThreshold(is_filter))
{}
FieldEntry(const std::string &name, uint32_t fieldId, search::fef::FilterThreshold threshold) noexcept
: ITermFieldData(fieldId),
_field_spec(name, fieldId, search::fef::IllegalHandle, is_filter),
_field_spec(name, fieldId, search::fef::IllegalHandle, threshold),
attribute_field(false)
{}

Expand Down
15 changes: 7 additions & 8 deletions searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ using search::diskindex::DiskTermBlueprint;
using search::diskindex::FieldIndex;
using search::diskindex::TestDiskIndex;
using search::diskindex::ZcRareWordPosOccIterator;
using search::fef::FilterThreshold;
using search::fef::TermFieldMatchDataArray;
using search::index::DictionaryLookupResult;
using search::index::DummyFileHeaderContext;
Expand Down Expand Up @@ -408,22 +409,20 @@ DiskIndexTest::requireThatBlueprintCanCreateSearchIterators()
EXPECT_EQ(result_f1_w1, SimpleResult().search(*s));
EXPECT_EQ(result_f1_w1, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // bitvector used due to bitvector_limit set.
{ // bitvector used due to filter threshold set.
// The term 'w2' hits 17 docs in field 'f2' (bitvector for term exists).
double bitvector_limit = 16.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f2", 0, 0, false), makeTerm("w2"), 100);
double threshold = 16.0 / 100.0;
b = create_blueprint(FieldSpec("f2", 0, 0, FilterThreshold(threshold)), makeTerm("w2"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE(dynamic_cast<BitVectorIterator *>(s.get()) != nullptr);
EXPECT_EQ(result_f2_w2, SimpleResult().search(*s));
EXPECT_EQ(result_f2_w2, SimpleResult().search(*leaf_b.createFilterSearch(upper_bound)));
}
{ // fake bitvector (wrapping posocc iterator) used due to bitvector_limit set.
{ // fake bitvector (wrapping posocc iterator) used due to filter threshold set.
// The term 'w1' hits 2 docs in field 'f1' (bitvector for term doesn't exist).
double bitvector_limit = 1.0 / 100.0;
_requestContext.get_create_blueprint_params().disk_index_bitvector_limit = bitvector_limit;
b = create_blueprint(FieldSpec("f1", 0, 0, false), makeTerm("w1"), 100);
double threshold = 1.0 / 100.0;
b = create_blueprint(FieldSpec("f1", 0, 0, FilterThreshold(threshold)), makeTerm("w1"), 100);
auto& leaf_b = dynamic_cast<LeafBlueprint&>(*b);
s = leaf_b.createLeafSearch(mda);
EXPECT_TRUE((dynamic_cast<BooleanMatchIteratorWrapper *>(s.get()) != nullptr));
Expand Down
2 changes: 1 addition & 1 deletion searchlib/src/tests/queryeval/queryeval_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ TEST(QueryEvalTest, test_dump)

TEST(QueryEvalTest, test_field_spec) {
EXPECT_EQ(8u, sizeof(FieldSpecBase));
EXPECT_EQ(8u + sizeof(std::string), sizeof(FieldSpec));
EXPECT_EQ(16u + sizeof(std::string), sizeof(FieldSpec));
}


Expand Down
4 changes: 1 addition & 3 deletions searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,7 @@ class CreateBlueprintVisitor : public CreateBlueprintVisitorHelper {
const std::string termStr = termAsString(n);
auto lookup_result = _field_index.lookup(termStr);
if (lookup_result.valid()) {
double bitvector_limit = getRequestContext().get_create_blueprint_params().disk_index_bitvector_limit;
setResult(std::make_unique<DiskTermBlueprint>
(_field, _field_index, termStr, lookup_result, _field.isFilter(), bitvector_limit));
setResult(std::make_unique<DiskTermBlueprint>(_field, _field_index, termStr, lookup_result));
} else {
setResult(std::make_unique<EmptyBlueprint>(_field));
}
Expand Down
11 changes: 4 additions & 7 deletions searchlib/src/vespa/searchlib/diskindex/disktermblueprint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,14 @@ getName(uint32_t indexId)
DiskTermBlueprint::DiskTermBlueprint(const FieldSpec & field,
const FieldIndex& field_index,
const std::string& query_term,
DictionaryLookupResult lookupRes,
bool is_filter_field,
double bitvector_limit)
DictionaryLookupResult lookupRes)
: SimpleLeafBlueprint(field),
_field(field),
_field_index(field_index),
_query_term(query_term),
_lookupRes(std::move(lookupRes)),
_bitvector_lookup_result(_field_index.lookup_bit_vector(_lookupRes)),
_is_filter_field(is_filter_field),
_bitvector_limit(bitvector_limit),
_is_filter_field(_field.isFilter()),
_fetchPostingsDone(false),
_postingHandle(),
_bitVector(),
Expand Down Expand Up @@ -118,8 +115,8 @@ DiskTermBlueprint::calculate_flow_stats(uint32_t docid_limit) const
bool
DiskTermBlueprint::use_bitvector() const
{
return _is_filter_field ||
((get_docid_limit() > 0) && ((double)_lookupRes.counts._numDocs / (double)get_docid_limit()) > _bitvector_limit);
return _is_filter_field ||
((get_docid_limit() > 0) && _field.get_filter_threshold().is_filter((double)_lookupRes.counts._numDocs / (double)get_docid_limit()));
}

const BitVector *
Expand Down
16 changes: 8 additions & 8 deletions searchlib/src/vespa/searchlib/diskindex/disktermblueprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class DiskTermBlueprint : public queryeval::SimpleLeafBlueprint
index::DictionaryLookupResult _lookupRes;
index::BitVectorDictionaryLookupResult _bitvector_lookup_result;
bool _is_filter_field;
double _bitvector_limit;
bool _fetchPostingsDone;
index::PostingListHandle _postingHandle;
std::shared_ptr<BitVector> _bitVector;
Expand All @@ -34,20 +33,21 @@ class DiskTermBlueprint : public queryeval::SimpleLeafBlueprint
/**
* Create a new blueprint.
*
* The filter threshold setting for the field determines whether bitvector is used for searching.
* If the field is a filter: force use of bitvector.
* Otherwise the filter threshold is compared against the hit estimate of the query term after dictionary lookup.
* If the hit estimate is above the filter threshold: force use of bitvector.
* If no bitvector exists for the term, a fake bitvector wrapping the posocc iterator is used.
*
* @param field The field to search in.
* @param field_index The field index used to read the bit vector or posting list.
* @param query_term The query term to search for.
* @param lookupRes The result after disk dictionary lookup.
* @param is_filter_field Whether this field is filter and we should force use of bit vector.
* @param bitvector_limit The hit estimate limit for whether bitvector should be used for searching this term.
This can be used to tune performance at the cost of quality.
If no bitvector exists for the term, a fake bitvector wrapping the posocc iterator is used.
**/
DiskTermBlueprint(const queryeval::FieldSpec & field,
const FieldIndex& field_index,
const std::string& query_term,
index::DictionaryLookupResult lookupRes,
bool is_filter_field,
double bitvector_limit);
index::DictionaryLookupResult lookupRes);

queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override;

Expand Down
8 changes: 4 additions & 4 deletions searchlib/src/vespa/searchlib/fef/filter_threshold.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ class FilterThreshold {
float _threshold;

public:
FilterThreshold() noexcept : _threshold(1.0) { }
FilterThreshold(bool is_filter_in) noexcept : _threshold(is_filter_in ? 0.0 : 1.0) { }
FilterThreshold(float threshold) noexcept : _threshold(threshold) { }
FilterThreshold(double threshold) noexcept : _threshold(threshold) { }
explicit FilterThreshold() noexcept : _threshold(1.0) { }
explicit FilterThreshold(bool is_filter_in) noexcept : _threshold(is_filter_in ? 0.0 : 1.0) { }
explicit FilterThreshold(float threshold) noexcept : _threshold(threshold) { }
explicit FilterThreshold(double threshold) noexcept : _threshold(threshold) { }
float threshold() const noexcept { return _threshold; }
bool is_filter() const noexcept { return _threshold == 0.0; }

Expand Down
13 changes: 11 additions & 2 deletions searchlib/src/vespa/searchlib/queryeval/field_spec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,18 @@ FieldSpec::FieldSpec(const std::string & name, uint32_t fieldId, fef::TermFieldH

FieldSpec::FieldSpec(const std::string & name, uint32_t fieldId,
fef::TermFieldHandle handle, bool isFilter_) noexcept
: FieldSpecBase(fieldId, handle, isFilter_),
_name(name)
: FieldSpec(name, fieldId, handle, fef::FilterThreshold(isFilter_))
{}

FieldSpec::FieldSpec(const std::string & name, uint32_t fieldId,
fef::TermFieldHandle handle, fef::FilterThreshold threshold) noexcept
: FieldSpecBase(fieldId, handle, threshold.is_filter()),
_name(name),
_threshold(threshold)
{
// NOTE: Whether the field is a filter is still tracked in FieldSpecBase
// to ensure this information is available in code where only the base class is used.
// This also ensures that the size of FieldSpecBase is not changed.
assert(fieldId < 0x1000000); // Can be represented by 24 bits
}

Expand Down
9 changes: 6 additions & 3 deletions searchlib/src/vespa/searchlib/queryeval/field_spec.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#pragma once

#include <vespa/searchlib/fef/filter_threshold.h>
#include <vespa/searchlib/fef/handle.h>
#include <vespa/vespalib/util/small_vector.h>
#include <string>
Expand Down Expand Up @@ -47,16 +48,18 @@ class FieldSpec : public FieldSpecBase
{
public:
FieldSpec(const std::string & name, uint32_t fieldId, fef::TermFieldHandle handle) noexcept;
FieldSpec(const std::string & name, uint32_t fieldId,
fef::TermFieldHandle handle, bool isFilter_) noexcept;
FieldSpec(const std::string & name, uint32_t fieldId, fef::TermFieldHandle handle, bool isFilter_) noexcept;
FieldSpec(const std::string & name, uint32_t fieldId, fef::TermFieldHandle handle, fef::FilterThreshold threshold) noexcept;
~FieldSpec();

void setBase(FieldSpecBase base) noexcept {
FieldSpecBase::operator =(base);
}
const std::string & getName() const noexcept { return _name; }
fef::FilterThreshold get_filter_threshold() const noexcept { return _threshold; }
private:
std::string _name; // field name
std::string _name; // field name
fef::FilterThreshold _threshold;
};

/**
Expand Down

0 comments on commit 18e07a6

Please sign in to comment.