From 5343849f1d1b6a4fe2180350832244fc633aa198 Mon Sep 17 00:00:00 2001 From: Kumar Shivendu Date: Wed, 17 Apr 2024 14:14:12 +0530 Subject: [PATCH] feat: Add sparse vectors benchmark support for Qdrant (#114) * feat: Add sparse vectors benchmark support in Qdrant * fix: Self review * feat: Add sparse dataset for CI benchmarks * feat: Introduce SparseVector class * feat: Disallow sparse vector dataset being run with non sparse vector engine configs * feat: use different engine config to run sparse vector benchmarks * fix: use different engine config to run sparse vector benchmarks * feat: Optimize CI benchmarks workflow * feat: Add 1M sparse dataset * fix: remove scipy, read csr matrix manually (#117) * fix: remove scipy, read csr matrix manually * fix: Dataset query reader should have sparse_vector=None by default * refactor: Changes based on feedback * refactoring: refactor sparse vector support (#118) * refactoring: refactor sparse vector support * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * feat: Use pydantic construct * refactor: Update all engines to use Query and Record dataclasses (#116) * refactor: Update all engines to use Query and Record dataclasses * feat: Add ruff in pre-commit hooks * fix: Type mismatches * fix: Redis search client types and var names * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: Type issues detected by linter * fix: iter_batches func type * refactor: knn_conditions should be class level constant --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: Type issue * fix: Allow python 3.8 since scipy is now removed * fix: Add missing redis-m-16-ef-128 config * fix: redis container port * fix linter --------- Co-authored-by: George Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: generall --- .github/workflows/continuous-benchmark.yaml | 33 ++++-- .pre-commit-config.yaml | 7 ++ benchmark/dataset.py | 14 ++- dataset_reader/ann_compound_reader.py | 1 + dataset_reader/ann_h5_reader.py | 5 +- dataset_reader/base_reader.py | 12 ++- dataset_reader/json_reader.py | 9 +- dataset_reader/sparse_reader.py | 100 ++++++++++++++++++ datasets/datasets.json | 12 +++ engine/base_client/__init__.py | 9 ++ engine/base_client/client.py | 5 +- engine/base_client/configure.py | 1 + engine/base_client/search.py | 8 +- engine/base_client/upload.py | 13 +-- engine/base_client/utils.py | 22 ++-- engine/clients/elasticsearch/__init__.py | 6 ++ engine/clients/elasticsearch/configure.py | 2 +- engine/clients/elasticsearch/search.py | 7 +- engine/clients/elasticsearch/upload.py | 18 ++-- engine/clients/milvus/__init__.py | 6 ++ engine/clients/milvus/search.py | 7 +- engine/clients/milvus/upload.py | 21 ++-- engine/clients/opensearch/__init__.py | 6 ++ engine/clients/opensearch/search.py | 15 +-- engine/clients/opensearch/upload.py | 18 ++-- engine/clients/pgvector/configure.py | 1 - engine/clients/pgvector/search.py | 13 ++- engine/clients/pgvector/upload.py | 14 ++- engine/clients/qdrant/__init__.py | 6 ++ engine/clients/qdrant/configure.py | 27 ++++- engine/clients/qdrant/search.py | 23 +++- engine/clients/qdrant/upload.py | 36 +++++-- engine/clients/redis/__init__.py | 6 ++ engine/clients/redis/configure.py | 14 +-- engine/clients/redis/search.py | 35 +++--- engine/clients/redis/upload.py | 15 ++- engine/clients/weaviate/__init__.py | 6 ++ engine/clients/weaviate/search.py | 10 +- engine/clients/weaviate/upload.py | 17 +-- .../redis-single-node/docker-compose.yaml | 2 +- .../configurations/qdrant-single-node.json | 19 ++++ .../configurations/redis-single-node.json | 13 +++ poetry.lock | 94 +++++++++------- pyproject.toml | 3 +- run.py | 20 ++-- tools/run_ci.sh | 21 ---- tools/run_experiment.sh | 4 +- tools/run_server_container.sh | 12 +-- tools/setup_ci.sh | 22 ++++ 49 files changed, 556 insertions(+), 234 deletions(-) create mode 100644 dataset_reader/sparse_reader.py create mode 100755 tools/setup_ci.sh diff --git a/.github/workflows/continuous-benchmark.yaml b/.github/workflows/continuous-benchmark.yaml index 822a0c52..7fca1cdc 100644 --- a/.github/workflows/continuous-benchmark.yaml +++ b/.github/workflows/continuous-benchmark.yaml @@ -15,18 +15,29 @@ jobs: - uses: webfactory/ssh-agent@v0.8.0 with: ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + - name: Setup CI + run: bash -x tools/setup_ci.sh - name: Benches run: | - export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }} - export GCS_KEY=${{ secrets.GCS_KEY }} - export GCS_SECRET=${{ secrets.GCS_SECRET }} - export POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }} - export POSTGRES_HOST=${{ secrets.POSTGRES_HOST }} + export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }} + export GCS_KEY=${{ secrets.GCS_KEY }} + export GCS_SECRET=${{ secrets.GCS_SECRET }} + export POSTGRES_PASSWORD=${{ secrets.POSTGRES_PASSWORD }} + export POSTGRES_HOST=${{ secrets.POSTGRES_HOST }} - # Benchmark the dev branch: - export QDRANT_VERSION=ghcr/dev - bash -x tools/run_ci.sh + declare -A DATASET_TO_ENGINE + DATASET_TO_ENGINE["laion-small-clip"]="qdrant-continuous-benchmark" + DATASET_TO_ENGINE["msmarco-sparse-1M"]="qdrant-sparse-vector" - # Benchmark the master branch: - export QDRANT_VERSION=docker/master - bash -x tools/run_ci.sh + for dataset in "${!DATASET_TO_ENGINE[@]}"; do + export ENGINE_NAME=${DATASET_TO_ENGINE[$dataset]} + export DATASETS=$dataset + + # Benchmark the dev branch: + export QDRANT_VERSION=ghcr/dev + bash -x tools/run_ci.sh + + # Benchmark the master branch: + export QDRANT_VERSION=docker/master + bash -x tools/run_ci.sh + done diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6817ea9d..690bcada 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,3 +28,10 @@ repos: - id: isort name: "Sort Imports" args: ["--profile", "black"] + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.5 + hooks: + # Run the linter. + - id: ruff + args: [ --fix ] diff --git a/benchmark/dataset.py b/benchmark/dataset.py index d2793f04..ef006955 100644 --- a/benchmark/dataset.py +++ b/benchmark/dataset.py @@ -10,20 +10,28 @@ from dataset_reader.ann_h5_reader import AnnH5Reader from dataset_reader.base_reader import BaseReader from dataset_reader.json_reader import JSONReader +from dataset_reader.sparse_reader import SparseReader @dataclass class DatasetConfig: - vector_size: int - distance: str name: str type: str path: str + link: Optional[str] = None schema: Optional[Dict[str, str]] = field(default_factory=dict) + # None in case of sparse vectors: + vector_size: Optional[int] = None + distance: Optional[str] = None -READER_TYPE = {"h5": AnnH5Reader, "jsonl": JSONReader, "tar": AnnCompoundReader} +READER_TYPE = { + "h5": AnnH5Reader, + "jsonl": JSONReader, + "tar": AnnCompoundReader, + "sparse": SparseReader, +} class Dataset: diff --git a/dataset_reader/ann_compound_reader.py b/dataset_reader/ann_compound_reader.py index a2b03301..9d862811 100644 --- a/dataset_reader/ann_compound_reader.py +++ b/dataset_reader/ann_compound_reader.py @@ -33,6 +33,7 @@ def read_queries(self) -> Iterator[Query]: vector /= np.linalg.norm(vector) yield Query( vector=vector.tolist(), + sparse_vector=None, meta_conditions=row_json["conditions"], expected_result=row_json["closest_ids"], expected_scores=row_json["closest_scores"], diff --git a/dataset_reader/ann_h5_reader.py b/dataset_reader/ann_h5_reader.py index 1d47bdd3..f87271a1 100644 --- a/dataset_reader/ann_h5_reader.py +++ b/dataset_reader/ann_h5_reader.py @@ -22,6 +22,7 @@ def read_queries(self) -> Iterator[Query]: vector /= np.linalg.norm(vector) yield Query( vector=vector.tolist(), + sparse_vector=None, meta_conditions=None, expected_result=expected_result.tolist(), expected_scores=expected_scores.tolist(), @@ -33,7 +34,9 @@ def read_data(self) -> Iterator[Record]: for idx, vector in enumerate(data["train"]): if self.normalize: vector /= np.linalg.norm(vector) - yield Record(id=idx, vector=vector.tolist(), metadata=None) + yield Record( + id=idx, vector=vector.tolist(), sparse_vector=None, metadata=None + ) if __name__ == "__main__": diff --git a/dataset_reader/base_reader.py b/dataset_reader/base_reader.py index 0a2617e3..3861c0f3 100644 --- a/dataset_reader/base_reader.py +++ b/dataset_reader/base_reader.py @@ -2,16 +2,24 @@ from typing import Iterator, List, Optional +@dataclass +class SparseVector: + indices: List[int] + values: List[float] + + @dataclass class Record: id: int - vector: List[float] + vector: Optional[List[float]] + sparse_vector: Optional[SparseVector] metadata: Optional[dict] @dataclass class Query: - vector: List[float] + vector: Optional[List[float]] + sparse_vector: Optional[SparseVector] meta_conditions: Optional[dict] expected_result: Optional[List[int]] expected_scores: Optional[List[float]] = None diff --git a/dataset_reader/json_reader.py b/dataset_reader/json_reader.py index 77a2eeb9..3df49a8f 100644 --- a/dataset_reader/json_reader.py +++ b/dataset_reader/json_reader.py @@ -58,13 +58,18 @@ def read_queries(self) -> Iterator[Query]: ): # ToDo: add meta_conditions - yield Query(vector=vector, meta_conditions=None, expected_result=neighbours) + yield Query( + vector=vector, + sparse_vector=None, + meta_conditions=None, + expected_result=neighbours, + ) def read_data(self) -> Iterator[Record]: for idx, (vector, payload) in enumerate( zip(self.read_vectors(), self.read_payloads()) ): - yield Record(id=idx, vector=vector, metadata=payload) + yield Record(id=idx, vector=vector, sparse_vector=None, metadata=payload) if __name__ == "__main__": diff --git a/dataset_reader/sparse_reader.py b/dataset_reader/sparse_reader.py new file mode 100644 index 00000000..fb2af5d9 --- /dev/null +++ b/dataset_reader/sparse_reader.py @@ -0,0 +1,100 @@ +import os +from pathlib import Path +from typing import Iterator, List, Tuple, Union + +import numpy as np + +from dataset_reader.base_reader import BaseReader, Query, Record, SparseVector + + +def read_sparse_matrix_fields( + filename: Union[Path, str] +) -> Tuple[np.array, np.array, np.array]: + """Read the fields of a CSR matrix without instantiating it""" + + with open(filename, "rb") as f: + sizes = np.fromfile(f, dtype="int64", count=3) + n_row, n_col, n_non_zero = sizes + index_pointer = np.fromfile(f, dtype="int64", count=n_row + 1) + assert n_non_zero == index_pointer[-1] + columns = np.fromfile(f, dtype="int32", count=n_non_zero) + assert np.all(columns >= 0) and np.all(columns < n_col) + values = np.fromfile(f, dtype="float32", count=n_non_zero) + return values, columns, index_pointer + + +def csr_to_sparse_vectors( + values: List[float], columns: List[int], index_pointer: List[int] +) -> Iterator[SparseVector]: + num_rows = len(index_pointer) - 1 + + for i in range(num_rows): + start = index_pointer[i] + end = index_pointer[i + 1] + row_values, row_indices = [], [] + for j in range(start, end): + row_values.append(values[j]) + row_indices.append(columns[j]) + yield SparseVector(indices=row_indices, values=row_values) + + +def read_csr_matrix(filename: Union[Path, str]) -> Iterator[SparseVector]: + """Read a CSR matrix in spmat format""" + values, columns, index_pointer = read_sparse_matrix_fields(filename) + values = values.tolist() + columns = columns.tolist() + index_pointer = index_pointer.tolist() + + yield from csr_to_sparse_vectors(values, columns, index_pointer) + + +def knn_result_read( + filename: Union[Path, str] +) -> Tuple[List[List[int]], List[List[float]]]: + n, d = map(int, np.fromfile(filename, dtype="uint32", count=2)) + assert os.stat(filename).st_size == 8 + n * d * (4 + 4) + with open(filename, "rb") as f: + f.seek(4 + 4) + ids = np.fromfile(f, dtype="int32", count=n * d).reshape(n, d).tolist() + scores = np.fromfile(f, dtype="float32", count=n * d).reshape(n, d).tolist() + return ids, scores + + +class SparseReader(BaseReader): + def __init__(self, path, normalize=False): + self.path = path + self.normalize = normalize + + def read_queries(self) -> Iterator[Query]: + queries_path = self.path / "queries.csr" + X = read_csr_matrix(queries_path) + + gt_path = self.path / "results.gt" + gt_indices, _ = knn_result_read(gt_path) + + for i, sparse_vector in enumerate(X): + yield Query( + vector=None, + sparse_vector=sparse_vector, + meta_conditions=None, + expected_result=gt_indices[i], + ) + + def read_data(self) -> Iterator[Record]: + data_path = self.path / "data.csr" + X = read_csr_matrix(data_path) + + for i, sparse_vector in enumerate(X): + yield Record(id=i, vector=None, sparse_vector=sparse_vector, metadata=None) + + +if __name__ == "__main__": + vals = [1, 3, 2, 3, 6, 4, 5] + cols = [0, 2, 2, 1, 3, 0, 2] + pointers = [0, 2, 3, 5, 7] + vecs = [vec for vec in csr_to_sparse_vectors(vals, cols, pointers)] + + assert vecs[0] == SparseVector(indices=[0, 2], values=[1, 3]) + assert vecs[1] == SparseVector(indices=[2], values=[2]) + assert vecs[2] == SparseVector(indices=[1, 3], values=[3, 6]) + assert vecs[3] == SparseVector(indices=[0, 2], values=[4, 5]) diff --git a/datasets/datasets.json b/datasets/datasets.json index 7a165eb6..f2e646de 100644 --- a/datasets/datasets.json +++ b/datasets/datasets.json @@ -66,6 +66,18 @@ "path": "dbpedia-openai-1M-1536-angular/dbpedia_openai_1M", "link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz" }, + { + "name": "msmarco-sparse-100K", + "type": "sparse", + "path": "msmarco-sparse/100K", + "link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/msmacro-sparse-100K.tar.gz" + }, + { + "name": "msmarco-sparse-1M", + "type": "sparse", + "path": "msmarco-sparse/1M", + "link": "https://storage.googleapis.com/ann-filtered-benchmark/datasets/msmacro-sparse-1M.tar.gz" + }, { "name": "h-and-m-2048-angular-filters", "vector_size": 2048, diff --git a/engine/base_client/__init__.py b/engine/base_client/__init__.py index a5495554..5528bb47 100644 --- a/engine/base_client/__init__.py +++ b/engine/base_client/__init__.py @@ -6,3 +6,12 @@ class IncompatibilityError(Exception): pass + + +__all__ = [ + "BaseClient", + "BaseConfigurator", + "BaseSearcher", + "BaseUploader", + "IncompatibilityError", +] diff --git a/engine/base_client/client.py b/engine/base_client/client.py index 0f262d34..def2f53b 100644 --- a/engine/base_client/client.py +++ b/engine/base_client/client.py @@ -1,7 +1,6 @@ import json import os from datetime import datetime -from pathlib import Path from typing import List from benchmark import ROOT_DIR @@ -31,6 +30,10 @@ def __init__( self.searchers = searchers self.engine = engine + @property + def sparse_vector_support(self): + return self.configurator.SPARSE_VECTOR_SUPPORT + def save_search_results( self, dataset_name: str, results: dict, search_id: int, search_params: dict ): diff --git a/engine/base_client/configure.py b/engine/base_client/configure.py index 1a4aaae8..6702f49e 100644 --- a/engine/base_client/configure.py +++ b/engine/base_client/configure.py @@ -4,6 +4,7 @@ class BaseConfigurator: + SPARSE_VECTOR_SUPPORT: bool = False DISTANCE_MAPPING = {} def __init__(self, host, collection_params: dict, connection_params: dict): diff --git a/engine/base_client/search.py b/engine/base_client/search.py index 93368a3f..3626191e 100644 --- a/engine/base_client/search.py +++ b/engine/base_client/search.py @@ -30,13 +30,11 @@ def get_mp_start_method(cls): return None @classmethod - def search_one( - cls, vector: List[float], meta_conditions, top: Optional[int] - ) -> List[Tuple[int, float]]: + def search_one(cls, query: Query, top: Optional[int]) -> List[Tuple[int, float]]: raise NotImplementedError() @classmethod - def _search_one(cls, query, top: Optional[int] = None): + def _search_one(cls, query: Query, top: Optional[int] = None): if top is None: top = ( len(query.expected_result) @@ -45,7 +43,7 @@ def _search_one(cls, query, top: Optional[int] = None): ) start = time.perf_counter() - search_res = cls.search_one(query.vector, query.meta_conditions, top) + search_res = cls.search_one(query, top) end = time.perf_counter() precision = 1.0 diff --git a/engine/base_client/upload.py b/engine/base_client/upload.py index d9d53d94..55ee4055 100644 --- a/engine/base_client/upload.py +++ b/engine/base_client/upload.py @@ -1,6 +1,6 @@ import time from multiprocessing import get_context -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List import tqdm @@ -80,12 +80,9 @@ def upload( } @classmethod - def _upload_batch( - cls, batch: Tuple[List[int], List[list], List[Optional[dict]]] - ) -> float: - ids, vectors, metadata = batch + def _upload_batch(cls, batch: List[Record]) -> float: start = time.perf_counter() - cls.upload_batch(ids, vectors, metadata) + cls.upload_batch(batch) return time.perf_counter() - start @classmethod @@ -93,9 +90,7 @@ def post_upload(cls, distance): return {} @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: List[Optional[dict]] - ): + def upload_batch(cls, batch: List[Record]): raise NotImplementedError() @classmethod diff --git a/engine/base_client/utils.py b/engine/base_client/utils.py index 4b6b8ad5..1b0da967 100644 --- a/engine/base_client/utils.py +++ b/engine/base_client/utils.py @@ -1,20 +1,16 @@ -from typing import Any, Iterable +from typing import Iterable, List from dataset_reader.base_reader import Record -def iter_batches(records: Iterable[Record], n: int) -> Iterable[Any]: - ids = [] - vectors = [] - metadata = [] +def iter_batches(records: Iterable[Record], n: int) -> Iterable[List[Record]]: + batch = [] for record in records: - ids.append(record.id) - vectors.append(record.vector) - metadata.append(record.metadata) + batch.append(record) - if len(vectors) >= n: - yield [ids, vectors, metadata] - ids, vectors, metadata = [], [], [] - if len(ids) > 0: - yield [ids, vectors, metadata] + if len(batch) >= n: + yield batch + batch = [] + if len(batch) > 0: + yield batch diff --git a/engine/clients/elasticsearch/__init__.py b/engine/clients/elasticsearch/__init__.py index 24288e97..c1802087 100644 --- a/engine/clients/elasticsearch/__init__.py +++ b/engine/clients/elasticsearch/__init__.py @@ -1,3 +1,9 @@ from engine.clients.elasticsearch.configure import ElasticConfigurator from engine.clients.elasticsearch.search import ElasticSearcher from engine.clients.elasticsearch.upload import ElasticUploader + +__all__ = [ + "ElasticConfigurator", + "ElasticSearcher", + "ElasticUploader", +] diff --git a/engine/clients/elasticsearch/configure.py b/engine/clients/elasticsearch/configure.py index d46166a7..446fe7c1 100644 --- a/engine/clients/elasticsearch/configure.py +++ b/engine/clients/elasticsearch/configure.py @@ -1,4 +1,4 @@ -from elasticsearch import Elasticsearch, NotFoundError +from elasticsearch import NotFoundError from benchmark.dataset import Dataset from engine.base_client import IncompatibilityError diff --git a/engine/clients/elasticsearch/search.py b/engine/clients/elasticsearch/search.py index 3b4fb3f4..4a1ee981 100644 --- a/engine/clients/elasticsearch/search.py +++ b/engine/clients/elasticsearch/search.py @@ -4,6 +4,7 @@ from elasticsearch import Elasticsearch +from dataset_reader.base_reader import Query from engine.base_client.search import BaseSearcher from engine.clients.elasticsearch.config import ELASTIC_INDEX, get_es_client from engine.clients.elasticsearch.parser import ElasticConditionParser @@ -29,15 +30,15 @@ def init_client(cls, host, _distance, connection_params: dict, search_params: di cls.search_params = search_params @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: knn = { "field": "vector", - "query_vector": vector, + "query_vector": query.vector, "k": top, **cls.search_params["config"], } - meta_conditions = cls.parser.parse(meta_conditions) + meta_conditions = cls.parser.parse(query.meta_conditions) if meta_conditions: knn["filter"] = meta_conditions diff --git a/engine/clients/elasticsearch/upload.py b/engine/clients/elasticsearch/upload.py index 96a18d22..efe249d6 100644 --- a/engine/clients/elasticsearch/upload.py +++ b/engine/clients/elasticsearch/upload.py @@ -1,9 +1,10 @@ import multiprocessing as mp import uuid -from typing import List, Optional +from typing import List from elasticsearch import Elasticsearch +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.elasticsearch.config import ELASTIC_INDEX, get_es_client @@ -27,19 +28,12 @@ def init_client(cls, host, _distance, connection_params, upload_params): cls.upload_params = upload_params @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): - if metadata is None: - metadata = [{}] * len(vectors) + def upload_batch(cls, batch: List[Record]): operations = [] - for idx, vector, payload in zip(ids, vectors, metadata): - vector_id = uuid.UUID(int=idx).hex + for record in batch: + vector_id = uuid.UUID(int=record.id).hex operations.append({"index": {"_id": vector_id}}) - if payload: - operations.append({"vector": vector, **payload}) - else: - operations.append({"vector": vector}) + operations.append({"vector": record.vector, **(record.metadata or {})}) cls.client.bulk( index=ELASTIC_INDEX, diff --git a/engine/clients/milvus/__init__.py b/engine/clients/milvus/__init__.py index ca400c86..31abe17b 100644 --- a/engine/clients/milvus/__init__.py +++ b/engine/clients/milvus/__init__.py @@ -1,3 +1,9 @@ from engine.clients.milvus.configure import MilvusConfigurator from engine.clients.milvus.search import MilvusSearcher from engine.clients.milvus.upload import MilvusUploader + +__all__ = [ + "MilvusConfigurator", + "MilvusSearcher", + "MilvusUploader", +] diff --git a/engine/clients/milvus/search.py b/engine/clients/milvus/search.py index f2e56d6f..1694fc37 100644 --- a/engine/clients/milvus/search.py +++ b/engine/clients/milvus/search.py @@ -3,6 +3,7 @@ from pymilvus import Collection, connections +from dataset_reader.base_reader import Query from engine.base_client.search import BaseSearcher from engine.clients.milvus.config import ( DISTANCE_MAPPING, @@ -37,15 +38,15 @@ def get_mp_start_method(cls): return "forkserver" if "forkserver" in mp.get_all_start_methods() else "spawn" @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: param = {"metric_type": cls.distance, "params": cls.search_params["config"]} try: res = cls.collection.search( - data=[vector], + data=[query.vector], anns_field="vector", param=param, limit=top, - expr=cls.parser.parse(meta_conditions), + expr=cls.parser.parse(query.meta_conditions), ) except Exception as e: import ipdb diff --git a/engine/clients/milvus/upload.py b/engine/clients/milvus/upload.py index 8f897a45..8c3768e1 100644 --- a/engine/clients/milvus/upload.py +++ b/engine/clients/milvus/upload.py @@ -1,5 +1,5 @@ import multiprocessing as mp -from typing import List, Optional +from typing import List from pymilvus import ( Collection, @@ -8,6 +8,7 @@ wait_for_index_building_complete, ) +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.milvus.config import ( DISTANCE_MAPPING, @@ -41,20 +42,26 @@ def init_client(cls, host, distance, connection_params, upload_params): cls.distance = DISTANCE_MAPPING[distance] @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): - if metadata is not None: + def upload_batch(cls, batch: List[Record]): + has_metadata = any(record.metadata for record in batch) + if has_metadata: field_values = [ [ - payload.get(field_schema.name) or DTYPE_DEFAULT[field_schema.dtype] - for payload in metadata + record.metadata.get(field_schema.name) + or DTYPE_DEFAULT[field_schema.dtype] + for record in batch ] for field_schema in cls.collection.schema.fields if field_schema.name not in ["id", "vector"] ] else: field_values = [] + + ids, vectors = [], [] + for record in batch: + ids.append(record.id) + vectors.append(record.vector) + cls.collection.insert([ids, vectors] + field_values) @classmethod diff --git a/engine/clients/opensearch/__init__.py b/engine/clients/opensearch/__init__.py index 686bfcde..e4c6c59a 100644 --- a/engine/clients/opensearch/__init__.py +++ b/engine/clients/opensearch/__init__.py @@ -1,3 +1,9 @@ from engine.clients.opensearch.configure import OpenSearchConfigurator from engine.clients.opensearch.search import OpenSearchSearcher from engine.clients.opensearch.upload import OpenSearchUploader + +__all__ = [ + "OpenSearchConfigurator", + "OpenSearchSearcher", + "OpenSearchUploader", +] diff --git a/engine/clients/opensearch/search.py b/engine/clients/opensearch/search.py index 364fc0cc..a3e36058 100644 --- a/engine/clients/opensearch/search.py +++ b/engine/clients/opensearch/search.py @@ -4,6 +4,7 @@ from opensearchpy import OpenSearch +from dataset_reader.base_reader import Query from engine.base_client.search import BaseSearcher from engine.clients.opensearch.config import ( OPENSEARCH_INDEX, @@ -46,21 +47,21 @@ def init_client(cls, host, distance, connection_params: dict, search_params: dic cls.search_params = search_params @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: - query = { + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: + opensearch_query = { "knn": { "vector": { - "vector": vector, + "vector": query.vector, "k": top, } } } - meta_conditions = cls.parser.parse(meta_conditions) + meta_conditions = cls.parser.parse(query.meta_conditions) if meta_conditions: - query = { + opensearch_query = { "bool": { - "must": [query], + "must": [opensearch_query], "filter": meta_conditions, } } @@ -68,7 +69,7 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: res = cls.client.search( index=OPENSEARCH_INDEX, body={ - "query": query, + "query": opensearch_query, "size": top, }, params={ diff --git a/engine/clients/opensearch/upload.py b/engine/clients/opensearch/upload.py index 46a7151d..0bc2427e 100644 --- a/engine/clients/opensearch/upload.py +++ b/engine/clients/opensearch/upload.py @@ -1,9 +1,10 @@ import multiprocessing as mp import uuid -from typing import List, Optional +from typing import List from opensearchpy import OpenSearch +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.opensearch.config import ( OPENSEARCH_INDEX, @@ -44,19 +45,12 @@ def init_client(cls, host, distance, connection_params, upload_params): cls.upload_params = upload_params @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): - if metadata is None: - metadata = [{}] * len(vectors) + def upload_batch(cls, batch: List[Record]): operations = [] - for idx, vector, payload in zip(ids, vectors, metadata): - vector_id = uuid.UUID(int=idx).hex + for record in batch: + vector_id = uuid.UUID(int=record.id).hex operations.append({"index": {"_id": vector_id}}) - if payload: - operations.append({"vector": vector, **payload}) - else: - operations.append({"vector": vector}) + operations.append({"vector": record.vector, **(record.metadata or {})}) cls.client.bulk( index=OPENSEARCH_INDEX, diff --git a/engine/clients/pgvector/configure.py b/engine/clients/pgvector/configure.py index 0da692b2..9f09899d 100644 --- a/engine/clients/pgvector/configure.py +++ b/engine/clients/pgvector/configure.py @@ -32,7 +32,6 @@ def recreate(self, dataset: Dataset, collection_params): );""" ) self.conn.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN") - self.conn.close() def delete_client(self): diff --git a/engine/clients/pgvector/search.py b/engine/clients/pgvector/search.py index 91bcd5c9..1e26dfc5 100644 --- a/engine/clients/pgvector/search.py +++ b/engine/clients/pgvector/search.py @@ -1,10 +1,10 @@ -import multiprocessing as mp from typing import List, Tuple import numpy as np import psycopg from pgvector.psycopg import register_vector +from dataset_reader.base_reader import Query from engine.base_client.distances import Distance from engine.base_client.search import BaseSearcher from engine.clients.pgvector.config import get_db_config @@ -25,15 +25,18 @@ def init_client(cls, host, distance, connection_params: dict, search_params: dic cls.cur = cls.conn.cursor() cls.cur.execute(f"SET hnsw.ef_search = {search_params['config']['hnsw_ef']}") if distance == Distance.COSINE: - cls.query = f"SELECT id, embedding <=> %s AS _score FROM items ORDER BY _score LIMIT %s" + cls.query = "SELECT id, embedding <=> %s AS _score FROM items ORDER BY _score LIMIT %s" elif distance == Distance.L2: - cls.query = f"SELECT id, embedding <-> %s AS _score FROM items ORDER BY _score LIMIT %s" + cls.query = "SELECT id, embedding <-> %s AS _score FROM items ORDER BY _score LIMIT %s" else: raise NotImplementedError(f"Unsupported distance metric {cls.distance}") @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: - cls.cur.execute(cls.query, (np.array(vector), top), binary=True, prepare=True) + def search_one(cls, query: Query, top) -> List[Tuple[int, float]]: + # TODO: Use query.metaconditions for datasets with filtering + cls.cur.execute( + cls.query, (np.array(query.vector), top), binary=True, prepare=True + ) return cls.cur.fetchall() @classmethod diff --git a/engine/clients/pgvector/upload.py b/engine/clients/pgvector/upload.py index c3921e95..fe0027c5 100644 --- a/engine/clients/pgvector/upload.py +++ b/engine/clients/pgvector/upload.py @@ -1,9 +1,11 @@ -from typing import List, Optional +from typing import List import numpy as np import psycopg from pgvector.psycopg import register_vector +from dataset_reader.base_reader import Record +from engine.base_client import IncompatibilityError from engine.base_client.distances import Distance from engine.base_client.upload import BaseUploader from engine.clients.pgvector.config import get_db_config @@ -26,11 +28,13 @@ def init_client(cls, host, distance, connection_params, upload_params): cls.upload_params = upload_params @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): - vectors = np.array(vectors) + def upload_batch(cls, batch: List[Record]): + ids, vectors = [], [] + for record in batch: + ids.append(record.id) + vectors.append(record.vector) + vectors = np.array(vectors) # Copy is faster than insert with cls.cur.copy( "COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)" diff --git a/engine/clients/qdrant/__init__.py b/engine/clients/qdrant/__init__.py index 03642803..2c95ffc8 100644 --- a/engine/clients/qdrant/__init__.py +++ b/engine/clients/qdrant/__init__.py @@ -1,3 +1,9 @@ from engine.clients.qdrant.configure import QdrantConfigurator from engine.clients.qdrant.search import QdrantSearcher from engine.clients.qdrant.upload import QdrantUploader + +__all__ = [ + "QdrantConfigurator", + "QdrantSearcher", + "QdrantUploader", +] diff --git a/engine/clients/qdrant/configure.py b/engine/clients/qdrant/configure.py index 36e4f007..668914b8 100644 --- a/engine/clients/qdrant/configure.py +++ b/engine/clients/qdrant/configure.py @@ -8,6 +8,7 @@ class QdrantConfigurator(BaseConfigurator): + SPARSE_VECTOR_SUPPORT = True DISTANCE_MAPPING = { Distance.L2: rest.Distance.EUCLID, Distance.COSINE: rest.Distance.COSINE, @@ -30,12 +31,30 @@ def clean(self): self.client.delete_collection(collection_name=QDRANT_COLLECTION_NAME) def recreate(self, dataset: Dataset, collection_params): + if dataset.config.type == "sparse": + vectors_config = { + "vectors_config": {}, + "sparse_vectors_config": { + "sparse": rest.SparseVectorParams( + index=rest.SparseIndexParams( + on_disk=False, + ) + ) + }, + } + else: + vectors_config = { + "vectors_config": ( + rest.VectorParams( + size=dataset.config.vector_size, + distance=self.DISTANCE_MAPPING.get(dataset.config.distance), + ) + ) + } + self.client.recreate_collection( collection_name=QDRANT_COLLECTION_NAME, - vectors_config=rest.VectorParams( - size=dataset.config.vector_size, - distance=self.DISTANCE_MAPPING.get(dataset.config.distance), - ), + **vectors_config, **self.collection_params ) self.client.update_collection( diff --git a/engine/clients/qdrant/search.py b/engine/clients/qdrant/search.py index 411f889b..1c1d2a84 100644 --- a/engine/clients/qdrant/search.py +++ b/engine/clients/qdrant/search.py @@ -1,11 +1,12 @@ -import multiprocessing as mp import os from typing import List, Tuple import httpx from qdrant_client import QdrantClient +from qdrant_client._pydantic_compat import construct from qdrant_client.http import models as rest +from dataset_reader.base_reader import Query from engine.base_client.search import BaseSearcher from engine.clients.qdrant.config import QDRANT_COLLECTION_NAME from engine.clients.qdrant.parser import QdrantConditionParser @@ -34,11 +35,25 @@ def init_client(cls, host, distance, connection_params: dict, search_params: dic # return "forkserver" if "forkserver" in mp.get_all_start_methods() else "spawn" @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: + # Can query only one till we introduce re-ranking in the benchmarks + if query.sparse_vector is None: + query_vector = query.vector + else: + query_vector = construct( + rest.NamedSparseVector, + name="sparse", + vector=construct( + rest.SparseVector, + indices=query.sparse_vector.indices, + values=query.sparse_vector.values, + ), + ) + res = cls.client.search( collection_name=QDRANT_COLLECTION_NAME, - query_vector=vector, - query_filter=cls.parser.parse(meta_conditions), + query_vector=query_vector, + query_filter=cls.parser.parse(query.meta_conditions), limit=top, search_params=rest.SearchParams(**cls.search_params.get("config", {})), ) diff --git a/engine/clients/qdrant/upload.py b/engine/clients/qdrant/upload.py index 32b1a26f..a5c2dbbe 100644 --- a/engine/clients/qdrant/upload.py +++ b/engine/clients/qdrant/upload.py @@ -1,10 +1,17 @@ import os import time -from typing import List, Optional +from typing import List from qdrant_client import QdrantClient -from qdrant_client.http.models import Batch, CollectionStatus, OptimizersConfigDiff +from qdrant_client._pydantic_compat import construct +from qdrant_client.http.models import ( + Batch, + CollectionStatus, + OptimizersConfigDiff, + SparseVector, +) +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.qdrant.config import QDRANT_COLLECTION_NAME @@ -21,15 +28,30 @@ def init_client(cls, host, distance, connection_params, upload_params): cls.upload_params = upload_params @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): - cls.client.upsert( + def upload_batch(cls, batch: List[Record]): + ids, vectors, payloads = [], [], [] + for point in batch: + if point.sparse_vector is None: + vector = point.vector + else: + vector = { + "sparse": construct( + SparseVector, + indices=point.sparse_vector.indices, + values=point.sparse_vector.values, + ) + } + + ids.append(point.id) + vectors.append(vector) + payloads.append(point.metadata or {}) + + _ = cls.client.upsert( collection_name=QDRANT_COLLECTION_NAME, points=Batch.model_construct( ids=ids, vectors=vectors, - payloads=[payload or {} for payload in metadata], + payloads=payloads, ), wait=False, ) diff --git a/engine/clients/redis/__init__.py b/engine/clients/redis/__init__.py index a1437747..75f3b150 100644 --- a/engine/clients/redis/__init__.py +++ b/engine/clients/redis/__init__.py @@ -1,3 +1,9 @@ from engine.clients.redis.configure import RedisConfigurator from engine.clients.redis.search import RedisSearcher from engine.clients.redis.upload import RedisUploader + +__all__ = [ + "RedisConfigurator", + "RedisSearcher", + "RedisUploader", +] diff --git a/engine/clients/redis/configure.py b/engine/clients/redis/configure.py index ccf3776c..a5e6fe82 100644 --- a/engine/clients/redis/configure.py +++ b/engine/clients/redis/configure.py @@ -36,24 +36,24 @@ class RedisConfigurator(BaseConfigurator): def __init__(self, host, collection_params: dict, connection_params: dict): super().__init__(host, collection_params, connection_params) redis_constructor = RedisCluster if REDIS_CLUSTER else Redis - self._is_cluster = True if REDIS_CLUSTER else False + self.is_cluster = REDIS_CLUSTER self.client = redis_constructor( host=host, port=REDIS_PORT, password=REDIS_AUTH, username=REDIS_USER ) def clean(self): conns = [self.client] - if self._is_cluster: + if self.is_cluster: conns = [ self.client.get_redis_connection(node) for node in self.client.get_primaries() ] for conn in conns: - index = conn.ft() + search_namespace = conn.ft() try: - index.dropindex(delete_documents=True) + search_namespace.dropindex(delete_documents=True) except redis.ResponseError as e: - if "Unknown Index name" not in e.__str__(): + if "Unknown Index name" not in str(e): print(e) def recreate(self, dataset: Dataset, collection_params): @@ -90,7 +90,7 @@ def recreate(self, dataset: Dataset, collection_params): ] + payload_fields conns = [self.client] - if self._is_cluster: + if self.is_cluster: conns = [ self.client.get_redis_connection(node) for node in self.client.get_primaries() @@ -100,7 +100,7 @@ def recreate(self, dataset: Dataset, collection_params): try: search_namespace.create_index(fields=index_fields) except redis.ResponseError as e: - if "Index already exists" not in e.__str__(): + if "Index already exists" not in str(e): raise e diff --git a/engine/clients/redis/search.py b/engine/clients/redis/search.py index 5d5858df..1dbb4c66 100644 --- a/engine/clients/redis/search.py +++ b/engine/clients/redis/search.py @@ -1,10 +1,12 @@ import random -from typing import List, Tuple +from typing import List, Tuple, Union import numpy as np from redis import Redis, RedisCluster -from redis.commands.search.query import Query +from redis.commands.search import Search as RedisSearchIndex +from redis.commands.search.query import Query as RedisQuery +from dataset_reader.base_reader import Query as DatasetQuery from engine.base_client.search import BaseSearcher from engine.clients.redis.config import ( REDIS_AUTH, @@ -18,8 +20,13 @@ class RedisSearcher(BaseSearcher): search_params = {} - client = None + client: Union[RedisCluster, Redis] = None parser = RedisConditionParser() + knn_conditions = "EF_RUNTIME $EF" + + is_cluster: bool + conns: List[Union[RedisCluster, Redis]] + search_namespace: RedisSearchIndex @classmethod def init_client(cls, host, distance, connection_params: dict, search_params: dict): @@ -28,21 +35,23 @@ def init_client(cls, host, distance, connection_params: dict, search_params: dic host=host, port=REDIS_PORT, password=REDIS_AUTH, username=REDIS_USER ) cls.search_params = search_params - cls.knn_conditions = "EF_RUNTIME $EF" - cls._is_cluster = True if REDIS_CLUSTER else False + # In the case of CLUSTER API enabled we randomly select the starting primary shard # when doing the client initialization to evenly distribute the load among the cluster - cls.conns = [cls.client] - if cls._is_cluster: + if REDIS_CLUSTER: cls.conns = [ cls.client.get_redis_connection(node) for node in cls.client.get_primaries() ] - cls._ft = cls.conns[random.randint(0, len(cls.conns)) - 1].ft() + else: + cls.conns = [cls.client] + + cls.is_cluster = REDIS_CLUSTER + cls.search_namespace = random.choice(cls.conns).ft() @classmethod - def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: - conditions = cls.parser.parse(meta_conditions) + def search_one(cls, query: DatasetQuery, top: int) -> List[Tuple[int, float]]: + conditions = cls.parser.parse(query.meta_conditions) if conditions is None: prefilter_condition = "*" params = {} @@ -50,7 +59,7 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: prefilter_condition, params = conditions q = ( - Query( + RedisQuery( f"{prefilter_condition}=>[KNN $K @vector $vec_param {cls.knn_conditions} AS vector_score]" ) .sort_by("vector_score", asc=True) @@ -62,11 +71,11 @@ def search_one(cls, vector, meta_conditions, top) -> List[Tuple[int, float]]: .timeout(REDIS_QUERY_TIMEOUT) ) params_dict = { - "vec_param": np.array(vector).astype(np.float32).tobytes(), + "vec_param": np.array(query.vector).astype(np.float32).tobytes(), "K": top, **cls.search_params["config"], **params, } - results = cls._ft.search(q, query_params=params_dict) + results = cls.search_namespace.search(q, query_params=params_dict) return [(int(result.id), float(result.vector_score)) for result in results.docs] diff --git a/engine/clients/redis/upload.py b/engine/clients/redis/upload.py index 89bc0a3b..cd4b888b 100644 --- a/engine/clients/redis/upload.py +++ b/engine/clients/redis/upload.py @@ -1,8 +1,9 @@ -from typing import List, Optional +from typing import List import numpy as np from redis import Redis, RedisCluster +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.redis.config import ( REDIS_AUTH, @@ -26,14 +27,12 @@ def init_client(cls, host, distance, connection_params, upload_params): cls.upload_params = upload_params @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: Optional[List[dict]] - ): + def upload_batch(cls, batch: List[Record]): p = cls.client.pipeline(transaction=False) - for i in range(len(ids)): - idx = ids[i] - vec = vectors[i] - meta = metadata[i] if metadata else {} + for record in batch: + idx = record.id + vec = record.vector + meta = record.metadata or {} geopoints = {} payload = {} if meta is not None: diff --git a/engine/clients/weaviate/__init__.py b/engine/clients/weaviate/__init__.py index d8d90121..2e8abba5 100644 --- a/engine/clients/weaviate/__init__.py +++ b/engine/clients/weaviate/__init__.py @@ -1,3 +1,9 @@ from engine.clients.weaviate.configure import WeaviateConfigurator from engine.clients.weaviate.search import WeaviateSearcher from engine.clients.weaviate.upload import WeaviateUploader + +__all__ = [ + "WeaviateConfigurator", + "WeaviateSearcher", + "WeaviateUploader", +] diff --git a/engine/clients/weaviate/search.py b/engine/clients/weaviate/search.py index 74fa926e..aa3ac9ce 100644 --- a/engine/clients/weaviate/search.py +++ b/engine/clients/weaviate/search.py @@ -1,4 +1,3 @@ -import uuid from typing import List, Tuple from weaviate import WeaviateClient @@ -7,6 +6,7 @@ from weaviate.collections import Collection from weaviate.connect import ConnectionParams +from dataset_reader.base_reader import Query from engine.base_client.search import BaseSearcher from engine.clients.weaviate.config import WEAVIATE_CLASS_NAME, WEAVIATE_DEFAULT_PORT from engine.clients.weaviate.parser import WeaviateConditionParser @@ -32,10 +32,10 @@ def init_client(cls, host, distance, connection_params: dict, search_params: dic cls.client = client @classmethod - def search_one(self, vector, meta_conditions, top) -> List[Tuple[int, float]]: - res = self.collection.query.near_vector( - near_vector=vector, - filters=self.parser.parse(meta_conditions), + def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]: + res = cls.collection.query.near_vector( + near_vector=query.vector, + filters=cls.parser.parse(query.meta_conditions), limit=top, return_metadata=MetadataQuery(distance=True), return_properties=[], diff --git a/engine/clients/weaviate/upload.py b/engine/clients/weaviate/upload.py index ad52f64f..9715ad1d 100644 --- a/engine/clients/weaviate/upload.py +++ b/engine/clients/weaviate/upload.py @@ -1,10 +1,11 @@ import uuid -from typing import List, Optional +from typing import List from weaviate import WeaviateClient from weaviate.classes.data import DataObject from weaviate.connect import ConnectionParams +from dataset_reader.base_reader import Record from engine.base_client.upload import BaseUploader from engine.clients.weaviate.config import WEAVIATE_CLASS_NAME, WEAVIATE_DEFAULT_PORT @@ -28,14 +29,14 @@ def init_client(cls, host, distance, connection_params, upload_params): ) @classmethod - def upload_batch( - cls, ids: List[int], vectors: List[list], metadata: List[Optional[dict]] - ): + def upload_batch(cls, batch: List[Record]): objects = [] - for i in range(len(ids)): - id = uuid.UUID(int=ids[i]) - property = metadata[i] or {} - objects.append(DataObject(properties=property, vector=vectors[i], uuid=id)) + for record in batch: + _id = uuid.UUID(int=record.id) + _property = record.metadata or {} + objects.append( + DataObject(properties=_property, vector=record.vector, uuid=_id) + ) if len(objects) > 0: cls.collection.data.insert_many(objects) diff --git a/engine/servers/redis-single-node/docker-compose.yaml b/engine/servers/redis-single-node/docker-compose.yaml index 040182e7..5604bb6b 100644 --- a/engine/servers/redis-single-node/docker-compose.yaml +++ b/engine/servers/redis-single-node/docker-compose.yaml @@ -4,7 +4,7 @@ services: redis: image: redislabs/redisearch:2.8.8 ports: - - '6380:6379' + - '6379:6379' logging: driver: "json-file" options: diff --git a/experiments/configurations/qdrant-single-node.json b/experiments/configurations/qdrant-single-node.json index ec75a30e..b30bc0e5 100644 --- a/experiments/configurations/qdrant-single-node.json +++ b/experiments/configurations/qdrant-single-node.json @@ -45,6 +45,25 @@ ], "upload_params": { "parallel": 16, "batch_size": 1024 } }, + { + "name": "qdrant-sparse-vector", + "engine": "qdrant", + "connection_params": { "timeout": 30 }, + "collection_params": { + "optimizers_config": { + "max_segment_size": 1000000, + "default_segment_number": 3, + "memmap_threshold": 10000000 + } + }, + "search_params": [ + { + "parallel": 8, + "search_params": {} + } + ], + "upload_params": { "parallel": 16, "batch_size": 1024 } + }, { "name": "qdrant-parallel", "engine": "qdrant", diff --git a/experiments/configurations/redis-single-node.json b/experiments/configurations/redis-single-node.json index e6e58bf5..2807e288 100644 --- a/experiments/configurations/redis-single-node.json +++ b/experiments/configurations/redis-single-node.json @@ -11,6 +11,19 @@ ], "upload_params": { "parallel": 16, "batch_size": 1024 } }, + { + "name": "redis-m-16-ef-128", + "engine": "redis", + "connection_params": {}, + "collection_params": { + "hnsw_config": { "M": 16, "EF_CONSTRUCTION": 128 } + }, + "search_params": [ + { "parallel": 1, "config": { "EF": 64 } }, { "parallel": 1, "config": { "EF": 128 } }, { "parallel": 1, "config": { "EF": 256 } }, { "parallel": 1, "config": { "EF": 512 } }, + { "parallel": 100, "config": { "EF": 64 } }, { "parallel": 100, "config": { "EF": 128 } }, { "parallel": 100, "config": { "EF": 256 } }, { "parallel": 100, "config": { "EF": 512 } } + ], + "upload_params": { "parallel": 16 } + }, { "name": "redis-m-32-ef-128", "engine": "redis", diff --git a/poetry.lock b/poetry.lock index a16f1a2c..7ca44280 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -11,9 +11,6 @@ files = [ {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} - [[package]] name = "anyio" version = "4.3.0" @@ -197,34 +194,6 @@ files = [ {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, ] -[[package]] -name = "backports-zoneinfo" -version = "0.2.1" -description = "Backport of the standard library zoneinfo module" -optional = false -python-versions = ">=3.6" -files = [ - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win32.whl", hash = "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win32.whl", hash = "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win32.whl", hash = "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6"}, - {file = "backports.zoneinfo-0.2.1.tar.gz", hash = "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"}, -] - -[package.extras] -tzdata = ["tzdata"] - [[package]] name = "certifi" version = "2024.2.2" @@ -1207,8 +1176,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -1403,7 +1372,6 @@ files = [ ] [package.dependencies] -"backports.zoneinfo" = {version = ">=0.2.0", markers = "python_version < \"3.9\""} psycopg-binary = {version = "3.1.18", optional = true, markers = "implementation_name != \"pypy\" and extra == \"binary\""} typing-extensions = ">=4.1" tzdata = {version = "*", markers = "sys_platform == \"win32\""} @@ -1756,7 +1724,6 @@ azure-storage-blob = "*" environs = "<=9.5.0" grpcio = ">=1.49.1,<=1.60.0" minio = ">=7.0.0" -numpy = {version = "<1.25.0", markers = "python_version <= \"3.8\""} pandas = ">=1.2.4" protobuf = ">=3.20.0" pyarrow = ">=12.0.0" @@ -1860,6 +1827,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -1867,8 +1835,16 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -1885,6 +1861,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -1892,6 +1869,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -1959,6 +1937,48 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "scipy" +version = "1.12.0" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"}, + {file = "scipy-1.12.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:f5f00ebaf8de24d14b8449981a2842d404152774c1a1d880c901bf454cb8e2a1"}, + {file = "scipy-1.12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e53958531a7c695ff66c2e7bb7b79560ffdc562e2051644c5576c39ff8efb563"}, + {file = "scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e32847e08da8d895ce09d108a494d9eb78974cf6de23063f93306a3e419960c"}, + {file = "scipy-1.12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4c1020cad92772bf44b8e4cdabc1df5d87376cb219742549ef69fc9fd86282dd"}, + {file = "scipy-1.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:75ea2a144096b5e39402e2ff53a36fecfd3b960d786b7efd3c180e29c39e53f2"}, + {file = "scipy-1.12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:408c68423f9de16cb9e602528be4ce0d6312b05001f3de61fe9ec8b1263cad08"}, + {file = "scipy-1.12.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5adfad5dbf0163397beb4aca679187d24aec085343755fcdbdeb32b3679f254c"}, + {file = "scipy-1.12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3003652496f6e7c387b1cf63f4bb720951cfa18907e998ea551e6de51a04467"}, + {file = "scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b8066bce124ee5531d12a74b617d9ac0ea59245246410e19bca549656d9a40a"}, + {file = "scipy-1.12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8bee4993817e204d761dba10dbab0774ba5a8612e57e81319ea04d84945375ba"}, + {file = "scipy-1.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:a24024d45ce9a675c1fb8494e8e5244efea1c7a09c60beb1eeb80373d0fecc70"}, + {file = "scipy-1.12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e7e76cc48638228212c747ada851ef355c2bb5e7f939e10952bc504c11f4e372"}, + {file = "scipy-1.12.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f7ce148dffcd64ade37b2df9315541f9adad6efcaa86866ee7dd5db0c8f041c3"}, + {file = "scipy-1.12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c39f92041f490422924dfdb782527a4abddf4707616e07b021de33467f917bc"}, + {file = "scipy-1.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7ebda398f86e56178c2fa94cad15bf457a218a54a35c2a7b4490b9f9cb2676c"}, + {file = "scipy-1.12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:95e5c750d55cf518c398a8240571b0e0782c2d5a703250872f36eaf737751338"}, + {file = "scipy-1.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:e646d8571804a304e1da01040d21577685ce8e2db08ac58e543eaca063453e1c"}, + {file = "scipy-1.12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:913d6e7956c3a671de3b05ccb66b11bc293f56bfdef040583a7221d9e22a2e35"}, + {file = "scipy-1.12.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba1b0c7256ad75401c73e4b3cf09d1f176e9bd4248f0d3112170fb2ec4db067"}, + {file = "scipy-1.12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:730badef9b827b368f351eacae2e82da414e13cf8bd5051b4bdfd720271a5371"}, + {file = "scipy-1.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6546dc2c11a9df6926afcbdd8a3edec28566e4e785b915e849348c6dd9f3f490"}, + {file = "scipy-1.12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:196ebad3a4882081f62a5bf4aeb7326aa34b110e533aab23e4374fcccb0890dc"}, + {file = "scipy-1.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:b360f1b6b2f742781299514e99ff560d1fe9bd1bff2712894b52abe528d1fd1e"}, + {file = "scipy-1.12.0.tar.gz", hash = "sha256:4bf5abab8a36d20193c698b0f1fc282c1d083c94723902c447e5d2f1780936a3"}, +] + +[package.dependencies] +numpy = ">=1.22.4,<1.29.0" + +[package.extras] +dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] +doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"] +test = ["asv", "gmpy2", "hypothesis", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + [[package]] name = "setuptools" version = "69.2.0" @@ -2294,5 +2314,5 @@ validators = "0.22.0" [metadata] lock-version = "2.0" -python-versions = ">=3.8,<3.12" -content-hash = "66b915f6915c79f83165dc5fb39f363ca53c493668ff87bb5b4953fb712cd4cc" +python-versions = ">=3.9,<3.12" +content-hash = "badb7b46af420d7b474a7b6e2aa8dc926d45ab8631342eee8ab8ab808d86c90c" diff --git a/pyproject.toml b/pyproject.toml index 129a5828..92c7c527 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "vector-db-benchmark" version = "0.1.0" description = "" -authors = ["Kacper Ɓukawski "] +authors = ["Qdrant Team "] [tool.poetry.dependencies] python = ">=3.8,<3.12" @@ -21,7 +21,6 @@ tqdm = "^4.66.1" psycopg = {extras = ["binary"], version = "^3.1.17"} pgvector = "^0.2.4" - [tool.poetry.dev-dependencies] pre-commit = "^2.20.0" pytest = "^7.1" diff --git a/run.py b/run.py index 5e947a8d..4446a7e5 100644 --- a/run.py +++ b/run.py @@ -20,13 +20,13 @@ def run( host: str = "localhost", skip_upload: bool = False, skip_search: bool = False, - skip_if_exists: bool = True, + skip_if_exists: bool = False, exit_on_error: bool = True, timeout: float = 86400.0, ): """ Example: - python3 run.py --engines *-m-16-* --engines qdrant-* --datasets glove-* + python3 run.py --engines "*-m-16-*" --engines "qdrant-*" --datasets "glove-*" """ all_engines = read_engine_configs() all_datasets = read_dataset_config() @@ -46,9 +46,15 @@ def run( for dataset_name, dataset_config in selected_datasets.items(): print(f"Running experiment: {engine_name} - {dataset_name}") client = ClientFactory(host).build_client(engine_config) - dataset = Dataset(dataset_config) - dataset.download() try: + + dataset = Dataset(dataset_config) + if dataset.config.type == "sparse" and not client.sparse_vector_support: + raise IncompatibilityError( + f"{client.name} engine does not support sparse vectors" + ) + dataset.download() + with stopit.ThreadingTimeout(timeout) as tt: client.run_experiment( dataset, skip_upload, skip_search, skip_if_exists @@ -66,9 +72,11 @@ def run( ) exit(2) except IncompatibilityError as e: - print(f"Skipping {engine_name} - {dataset_name}, incompatible params") + print( + f"Skipping {engine_name} - {dataset_name}, incompatible params:", e + ) continue - except KeyboardInterrupt as e: + except KeyboardInterrupt: traceback.print_exc() exit(1) except Exception as e: diff --git a/tools/run_ci.sh b/tools/run_ci.sh index 039a6fb4..4bac3bb2 100644 --- a/tools/run_ci.sh +++ b/tools/run_ci.sh @@ -7,27 +7,6 @@ set -e SCRIPT=$(realpath "$0") SCRIPT_PATH=$(dirname "$SCRIPT") -# Set up dependencies - -sudo apt update -sudo apt install -y jq - -# Download and install hcloud - -HCVERSION=v1.36.0 - -wget https://github.com/hetznercloud/cli/releases/download/${HCVERSION}/hcloud-linux-amd64.tar.gz - -tar xzf hcloud-linux-amd64.tar.gz - -sudo mv hcloud /usr/local/bin - -# Install mc - -wget https://dl.min.io/client/mc/release/linux-amd64/mc -chmod +x mc -./mc alias set qdrant https://storage.googleapis.com "${GCS_KEY}" "${GCS_SECRET}" - bash -x "${SCRIPT_PATH}/run_remote_benchmark.sh" ./mc cp results/* qdrant/vector-search-engines-benchmark/results/ci/qdrant/ diff --git a/tools/run_experiment.sh b/tools/run_experiment.sh index a1b23d52..6bd13e69 100644 --- a/tools/run_experiment.sh +++ b/tools/run_experiment.sh @@ -1,6 +1,8 @@ #!/bin/bash -ENGINE_NAME=${ENGINE_NAME:-"qdrant-default"} +set -e + +ENGINE_NAME=${ENGINE_NAME:-"qdrant-continuous-benchmark"} DATASETS=${DATASETS:-""} diff --git a/tools/run_server_container.sh b/tools/run_server_container.sh index 98d6e73f..4875e83e 100644 --- a/tools/run_server_container.sh +++ b/tools/run_server_container.sh @@ -2,7 +2,7 @@ set -e -# Examples: qdrant-single-node +# Examples: qdrant-continuous-benchmarks CONTAINER_NAME=$1 CLOUD_NAME=${CLOUD_NAME:-"hetzner"} @@ -20,7 +20,7 @@ IP_OF_THE_SERVER=$(bash "${SCRIPT_PATH}/${CLOUD_NAME}/get_public_ip.sh" "$BENCH_ bash -x "${SCRIPT_PATH}/sync_servers.sh" "root@$IP_OF_THE_SERVER" -# if version is dev or if starts with "docker" or "ghcr", use container +# if version is starts with "docker" or "ghcr", use container if [[ ${QDRANT_VERSION} == docker/* ]] || [[ ${QDRANT_VERSION} == ghcr/* ]]; then if [[ ${QDRANT_VERSION} == docker/* ]]; then @@ -33,11 +33,9 @@ if [[ ${QDRANT_VERSION} == docker/* ]] || [[ ${QDRANT_VERSION} == ghcr/* ]]; the CONTAINER_REGISTRY='ghcr.io' fi - DOCKER_COMPOSE="export QDRANT_VERSION=${QDRANT_VERSION}; export CONTAINER_REGISTRY=${CONTAINER_REGISTRY}; docker compose down ; pkill qdrant ; docker rmi ${CONTAINER_REGISTRY}/qdrant/qdrant:${QDRANT_VERSION} || true ; docker compose up -d" + DOCKER_COMPOSE="export QDRANT_VERSION=${QDRANT_VERSION}; export CONTAINER_REGISTRY=${CONTAINER_REGISTRY}; docker compose down; pkill qdrant ; docker rmi ${CONTAINER_REGISTRY}/qdrant/qdrant:${QDRANT_VERSION} || true ; docker compose up -d; docker container ls" ssh -t "${SERVER_USERNAME}@${IP_OF_THE_SERVER}" "cd ./projects/vector-db-benchmark/engine/servers/${CONTAINER_NAME} ; $DOCKER_COMPOSE" else - # else run natively in the server - DOCKER_QDRANT_STOP="docker stop qdrant-continuous || true" - QDRANT_BUILD="source ~/.cargo/env; git fetch --tags; git checkout ${QDRANT_VERSION}; git pull; mold -run cargo run --bin qdrant --release" - ssh -t "${SERVER_USERNAME}@${IP_OF_THE_SERVER}" "cd ./projects/qdrant; ${DOCKER_QDRANT_STOP}; $QDRANT_BUILD" + echo "Error: unknown version ${QDRANT_VERSION}. Version name should start with 'docker/' or 'ghcr/'" + exit 1 fi diff --git a/tools/setup_ci.sh b/tools/setup_ci.sh new file mode 100755 index 00000000..f24321d5 --- /dev/null +++ b/tools/setup_ci.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Set up dependencies + +sudo apt update +sudo apt install -y jq + +# Download and install hcloud + +HCVERSION=v1.36.0 + +wget https://github.com/hetznercloud/cli/releases/download/${HCVERSION}/hcloud-linux-amd64.tar.gz + +tar xzf hcloud-linux-amd64.tar.gz + +sudo mv hcloud /usr/local/bin + +# Install mc + +wget https://dl.min.io/client/mc/release/linux-amd64/mc +chmod +x mc +./mc alias set qdrant https://storage.googleapis.com "${GCS_KEY}" "${GCS_SECRET}"