Skip to content

add chroma to benchmark #205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions engine/clients/chroma/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from engine.clients.chroma.configure import ChromaConfigurator
from engine.clients.chroma.search import ChromaSearcher
from engine.clients.chroma.upload import ChromaUploader

__all__ = [
"ChromaConfigurator",
"ChromaSearcher",
"ChromaUploader",
]
7 changes: 7 additions & 0 deletions engine/clients/chroma/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "benchmark")


def chroma_fix_host(host: str):
return host if host != "localhost" else "127.0.0.1"
45 changes: 45 additions & 0 deletions engine/clients/chroma/configure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from chromadb import HttpClient, Settings

from benchmark.dataset import Dataset
from engine.base_client.configure import BaseConfigurator
from engine.base_client.distances import Distance
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host


class ChromaConfigurator(BaseConfigurator):

DISTANCE_MAPPING = {
Distance.L2: "l2",
Distance.COSINE: "cosine",
Distance.DOT: "ip",
}

def __init__(self, host, collection_params: dict, connection_params: dict):
super().__init__(host, collection_params, connection_params)
self.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)

def clean(self):
"""
Delete a collection and all associated embeddings, documents, and metadata.

This is destructive and not reversible.
"""
try:
self.client.delete_collection(name=CHROMA_COLLECTION_NAME)
except (Exception, ValueError):
pass

def recreate(self, dataset: Dataset, collection_params):
params = self.collection_params
params["metadata"] = dict(
{"hnsw:space": self.DISTANCE_MAPPING.get(dataset.config.distance)},
**params.pop("config", {}),
)
self.client.create_collection(
name=CHROMA_COLLECTION_NAME,
**params,
)
56 changes: 56 additions & 0 deletions engine/clients/chroma/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import List, Optional

from chromadb import Where
from chromadb.types import OperatorExpression

from engine.base_client import IncompatibilityError
from engine.base_client.parser import BaseConditionParser, FieldValue


class ChromaConditionParser(BaseConditionParser):
def build_condition(
self,
and_subfilters: Optional[List[Where]],
or_subfilters: Optional[List[Where]],
) -> Where:
condition: Where = {}
if and_subfilters is not None:
if len(and_subfilters) >= 2:
condition["$and"] = and_subfilters
elif len(and_subfilters) == 1:
condition = {**condition, **and_subfilters[0]}

if or_subfilters is not None:
if len(or_subfilters) >= 2:
condition["$or"] = or_subfilters
elif len(or_subfilters) == 1:
condition = {**condition, **or_subfilters[0]}

return condition

def build_exact_match_filter(self, field_name: str, value: FieldValue) -> Where:
return {field_name: value}

def build_range_filter(
self,
field_name: str,
lt: Optional[FieldValue],
gt: Optional[FieldValue],
lte: Optional[FieldValue],
gte: Optional[FieldValue],
) -> Where:
raw_filters: OperatorExpression = {
"$lt": lt,
"$gt": gt,
"$lte": lte,
"$gte": gte,
}
filters: OperatorExpression = {
k: v for k, v in raw_filters.items() if v is not None
}
return {field_name: filters}

def build_geo_filter(
self, field_name: str, lat: float, lon: float, radius: float
) -> Where:
raise IncompatibilityError
44 changes: 44 additions & 0 deletions engine/clients/chroma/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import List, Tuple

from chromadb import ClientAPI, HttpClient, Settings
from chromadb.api.types import IncludeEnum

from dataset_reader.base_reader import Query
from engine.base_client.search import BaseSearcher
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host
from engine.clients.chroma.parser import ChromaConditionParser


class ChromaSearcher(BaseSearcher):
client: ClientAPI = None
parser = ChromaConditionParser()

@classmethod
def init_client(cls, host, distance, connection_params: dict, search_params: dict):
cls.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)
cls.search_params = search_params

@classmethod
def search_one(cls, query: Query, top: int) -> List[Tuple[int, float]]:
res = cls.collection.query(
query_embeddings=[query.vector],
n_results=top,
where=cls.parser.parse(query.meta_conditions),
include=[IncludeEnum.distances],
)

return [
(int(hit[0]), float(hit[1]))
for hit in zip(res["ids"][0], res["distances"][0])
]

def setup_search(self):
metadata = self.collection.metadata.copy()
metadata.pop("hnsw:space", None) # Not allowed in the collection.modify method
metadata.update(self.search_params.get("config", {}))
self.collection.modify(metadata=metadata)
36 changes: 36 additions & 0 deletions engine/clients/chroma/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import List

from chromadb import ClientAPI, HttpClient, Settings

from dataset_reader.base_reader import Record
from engine.base_client.upload import BaseUploader
from engine.clients.chroma.config import CHROMA_COLLECTION_NAME, chroma_fix_host


class ChromaUploader(BaseUploader):
client: ClientAPI = None
upload_params = {}

@classmethod
def init_client(cls, host, distance, connection_params, upload_params):
cls.client = HttpClient(
host=chroma_fix_host(host),
settings=Settings(allow_reset=True, anonymized_telemetry=False),
**connection_params,
)
cls.collection = cls.client.get_collection(name=CHROMA_COLLECTION_NAME)

@classmethod
def upload_batch(cls, batch: List[Record]):
# assert len(batch) <= cls.client.get_max_batch_size() # commented for performance reasons
ids, vectors, payloads = [], [], []
for point in batch:
ids.append(str(point.id))
vectors.append(point.vector)
payloads.append(point.metadata or None)

cls.collection.add(
embeddings=vectors,
metadatas=payloads or None,
ids=ids,
)
4 changes: 4 additions & 0 deletions engine/clients/client_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
BaseSearcher,
BaseUploader,
)
from engine.clients.chroma import ChromaConfigurator, ChromaSearcher, ChromaUploader
from engine.clients.elasticsearch import (
ElasticConfigurator,
ElasticSearcher,
Expand Down Expand Up @@ -39,6 +40,7 @@
"opensearch": OpenSearchConfigurator,
"redis": RedisConfigurator,
"pgvector": PgVectorConfigurator,
"chroma": ChromaConfigurator,
}

ENGINE_UPLOADERS = {
Expand All @@ -49,6 +51,7 @@
"opensearch": OpenSearchUploader,
"redis": RedisUploader,
"pgvector": PgVectorUploader,
"chroma": ChromaUploader,
}

ENGINE_SEARCHERS = {
Expand All @@ -59,6 +62,7 @@
"opensearch": OpenSearchSearcher,
"redis": RedisSearcher,
"pgvector": PgVectorSearcher,
"chroma": ChromaSearcher,
}


Expand Down
20 changes: 20 additions & 0 deletions engine/servers/chroma-single-node/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
services:
chromadb_bench:
image: ${CONTAINER_REGISTRY:-docker.io}/chromadb/chroma:0.5.7
volumes:
- ./chromadb:/chroma/chroma
ports:
- "8000:8000"
logging:
driver: "json-file"
options:
max-file: 1
max-size: 10m
environment:
IS_PERSISTENT: TRUE
ANONYMIZED_TELEMETRY: False
CHROMA_WORKERS: 1
deploy:
resources:
limits:
memory: 25Gb
105 changes: 105 additions & 0 deletions experiments/configurations/chroma-single-node.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
[
{
"name": "chroma-default",
"engine": "chroma",
"connection_params": {},
"collection_params": {},
"search_params": [
{ "parallel": 8, "config": {} }
],
"upload_params": {
"parallel": 16,
"batch_size": 1024
}
},
{
"name": "chroma-m-16-ef-128",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 16,
"hnsw:construction_ef": 128
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-128",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 32,
"hnsw:construction_ef": 128
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-256",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 32,
"hnsw:construction_ef": 256
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-32-ef-512",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 32,
"hnsw:construction_ef": 512
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-64-ef-256",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 64,
"hnsw:construction_ef": 256
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
},
{
"name": "chroma-m-64-ef-512",
"engine": "chroma",
"connection_params": {},
"collection_params": {
"config": {
"hnsw:M": 64,
"hnsw:construction_ef": 512
}
},
"search_params": [
{ "parallel": 1, "config": {"hnsw:search_ef": 128 } }, { "parallel": 1, "config": {"hnsw:search_ef": 256 } }, { "parallel": 1, "config": {"hnsw:search_ef": 512 } }
],
"upload_params": { "parallel": 16 }
}
]
29 changes: 29 additions & 0 deletions experiments/configurations/chroma-single-node.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Chroma Parameters

See https://cookbook.chromadb.dev/core/configuration/#hnsw-configuration

`hnsw:M` cannot be changed after index creation.
`hnsw:construction_ef` cannot be changed after index creation.
`hnsw:search_ef` can be changed.

Parallel > 1 for searching is currently not supported because Chroma is not process-safe (see https://github.com/qdrant/vector-db-benchmark/pull/205#discussion_r1781471419).

## collection_params
"metadata": {
"hnsw:M": 16,32,64,
"hnsw:construction_ef": 128,256,512
}

## search_params
"parallel": 1 # implemented in base_client
"top": / # implemented in base_client
"metadata": {
"hnsw:search_ef": 128,256,512
}

## upload_params
non-default not in use.

"parallel": 16 # implemented in base_client
"batch_size": 1024 # implemented in base_client

Loading