diff --git a/vectordb_bench/backend/cases.py b/vectordb_bench/backend/cases.py index 617af5208..c456c96a8 100644 --- a/vectordb_bench/backend/cases.py +++ b/vectordb_bench/backend/cases.py @@ -4,7 +4,7 @@ from vectordb_bench import config from vectordb_bench.backend.clients.api import MetricType -from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NonFilter, non_filter +from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NewIntFilter, NonFilter, non_filter from vectordb_bench.base import BaseModel from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig @@ -52,6 +52,8 @@ class CaseType(Enum): StreamingPerformanceCase = 200 + NewIntFilterPerformanceCase = 250 + LabelFilterPerformanceCase = 300 def case_cls(self, custom_configs: dict | None = None) -> type["Case"]: @@ -130,6 +132,7 @@ class PerformanceCase(Case): filter_rate: float | None = None load_timeout: float | int = config.LOAD_TIMEOUT_DEFAULT optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_DEFAULT + int_value: float | None = None class CapacityDim960(CapacityCase): @@ -471,6 +474,46 @@ def __init__( ) +class NewIntFilterPerformanceCase(PerformanceCase): + case_id: CaseType = CaseType.NewIntFilterPerformanceCase + dataset_with_size_type: DatasetWithSizeType + filter_rate: float + + def __init__( + self, + dataset_with_size_type: DatasetWithSizeType | str, + filter_rate: float, + int_value: float | None = 0, + **kwargs, + ): + if not isinstance(dataset_with_size_type, DatasetWithSizeType): + dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type) + name = f"Int-Filter-{filter_rate*100:.1f}% - {dataset_with_size_type.value}" + description = f"Int-Filter-{filter_rate*100:.1f}% Performance Test ({dataset_with_size_type.value})" + dataset = dataset_with_size_type.get_manager() + load_timeout = dataset_with_size_type.get_load_timeout() + optimize_timeout = dataset_with_size_type.get_optimize_timeout() + filters = IntFilter(filter_rate=filter_rate, int_value=int_value) + filter_rate = filters.filter_rate + super().__init__( + name=name, + description=description, + dataset=dataset, + load_timeout=load_timeout, + optimize_timeout=optimize_timeout, + filter_rate=filter_rate, + int_value=int_value, + dataset_with_size_type=dataset_with_size_type, + **kwargs, + ) + + @property + def filters(self) -> Filter: + int_field = self.dataset.data.train_id_field + int_value = int(self.dataset.data.size * self.filter_rate) + return NewIntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value) + + class LabelFilterPerformanceCase(PerformanceCase): case_id: CaseType = CaseType.LabelFilterPerformanceCase dataset_with_size_type: DatasetWithSizeType @@ -529,5 +572,6 @@ def filters(self) -> Filter: CaseType.Performance1536D50K: Performance1536D50K, CaseType.PerformanceCustomDataset: PerformanceCustomDataset, CaseType.StreamingPerformanceCase: StreamingPerformanceCase, + CaseType.NewIntFilterPerformanceCase: NewIntFilterPerformanceCase, CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase, } diff --git a/vectordb_bench/backend/dataset.py b/vectordb_bench/backend/dataset.py index d9746adf4..7416a275b 100644 --- a/vectordb_bench/backend/dataset.py +++ b/vectordb_bench/backend/dataset.py @@ -48,6 +48,7 @@ class BaseDataset(BaseModel): scalar_labels_file_separated: bool = True scalar_labels_file: str = "scalar_labels.parquet" scalar_label_percentages: list[float] = [] + scalar_int_rates: list[float] = [] train_id_field: str = "id" train_vector_field: str = "emb" test_file: str = "test.parquet" @@ -164,6 +165,7 @@ class Cohere(BaseDataset): } with_scalar_labels: bool = True scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5] + scalar_int_rates: list[float] = [0.01, 0.99] class Bioasq(BaseDataset): @@ -178,6 +180,7 @@ class Bioasq(BaseDataset): } with_scalar_labels: bool = True scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5] + scalar_int_rates: list[float] = [0.01, 0.99] class Glove(BaseDataset): @@ -217,6 +220,7 @@ class OpenAI(BaseDataset): } with_scalar_labels: bool = True scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5] + scalar_int_rates: list[float] = [0.01, 0.99] class DatasetManager(BaseModel): diff --git a/vectordb_bench/backend/filter.py b/vectordb_bench/backend/filter.py index ceff53f4d..f20d266e5 100644 --- a/vectordb_bench/backend/filter.py +++ b/vectordb_bench/backend/filter.py @@ -51,6 +51,23 @@ def groundtruth_file(self) -> str: raise RuntimeError(msg) +class NewIntFilter(Filter): + type: FilterOp = FilterOp.NumGE + int_field: str = "id" + int_value: int + + @property + def int_rate(self) -> str: + r = self.filter_rate * 100 + if r >= 1: + return f"int_{int(r)}p" + return f"int_{r:.1f}p" + + @property + def groundtruth_file(self) -> str: + return f"neighbors_{self.int_rate}.parquet" + + class LabelFilter(Filter): """ filter expr: label_field == label_value, like `color == "red"` diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index 896352a8e..785fee8de 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -223,6 +223,17 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> ] +def generate_int_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> list[CaseConfig]: + filter_rates = dataset_with_size_type.get_manager().data.scalar_int_rates + return [ + CaseConfig( + case_id=CaseType.NewIntFilterPerformanceCase, + custom_case=dict(dataset_with_size_type=dataset_with_size_type, filter_rate=filter_rate), + ) + for filter_rate in filter_rates + ] + + UI_CASE_CLUSTERS: list[UICaseItemCluster] = [ UICaseItemCluster( label="Search Performance Test", @@ -253,6 +264,27 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) -> UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D500K99P)), ], ), + UICaseItemCluster( + label="New-Int-Filter Search Performance Test", + uiCaseItems=[ + UICaseItem( + label=f"Int-Filter Search Performance Test - {dataset_with_size_type.value}", + description=( + f"[Batch Cases]These cases test the search performance of a vector database " + f"with dataset {dataset_with_size_type.value}" + f"under filtering rates of {dataset_with_size_type.get_manager().data.scalar_int_rates}, at varying parallel levels." + f"Results will show index building time, recall, and maximum QPS." + ), + cases=generate_int_filter_cases(dataset_with_size_type), + ) + for dataset_with_size_type in [ + DatasetWithSizeType.CohereMedium, + DatasetWithSizeType.CohereLarge, + DatasetWithSizeType.OpenAIMedium, + DatasetWithSizeType.OpenAILarge, + ] + ], + ), UICaseItemCluster( label="Label-Filter Search Performance Test", uiCaseItems=[ diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index db6deae76..0b3c150cc 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -6,6 +6,8 @@ import ujson +from vectordb_bench.backend.dataset import DatasetWithSizeMap, DatasetWithSizeType + from . import config from .backend.cases import Case, CaseType from .backend.clients import ( @@ -14,6 +16,7 @@ DBConfig, ) from .base import BaseModel +from vectordb_bench.backend.cases import type2case from .metric import Metric log = logging.getLogger(__name__) @@ -246,6 +249,26 @@ def write_db_file(self, result_dir: pathlib.Path, partial: Self, db: str): b = partial.json(exclude={"db_config": {"password", "api_key"}}) f.write(b) + def get_case_config(case_config: CaseConfig) -> dict[CaseConfig]: + if int(case_config["case_id"]) in {6, 7, 8, 9, 12, 13, 14, 15}: + for key, value in CaseType.__members__.items(): + if value.value == case_config["case_id"]: + matching_key = key + break + case_list = type2case[CaseType[matching_key]] + case_instance = case_list() + custom_case = case_config["custom_case"] + if custom_case is None: + custom_case = {} + custom_case["filter_rate"] = case_instance.filter_rate + for dataset, size_type in DatasetWithSizeMap.items(): + if case_instance.dataset == size_type: + custom_case["dataset_with_size_type"] = dataset + break + case_config["case_id"] = CaseType.NewIntFilterPerformanceCase + case_config["custom_case"] = custom_case + return case_config + @classmethod def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self: if not full_path.exists(): @@ -256,9 +279,9 @@ def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self: test_result = ujson.loads(f.read()) if "task_label" not in test_result: test_result["task_label"] = test_result["run_id"] - for case_result in test_result["results"]: task_config = case_result.get("task_config") + case_config = task_config.get("case_config") db = DB(task_config.get("db")) task_config["db_config"] = db.config_cls(**task_config["db_config"]) @@ -266,6 +289,7 @@ def read_file(cls, full_path: pathlib.Path, trans_unit: bool = False) -> Self: index_type=task_config["db_case_config"].get("index", None), )(**task_config["db_case_config"]) + task_config["case_config"] = cls.get_case_config(case_config=case_config) case_result["task_config"] = task_config if trans_unit: