diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index 2d41835e1..ce9d62c1b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -236,6 +236,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache, diagnostics.startMonitoring("testDirectory", workDirectory); diagnostics.startMonitoring("indexCache", Paths.get(indexCacheDir)); diagnostics.capturePrePhaseSnapshot("Graph Build"); + System.out.printf("%s: Dataset similarity function is %s%n", ds.getName(), ds.getSimilarityFunction()); // Resolve build compressor (and label quant type) so we can record compute time VectorCompressor buildCompressorObj = null; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java index aed5d99e7..3c218c85f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java @@ -16,7 +16,6 @@ package io.github.jbellis.jvector.example.benchmarks.datasets; -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import io.github.jbellis.jvector.vector.VectorizationProvider; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; @@ -41,10 +40,9 @@ /** * This dataset loader will get and load hdf5 files from ann-benchmarks. * - *

The vector similarity function is first inferred from the filename (e.g. {@code -angular}, - * {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls - * back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. - * If neither source provides a similarity function, an error is thrown. + *

For curated benchmark datasets, properties are provided by + * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata + * does not provide a similarity function, an error is thrown. */ public class DataSetLoaderHDF5 implements DataSetLoader { public static final Path HDF5_DIR = Path.of("hdf5"); @@ -57,19 +55,17 @@ public class DataSetLoaderHDF5 implements DataSetLoader { */ public Optional loadDataSet(String datasetName) { return maybeDownloadHdf5(datasetName).map(path -> { - var props = getProperties(datasetName, path); - var similarity = props.similarityFunction() + var props = getProperties(datasetName); + props.similarityFunction() .orElseThrow(() -> new IllegalArgumentException( - "No similarity function found for HDF5 dataset: " + datasetName - + ". Either include -angular, -dot, or -euclidean in the filename," - + " or add an entry in dataset_metadata.yml")); - return new DataSetInfo(props, () -> readHdf5Data(path, similarity)); + "No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName)); + return new DataSetInfo(props, () -> readHdf5Data(path, props)); }); } /// Reads base vectors, query vectors, and ground truth from an HDF5 file - /// and returns a scrubbed {@link DataSet}. - private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) { + /// and returns a {@link DataSet} using the configured dataset properties. + private DataSet readHdf5Data(Path path, DataSetProperties props) { VectorFloat[] baseVectors; VectorFloat[] queryVectors; var gtSets = new ArrayList>(); @@ -103,37 +99,19 @@ private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunct } } - return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); + return DataSetUtils.processDataSet( + path.getFileName().toString(), + props, + Arrays.asList(baseVectors), + Arrays.asList(queryVectors), + gtSets); } - /// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}. - /// - /// The filename is checked first for known suffixes ({@code -angular}, {@code -dot}, - /// {@code -euclidean}) to infer the similarity function. If none match, the dataset name - /// is looked up in {@code dataset_metadata.yml}. If neither source provides properties, - /// a minimal {@link DataSetProperties} with an empty similarity function is returned - /// so that the caller can produce a clear error. + /// Looks up dataset properties in {@code dataset_metadata.yml}. /// /// @param datasetName the logical dataset name (without {@code .hdf5} extension) - /// @param filename the resolved file path including the {@code .hdf5} extension - /// @return the dataset properties - private static DataSetProperties getProperties(String datasetName, Path filename) { - String filenameStr = filename.toString(); - VectorSimilarityFunction inferred = null; - if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) { - inferred = VectorSimilarityFunction.COSINE; - } else if (filenameStr.contains("-euclidean")) { - inferred = VectorSimilarityFunction.EUCLIDEAN; - } - - // If filename inference succeeded, build properties with just the SF - if (inferred != null) { - return new DataSetProperties.PropertyMap(Map.of( - DataSetProperties.KEY_NAME, datasetName, - DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred)); - } - - // Fall back to metadata YAML + /// @return the dataset properties, or a minimal name-only property set if no entry exists + private static DataSetProperties getProperties(String datasetName) { return metadata.getProperties(datasetName) .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName))); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java index 16dedbb82..b38d2daf1 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java @@ -17,7 +17,6 @@ package io.github.jbellis.jvector.example.benchmarks.datasets; import io.github.jbellis.jvector.example.util.SiftLoader; -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider; @@ -67,10 +66,10 @@ public Optional loadDataSet(String fileName) { var props = metadata.getProperties(mfd.name) .orElseThrow(() -> new IllegalArgumentException( "No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); - var vsf = props.similarityFunction() + props.similarityFunction() .orElseThrow(() -> new IllegalArgumentException( "No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name)); - return new DataSetInfo(props, () -> mfd.load(vsf)); + return new DataSetInfo(props, () -> mfd.load(props)); }); } @@ -204,15 +203,16 @@ public Iterable paths() { return List.of(basePath, queriesPath, groundTruthPath); } - /// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}. + /// Reads the fvec/ivec files from disk and processes the dataset using the + /// configured dataset properties. /// - /// @param similarityFunction the similarity function to associate with the dataset - /// @return the loaded and scrubbed dataset - public DataSet load(VectorSimilarityFunction similarityFunction) { + /// @param props the dataset properties controlling similarity and load behavior + /// @return the loaded dataset + public DataSet load(DataSetProperties props) { var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors); + return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors); } public static Map byName = new HashMap<>() {{ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java index e8305a3ce..93ace9249 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java @@ -53,7 +53,7 @@ public class DataSetMetadataReader { private final Map> metadata; private DataSetMetadataReader(Map> metadata) { - this.metadata = metadata; + this.metadata = metadata != null ? metadata : Map.of(); } /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}). @@ -72,8 +72,7 @@ public static DataSetMetadataReader load() { @SuppressWarnings("unchecked") public static DataSetMetadataReader load(String file) { try (InputStream inputStream = new FileInputStream(file)) { - Yaml yaml = new Yaml(); - Map> data = yaml.load(inputStream); + Map> data = new Yaml().load(inputStream); return new DataSetMetadataReader(data); } catch (IOException e) { throw new RuntimeException("Failed to load dataset metadata from " + file, e); @@ -82,22 +81,34 @@ public static DataSetMetadataReader load(String file) { /// Looks up the {@link DataSetProperties} for a dataset by key. /// - /// The lookup tries the exact key first, then the key with {@code .hdf5} appended. - /// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset - /// name injected. Properties not present in the YAML default to empty/false/zero. + /// The lookup first tries the exact key. If that is not found, it also tries the + /// corresponding key with or without the {@code .hdf5} suffix so that callers may + /// use either form. + /// + /// The matched YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} + /// with the requested dataset key injected as the dataset name when no explicit + /// name is present. Properties not present in the YAML default to empty/false/zero. /// /// @param datasetKey the dataset name or filename to look up /// @return the dataset properties if an entry exists, or empty if no entry is found public Optional getProperties(String datasetKey) { + return findEntry(datasetKey).map(entry -> { + var props = new HashMap<>(entry); + props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey); + return new DataSetProperties.PropertyMap(props); + }); + } + + private Optional> findEntry(String datasetKey) { Map entry = metadata.get(datasetKey); - if (entry == null) { - entry = metadata.get(datasetKey + ".hdf5"); + if (entry != null) { + return Optional.of(entry); } - if (entry == null) { - return Optional.empty(); + + if (datasetKey.endsWith(".hdf5")) { + return Optional.ofNullable(metadata.get(datasetKey.substring(0, datasetKey.length() - ".hdf5".length()))); } - var props = new HashMap<>(entry); - props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey); - return Optional.of(new DataSetProperties.PropertyMap(props)); + + return Optional.ofNullable(metadata.get(datasetKey + ".hdf5")); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java index 5f02ba790..5ae1cf2e6 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java @@ -51,6 +51,21 @@ public interface DataSetProperties { /// Canonical key for whether the dataset is free of duplicate vectors ({@link Boolean}). String KEY_IS_DUPLICATE_VECTOR_FREE = "is_duplicate_vector_free"; + /// Canonical key for how benchmark loaders should treat the dataset at load time. + String KEY_LOAD_BEHAVIOR = "load_behavior"; + + /** + * Controls benchmark-loader behavior for this dataset. + * + *

LEGACY_SCRUB preserves current behavior (to be deprecated). + * NO_SCRUB loads the dataset exactly as provided, without load-time scrubbing + * or ground-truth remapping. + */ + enum LoadBehavior { + LEGACY_SCRUB, + NO_SCRUB + } + /** * Returns the similarity function for this dataset. * @@ -97,6 +112,18 @@ public interface DataSetProperties { */ public boolean isDuplicateVectorFree(); + /** + * Returns how benchmark loaders should treat this dataset at load time. + * + *

This is a loader policy, not a statement of dataset quality. + * The default preserves legacy behavior. + * + * @return the benchmark loader behavior for this dataset + */ + default LoadBehavior loadBehavior() { + return LoadBehavior.LEGACY_SCRUB; + } + /** * A convenience method to capture the notion of a valid dataset. * As any additional qualifiers are added to this data carrier, this method should be updated accordingly. @@ -222,5 +249,17 @@ public boolean isZeroVectorFree() { public boolean isDuplicateVectorFree() { return Boolean.TRUE.equals(properties.get(KEY_IS_DUPLICATE_VECTOR_FREE)); } + + @Override + public LoadBehavior loadBehavior() { + var value = properties.get(KEY_LOAD_BEHAVIOR); + if (value instanceof LoadBehavior) { + return (LoadBehavior) value; + } + if (value instanceof String) { + return LoadBehavior.valueOf((String) value); + } + return LoadBehavior.LEGACY_SCRUB; + } } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java index 653e71b0c..61dc64652 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java @@ -27,69 +27,111 @@ public class DataSetUtils { * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length. * Note: This only scrubs and normalizes for dot product similarity. */ + private static final Comparator> VECTOR_COMPARATOR = (a, b) -> { + assert a.length() == b.length(); + for (int i = 0; i < a.length(); i++) { + if (a.get(i) < b.get(i)) { + return -1; + } + if (a.get(i) > b.get(i)) { + return 1; + } + } + return 0; + }; + + /** + * Processes a dataset using the configured load behavior from the dataset metadata. + */ + public static DataSet processDataSet(String pathStr, + DataSetProperties props, + List> baseVectors, + List> queryVectors, + List> groundTruth) { + var vsf = props.similarityFunction() + .orElseThrow(() -> new IllegalArgumentException( + "No similarity function configured for dataset: " + props.getName())); + + switch (props.loadBehavior()) { + case NO_SCRUB: + return new SimpleDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth); + case LEGACY_SCRUB: + return legacyScrubDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth); + default: + throw new IllegalArgumentException("Unsupported load behavior: " + props.loadBehavior()); + } + } + + /** + * @deprecated Benchmark loaders should use + * {@link #processDataSet(String, DataSetProperties, List, List, List)} + * so that load behavior is controlled explicitly by dataset metadata. + */ + @Deprecated(forRemoval = true) public static DataSet getScrubbedDataSet(String pathStr, VectorSimilarityFunction vsf, List> baseVectors, List> queryVectors, List> groundTruth) { - // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers - List> scrubbedBaseVectors; - List> scrubbedQueryVectors; - List> gtSet; - scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); - scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); - gtSet = new ArrayList<>(groundTruth.size()); - var uniqueVectors = new TreeSet>((a, b) -> { - assert a.length() == b.length(); - for (int i = 0; i < a.length(); i++) { - if (a.get(i) < b.get(i)) { - return -1; - } - if (a.get(i) > b.get(i)) { - return 1; - } - } - return 0; - }); + return legacyScrubDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth); + } + + private static DataSet legacyScrubDataSet(String pathStr, + VectorSimilarityFunction vsf, + List> baseVectors, + List> queryVectors, + List> groundTruth) { + List> scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); + List> scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); + List> gtSet = new ArrayList<>(groundTruth.size()); + + var uniqueVectors = new TreeSet>(VECTOR_COMPARATOR); Map rawToScrubbed = new HashMap<>(); - { - int j = 0; - for (int i = 0; i < baseVectors.size(); i++) { - VectorFloat v = baseVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - if (valid && uniqueVectors.add(v)) { - scrubbedBaseVectors.add(v); - rawToScrubbed.put(i, j++); - } + + int nextOrdinal = 0; + for (int i = 0; i < baseVectors.size(); i++) { + VectorFloat v = baseVectors.get(i); + boolean valid = isValidLegacyVector(v, vsf); + if (valid && uniqueVectors.add(v)) { + scrubbedBaseVectors.add(v); + rawToScrubbed.put(i, nextOrdinal++); } } - // also remove zero query vectors and query vectors that are present in the base set + + // Also remove zero query vectors and query vectors that are present in the base set. for (int i = 0; i < queryVectors.size(); i++) { VectorFloat v = queryVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - var dupe = uniqueVectors.contains(v); + boolean valid = isValidLegacyVector(v, vsf); + boolean dupe = uniqueVectors.contains(v); if (valid && !dupe) { scrubbedQueryVectors.add(v); - var gt = new ArrayList(); - for (int j : groundTruth.get(i)) { - gt.add(rawToScrubbed.get(j)); + var gt = new ArrayList(groundTruth.get(i).size()); + for (int ordinal : groundTruth.get(i)) { + gt.add(rawToScrubbed.get(ordinal)); } gtSet.add(gt); } } - // now that the zero vectors are removed, we can normalize if it looks like they aren't already - if (vsf == VectorSimilarityFunction.DOT_PRODUCT) { - if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) { - normalizeAll(scrubbedBaseVectors); - normalizeAll(scrubbedQueryVectors); - } + if (shouldNormalizeLegacy(vsf, baseVectors)) { + normalizeAll(scrubbedBaseVectors); + normalizeAll(scrubbedQueryVectors); } assert scrubbedQueryVectors.size() == gtSet.size(); return new SimpleDataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet); } + private static boolean isValidLegacyVector(VectorFloat vector, VectorSimilarityFunction vsf) { + return vsf == VectorSimilarityFunction.EUCLIDEAN || Math.abs(normOf(vector)) > 1e-5; + } + + private static boolean shouldNormalizeLegacy(VectorSimilarityFunction vsf, List> baseVectors) { + return vsf == VectorSimilarityFunction.DOT_PRODUCT + && !baseVectors.isEmpty() + && Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5; + } + public static void normalizeAll(Iterable> vectors) { for (VectorFloat v : vectors) { VectorUtil.l2normalize(v); diff --git a/jvector-examples/yaml-configs/ada002-100k.yml b/jvector-examples/yaml-configs/ada002-100k.yml deleted file mode 100644 index 1cf7e1fbb..000000000 --- a/jvector-examples/yaml-configs/ada002-100k.yml +++ /dev/null @@ -1,38 +0,0 @@ -yamlSchemaVersion: 1 -onDiskIndexVersion: 6 - -dataset: ada002-100k - -construction: - outDegree: [32] - efConstruction: [100] - neighborOverflow: [1.2f] - addHierarchy: [Yes] - refineFinalGraph: [Yes] - fusedGraph: [Yes, No] - compression: - - type: PQ - parameters: - m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor - # mFactor: 8 - # k: 256 # optional parameter. By default, k=256 - centerData: No - anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) - reranking: - - NVQ - useSavedIndexIfExists: No - -search: - topKOverquery: - 10: [1.0, 2.0, 5.0, 10.0] - 100: [1.0, 2.0] - useSearchPruning: [Yes] - compression: - - type: PQ - parameters: - m: 192 - # k: 256 # optional parameter. By default, k=256 - centerData: No - anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) - -# Run-level controls, such as benchmarks, console, and logging, are in run.yml. \ No newline at end of file diff --git a/jvector-examples/yaml-configs/colbert-1M.yml b/jvector-examples/yaml-configs/colbert-1M.yml index 48f32b0c9..b9e6c72b7 100644 --- a/jvector-examples/yaml-configs/colbert-1M.yml +++ b/jvector-examples/yaml-configs/colbert-1M.yml @@ -11,13 +11,7 @@ construction: refineFinalGraph: [Yes] fusedGraph: [No] compression: - - type: PQ - parameters: - m: 32 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor - # mFactor: 8 - # k: 256 # optional parameter. By default, k=256 - centerData: No - anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) + - type: None reranking: - NVQ useSavedIndexIfExists: No @@ -28,11 +22,6 @@ search: 100: [1.0, 2.0] useSearchPruning: [Yes] compression: - - type: PQ - parameters: - m: 32 - # k: 256 # optional parameter. By default, k=256 - centerData: No - anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) + - type: None # Run-level controls, such as benchmarks, console, and logging, are in run.yml. \ No newline at end of file diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml index 6aade63cc..21e5e69f9 100644 --- a/jvector-examples/yaml-configs/dataset_metadata.yml +++ b/jvector-examples/yaml-configs/dataset_metadata.yml @@ -1,39 +1,99 @@ -# This file contains the metadata for the datasets (formats) which do not have a dedicated -# metadata facility. The MFD and hdf5 loaders use this file to determine the similarity function, among other things. -# (HDF5 metadata support is moot for us since the runtime support fall short in other ways) +# This file contains authoritative metadata for curated benchmark datasets whose +# raw formats do not carry the properties we need at runtime. # -# Ideally, this metadata is part of the format and access layer for a given dataset format. This file exists because -# the dataset names herein are in a form which does _not_ support proper bundled configuration data with the raw data. -# When possible, these dataset should be provided with another mechanism which fully handles this aspect of dataset -# management so that we don't have to maintain separate parts in different places. +# Both the MFD and HDF5 loaders use this file to determine dataset properties such +# as similarity_function and load_behavior. # -# You can put additional metadata here, but it will not be type-safe and reified properly unless there is an accompanying -# change in the DataSetProperties interface and associated implementations. +# load_behavior controls benchmark-loader processing: +# LEGACY_SCRUB - preserve the current load-time scrubbing behavior +# NO_SCRUB - load vectors and ground truth exactly as stored +# +# During the transition, existing deployed datasets should generally remain on +# LEGACY_SCRUB until their prescrubbed replacements and matching offline ground +# truth are ready. New prescrubbed datasets should use NO_SCRUB. +# +# Additional metadata requires corresponding support in DataSetProperties and the +# relevant loader code. +ada002-100k: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +ada002-1M: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +cap-1M: + similarity_function: DOT_PRODUCT + load_behavior: LEGACY_SCRUB +cap-6M: + similarity_function: DOT_PRODUCT + load_behavior: LEGACY_SCRUB cohere-english-v3-100k: similarity_function: COSINE - # examples of supported properties - # If not present, presumed to be false - # is_normalized: false - # is_zero_vector_free: false - # is duplicate_vector_free: false -ada002-100k: + load_behavior: LEGACY_SCRUB +cohere-english-v3-1M: similarity_function: COSINE -openai-v3-small-100k: + load_behavior: LEGACY_SCRUB +cohere-english-v3-10M: similarity_function: COSINE -gecko-100k: + load_behavior: LEGACY_SCRUB +colbert-1M: similarity_function: COSINE -openai-v3-large-3072-100k: + load_behavior: LEGACY_SCRUB +colbert-10M: similarity_function: COSINE -openai-v3-large-1536-100k: + load_behavior: LEGACY_SCRUB +degen-200k: similarity_function: COSINE + load_behavior: LEGACY_SCRUB +dpr-1M: + similarity_function: DOT_PRODUCT + load_behavior: LEGACY_SCRUB +dpr-10M: + similarity_function: DOT_PRODUCT + load_behavior: LEGACY_SCRUB e5-small-v2-100k: similarity_function: COSINE + load_behavior: LEGACY_SCRUB e5-base-v2-100k: similarity_function: COSINE + load_behavior: LEGACY_SCRUB e5-large-v2-100k: similarity_function: COSINE -ada002-1M: + load_behavior: LEGACY_SCRUB +gecko-100k: similarity_function: COSINE -colbert-1M: + load_behavior: LEGACY_SCRUB +nv-qa-v4-100k: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +openai-v3-small-100k: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +openai-v3-large-3072-100k: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +openai-v3-large-1536-100k: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +# ann-benchmarks +glove-25-angular.hdf5: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +glove-50-angular.hdf5: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +lastfm-64-dot.hdf5: + similarity_function: DOT_PRODUCT + load_behavior: LEGACY_SCRUB +glove-100-angular.hdf5: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +glove-200-angular.hdf5: + similarity_function: COSINE + load_behavior: LEGACY_SCRUB +nytimes-256-angular.hdf5: similarity_function: COSINE + load_behavior: LEGACY_SCRUB +sift-128-euclidean.hdf5: + similarity_function: EUCLIDEAN + load_behavior: LEGACY_SCRUB \ No newline at end of file diff --git a/jvector-examples/yaml-configs/default.yml b/jvector-examples/yaml-configs/default.yml index 2b37e61a0..346a701e4 100644 --- a/jvector-examples/yaml-configs/default.yml +++ b/jvector-examples/yaml-configs/default.yml @@ -13,8 +13,8 @@ construction: compression: - type: PQ parameters: - m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor - # mFactor: 8 + # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor + mFactor: 8 # k: 256 # optional parameter. By default, k=256 centerData: No anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) @@ -30,7 +30,8 @@ search: compression: - type: PQ parameters: - m: 192 + # m: 192 + mFactor: 8 # k: 256 # optional parameter. By default, k=256 centerData: No anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy) diff --git a/jvector-examples/yaml-configs/glove-100-angular.yml b/jvector-examples/yaml-configs/glove-100-angular.yml new file mode 100644 index 000000000..5f80f4ae2 --- /dev/null +++ b/jvector-examples/yaml-configs/glove-100-angular.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: glove-100-angular + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None diff --git a/jvector-examples/yaml-configs/glove-200-angular.yml b/jvector-examples/yaml-configs/glove-200-angular.yml new file mode 100644 index 000000000..faf5bf3fa --- /dev/null +++ b/jvector-examples/yaml-configs/glove-200-angular.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: glove-200-angular + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None diff --git a/jvector-examples/yaml-configs/glove-25-angular.yml b/jvector-examples/yaml-configs/glove-25-angular.yml index e982302b8..953f454e2 100644 --- a/jvector-examples/yaml-configs/glove-25-angular.yml +++ b/jvector-examples/yaml-configs/glove-25-angular.yml @@ -12,14 +12,10 @@ construction: refineFinalGraph: [Yes] fusedGraph: [No] compression: - - type: PQ - parameters: - mFactor: 4 - centerData: No - anisotropicThreshold: -1.0 + - type: None reranking: - NVQ - useSavedIndexIfExists: Yes + useSavedIndexIfExists: No search: topKOverquery: @@ -27,8 +23,4 @@ search: 100: [1.0, 2.0] useSearchPruning: [Yes] compression: - - type: PQ - parameters: - mFactor: 4 - centerData: No - anisotropicThreshold: -1.0 + - type: None diff --git a/jvector-examples/yaml-configs/glove-50-angular.yml b/jvector-examples/yaml-configs/glove-50-angular.yml new file mode 100644 index 000000000..a62eede12 --- /dev/null +++ b/jvector-examples/yaml-configs/glove-50-angular.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: glove-50-angular + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None diff --git a/jvector-examples/yaml-configs/lastfm-64-dot.yml b/jvector-examples/yaml-configs/lastfm-64-dot.yml new file mode 100644 index 000000000..045963849 --- /dev/null +++ b/jvector-examples/yaml-configs/lastfm-64-dot.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: lastfm-64-dot + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None diff --git a/jvector-examples/yaml-configs/nytimes-256-angular.yml b/jvector-examples/yaml-configs/nytimes-256-angular.yml new file mode 100644 index 000000000..755e99017 --- /dev/null +++ b/jvector-examples/yaml-configs/nytimes-256-angular.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: nytimes-256-angular + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None diff --git a/jvector-examples/yaml-configs/sift-128-euclidean.yml b/jvector-examples/yaml-configs/sift-128-euclidean.yml new file mode 100644 index 000000000..04d4b6fd8 --- /dev/null +++ b/jvector-examples/yaml-configs/sift-128-euclidean.yml @@ -0,0 +1,26 @@ +# This is here for loader testing purposes only, use other datasets for realistic test data +yamlSchemaVersion: 1 +onDiskIndexVersion: 6 + +dataset: sift-128-euclidean + +construction: + outDegree: [32] + efConstruction: [100] + neighborOverflow: [1.2f] + addHierarchy: [Yes] + refineFinalGraph: [Yes] + fusedGraph: [No] + compression: + - type: None + reranking: + - NVQ + useSavedIndexIfExists: No + +search: + topKOverquery: + 10: [1.0, 2.0, 5.0, 10.0] + 100: [1.0, 2.0] + useSearchPruning: [Yes] + compression: + - type: None