diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
index 2d41835e1..ce9d62c1b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
@@ -236,6 +236,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache,
diagnostics.startMonitoring("testDirectory", workDirectory);
diagnostics.startMonitoring("indexCache", Paths.get(indexCacheDir));
diagnostics.capturePrePhaseSnapshot("Graph Build");
+ System.out.printf("%s: Dataset similarity function is %s%n", ds.getName(), ds.getSimilarityFunction());
// Resolve build compressor (and label quant type) so we can record compute time
VectorCompressor> buildCompressorObj = null;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
index aed5d99e7..3c218c85f 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
@@ -16,7 +16,6 @@
package io.github.jbellis.jvector.example.benchmarks.datasets;
-import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import io.github.jbellis.jvector.vector.VectorizationProvider;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
@@ -41,10 +40,9 @@
/**
* This dataset loader will get and load hdf5 files from ann-benchmarks.
*
- *
The vector similarity function is first inferred from the filename (e.g. {@code -angular},
- * {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
- * back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
- * If neither source provides a similarity function, an error is thrown.
+ *
For curated benchmark datasets, properties are provided by
+ * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata
+ * does not provide a similarity function, an error is thrown.
*/
public class DataSetLoaderHDF5 implements DataSetLoader {
public static final Path HDF5_DIR = Path.of("hdf5");
@@ -57,19 +55,17 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
*/
public Optional loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(path -> {
- var props = getProperties(datasetName, path);
- var similarity = props.similarityFunction()
+ var props = getProperties(datasetName);
+ props.similarityFunction()
.orElseThrow(() -> new IllegalArgumentException(
- "No similarity function found for HDF5 dataset: " + datasetName
- + ". Either include -angular, -dot, or -euclidean in the filename,"
- + " or add an entry in dataset_metadata.yml"));
- return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
+ "No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName));
+ return new DataSetInfo(props, () -> readHdf5Data(path, props));
});
}
/// Reads base vectors, query vectors, and ground truth from an HDF5 file
- /// and returns a scrubbed {@link DataSet}.
- private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
+ /// and returns a {@link DataSet} using the configured dataset properties.
+ private DataSet readHdf5Data(Path path, DataSetProperties props) {
VectorFloat>[] baseVectors;
VectorFloat>[] queryVectors;
var gtSets = new ArrayList>();
@@ -103,37 +99,19 @@ private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunct
}
}
- return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
+ return DataSetUtils.processDataSet(
+ path.getFileName().toString(),
+ props,
+ Arrays.asList(baseVectors),
+ Arrays.asList(queryVectors),
+ gtSets);
}
- /// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
- ///
- /// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
- /// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
- /// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
- /// a minimal {@link DataSetProperties} with an empty similarity function is returned
- /// so that the caller can produce a clear error.
+ /// Looks up dataset properties in {@code dataset_metadata.yml}.
///
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
- /// @param filename the resolved file path including the {@code .hdf5} extension
- /// @return the dataset properties
- private static DataSetProperties getProperties(String datasetName, Path filename) {
- String filenameStr = filename.toString();
- VectorSimilarityFunction inferred = null;
- if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
- inferred = VectorSimilarityFunction.COSINE;
- } else if (filenameStr.contains("-euclidean")) {
- inferred = VectorSimilarityFunction.EUCLIDEAN;
- }
-
- // If filename inference succeeded, build properties with just the SF
- if (inferred != null) {
- return new DataSetProperties.PropertyMap(Map.of(
- DataSetProperties.KEY_NAME, datasetName,
- DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
- }
-
- // Fall back to metadata YAML
+ /// @return the dataset properties, or a minimal name-only property set if no entry exists
+ private static DataSetProperties getProperties(String datasetName) {
return metadata.getProperties(datasetName)
.orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
index 16dedbb82..b38d2daf1 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
@@ -17,7 +17,6 @@
package io.github.jbellis.jvector.example.benchmarks.datasets;
import io.github.jbellis.jvector.example.util.SiftLoader;
-import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
@@ -67,10 +66,10 @@ public Optional loadDataSet(String fileName) {
var props = metadata.getProperties(mfd.name)
.orElseThrow(() -> new IllegalArgumentException(
"No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
- var vsf = props.similarityFunction()
+ props.similarityFunction()
.orElseThrow(() -> new IllegalArgumentException(
"No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
- return new DataSetInfo(props, () -> mfd.load(vsf));
+ return new DataSetInfo(props, () -> mfd.load(props));
});
}
@@ -204,15 +203,16 @@ public Iterable paths() {
return List.of(basePath, queriesPath, groundTruthPath);
}
- /// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}.
+ /// Reads the fvec/ivec files from disk and processes the dataset using the
+ /// configured dataset properties.
///
- /// @param similarityFunction the similarity function to associate with the dataset
- /// @return the loaded and scrubbed dataset
- public DataSet load(VectorSimilarityFunction similarityFunction) {
+ /// @param props the dataset properties controlling similarity and load behavior
+ /// @return the loaded dataset
+ public DataSet load(DataSetProperties props) {
var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
- return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors);
+ return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors);
}
public static Map byName = new HashMap<>() {{
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
index e8305a3ce..93ace9249 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
@@ -53,7 +53,7 @@ public class DataSetMetadataReader {
private final Map> metadata;
private DataSetMetadataReader(Map> metadata) {
- this.metadata = metadata;
+ this.metadata = metadata != null ? metadata : Map.of();
}
/// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}).
@@ -72,8 +72,7 @@ public static DataSetMetadataReader load() {
@SuppressWarnings("unchecked")
public static DataSetMetadataReader load(String file) {
try (InputStream inputStream = new FileInputStream(file)) {
- Yaml yaml = new Yaml();
- Map> data = yaml.load(inputStream);
+ Map> data = new Yaml().load(inputStream);
return new DataSetMetadataReader(data);
} catch (IOException e) {
throw new RuntimeException("Failed to load dataset metadata from " + file, e);
@@ -82,22 +81,34 @@ public static DataSetMetadataReader load(String file) {
/// Looks up the {@link DataSetProperties} for a dataset by key.
///
- /// The lookup tries the exact key first, then the key with {@code .hdf5} appended.
- /// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset
- /// name injected. Properties not present in the YAML default to empty/false/zero.
+ /// The lookup first tries the exact key. If that is not found, it also tries the
+ /// corresponding key with or without the {@code .hdf5} suffix so that callers may
+ /// use either form.
+ ///
+ /// The matched YAML entry is wrapped in a {@link DataSetProperties.PropertyMap}
+ /// with the requested dataset key injected as the dataset name when no explicit
+ /// name is present. Properties not present in the YAML default to empty/false/zero.
///
/// @param datasetKey the dataset name or filename to look up
/// @return the dataset properties if an entry exists, or empty if no entry is found
public Optional getProperties(String datasetKey) {
+ return findEntry(datasetKey).map(entry -> {
+ var props = new HashMap<>(entry);
+ props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
+ return new DataSetProperties.PropertyMap(props);
+ });
+ }
+
+ private Optional