Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache,
diagnostics.startMonitoring("testDirectory", workDirectory);
diagnostics.startMonitoring("indexCache", Paths.get(indexCacheDir));
diagnostics.capturePrePhaseSnapshot("Graph Build");
System.out.printf("%s: Dataset similarity function is %s%n", ds.getName(), ds.getSimilarityFunction());

// Resolve build compressor (and label quant type) so we can record compute time
VectorCompressor<?> buildCompressorObj = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.github.jbellis.jvector.example.benchmarks.datasets;

import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import io.github.jbellis.jvector.vector.VectorizationProvider;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
Expand All @@ -41,10 +40,9 @@
/**
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
*
* <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
* {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
* back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
* If neither source provides a similarity function, an error is thrown.
* <p>For curated benchmark datasets, properties are provided by
* {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata
* does not provide a similarity function, an error is thrown.
*/
public class DataSetLoaderHDF5 implements DataSetLoader {
public static final Path HDF5_DIR = Path.of("hdf5");
Expand All @@ -57,19 +55,17 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
*/
public Optional<DataSetInfo> loadDataSet(String datasetName) {
return maybeDownloadHdf5(datasetName).map(path -> {
var props = getProperties(datasetName, path);
var similarity = props.similarityFunction()
var props = getProperties(datasetName);
props.similarityFunction()
.orElseThrow(() -> new IllegalArgumentException(
"No similarity function found for HDF5 dataset: " + datasetName
+ ". Either include -angular, -dot, or -euclidean in the filename,"
+ " or add an entry in dataset_metadata.yml"));
return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
"No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName));
return new DataSetInfo(props, () -> readHdf5Data(path, props));
});
}

/// Reads base vectors, query vectors, and ground truth from an HDF5 file
/// and returns a scrubbed {@link DataSet}.
private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
/// and returns a {@link DataSet} using the configured dataset properties.
private DataSet readHdf5Data(Path path, DataSetProperties props) {
VectorFloat<?>[] baseVectors;
VectorFloat<?>[] queryVectors;
var gtSets = new ArrayList<List<Integer>>();
Expand Down Expand Up @@ -103,37 +99,19 @@ private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunct
}
}

return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
return DataSetUtils.processDataSet(
path.getFileName().toString(),
props,
Arrays.asList(baseVectors),
Arrays.asList(queryVectors),
gtSets);
}

/// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
///
/// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
/// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
/// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
/// a minimal {@link DataSetProperties} with an empty similarity function is returned
/// so that the caller can produce a clear error.
/// Looks up dataset properties in {@code dataset_metadata.yml}.
///
/// @param datasetName the logical dataset name (without {@code .hdf5} extension)
/// @param filename the resolved file path including the {@code .hdf5} extension
/// @return the dataset properties
private static DataSetProperties getProperties(String datasetName, Path filename) {
String filenameStr = filename.toString();
VectorSimilarityFunction inferred = null;
if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
inferred = VectorSimilarityFunction.COSINE;
} else if (filenameStr.contains("-euclidean")) {
inferred = VectorSimilarityFunction.EUCLIDEAN;
}

// If filename inference succeeded, build properties with just the SF
if (inferred != null) {
return new DataSetProperties.PropertyMap(Map.of(
DataSetProperties.KEY_NAME, datasetName,
DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
}

// Fall back to metadata YAML
/// @return the dataset properties, or a minimal name-only property set if no entry exists
private static DataSetProperties getProperties(String datasetName) {
return metadata.getProperties(datasetName)
.orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package io.github.jbellis.jvector.example.benchmarks.datasets;

import io.github.jbellis.jvector.example.util.SiftLoader;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
Expand Down Expand Up @@ -67,10 +66,10 @@ public Optional<DataSetInfo> loadDataSet(String fileName) {
var props = metadata.getProperties(mfd.name)
.orElseThrow(() -> new IllegalArgumentException(
"No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
var vsf = props.similarityFunction()
props.similarityFunction()
.orElseThrow(() -> new IllegalArgumentException(
"No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
return new DataSetInfo(props, () -> mfd.load(vsf));
return new DataSetInfo(props, () -> mfd.load(props));
});
}

Expand Down Expand Up @@ -204,15 +203,16 @@ public Iterable<Path> paths() {
return List.of(basePath, queriesPath, groundTruthPath);
}

/// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}.
/// Reads the fvec/ivec files from disk and processes the dataset using the
/// configured dataset properties.
///
/// @param similarityFunction the similarity function to associate with the dataset
/// @return the loaded and scrubbed dataset
public DataSet load(VectorSimilarityFunction similarityFunction) {
/// @param props the dataset properties controlling similarity and load behavior
/// @return the loaded dataset
public DataSet load(DataSetProperties props) {
var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors);
return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors);
}

public static Map<String, MultiFileDatasource> byName = new HashMap<>() {{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public class DataSetMetadataReader {
private final Map<String, Map<String, Object>> metadata;

private DataSetMetadataReader(Map<String, Map<String, Object>> metadata) {
this.metadata = metadata;
this.metadata = metadata != null ? metadata : Map.of();
}

/// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}).
Expand All @@ -72,8 +72,7 @@ public static DataSetMetadataReader load() {
@SuppressWarnings("unchecked")
public static DataSetMetadataReader load(String file) {
try (InputStream inputStream = new FileInputStream(file)) {
Yaml yaml = new Yaml();
Map<String, Map<String, Object>> data = yaml.load(inputStream);
Map<String, Map<String, Object>> data = new Yaml().load(inputStream);
return new DataSetMetadataReader(data);
} catch (IOException e) {
throw new RuntimeException("Failed to load dataset metadata from " + file, e);
Expand All @@ -82,22 +81,34 @@ public static DataSetMetadataReader load(String file) {

/// Looks up the {@link DataSetProperties} for a dataset by key.
///
/// The lookup tries the exact key first, then the key with {@code .hdf5} appended.
/// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset
/// name injected. Properties not present in the YAML default to empty/false/zero.
/// The lookup first tries the exact key. If that is not found, it also tries the
/// corresponding key with or without the {@code .hdf5} suffix so that callers may
/// use either form.
///
/// The matched YAML entry is wrapped in a {@link DataSetProperties.PropertyMap}
/// with the requested dataset key injected as the dataset name when no explicit
/// name is present. Properties not present in the YAML default to empty/false/zero.
///
/// @param datasetKey the dataset name or filename to look up
/// @return the dataset properties if an entry exists, or empty if no entry is found
public Optional<DataSetProperties> getProperties(String datasetKey) {
return findEntry(datasetKey).map(entry -> {
var props = new HashMap<>(entry);
props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
return new DataSetProperties.PropertyMap(props);
});
}

private Optional<Map<String, Object>> findEntry(String datasetKey) {
Map<String, Object> entry = metadata.get(datasetKey);
if (entry == null) {
entry = metadata.get(datasetKey + ".hdf5");
if (entry != null) {
return Optional.of(entry);
}
if (entry == null) {
return Optional.empty();

if (datasetKey.endsWith(".hdf5")) {
return Optional.ofNullable(metadata.get(datasetKey.substring(0, datasetKey.length() - ".hdf5".length())));
}
var props = new HashMap<>(entry);
props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
return Optional.of(new DataSetProperties.PropertyMap(props));

return Optional.ofNullable(metadata.get(datasetKey + ".hdf5"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,21 @@ public interface DataSetProperties {
/// Canonical key for whether the dataset is free of duplicate vectors ({@link Boolean}).
String KEY_IS_DUPLICATE_VECTOR_FREE = "is_duplicate_vector_free";

/// Canonical key for how benchmark loaders should treat the dataset at load time.
String KEY_LOAD_BEHAVIOR = "load_behavior";

/**
* Controls benchmark-loader behavior for this dataset.
*
* <p>LEGACY_SCRUB preserves current behavior (to be deprecated).
* NO_SCRUB loads the dataset exactly as provided, without load-time scrubbing
* or ground-truth remapping.
*/
enum LoadBehavior {
LEGACY_SCRUB,
NO_SCRUB
}

/**
* Returns the similarity function for this dataset.
*
Expand Down Expand Up @@ -97,6 +112,18 @@ public interface DataSetProperties {
*/
public boolean isDuplicateVectorFree();

/**
* Returns how benchmark loaders should treat this dataset at load time.
*
* <p>This is a loader policy, not a statement of dataset quality.
* The default preserves legacy behavior.
*
* @return the benchmark loader behavior for this dataset
*/
default LoadBehavior loadBehavior() {
return LoadBehavior.LEGACY_SCRUB;
}

/**
* A convenience method to capture the notion of a valid dataset.
* As any additional qualifiers are added to this data carrier, this method should be updated accordingly.
Expand Down Expand Up @@ -222,5 +249,17 @@ public boolean isZeroVectorFree() {
public boolean isDuplicateVectorFree() {
return Boolean.TRUE.equals(properties.get(KEY_IS_DUPLICATE_VECTOR_FREE));
}

@Override
public LoadBehavior loadBehavior() {
var value = properties.get(KEY_LOAD_BEHAVIOR);
if (value instanceof LoadBehavior) {
return (LoadBehavior) value;
}
if (value instanceof String) {
return LoadBehavior.valueOf((String) value);
}
return LoadBehavior.LEGACY_SCRUB;
}
}
}
Loading
Loading