diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
index 2d41835e1..ce9d62c1b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
@@ -236,6 +236,7 @@ static void runOneGraph(OnDiskGraphIndexCache cache,
             diagnostics.startMonitoring("testDirectory", workDirectory);
             diagnostics.startMonitoring("indexCache", Paths.get(indexCacheDir));
             diagnostics.capturePrePhaseSnapshot("Graph Build");
+            System.out.printf("%s: Dataset similarity function is %s%n", ds.getName(), ds.getSimilarityFunction());
 
             // Resolve build compressor (and label quant type) so we can record compute time
             VectorCompressor<?> buildCompressorObj = null;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
index aed5d99e7..3c218c85f 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderHDF5.java
@@ -16,7 +16,6 @@
 
 package io.github.jbellis.jvector.example.benchmarks.datasets;
 
-import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
 import io.github.jbellis.jvector.vector.VectorizationProvider;
 import io.github.jbellis.jvector.vector.types.VectorFloat;
 import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
@@ -41,10 +40,9 @@
 /**
  * This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
  *
- * <p>The vector similarity function is first inferred from the filename (e.g. {@code -angular},
- * {@code -euclidean}). If the filename does not contain a recognized suffix, the loader falls
- * back to looking up the dataset in {@code dataset_metadata.yml} via {@link DataSetMetadataReader}.
- * If neither source provides a similarity function, an error is thrown.
+ * <p>For curated benchmark datasets, properties are provided by
+ * {@code dataset_metadata.yml} via {@link DataSetMetadataReader}. If the metadata
+ * does not provide a similarity function, an error is thrown.
  */
 public class DataSetLoaderHDF5 implements DataSetLoader {
     public static final Path HDF5_DIR = Path.of("hdf5");
@@ -57,19 +55,17 @@ public class DataSetLoaderHDF5 implements DataSetLoader {
      */
     public Optional<DataSetInfo> loadDataSet(String datasetName) {
         return maybeDownloadHdf5(datasetName).map(path -> {
-            var props = getProperties(datasetName, path);
-            var similarity = props.similarityFunction()
+            var props = getProperties(datasetName);
+            props.similarityFunction()
                     .orElseThrow(() -> new IllegalArgumentException(
-                            "No similarity function found for HDF5 dataset: " + datasetName
-                            + ". Either include -angular, -dot, or -euclidean in the filename,"
-                            + " or add an entry in dataset_metadata.yml"));
-            return new DataSetInfo(props, () -> readHdf5Data(path, similarity));
+                            "No similarity function configured in dataset_metadata.yml for HDF5 dataset: " + datasetName));
+            return new DataSetInfo(props, () -> readHdf5Data(path, props));
         });
     }
 
     /// Reads base vectors, query vectors, and ground truth from an HDF5 file
-    /// and returns a scrubbed {@link DataSet}.
-    private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunction) {
+    /// and returns a {@link DataSet} using the configured dataset properties.
+    private DataSet readHdf5Data(Path path, DataSetProperties props) {
         VectorFloat<?>[] baseVectors;
         VectorFloat<?>[] queryVectors;
         var gtSets = new ArrayList<List<Integer>>();
@@ -103,37 +99,19 @@ private DataSet readHdf5Data(Path path, VectorSimilarityFunction similarityFunct
             }
         }
 
-        return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
+        return DataSetUtils.processDataSet(
+                path.getFileName().toString(),
+                props,
+                Arrays.asList(baseVectors),
+                Arrays.asList(queryVectors),
+                gtSets);
     }
 
-    /// Derives dataset properties from the filename, falling back to {@link DataSetMetadataReader}.
-    ///
-    /// The filename is checked first for known suffixes ({@code -angular}, {@code -dot},
-    /// {@code -euclidean}) to infer the similarity function. If none match, the dataset name
-    /// is looked up in {@code dataset_metadata.yml}. If neither source provides properties,
-    /// a minimal {@link DataSetProperties} with an empty similarity function is returned
-    /// so that the caller can produce a clear error.
+    /// Looks up dataset properties in {@code dataset_metadata.yml}.
     ///
     /// @param datasetName the logical dataset name (without {@code .hdf5} extension)
-    /// @param filename    the resolved file path including the {@code .hdf5} extension
-    /// @return the dataset properties
-    private static DataSetProperties getProperties(String datasetName, Path filename) {
-        String filenameStr = filename.toString();
-        VectorSimilarityFunction inferred = null;
-        if (filenameStr.contains("-angular") || filenameStr.contains("-dot")) {
-            inferred = VectorSimilarityFunction.COSINE;
-        } else if (filenameStr.contains("-euclidean")) {
-            inferred = VectorSimilarityFunction.EUCLIDEAN;
-        }
-
-        // If filename inference succeeded, build properties with just the SF
-        if (inferred != null) {
-            return new DataSetProperties.PropertyMap(Map.of(
-                    DataSetProperties.KEY_NAME, datasetName,
-                    DataSetProperties.KEY_SIMILARITY_FUNCTION, inferred));
-        }
-
-        // Fall back to metadata YAML
+    /// @return the dataset properties, or a minimal name-only property set if no entry exists
+    private static DataSetProperties getProperties(String datasetName) {
         return metadata.getProperties(datasetName)
                 .orElse(new DataSetProperties.PropertyMap(Map.of(DataSetProperties.KEY_NAME, datasetName)));
     }
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
index 16dedbb82..b38d2daf1 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetLoaderMFD.java
@@ -17,7 +17,6 @@
 package io.github.jbellis.jvector.example.benchmarks.datasets;
 
 import io.github.jbellis.jvector.example.util.SiftLoader;
-import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import software.amazon.awssdk.auth.credentials.AnonymousCredentialsProvider;
@@ -67,10 +66,10 @@ public Optional<DataSetInfo> loadDataSet(String fileName) {
             var props = metadata.getProperties(mfd.name)
                     .orElseThrow(() -> new IllegalArgumentException(
                             "No metadata configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
-            var vsf = props.similarityFunction()
+            props.similarityFunction()
                     .orElseThrow(() -> new IllegalArgumentException(
                             "No similarity_function configured in dataset_metadata.yml for MFD dataset: " + mfd.name));
-            return new DataSetInfo(props, () -> mfd.load(vsf));
+            return new DataSetInfo(props, () -> mfd.load(props));
         });
     }
 
@@ -204,15 +203,16 @@ public Iterable<Path> paths() {
             return List.of(basePath, queriesPath, groundTruthPath);
         }
 
-        /// Reads the fvec/ivec files from disk and returns a scrubbed {@link DataSet}.
+        /// Reads the fvec/ivec files from disk and processes the dataset using the
+        /// configured dataset properties.
         ///
-        /// @param similarityFunction the similarity function to associate with the dataset
-        /// @return the loaded and scrubbed dataset
-        public DataSet load(VectorSimilarityFunction similarityFunction) {
+        /// @param props the dataset properties controlling similarity and load behavior
+        /// @return the loaded dataset
+        public DataSet load(DataSetProperties props) {
             var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
             var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
             var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
-            return DataSetUtils.getScrubbedDataSet(name, similarityFunction, baseVectors, queryVectors, gtVectors);
+            return DataSetUtils.processDataSet(name, props, baseVectors, queryVectors, gtVectors);
         }
 
         public static Map<String, MultiFileDatasource> byName = new HashMap<>() {{
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
index e8305a3ce..93ace9249 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetMetadataReader.java
@@ -53,7 +53,7 @@ public class DataSetMetadataReader {
     private final Map<String, Map<String, Object>> metadata;
 
     private DataSetMetadataReader(Map<String, Map<String, Object>> metadata) {
-        this.metadata = metadata;
+        this.metadata = metadata != null ? metadata : Map.of();
     }
 
     /// Loads dataset metadata from the default file ({@code jvector-examples/yaml-configs/dataset_metadata.yml}).
@@ -72,8 +72,7 @@ public static DataSetMetadataReader load() {
     @SuppressWarnings("unchecked")
     public static DataSetMetadataReader load(String file) {
         try (InputStream inputStream = new FileInputStream(file)) {
-            Yaml yaml = new Yaml();
-            Map<String, Map<String, Object>> data = yaml.load(inputStream);
+            Map<String, Map<String, Object>> data = new Yaml().load(inputStream);
             return new DataSetMetadataReader(data);
         } catch (IOException e) {
             throw new RuntimeException("Failed to load dataset metadata from " + file, e);
@@ -82,22 +81,34 @@ public static DataSetMetadataReader load(String file) {
 
     /// Looks up the {@link DataSetProperties} for a dataset by key.
     ///
-    /// The lookup tries the exact key first, then the key with {@code .hdf5} appended.
-    /// The YAML entry is wrapped in a {@link DataSetProperties.PropertyMap} with the dataset
-    /// name injected. Properties not present in the YAML default to empty/false/zero.
+    /// The lookup first tries the exact key. If that is not found, it also tries the
+    /// corresponding key with or without the {@code .hdf5} suffix so that callers may
+    /// use either form.
+    ///
+    /// The matched YAML entry is wrapped in a {@link DataSetProperties.PropertyMap}
+    /// with the requested dataset key injected as the dataset name when no explicit
+    /// name is present. Properties not present in the YAML default to empty/false/zero.
     ///
     /// @param datasetKey the dataset name or filename to look up
     /// @return the dataset properties if an entry exists, or empty if no entry is found
     public Optional<DataSetProperties> getProperties(String datasetKey) {
+        return findEntry(datasetKey).map(entry -> {
+            var props = new HashMap<>(entry);
+            props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
+            return new DataSetProperties.PropertyMap(props);
+        });
+    }
+
+    private Optional<Map<String, Object>> findEntry(String datasetKey) {
         Map<String, Object> entry = metadata.get(datasetKey);
-        if (entry == null) {
-            entry = metadata.get(datasetKey + ".hdf5");
+        if (entry != null) {
+            return Optional.of(entry);
         }
-        if (entry == null) {
-            return Optional.empty();
+
+        if (datasetKey.endsWith(".hdf5")) {
+            return Optional.ofNullable(metadata.get(datasetKey.substring(0, datasetKey.length() - ".hdf5".length())));
         }
-        var props = new HashMap<>(entry);
-        props.putIfAbsent(DataSetProperties.KEY_NAME, datasetKey);
-        return Optional.of(new DataSetProperties.PropertyMap(props));
+
+        return Optional.ofNullable(metadata.get(datasetKey + ".hdf5"));
     }
 }
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
index 5f02ba790..5ae1cf2e6 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetProperties.java
@@ -51,6 +51,21 @@ public interface DataSetProperties {
     /// Canonical key for whether the dataset is free of duplicate vectors ({@link Boolean}).
     String KEY_IS_DUPLICATE_VECTOR_FREE = "is_duplicate_vector_free";
 
+    /// Canonical key for how benchmark loaders should treat the dataset at load time.
+    String KEY_LOAD_BEHAVIOR = "load_behavior";
+
+    /**
+     * Controls benchmark-loader behavior for this dataset.
+     *
+     * <p>LEGACY_SCRUB preserves current behavior (to be deprecated).
+     * NO_SCRUB loads the dataset exactly as provided, without load-time scrubbing
+     * or ground-truth remapping.
+     */
+    enum LoadBehavior {
+        LEGACY_SCRUB,
+        NO_SCRUB
+    }
+
     /**
      * Returns the similarity function for this dataset.
      *
@@ -97,6 +112,18 @@ public interface DataSetProperties {
      */
     public boolean isDuplicateVectorFree();
 
+    /**
+     * Returns how benchmark loaders should treat this dataset at load time.
+     *
+     * <p>This is a loader policy, not a statement of dataset quality.
+     * The default preserves legacy behavior.
+     *
+     * @return the benchmark loader behavior for this dataset
+     */
+    default LoadBehavior loadBehavior() {
+        return LoadBehavior.LEGACY_SCRUB;
+    }
+
     /**
      * A convenience method to capture the notion of a valid dataset.
      * As any additional qualifiers are added to this data carrier, this method should be updated accordingly.
@@ -222,5 +249,17 @@ public boolean isZeroVectorFree() {
         public boolean isDuplicateVectorFree() {
             return Boolean.TRUE.equals(properties.get(KEY_IS_DUPLICATE_VECTOR_FREE));
         }
+
+        @Override
+        public LoadBehavior loadBehavior() {
+            var value = properties.get(KEY_LOAD_BEHAVIOR);
+            if (value instanceof LoadBehavior) {
+                return (LoadBehavior) value;
+            }
+            if (value instanceof String) {
+                return LoadBehavior.valueOf((String) value);
+            }
+            return LoadBehavior.LEGACY_SCRUB;
+        }
     }
 }
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java
index 653e71b0c..61dc64652 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/datasets/DataSetUtils.java
@@ -27,69 +27,111 @@ public class DataSetUtils {
      * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length.
      * Note: This only scrubs and normalizes for dot product similarity.
      */
+    private static final Comparator<VectorFloat<?>> VECTOR_COMPARATOR = (a, b) -> {
+        assert a.length() == b.length();
+        for (int i = 0; i < a.length(); i++) {
+            if (a.get(i) < b.get(i)) {
+                return -1;
+            }
+            if (a.get(i) > b.get(i)) {
+                return 1;
+            }
+        }
+        return 0;
+    };
+
+    /**
+     * Processes a dataset using the configured load behavior from the dataset metadata.
+     */
+    public static DataSet processDataSet(String pathStr,
+                                         DataSetProperties props,
+                                         List<VectorFloat<?>> baseVectors,
+                                         List<VectorFloat<?>> queryVectors,
+                                         List<List<Integer>> groundTruth) {
+        var vsf = props.similarityFunction()
+                .orElseThrow(() -> new IllegalArgumentException(
+                        "No similarity function configured for dataset: " + props.getName()));
+
+        switch (props.loadBehavior()) {
+            case NO_SCRUB:
+                return new SimpleDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth);
+            case LEGACY_SCRUB:
+                return legacyScrubDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth);
+            default:
+                throw new IllegalArgumentException("Unsupported load behavior: " + props.loadBehavior());
+        }
+    }
+
+    /**
+     * @deprecated Benchmark loaders should use
+     * {@link #processDataSet(String, DataSetProperties, List, List, List)}
+     * so that load behavior is controlled explicitly by dataset metadata.
+     */
+    @Deprecated(forRemoval = true)
     public static DataSet getScrubbedDataSet(String pathStr,
                                              VectorSimilarityFunction vsf,
                                              List<VectorFloat<?>> baseVectors,
                                              List<VectorFloat<?>> queryVectors,
                                              List<List<Integer>> groundTruth) {
-        // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers
-        List<VectorFloat<?>> scrubbedBaseVectors;
-        List<VectorFloat<?>> scrubbedQueryVectors;
-        List<ArrayList<Integer>> gtSet;
-        scrubbedBaseVectors = new ArrayList<>(baseVectors.size());
-        scrubbedQueryVectors = new ArrayList<>(queryVectors.size());
-        gtSet = new ArrayList<>(groundTruth.size());
-        var uniqueVectors = new TreeSet<VectorFloat<?>>((a, b) -> {
-            assert a.length() == b.length();
-            for (int i = 0; i < a.length(); i++) {
-                if (a.get(i) < b.get(i)) {
-                    return -1;
-                }
-                if (a.get(i) > b.get(i)) {
-                    return 1;
-                }
-            }
-            return 0;
-        });
+        return legacyScrubDataSet(pathStr, vsf, baseVectors, queryVectors, groundTruth);
+    }
+
+    private static DataSet legacyScrubDataSet(String pathStr,
+                                              VectorSimilarityFunction vsf,
+                                              List<VectorFloat<?>> baseVectors,
+                                              List<VectorFloat<?>> queryVectors,
+                                              List<List<Integer>> groundTruth) {
+        List<VectorFloat<?>> scrubbedBaseVectors = new ArrayList<>(baseVectors.size());
+        List<VectorFloat<?>> scrubbedQueryVectors = new ArrayList<>(queryVectors.size());
+        List<ArrayList<Integer>> gtSet = new ArrayList<>(groundTruth.size());
+
+        var uniqueVectors = new TreeSet<VectorFloat<?>>(VECTOR_COMPARATOR);
         Map<Integer, Integer> rawToScrubbed = new HashMap<>();
-        {
-            int j = 0;
-            for (int i = 0; i < baseVectors.size(); i++) {
-                VectorFloat<?> v = baseVectors.get(i);
-                var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
-                if (valid && uniqueVectors.add(v)) {
-                    scrubbedBaseVectors.add(v);
-                    rawToScrubbed.put(i, j++);
-                }
+
+        int nextOrdinal = 0;
+        for (int i = 0; i < baseVectors.size(); i++) {
+            VectorFloat<?> v = baseVectors.get(i);
+            boolean valid = isValidLegacyVector(v, vsf);
+            if (valid && uniqueVectors.add(v)) {
+                scrubbedBaseVectors.add(v);
+                rawToScrubbed.put(i, nextOrdinal++);
             }
         }
-        // also remove zero query vectors and query vectors that are present in the base set
+
+        // Also remove zero query vectors and query vectors that are present in the base set.
         for (int i = 0; i < queryVectors.size(); i++) {
             VectorFloat<?> v = queryVectors.get(i);
-            var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
-            var dupe = uniqueVectors.contains(v);
+            boolean valid = isValidLegacyVector(v, vsf);
+            boolean dupe = uniqueVectors.contains(v);
             if (valid && !dupe) {
                 scrubbedQueryVectors.add(v);
-                var gt = new ArrayList<Integer>();
-                for (int j : groundTruth.get(i)) {
-                    gt.add(rawToScrubbed.get(j));
+                var gt = new ArrayList<Integer>(groundTruth.get(i).size());
+                for (int ordinal : groundTruth.get(i)) {
+                    gt.add(rawToScrubbed.get(ordinal));
                 }
                 gtSet.add(gt);
             }
         }
 
-        // now that the zero vectors are removed, we can normalize if it looks like they aren't already
-        if (vsf == VectorSimilarityFunction.DOT_PRODUCT) {
-            if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) {
-                normalizeAll(scrubbedBaseVectors);
-                normalizeAll(scrubbedQueryVectors);
-            }
+        if (shouldNormalizeLegacy(vsf, baseVectors)) {
+            normalizeAll(scrubbedBaseVectors);
+            normalizeAll(scrubbedQueryVectors);
         }
 
         assert scrubbedQueryVectors.size() == gtSet.size();
         return new SimpleDataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet);
     }
 
+    private static boolean isValidLegacyVector(VectorFloat<?> vector, VectorSimilarityFunction vsf) {
+        return vsf == VectorSimilarityFunction.EUCLIDEAN || Math.abs(normOf(vector)) > 1e-5;
+    }
+
+    private static boolean shouldNormalizeLegacy(VectorSimilarityFunction vsf, List<VectorFloat<?>> baseVectors) {
+        return vsf == VectorSimilarityFunction.DOT_PRODUCT
+                && !baseVectors.isEmpty()
+                && Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5;
+    }
+
     public static void normalizeAll(Iterable<VectorFloat<?>> vectors) {
         for (VectorFloat<?> v : vectors) {
             VectorUtil.l2normalize(v);
diff --git a/jvector-examples/yaml-configs/ada002-100k.yml b/jvector-examples/yaml-configs/ada002-100k.yml
deleted file mode 100644
index 1cf7e1fbb..000000000
--- a/jvector-examples/yaml-configs/ada002-100k.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-yamlSchemaVersion: 1
-onDiskIndexVersion: 6
-
-dataset: ada002-100k
-
-construction:
-  outDegree: [32]
-  efConstruction: [100]
-  neighborOverflow: [1.2f]
-  addHierarchy: [Yes]
-  refineFinalGraph: [Yes]
-  fusedGraph: [Yes, No]
-  compression:
-    - type: PQ
-      parameters:
-        m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
-        # mFactor: 8
-        # k: 256 # optional parameter. By default, k=256
-        centerData: No
-        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
-  reranking:
-    - NVQ
-  useSavedIndexIfExists: No
-
-search:
-  topKOverquery:
-    10: [1.0, 2.0, 5.0, 10.0]
-    100: [1.0, 2.0]
-  useSearchPruning: [Yes]
-  compression:
-    - type: PQ
-      parameters:
-        m: 192
-        # k: 256 # optional parameter. By default, k=256
-        centerData: No
-        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
-
-# Run-level controls, such as benchmarks, console, and logging, are in run.yml.
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/colbert-1M.yml b/jvector-examples/yaml-configs/colbert-1M.yml
index 48f32b0c9..b9e6c72b7 100644
--- a/jvector-examples/yaml-configs/colbert-1M.yml
+++ b/jvector-examples/yaml-configs/colbert-1M.yml
@@ -11,13 +11,7 @@ construction:
   refineFinalGraph: [Yes]
   fusedGraph: [No]
   compression:
-    - type: PQ
-      parameters:
-        m: 32 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
-        # mFactor: 8
-        # k: 256 # optional parameter. By default, k=256
-        centerData: No
-        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+    - type: None
   reranking:
     - NVQ
   useSavedIndexIfExists: No
@@ -28,11 +22,6 @@ search:
     100: [1.0, 2.0]
   useSearchPruning: [Yes]
   compression:
-    - type: PQ
-      parameters:
-        m: 32
-        # k: 256 # optional parameter. By default, k=256
-        centerData: No
-        anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
+    - type: None
 
 # Run-level controls, such as benchmarks, console, and logging, are in run.yml.
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/dataset_metadata.yml b/jvector-examples/yaml-configs/dataset_metadata.yml
index 6aade63cc..21e5e69f9 100644
--- a/jvector-examples/yaml-configs/dataset_metadata.yml
+++ b/jvector-examples/yaml-configs/dataset_metadata.yml
@@ -1,39 +1,99 @@
-# This file contains the metadata for the datasets (formats) which do not have a dedicated
-# metadata facility. The MFD and hdf5 loaders use this file to determine the similarity function, among other things.
-# (HDF5 metadata support is moot for us since the runtime support fall short in other ways)
+# This file contains authoritative metadata for curated benchmark datasets whose
+# raw formats do not carry the properties we need at runtime.
 #
-# Ideally, this metadata is part of the format and access layer for a given dataset format. This file exists because
-# the dataset names herein are in a form which does _not_ support proper bundled configuration data with the raw data.
-# When possible, these dataset should be provided with another mechanism which fully handles this aspect of dataset
-# management so that we don't have to maintain separate parts in different places.
+# Both the MFD and HDF5 loaders use this file to determine dataset properties such
+# as similarity_function and load_behavior.
 #
-# You can put additional metadata here, but it will not be type-safe and reified properly unless there is an accompanying
-# change in the DataSetProperties interface and associated implementations.
+# load_behavior controls benchmark-loader processing:
+#   LEGACY_SCRUB - preserve the current load-time scrubbing behavior
+#   NO_SCRUB     - load vectors and ground truth exactly as stored
+#
+# During the transition, existing deployed datasets should generally remain on
+# LEGACY_SCRUB until their prescrubbed replacements and matching offline ground
+# truth are ready. New prescrubbed datasets should use NO_SCRUB.
+#
+# Additional metadata requires corresponding support in DataSetProperties and the
+# relevant loader code.
 
+ada002-100k:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+ada002-1M:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+cap-1M:
+  similarity_function: DOT_PRODUCT
+  load_behavior: LEGACY_SCRUB
+cap-6M:
+  similarity_function: DOT_PRODUCT
+  load_behavior: LEGACY_SCRUB
 cohere-english-v3-100k:
   similarity_function: COSINE
-  # examples of supported properties
-  # If not present, presumed to be false
-  # is_normalized: false
-  # is_zero_vector_free: false
-  # is duplicate_vector_free: false
-ada002-100k:
+  load_behavior: LEGACY_SCRUB
+cohere-english-v3-1M:
   similarity_function: COSINE
-openai-v3-small-100k:
+  load_behavior: LEGACY_SCRUB
+cohere-english-v3-10M:
   similarity_function: COSINE
-gecko-100k:
+  load_behavior: LEGACY_SCRUB
+colbert-1M:
   similarity_function: COSINE
-openai-v3-large-3072-100k:
+  load_behavior: LEGACY_SCRUB
+colbert-10M:
   similarity_function: COSINE
-openai-v3-large-1536-100k:
+  load_behavior: LEGACY_SCRUB
+degen-200k:
   similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+dpr-1M:
+  similarity_function: DOT_PRODUCT
+  load_behavior: LEGACY_SCRUB
+dpr-10M:
+  similarity_function: DOT_PRODUCT
+  load_behavior: LEGACY_SCRUB
 e5-small-v2-100k:
   similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
 e5-base-v2-100k:
   similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
 e5-large-v2-100k:
   similarity_function: COSINE
-ada002-1M:
+  load_behavior: LEGACY_SCRUB
+gecko-100k:
   similarity_function: COSINE
-colbert-1M:
+  load_behavior: LEGACY_SCRUB
+nv-qa-v4-100k:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+openai-v3-small-100k:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+openai-v3-large-3072-100k:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+openai-v3-large-1536-100k:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+# ann-benchmarks
+glove-25-angular.hdf5:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+glove-50-angular.hdf5:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+lastfm-64-dot.hdf5:
+  similarity_function: DOT_PRODUCT
+  load_behavior: LEGACY_SCRUB
+glove-100-angular.hdf5:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+glove-200-angular.hdf5:
+  similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+nytimes-256-angular.hdf5:
   similarity_function: COSINE
+  load_behavior: LEGACY_SCRUB
+sift-128-euclidean.hdf5:
+  similarity_function: EUCLIDEAN
+  load_behavior: LEGACY_SCRUB
\ No newline at end of file
diff --git a/jvector-examples/yaml-configs/default.yml b/jvector-examples/yaml-configs/default.yml
index 2b37e61a0..346a701e4 100644
--- a/jvector-examples/yaml-configs/default.yml
+++ b/jvector-examples/yaml-configs/default.yml
@@ -13,8 +13,8 @@ construction:
   compression:
     - type: PQ
       parameters:
-        m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
-        # mFactor: 8
+        # m: 192 # we can either specify the integer m or the integer mFactor. In this case, m will be set to the data dimensionality divided by mFactor
+        mFactor: 8
         # k: 256 # optional parameter. By default, k=256
         centerData: No
         anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
@@ -30,7 +30,8 @@ search:
   compression:
     - type: PQ
       parameters:
-        m: 192
+        # m: 192
+        mFactor: 8
         # k: 256 # optional parameter. By default, k=256
         centerData: No
         anisotropicThreshold: -1.0 # optional parameter. By default, anisotropicThreshold=-1 (i.e., no anisotropy)
diff --git a/jvector-examples/yaml-configs/glove-100-angular.yml b/jvector-examples/yaml-configs/glove-100-angular.yml
new file mode 100644
index 000000000..5f80f4ae2
--- /dev/null
+++ b/jvector-examples/yaml-configs/glove-100-angular.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: glove-100-angular
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None
diff --git a/jvector-examples/yaml-configs/glove-200-angular.yml b/jvector-examples/yaml-configs/glove-200-angular.yml
new file mode 100644
index 000000000..faf5bf3fa
--- /dev/null
+++ b/jvector-examples/yaml-configs/glove-200-angular.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: glove-200-angular
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None
diff --git a/jvector-examples/yaml-configs/glove-25-angular.yml b/jvector-examples/yaml-configs/glove-25-angular.yml
index e982302b8..953f454e2 100644
--- a/jvector-examples/yaml-configs/glove-25-angular.yml
+++ b/jvector-examples/yaml-configs/glove-25-angular.yml
@@ -12,14 +12,10 @@ construction:
   refineFinalGraph: [Yes]
   fusedGraph: [No]
   compression:
-    - type: PQ
-      parameters:
-        mFactor: 4
-        centerData: No
-        anisotropicThreshold: -1.0
+    - type: None
   reranking:
     - NVQ
-  useSavedIndexIfExists: Yes
+  useSavedIndexIfExists: No
 
 search:
   topKOverquery:
@@ -27,8 +23,4 @@ search:
     100: [1.0, 2.0]
   useSearchPruning: [Yes]
   compression:
-    - type: PQ
-      parameters:
-        mFactor: 4
-        centerData: No
-        anisotropicThreshold: -1.0
+    - type: None
diff --git a/jvector-examples/yaml-configs/glove-50-angular.yml b/jvector-examples/yaml-configs/glove-50-angular.yml
new file mode 100644
index 000000000..a62eede12
--- /dev/null
+++ b/jvector-examples/yaml-configs/glove-50-angular.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: glove-50-angular
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None
diff --git a/jvector-examples/yaml-configs/lastfm-64-dot.yml b/jvector-examples/yaml-configs/lastfm-64-dot.yml
new file mode 100644
index 000000000..045963849
--- /dev/null
+++ b/jvector-examples/yaml-configs/lastfm-64-dot.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: lastfm-64-dot
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None
diff --git a/jvector-examples/yaml-configs/nytimes-256-angular.yml b/jvector-examples/yaml-configs/nytimes-256-angular.yml
new file mode 100644
index 000000000..755e99017
--- /dev/null
+++ b/jvector-examples/yaml-configs/nytimes-256-angular.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: nytimes-256-angular
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None
diff --git a/jvector-examples/yaml-configs/sift-128-euclidean.yml b/jvector-examples/yaml-configs/sift-128-euclidean.yml
new file mode 100644
index 000000000..04d4b6fd8
--- /dev/null
+++ b/jvector-examples/yaml-configs/sift-128-euclidean.yml
@@ -0,0 +1,26 @@
+# This is here for loader testing purposes only, use other datasets for realistic test data
+yamlSchemaVersion: 1
+onDiskIndexVersion: 6
+
+dataset: sift-128-euclidean
+
+construction:
+  outDegree: [32]
+  efConstruction: [100]
+  neighborOverflow: [1.2f]
+  addHierarchy: [Yes]
+  refineFinalGraph: [Yes]
+  fusedGraph: [No]
+  compression:
+    - type: None
+  reranking:
+    - NVQ
+  useSavedIndexIfExists: No
+
+search:
+  topKOverquery:
+    10: [1.0, 2.0, 5.0, 10.0]
+    100: [1.0, 2.0]
+  useSearchPruning: [Yes]
+  compression:
+    - type: None