Merge branch 'more-tests' of https://github.com/EleutherAI/delphi

SrGonao · SrGonao · commit 4b45f8a177cf · 2025-04-21T10:03:52.000-04:00
diff --git a/tests/e2e.py b/tests/e2e.py
@@ -6,7 +6,7 @@
 
 from delphi.__main__ import run
 from delphi.config import CacheConfig, ConstructorConfig, RunConfig, SamplerConfig
-from delphi.log.result_analysis import build_scores_df, latent_balanced_score_metrics
+from delphi.log.result_analysis import get_metrics, load_data
 
 
 async def test():
@@ -58,23 +58,16 @@ async def test():
     end_time = time.time()
     print(f"Time taken: {end_time - start_time} seconds")
 
-    # Performs better than random guessing
     scores_path = Path.cwd() / "results" / run_cfg.name / "scores"
-    hookpoint_firing_counts = torch.load(
-        Path.cwd() / "results" / run_cfg.name / "log" / "hookpoint_firing_counts.pt",
-        weights_only=True,
-    )
-    df = build_scores_df(scores_path, run_cfg.hookpoints, hookpoint_firing_counts)
-    for score_type in df["score_type"].unique():
-        score_df = df.query(f"score_type == '{score_type}'")
 
-        weighted_mean_metrics = latent_balanced_score_metrics(
-            score_df, score_type, verbose=False
-        )
+    latent_df, _ = load_data(scores_path, run_cfg.hookpoints)
+    processed_df = get_metrics(latent_df)
 
-        accuracy = weighted_mean_metrics["accuracy"]
+    # Performs better than random guessing
+    for score_type, df in processed_df.groupby("score_type"):
+        accuracy = df["accuracy"].mean()
         assert accuracy > 0.55, f"Score type {score_type} has an accuracy of {accuracy}"
 
 
 if __name__ == "__main__":
-    asyncio.run(test())
+    asyncio.run(test())