gpu-mode · msaroufim · Jun 30, 2025
diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py
@@ -1,11 +1,13 @@
+import base64
 import dataclasses
+import multiprocessing
 import re
 import time
 import os
 import sys
 import math
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
 import torch.cuda
 
@@ -15,16 +17,13 @@
 except ImportError:
     TestSpec = dict
 
-from submission import custom_kernel
 from reference import check_implementation, generate_input
 
-WARMUP_RUNS = 10
-TIMED_RUNS = 100
-
 
 class PopcornOutput:
     def __init__(self, fd: int):
         self.file = os.fdopen(fd, 'w')
+        os.set_inheritable(fd, False)
 
     def __enter__(self):
         return self
@@ -45,7 +44,18 @@ class TestCase:
     spec: str
 
 
-def get_test_cases(file_name: str) -> list[TestCase]:
+def _combine(a: int, b: int) -> int:
+    # combine two integers into one:
+    # we need this to generate a secret seed based on the test-level seed and
+    # the global secret seed.
+    # the test-level seeds are public knowledge, and typically relatively small numbers,
+    # so we need to make sure they don't provide any useful info for the full seed.
+    # This Cantor construction ensures that if the secret seed is a large number,
+    # then so is the overall seed.
+    return int(a + (a+b)*(a+b+1)//2)
+
+
+def get_test_cases(file_name: str, seed: Optional[int]) -> list[TestCase]:
     try:
         content = Path(file_name).read_text()
     except Exception as E:
@@ -73,15 +83,12 @@ def get_test_cases(file_name: str) -> list[TestCase]:
             case[key] = val
         tests.append(TestCase(spec=line, args=case))
 
-    return tests
-
+    if seed is not None:
+        for test in tests:
+            if "seed" in test.args:
+                test.args["seed"] = _combine(test.args["seed"], seed)
 
-def warm_up(test: TestCase):
-    data = generate_input(**test.args)
-    start = time.perf_counter()
-    while time.perf_counter() - start < 0.2:
-        custom_kernel(data)
-        torch.cuda.synchronize()
+    return tests
 
 
 @dataclasses.dataclass
@@ -115,7 +122,53 @@ def calculate_stats(durations: list[int]):
                  worst=float(worst))
 
 
-def run_testing(logger: PopcornOutput, tests: list[TestCase]):
+def _clone_data(data):
+    """
+    Recursively goes through data and clones all tensors.
+    """
+    if isinstance(data, tuple):
+        return tuple(_clone_data(x) for x in data)
+    elif isinstance(data, list):
+        return [_clone_data(x) for x in data]
+    elif isinstance(data, dict):
+        return {k: _clone_data(v) for k, v in data.items()}
+    elif isinstance(data, torch.Tensor):
+        return data.clone()
+    else:
+        return data
+
+
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
+def _run_single_test(test: TestCase):
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+    submission_output = custom_kernel(_clone_data(data))
+    torch.cuda.synchronize()
+    return wrap_check_implementation(data, submission_output)
+
+
+def run_single_test(pool: multiprocessing.Pool, test: TestCase):
+    """
+    Runs a single test in another process.
+    """
+    return pool.apply(_run_single_test, (test,))
+
+
+def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
     """
     Executes the actual test case code and checks for correctness.
 
@@ -127,18 +180,15 @@ def run_testing(logger: PopcornOutput, tests: list[TestCase]):
     logger.log("test-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"test.{idx}.spec", test.spec)
-
-        data = generate_input(**test.args)
-        torch.cuda.synchronize()
-        submission_output = custom_kernel(data)
-        torch.cuda.synchronize()
-        error = check_implementation(data, submission_output)
-        if error:
+        good, message = run_single_test(pool, test)
+        if not good:
             logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", error)
+            logger.log(f"test.{idx}.error", message)
             passed = False
         else:
             logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
 
     if passed:
         logger.log("check", "pass")
@@ -148,69 +198,95 @@ def run_testing(logger: PopcornOutput, tests: list[TestCase]):
         return 112
 
 
-def benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
+def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_time_ns: float) -> Stats | Any:
     """
-    For a particular test case, check correctness (if applicable) and grab runtime results.
-
-    @param test: TestCase object.
-    @param recheck: Flag for whether to explicitly check functional correctness.
-    @param max_repeats: Number of trials to repeat.
-    @param max_time_ns: Timeout time in nanoseconds.
-    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    Runs one benchmark. Do not call directly.
     """
+    from submission import custom_kernel
+
     durations = []
     # generate input data once
     data = generate_input(**test.args)
-    # first, one obligatory correctness check; also triggers triton compile for the given shape
+    check_copy = _clone_data(data)
+    #  first, one obligatory correctness check
     output = custom_kernel(data)
-    error = check_implementation(data, output)
-    if error:
-        return error
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
 
     # now, do multiple timing runs without further correctness testing
     # there is an upper bound of 100 runs, and a lower bound of 3 runs;
     # otherwise, we repeat until we either measure at least 10 full seconds,
     # or the relative error of the mean is below 1%.
 
+    bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
         if recheck:
+            # ensure we use a different seed for every benchmark
+            if "seed" in test.args:
+                test.args["seed"] += 13
+
             data = generate_input(**test.args)
+            check_copy = _clone_data(data)
         torch.cuda.synchronize()
         start = time.perf_counter_ns()
         output = custom_kernel(data)
         torch.cuda.synchronize()
         end = time.perf_counter_ns()
 
         if recheck:
-            error = check_implementation(data, output)
-            if error:
-                return error
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
 
         del output
-        durations.append(end-start)
+        durations.append(end - start)
 
         if i > 1:
+            total_bm_duration = time.perf_counter_ns() - bm_start_time
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean *  stats.runs > max_time_ns:
+            # stop if either
+            # a) relative error dips below 0.1%
+            # b) we exceed the total time limit for benchmarking the kernel
+            # c) we exceed 2 minutes of total wallclock time.
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns or total_bm_duration > 120e9:
                 break
 
     return calculate_stats(durations)
 
 
-def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
+def run_single_benchmark(pool: multiprocessing.Pool, test: TestCase, recheck: bool, max_repeats: int,
+                         max_time_ns: float):
+    """
+    For a particular test case, check correctness (if applicable) and grab runtime results.
+
+    @param pool: Process on which the benchmark will be launched.
+    @param test: TestCase object.
+    @param recheck: Flag for whether to explicitly check functional correctness.
+    @param max_repeats: Number of trials to repeat.
+    @param max_time_ns: Timeout time in nanoseconds.
+    @return: A Stats object for this particular benchmark case or an error if the test fails.
+    """
+    return pool.apply(_run_single_benchmark, (test, recheck, max_repeats, max_time_ns))
+
+
+def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[TestCase]):
     """
     Executes benchmarking code for a CUDA Kernel and logs runtimes.
 
     @param logger: A PopcornOutput object used for logging benchmark results.
+    @param pool: Process on which the benchmarks will be launched.
     @param tests: A list of TestCase objects representing the test cases to be benchmarked.
     @return: An integer representing the exit status: 0 if all benchmarks pass, otherwise 112.
     """
-    warm_up(tests[0])
+    # warm up
+    run_single_benchmark(pool, tests[0], False, 100, 10e7)
+
     passed = True
     logger.log("benchmark-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"benchmark.{idx}.spec", test.spec)
-        result = benchmark(test, False, 100, 10e9)
+        result = run_single_benchmark(pool, test, False, 100, 10e9)
         if isinstance(result, Stats):
             for field in dataclasses.fields(Stats):
                 logger.log(f"benchmark.{idx}.{field.name}", getattr(result, field.name))
@@ -227,6 +303,31 @@ def run_benchmarking(logger: PopcornOutput, tests: list[TestCase]):
         return 112
 
 
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -236,38 +337,44 @@ def main():
         return 2
 
     mode = sys.argv[1]
-    tests = get_test_cases(sys.argv[2])
+    seed = os.getenv("POPCORN_SEED")
+    os.unsetenv("POPCORN_SEED")
+    seed = int(seed) if seed else None
+    set_seed(seed or 42)
+    tests = get_test_cases(sys.argv[2], seed)
 
     with PopcornOutput(int(fd)) as logger:
-        seed = os.getenv("POPCORN_SEED")
-        seed = int(seed) if seed else 42
-        set_seed(seed)
-
-        if mode == "test":
-            return run_testing(logger, tests)
-
-        if mode == "benchmark":
-            return run_benchmarking(logger, tests)
-
-        if mode == "leaderboard":
-            warm_up(tests[0])
-            result = benchmark(tests[-1], True, 100, 30e9)
-            if isinstance(result, Stats):
-                logger.log("benchmark-count", 1)
-                logger.log(f"benchmark.0.spec", tests[-1].spec)
-                logger.log(f"benchmark.0.runs", result.runs)
-                logger.log(f"benchmark.0.mean", result.mean)
-                logger.log(f"benchmark.0.std", result.std)
-                logger.log(f"benchmark.0.err", result.err)
-                logger.log("check", "pass")
+        import multiprocessing
+        mp_context = multiprocessing.get_context('spawn')
+        with mp_context.Pool(1) as pool:
+            if mode == "test":
+                return run_testing(logger, pool, tests)
+            if mode == "benchmark":
+                return run_benchmarking(logger, pool, tests)
+
+            if mode == "leaderboard":
+                # warmup
+                run_single_benchmark(pool, tests[0], False, 100, 1e7)
+                logger.log("benchmark-count", len(tests))
+                passed = True
+                for i in range(len(tests)):
+                    result = run_single_benchmark(pool, tests[i], True, 100, 30e9)
+                    logger.log(f"benchmark.{i}.spec", tests[i].spec)
+                    if isinstance(result, Stats):
+                        for field in dataclasses.fields(Stats):
+                            logger.log(f"benchmark.{i}.{field.name}", getattr(result, field.name))
+                    else:
+                        passed = False
+                        logger.log(f"benchmark.{i}.status", "fail")
+                        logger.log(f"benchmark.{i}.error", str(result))  # TODO: Make sure result implements __str__?
+                        break
+
+                logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
             else:
-                logger.log("test-count", 1)
-                logger.log("test.0.status", "fail")
-                logger.log("test.0.error", str(result)) #TODO: Make sure result implements __str__?
-
-        else:
-            # TODO: Implement script and profile mode
-            return 2
+                # TODO: Implement script mode
+                return 2
 
 
 if __name__ == "__main__":