diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index f7fb3cec3e..bda6053eec 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -68,6 +68,17 @@ def test_mgsm_en(self): metrics = run_eval(args) self.assertGreater(metrics["score"], 0.835) + def test_math(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="math", + num_examples=5000, + num_threads=1024 + ) + + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.519 - 0.01) # -1% to account for sampling variance if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index a008c3869e..74741aba57 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -37,6 +37,18 @@ def test_mmlu(self): metrics = run_eval(args) self.assertGreaterEqual(metrics["score"], 0.65) + def test_math(self): + args = SimpleNamespace( + base_url=self.base_url, + model=self.model, + eval_name="math", + num_examples=64, + num_threads=32, + temperature=0.1, + ) + metrics = run_eval(args) + self.assertGreaterEqual(metrics["score"], 0.519 - 0.03) # -3% to account for sampling variance + if __name__ == "__main__": unittest.main()