From 48cda85c29ad4b0b9dfc5aeaa4b5be38a70e2eb7 Mon Sep 17 00:00:00 2001 From: study8677 <867762462f@gmail.com> Date: Sun, 7 Dec 2025 11:12:30 +0800 Subject: [PATCH 1/2] fix: remove extra bracket in std calc and specify tensor device --- agentevolver/module/trainer/ae_ray_trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/agentevolver/module/trainer/ae_ray_trainer.py b/agentevolver/module/trainer/ae_ray_trainer.py index 5ba5082..29fffea 100644 --- a/agentevolver/module/trainer/ae_ray_trainer.py +++ b/agentevolver/module/trainer/ae_ray_trainer.py @@ -197,11 +197,11 @@ def compute_grpo_outcome_advantage( id2score[index[i]].append(scores[i]) for idx in id2score: if len(id2score[idx]) == 1: - id2mean[idx] = torch.tensor(0.0) - id2std[idx] = torch.tensor(1.0) + id2mean[idx] = torch.tensor(0.0, device=scores.device) + id2std[idx] = torch.tensor(1.0, device=scores.device) elif len(id2score[idx]) > 1: - id2mean[idx] = torch.mean(torch.tensor(id2score[idx])) - id2std[idx] = torch.std(torch.tensor([id2score[idx]])) + id2mean[idx] = torch.mean(torch.tensor(id2score[idx], device=scores.device)) + id2std[idx] = torch.std(torch.tensor(id2score[idx], device=scores.device)) else: raise ValueError(f"no score in prompt index: {idx}") for i in range(bsz): From 5ac55110ecb474e1d3c852be940bd7b7ef8d97b1 Mon Sep 17 00:00:00 2001 From: study8677 <867762462f@gmail.com> Date: Sun, 7 Dec 2025 11:31:53 +0800 Subject: [PATCH 2/2] refactor: apply PR feedback for N=1 logic and torch.stack optimization --- agentevolver/module/trainer/ae_ray_trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/agentevolver/module/trainer/ae_ray_trainer.py b/agentevolver/module/trainer/ae_ray_trainer.py index 29fffea..f26d9ee 100644 --- a/agentevolver/module/trainer/ae_ray_trainer.py +++ b/agentevolver/module/trainer/ae_ray_trainer.py @@ -197,11 +197,12 @@ def compute_grpo_outcome_advantage( id2score[index[i]].append(scores[i]) for idx in id2score: if len(id2score[idx]) == 1: - id2mean[idx] = torch.tensor(0.0, device=scores.device) + id2mean[idx] = id2score[idx][0] id2std[idx] = torch.tensor(1.0, device=scores.device) elif len(id2score[idx]) > 1: - id2mean[idx] = torch.mean(torch.tensor(id2score[idx], device=scores.device)) - id2std[idx] = torch.std(torch.tensor(id2score[idx], device=scores.device)) + group_scores = torch.stack(id2score[idx]) + id2mean[idx] = group_scores.mean() + id2std[idx] = group_scores.std() else: raise ValueError(f"no score in prompt index: {idx}") for i in range(bsz):