[docs]: upload PPO performance

Ronchy2000 · Ronchy2000 · commit 4f089a5fc312 · 2025-09-24T22:14:13.000+08:00
diff --git a/HAPPO-MAPPO_Continous_Heterogeneous/.gitignore b/HAPPO-MAPPO_Continous_Heterogeneous/.gitignore
@@ -1,4 +1,4 @@
 data/*
 !data/plot_*.py
-
+!data/happo_learning_curve_simple_tag_v3_s23.png
 models/*
diff --git a/HAPPO-MAPPO_Continous_Heterogeneous/data/happo_learning_curve_simple_tag_v3_s23.png b/HAPPO-MAPPO_Continous_Heterogeneous/data/happo_learning_curve_simple_tag_v3_s23.png
diff --git a/HAPPO-MAPPO_Continous_Heterogeneous/data/plot_reward.py b/HAPPO-MAPPO_Continous_Heterogeneous/data/plot_reward.py
@@ -3,6 +3,7 @@
 import os
 import glob
 import argparse
+import re
 
 def plot_rewards(file_path=None, data_dir=None, show=True, save=True):
     """
@@ -32,7 +33,7 @@ def plot_rewards(file_path=None, data_dir=None, show=True, save=True):
     # If file not specified, find the latest CSV file
     if file_path is None:
         # 修改：支持自动查找最新的happo奖励文件
-        csv_files = glob.glob(os.path.join(data_dir, "happo_rewards_*.csv"))
+        csv_files = glob.glob(os.path.join(data_dir, "happo_rewards_simple_tag_v3_n1_s23_2025-09-24_20-12.csv"))
         if not csv_files:
             print(f"Error: No HAPPO reward CSV files found in directory {data_dir}")
             return
@@ -54,20 +55,12 @@ def plot_rewards(file_path=None, data_dir=None, show=True, save=True):
     filename = os.path.basename(file_path)
     parts = filename.split('_')
     
-    # 修改：更好的文件名解析逻辑
     algorithm = "HAPPO"  # 默认算法名
-    env_name = "unknown"
-    agents = "?"
-    seed = "?"
-    
-    if filename.startswith("happo_rewards_"):
-        # 格式: happo_rewards_{env_name}_n{number}_s{seed}_{timestamp}.csv
-        try:
-            env_name = parts[2]  # simple_tag_v3
-            agents = parts[3][1:] if parts[3].startswith('n') else parts[3]  # 去掉'n'前缀
-            seed = parts[4][1:] if parts[4].startswith('s') else parts[4]    # 去掉's'前缀
-        except IndexError:
-            pass  # 使用默认值
+    env_name = "simple_tag_v3"
+    # 从文件名中提取种子值
+    seed_match = re.search(r"_s(\d+)_", filename)
+    if seed_match:
+        seed = seed_match.group(1)
     
     # Create chart with better styling
     plt.figure(figsize=(12, 8))
@@ -76,7 +69,7 @@ def plot_rewards(file_path=None, data_dir=None, show=True, save=True):
     plt.ylabel('Evaluation Reward', fontsize=12)
     
     # 修改：使用英文标题，更清晰的格式
-    plt.title(f'{algorithm} Learning Curve | Env: {env_name} | Agents: {agents} | Seed: {seed}', 
+    plt.title(f'{algorithm} Learning Curve | Env: {env_name} | Seed: {seed}', 
               fontsize=14, fontweight='bold')
     plt.grid(True, linestyle='--', alpha=0.3)
     
@@ -118,7 +111,7 @@ def plot_rewards(file_path=None, data_dir=None, show=True, save=True):
         plots_dir = os.path.join(current_dir)
         os.makedirs(plots_dir, exist_ok=True)
         # 修改：更清晰的文件名
-        plt_filename = os.path.join(plots_dir, f"{algorithm.lower()}_learning_curve_{env_name}_n{agents}_s{seed}.png")
+        plt_filename = os.path.join(plots_dir, f"{algorithm.lower()}_learning_curve_{env_name}_s{seed}.png")
         plt.savefig(plt_filename, dpi=300, bbox_inches='tight')
         print(f"Chart saved to: {plt_filename}")
     
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 | [动手学强化学习](./动手学强化学习/) | ![状态](https://img.shields.io/badge/状态-参考实现-informational) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-DQN到DDPG-blue) | [README](./动手学强化学习/README.md) |
 | [MADDPG_Continous](./MADDPG_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MADDPG-blue) | [中文文档](./MADDPG_Continous/README.md#项目特色) |
 | [MATD3_Continous](./MATD3_Continous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-100%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-连续MATD3-blue) | [中文文档](./MATD3_Continous/readme.md) |
-
+| [HAPPO-MAPPO_Continous_Heterogeneous](./HAPPO-MAPPO_Continous_Heterogeneous/) | ![状态](https://img.shields.io/badge/状态-已完成-success) | ![完成度](https://img.shields.io/badge/完成度-95%25-brightgreen) | ![技术](https://img.shields.io/badge/技术-PPO异构智能体-blue) | [中文文档](./HAPPO-MAPPO_Continous_Heterogeneous/Readme.md) |
 
 ## 学习路径与项目关联
 本仓库中的项目构成了一条从基础强化学习到多智能体强化学习的完整学习路径：
@@ -49,7 +49,7 @@
 #### 参考资源
 - [赵老师强化学习课程](https://www.bilibili.com/video/BV1sd4y167NS)
 - [强化学习的数学原理](https://github.com/MathFoundationRL/Book-Mathematical-Foundation-of-Reinforcement-Learning)
-#### 代码位置  [`赵老师强化学习代码仓库: ./RL_Learning-main`](./RL_Learning-main/scripts)
+#### 代码位置 [`赵老师强化学习代码仓库: ./RL_Learning-main`](./RL_Learning-main/scripts)
 
 #### 更新日志
 
@@ -98,13 +98,13 @@
 </div>
 
 
-#### 实现进度
+##### 实现进度
 | 算法            | 状态   | 位置                  | 核心组件                           |
 |----------------|--------|----------------------|----------------------------------|
 | MADDPG         | ✅ 1.0 | `agents/maddpg/`        | MADDPG_agent, DDPG_agent, buffer |
 | Independent RL | ⏳ 待完成 | `agents/independent/`| IndependentRL (计划中)          |
 | Centralized RL | ⏳ 待完成 | `agents/centralized/`| CentralizedRL (计划中)          |
-#### 代码位置  [`./MADDPG_Continous`](./MADDPG_Continous)
+##### 代码位置 [`./MADDPG_Continous`](./MADDPG_Continous)
 
 
 #### 3.2 MATD3_Continous：多智能体双延迟深度确定性策略梯度算法
@@ -123,17 +123,35 @@
   <p><strong>MATD3算法在simple_tag_env环境中的奖励收敛曲线</strong></p>
 </div>
 
-#### MATD3 vs MADDPG
+##### MATD3 vs MADDPG
 MATD3对标准MADDPG进行了以下关键增强：
 
 1. **双Q网络设计**: 减少对动作值的过估计
 2. **延迟策略更新**: 提高训练稳定性
 3. **目标策略平滑**: 通过在目标动作中加入噪声防止过拟合
 4. **自适应噪声调整**: 根据训练进度动态调整探索噪声
 
-#### 代码位置  [`./MATD3_Continous`](./MATD3_Continous)
+##### 代码位置 [`./MATD3_Continous`](./MATD3_Continous)
+
+#### 3.3 MAPPO-HAPPO算法：支持同构/异构智能体的多智能体近端策略优化
+
+实现了两种基于PPO的多智能体算法：MAPPO（多智能体近端策略优化）和HAPPO（异构智能体近端策略优化），为连续动作空间和异构智能体环境提供了解决方案。
+
+<div align="center">
+  <img src="./HAPPO-MAPPO_Continous_Heterogeneous/data/happo_learning_curve_simple_tag_v3_s23.png" alt="HAPPO算法表现" width="80%"/>
+  <p><strong>HAPPO算法特点：支持异构智能体协作与竞争，每个智能体可以有不同的观察维度</strong></p>
+</div>
+
+##### HAPPO/MAPPO的优势
 
+1. **无需采用确定性策略**：基于PPO，使用随机策略，减轻过拟合
+2. **异构智能体支持**：HAPPO特别支持不同观察维度和能力的异构智能体
+3. **训练稳定性**：PPO的截断机制提供更稳定的训练过程
+4. **采样效率**：通过多回合更新提高样本利用效率
+5. **超参数鲁棒性**：对超参数选择不那么敏感
 
+##### 代码位置 [`./MAPPO_Continous_Homogeneous`](./MAPPO_Continous_Homogeneous)
+##### 代码位置 [`./HAPPO-MAPPO_Continous_Homogeneous`](./HAPPO-MAPPO_Continous_Heterogeneous)
 
 ## 进行中的项目
 - **MARL**: 基于深度强化学习的多智能体协作与协调
diff --git a/README_en.md b/README_en.md
@@ -12,7 +12,7 @@ This repository contains multiple projects related to Reinforcement Learning (RL
 | [Hands-on RL](./动手学强化学习/) | ![Status](https://img.shields.io/badge/status-reference-informational) | ![Completion](https://img.shields.io/badge/completion-100%25-brightgreen) | ![Tech](https://img.shields.io/badge/tech-DQN%20to%20DDPG-blue) | [README](./动手学强化学习/README.md) |
 | [MADDPG_Continous](./MADDPG_Continous/) | ![Status](https://img.shields.io/badge/status-completed-success) | ![Completion](https://img.shields.io/badge/completion-100%25-brightgreen) | ![Tech](https://img.shields.io/badge/tech-continuous%20MADDPG-blue) | [README](./MADDPG_Continous/README_EN.md) |
 | [MATD3_Continous](./MATD3_Continous/) | ![Status](https://img.shields.io/badge/status-completed-success) | ![Completion](https://img.shields.io/badge/completion-100%25-brightgreen) | ![Tech](https://img.shields.io/badge/tech-continuous%20MATD3-blue) | [README](./MATD3_Continous/readme_en.md) |
-
+| [HAPPO-MAPPO_Continous_Heterogeneous](./HAPPO-MAPPO_Continous_Heterogeneous/) | ![Status](https://img.shields.io/badge/status-completed-success) | ![Completion](https://img.shields.io/badge/completion-95%25-brightgreen) | ![Tech](https://img.shields.io/badge/tech-PPO%20Heterogeneous-blue) | [Documentation](./HAPPO-MAPPO_Continous_Heterogeneous/Readme_en.md) |
 ## Learning Path and Project Connections
 
 The projects in this repository form a complete learning path from basic reinforcement learning to multi-agent reinforcement learning:
@@ -52,8 +52,7 @@ Reproduction of Professor Shiyu Zhao's reinforcement learning course code from W
 - [Professor Zhao's Reinforcement Learning Course](https://www.bilibili.com/video/BV1sd4y167NS)
 - [Mathematical Foundation of Reinforcement Learning](https://github.com/MathFoundationRL/Book-Mathematical-Foundation-of-Reinforcement-Learning)
 
-#### Code Location
-[Professor Zhao's RL Code Repository: ./RL_Learning-main](./RL_Learning-main/scripts)
+#### Code Location [Professor Zhao's RL Code Repository: ./RL_Learning-main](./RL_Learning-main/scripts)
 
 #### Update Log
 **2024.6.7**  
@@ -78,8 +77,7 @@ Reproduction and expansion of the code from the book "Hands-on Reinforcement Lea
 #### Learning Path
 This section demonstrates the learning path from basic DQN to DDPG, and then to MADDPG, laying the foundation for understanding multi-agent reinforcement learning.
 
-#### Code Location
-[./动手学强化学习](./动手学强化学习/)
+#### Code Location [./动手学强化学习](./动手学强化学习/)
 
 #### References
 - [Hands-on Reinforcement Learning](https://hrl.boyuai.com/chapter/2/dqn%E7%AE%97%E6%B3%95)
@@ -107,15 +105,14 @@ Personal implementation of the MADDPG algorithm based on the latest version of t
   <p><strong>Reward convergence curve of MADDPG algorithm in simple_tag_v3 environment</strong></p>
 </div>
 
-#### Implementation Progress
+##### Implementation Progress
 | Algorithm      | Status | Location              | Core Components                    |
 |----------------|--------|----------------------|----------------------------------|
 | MADDPG         | ✅ 1.0 | `agents/maddpg/`     | MADDPG_agent, DDPG_agent, buffer |
 | Independent RL | ⏳ Planned | `agents/independent/`| IndependentRL (planned)        |
 | Centralized RL | ⏳ Planned | `agents/centralized/`| CentralizedRL (planned)        |
 
-#### Code Location
-[./MADDPG_Continous](./MADDPG_Continous)
+##### Code Location [./MADDPG_Continous](./MADDPG_Continous)
 
 #### 3.2 MATD3_Continous: Multi-Agent Twin Delayed Deep Deterministic Policy Gradient Algorithm
 
@@ -134,16 +131,37 @@ Multi-agent extension version of the TD3 algorithm (MATD3: Twin Delayed Deep Det
   <p><strong>Reward convergence curve of MATD3 algorithm in simple_tag_v3 environment</strong></p>
 </div>
 
-#### MATD3 vs MADDPG
+##### MATD3 vs MADDPG
 MATD3 enhances standard MADDPG with these key improvements:
 
 1. **Double Q-Network Design**: Reduces overestimation of action values
 2. **Delayed Policy Updates**: Improves training stability
 3. **Target Policy Smoothing**: Prevents overfitting by adding noise to target actions
 4. **Adaptive Noise Adjustment**: Dynamically adjusts exploration noise based on training progress
 
-#### Code Location
-[./MATD3_Continous](./MATD3_Continous)
+##### Code Location [./MATD3_Continous](./MATD3_Continous)
+
+
+#### 3.3 HAPPO-MAPPO: Supporting Heterogeneous Agents in Multi-Agent Proximal Policy Optimization
+
+Implementation of two PPO-based multi-agent algorithms: MAPPO (Multi-Agent Proximal Policy Optimization) and HAPPO (Heterogeneous-Agent Proximal Policy Optimization), providing solutions for continuous action spaces and heterogeneous agent environments.
+
+<div align="center">
+  <img src="./HAPPO-MAPPO_Continous_Heterogeneous/data/happo_learning_curve_simple_tag_v3_s23.png" alt="HAPPO Algorithm Performance" width="45%"/>
+  <p><strong>HAPPO Algorithm Features: Supporting heterogeneous agent cooperation and competition, where each agent can have different observation dimensions</strong></p>
+</div>
+
+##### Advantages of HAPPO/MAPPO
+
+1. **No Need for Deterministic Policies**: Based on PPO, using stochastic policies, reducing overfitting
+2. **Heterogeneous Agent Support**: HAPPO specifically supports heterogeneous agents with different observation dimensions and capabilities
+3. **Training Stability**: PPO's clipping mechanism provides more stable training process
+4. **Sample Efficiency**: Improves sample utilization through multi-epoch updates
+5. **Hyperparameter Robustness**: Less sensitive to hyperparameter selection
+
+##### Code Location [`./MAPPO_Continous_Homogeneous`](./MAPPO_Continous_Homogeneous)
+##### Code Location [`./HAPPO-MAPPO_Continous_Heterogeneous`](./HAPPO-MAPPO_Continous_Heterogeneous)
+
 
 ## Ongoing Projects
 - **MARL**: Multi-agent cooperation and coordination based on deep reinforcement learning