InternLM
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 1 addition & 0 deletions b/‎lmdeploy/messages.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/config.py‎
Lines changed: 1 addition & 0 deletions b/‎lmdeploy/turbomind/deploy/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/converter.py‎
Lines changed: 3 additions & 0 deletions b/‎lmdeploy/turbomind/deploy/converter.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmdeploy/turbomind/deploy/target_model/base.py‎
Lines changed: 9 additions & 2 deletions b/‎lmdeploy/turbomind/deploy/target_model/base.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h‎
Lines changed: 1 addition & 0 deletions b/‎src/turbomind/comm/cuda_ipc/cuda_ipc_comm.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu‎
Lines changed: 2 additions & 0 deletions b/‎src/turbomind/comm/cuda_ipc/fused_allreduce_ex.cu‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/turbomind/comm/device_comm.h‎
Lines changed: 1 addition & 0 deletions b/‎src/turbomind/comm/device_comm.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/turbomind/comm/nccl/nccl.cu‎
Lines changed: 15 additions & 4 deletions b/‎src/turbomind/comm/nccl/nccl.cu‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎src/turbomind/comm/test_comm.cu‎
Lines changed: 1 addition & 0 deletions b/‎src/turbomind/comm/test_comm.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/turbomind/kernels/attention/attention_params.h‎
Lines changed: 7 additions & 0 deletions b/‎src/turbomind/kernels/attention/attention_params.h‎
Lines changed: 7 additions & 0 deletions
@@ -233,6 +233,7 @@ class TurbomindEngineConfig:
     dp: int = 1
     device_num: int = None
     attn_tp_size: int = None
+    attn_cp_size: int = None
     attn_dp_size: int = None
     mlp_tp_size: int = None
     mlp_dp_size: int = None
 
@@ -67,6 +67,7 @@ class ModelConfig:
     weight_type: str = None
     session_len: int = None
     attn_tp_size: int = 1
+    attn_cp_size: int = 1
     mlp_tp_size: int = 1
     model_format: str = 'hf'
     expert_num: List[int] = ()
 
@@ -173,4 +173,7 @@ def get_tm_model(model_path,
                                                         model_cls=Transformer,
                                                         out_dir=out_dir)
 
+    engine_config.attn_tp_size = output_model.tm_config.model_config.attn_tp_size
+    engine_config.attn_cp_size = output_model.tm_config.model_config.attn_cp_size
+
     return output_model
@@ -52,6 +52,7 @@ def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, model
         self.attention_config = cfg.attention_config
         self.lora_config = cfg.lora_config
         self.attn_tp_size = self.model_config.attn_tp_size
+        self.attn_cp_size = self.model_config.attn_cp_size
         self.mlp_tp_size = self.model_config.mlp_tp_size
         self.out_dir = out_dir
         self.to_file = True if out_dir else False
@@ -74,8 +75,14 @@ def __init__(self, input_model: BaseInputModel, cfg: TurbomindModelConfig, model
         self.repeat_kv = 0
         if (self.attn_tp_size > self.model_config.kv_head_num
                 and self.attn_tp_size % self.model_config.kv_head_num == 0):
-            self.repeat_kv = (self.attn_tp_size // self.model_config.kv_head_num)
-            self.model_config.kv_head_num = self.attn_tp_size
+            self.attn_cp_size = self.attn_tp_size // self.model_config.kv_head_num
+            self.attn_tp_size //= self.attn_cp_size
+            self.model_config.attn_tp_size = self.attn_tp_size
+            self.model_config.attn_cp_size = self.attn_cp_size
+        # if (self.attn_tp_size > self.model_config.kv_head_num
+        #         and self.attn_tp_size % self.model_config.kv_head_num == 0):
+        #     self.repeat_kv = (self.attn_tp_size // self.model_config.kv_head_num)
+        #     self.model_config.kv_head_num = self.attn_tp_size
 
         self.model_config.verify()
         assert self.model_config.kv_head_num % self.attn_tp_size == 0
 
@@ -77,6 +77,7 @@ class CudaIpcCommImpl: public DeviceCommImpl {
                                         DataType     type,
                                         int          group0,
                                         int          group1,
+                                        int          cp_size,
                                         const int*   local_token_nums,
                                         cudaStream_t stream) override;
 
 
@@ -189,9 +189,11 @@ void CudaIpcCommImpl::AllreduceResidualBiasRMSnormEx(void*        hidden,
                                                      DataType     dtype,
                                                      int          group0,
                                                      int          group1,
+                                                     int          cp_size,
                                                      const int*   local_token_nums,
                                                      cudaStream_t stream)
 {
+    FT_CHECK(cp_size == 1);
     FT_CHECK(group0 * group1 == 0);
 
     const auto& g0 = groups_.at(group0);
 
@@ -87,6 +87,7 @@ class DeviceCommImpl {
                                                 DataType     type,
                                                 int          group0,
                                                 int          group1,
+                                                int          cp_size,
                                                 const int*   local_token_nums,
                                                 cudaStream_t stream)
     {
 
@@ -237,6 +237,7 @@ public:
                                         DataType     type,
                                         int          group0,
                                         int          group1,
+                                        int          cp_size,
                                         const int*   local_token_nums,
                                         cudaStream_t stream) override
     {
@@ -252,9 +253,8 @@ public:
         NCCLCHECK(ncclCommCount(comm0, &tp0));
         NCCLCHECK(ncclCommCount(comm1, &tp1));
 
-        const int inner_tp = std::min(tp0, tp1);
-
-        FT_CHECK(tp0 % inner_tp == 0 && tp1 % inner_tp == 0);
+        FT_CHECK(std::max(tp0, tp1) % std::min(tp0, tp1) == 0);
+        const int inner_tp = std::min(tp0, tp1) * cp_size;
 
         std::vector<std::tuple<int, int, int>> tasks;
         tasks.reserve(global_n_ranks_);
@@ -289,7 +289,18 @@ public:
             sync_check_cuda_error();
         }
 
-        if (tp1 > 1) {
+        if (cp_size > 1 && tp0 > tp1) {
+            NCCLCHECK(ncclGroupStart());
+            for (int i = 0; i < global_n_ranks_; ++i) {
+                if (auto& [offset, first, num] = tasks[i]; num > 0) {
+                    char* buff = (char*)hidden + elem_size * (offset + first) * dim;
+                    NCCLCHECK(ncclBroadcast(buff, buff, (size_t)num * dim, nccl_type, i % tp0, comm0, stream));
+                }
+            }
+            NCCLCHECK(ncclGroupEnd());
+            sync_check_cuda_error();
+        }
+        else if (tp1 > 1) {
             NCCLCHECK(ncclGroupStart());
             for (int i = 0; i < global_n_ranks_; ++i) {
                 if (auto& [offset, first, num] = tasks[i]; num > 0) {
 
@@ -796,6 +796,7 @@ struct TestComm {
                                                                dtype,
                                                                group0,
                                                                group1,
+                                                               1,
                                                                local_token_nums.data(),
                                                                stream);
                     });
 
@@ -75,6 +75,13 @@ struct AttentionParams {
     float* partial_L;
     int*   locks;
 
+    // cp
+    int    cp_rank{0};
+    int    cp_size{1};
+    float* cp_O{nullptr};
+    float* cp_M{nullptr};
+    float* cp_L{nullptr};
+
     int          arch;
     cudaStream_t stream;
Original file line number	Diff line number	Diff line change
`@@ -87,6 +87,7 @@ class DeviceCommImpl {`
`87`	`87`	`DataType type,`
`88`	`88`	`int group0,`
`89`	`89`	`int group1,`
	`90`	`+ int cp_size,`
`90`	`91`	`const int* local_token_nums,`
`91`	`92`	`cudaStream_t stream)`
`92`	`93`	`{`