[Metax] adapt cutlass moe for ernie-vl

neilzhuu · neilzhuu · commit 5d7ab5cb5a4b · 2025-10-31T11:48:41.000+08:00
diff --git a/custom_ops/metax_ops/fused_moe.cu b/custom_ops/metax_ops/fused_moe.cu
@@ -101,6 +101,10 @@ std::vector<paddle::Tensor> FusedExpertMoe(
   const auto input_type = input.dtype();
   auto output = paddle::empty_like(input);
 
+  if (output.dims()[0] == 0) {
+    return {output};
+  }
+
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
       FusedMoeKernel<paddle::DataType::BFLOAT16,
diff --git a/custom_ops/metax_ops/moe_dispatch.cu b/custom_ops/metax_ops/moe_dispatch.cu
@@ -178,6 +178,14 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
   auto permute_indices_per_token =
       GetEmptyTensor({moe_topk, num_rows}, paddle::DataType::INT32, place);
 
+  if (token_rows == 0) {
+    return {permute_input,
+            tokens_expert_prefix_sum,
+            permute_indices_per_token,
+            top_k_weight,
+            top_k_indices};
+  }
+
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
       MoeDispatchKernel<paddle::DataType::BFLOAT16>(input,
diff --git a/custom_ops/metax_ops/moe_ffn.cu b/custom_ops/metax_ops/moe_ffn.cu
@@ -114,6 +114,10 @@ std::vector<paddle::Tensor> MoeExpertFFN(
   const auto input_type = permute_input.dtype();
   auto ffn_out = paddle::empty_like(permute_input);
 
+  if (permute_input.numel() == 0) {
+    return {ffn_out};
+  }
+
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
       McMoeFFNKernel<paddle::DataType::BFLOAT16,
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -612,6 +612,8 @@ def find_end_files(directory, end_str):
         "gpu_ops/text_image_gather_scatter.cu",
         "gpu_ops/text_image_index_out.cu",
         "gpu_ops/get_position_ids_and_mask_encoder_batch.cu",
+        "gpu_ops/limit_thinking_content_length_v1.cu",
+        "gpu_ops/limit_thinking_content_length_v2.cu",
         "gpu_ops/append_attn/mla_cache_kernel.cu",
         "gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu",
         "gpu_ops/moe/tritonmoe_preprocess.cu",
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
@@ -50,8 +50,12 @@
 elif current_platform.is_maca():
     from fastdeploy.model_executor.ops.gpu import (
         get_padding_offset,
+        limit_thinking_content_length_v1,
+        limit_thinking_content_length_v2,
         save_output,
         set_stop_value_multi_ends,
+        speculate_limit_thinking_content_length_v1,
+        speculate_limit_thinking_content_length_v2,
         step_paddle,
         update_inputs,
         update_inputs_v1,
@@ -770,7 +774,9 @@ def rebuild_padding(
             seq_lens_decoder,
             seq_lens_encoder,
             output_padding_offset,
+            first_token_out,
             max_input_length,
+            enable_logprob,
         )
     else:
         raise RuntimeError("Not supported platform")
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py