PaddlePaddle · ZhangX-21 · Jun 2, 2026 · Jun 9, 2026 · Jun 23, 2026 · PaddlePaddle-bot
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1211,6 +1211,13 @@ def _set_cudagraph_sizes(
         # Shape [256, 288, ... 992, 1024]
         draft_capture_sizes += [32 * i for i in range(9, 33)]
 
+        # Shape [1024, 1088, ... 2048] step=64
+        draft_capture_sizes += [64 * i for i in range(17, 33)]
+        # Shape [2048, 2176, ... 4096] step=128
+        draft_capture_sizes += [128 * i for i in range(17, 33)]
+        # Shape [4096, 4352, ... 8192] step=256
+        draft_capture_sizes += [256 * i for i in range(17, 33)]
+
         draft_capture_sizes_prefill = draft_capture_sizes.copy()
         draft_capture_sizes.append(max_capture_size)
         self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
@@ -2223,7 +2230,13 @@ def postprocess(self):
 
         # Adjustment GraphOptConfig
         if self.scheduler_config is not None and self.scheduler_config.splitwise_role == "prefill":
-            self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill
+            # Piecewise CUDAGraph for prefill worker: if graph_opt_level >= 1 and not full_cuda_graph,
+            # reuse the mixed piecewise path (capture_model_prefill_and_mixed) for the prefill worker.
+            # Otherwise fall back to cudagraph_only_prefill flag (legacy path).
+            if self.graph_opt_config.graph_opt_level >= 1 and not self.graph_opt_config.full_cuda_graph:
+                self.graph_opt_config.use_cudagraph = True
+            else:
+                self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill
         if self.load_config is not None and self.load_config.dynamic_load_weight is True:
             self.graph_opt_config.graph_opt_level = 0
             logger.info(

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -263,16 +263,6 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool(
         int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1"))
     ),
-    # Whether to enable block-wise CUDA Graph capture/replay.
-    # When enabled, individual layer forward methods decorated with @block_wise_cuda_graph_wrap
-    # will be captured and replayed as CUDA Graphs for improved performance.
-    # Set to 1 to enable; defaults to 0 (disabled).
-    "FD_USE_BLOCK_WISE_CUDA_GRAPH": lambda: bool(int(os.getenv("FD_USE_BLOCK_WISE_CUDA_GRAPH", "0"))),
-    # Comma-separated list of token counts to pre-capture for block-wise CUDA Graphs.
-    # Used during the warmup phase to pre-capture graphs for these specific sizes.
-    # At runtime, token counts not in this list fall back to eager execution.
-    # Example: "1,2,4,8,16,32,64,128,256,512"
-    "FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"),
     # Suspend rollouting routing replay
     "FD_SUSPEND_ROUTING_REPLAY": lambda: bool(int(os.getenv("FD_SUSPEND_ROUTING_REPLAY", "0"))),
     # train-infer consistency, used in RL

diff --git a/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py b/fastdeploy/model_executor/graph_optimization/cuda_graph_op.py
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -193,19 +193,14 @@ def run_static_model(self, entry: ConcreteSizeEntry, is_decode: bool = False, **
 
     def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
         # Get real shape (total num tokens)
-        if (
-            self.speculative_decoding
-            and self.real_bsz_to_captured_size
-            and all(self.real_bsz_to_captured_size.values())
-        ):
-            seq_lens_this_time: paddle.Tensor = kwargs["forward_meta"].seq_lens_this_time
-            real_bsz = kwargs["forward_meta"].real_bsz
-            num_running_requests = real_bsz if real_bsz > 0 else int((seq_lens_this_time.flatten() > 0).sum().item())
-            num_running_requests = max(1, num_running_requests)
-            real_shape = self.real_bsz_to_captured_size[num_running_requests]
-        else:
-            ids_remove_padding: paddle.Tensor = kwargs["forward_meta"].ids_remove_padding
-            real_shape = ids_remove_padding.shape[0]
+        # For both MTP speculative decoding and regular decode, use ids_remove_padding.shape[0]
+        # directly as the real_shape key into real_shape_to_captured_size.
+        # In MTP, cudagraph_capture_sizes are already scaled by (num_speculative_tokens+1),
+        # so ids_remove_padding.shape[0] == capture_size and the lookup is correct.
+        # This avoids using real_bsz (a concrete Python int) which causes SOT to specialize
+        # on each distinct batch size, generating multiple code objects and breaking warmup_impl.
+        ids_remove_padding: paddle.Tensor = kwargs["forward_meta"].ids_remove_padding
+        real_shape = ids_remove_padding.shape[0]
         exist_prefill = kwargs["forward_meta"].exist_prefill
         # Static split graph mode: use Static + CUDAGraph for prefill/mixed phase
         static_cudagraph_for_prefill = exist_prefill and not self.full_cuda_graph and self.dy2st