Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1211,6 +1211,13 @@ def _set_cudagraph_sizes(
# Shape [256, 288, ... 992, 1024]
draft_capture_sizes += [32 * i for i in range(9, 33)]

# Shape [1024, 1088, ... 2048] step=64
draft_capture_sizes += [64 * i for i in range(17, 33)]
# Shape [2048, 2176, ... 4096] step=128
draft_capture_sizes += [128 * i for i in range(17, 33)]
# Shape [4096, 4352, ... 8192] step=256
draft_capture_sizes += [256 * i for i in range(17, 33)]

draft_capture_sizes_prefill = draft_capture_sizes.copy()
draft_capture_sizes.append(max_capture_size)
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
Expand Down Expand Up @@ -2223,7 +2230,13 @@ def postprocess(self):

# Adjustment GraphOptConfig
if self.scheduler_config is not None and self.scheduler_config.splitwise_role == "prefill":
self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill
# Piecewise CUDAGraph for prefill worker: if graph_opt_level >= 1 and not full_cuda_graph,
# reuse the mixed piecewise path (capture_model_prefill_and_mixed) for the prefill worker.
# Otherwise fall back to cudagraph_only_prefill flag (legacy path).
if self.graph_opt_config.graph_opt_level >= 1 and not self.graph_opt_config.full_cuda_graph:
self.graph_opt_config.use_cudagraph = True

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug 这里在 PD prefill piecewise 模式下无条件把 use_cudagraph 置为 True,但后面的 CPU cache 兼容性逻辑只检查 cudagraph_only_prefill

cache_config.num_cpu_blocks 非零时,下面的注释已经说明 layer-by-layer swap/H2D 与 CUDA Graph prefill capture 不兼容;但本分支默认 cudagraph_only_prefill=False,所以会绕过 2246-2259 的禁用逻辑。随后 gpu_worker.graph_optimize_and_warm_up_model()use_piecewise and not is_pd_decode 下仍会调用 capture_model_prefill_and_mixed(),启用本应被禁用的 prefill CUDAGraph。

建议修复方式:把 piecewise prefill 也纳入同一兼容性判断;例如在 num_cpu_blocks 非零且 splitwise_role == "prefill" and graph_opt_level >= 1 and not full_cuda_graph 时不要打开 use_cudagraph,或在后面的 CPU cache 兼容性块里同时将 use_cudagraph 复位为 False 并跳过 prefill capture。

else:
self.graph_opt_config.use_cudagraph = self.graph_opt_config.cudagraph_only_prefill
if self.load_config is not None and self.load_config.dynamic_load_weight is True:
self.graph_opt_config.graph_opt_level = 0
logger.info(
Expand Down
10 changes: 0 additions & 10 deletions fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,16 +263,6 @@ def _validate_split_kv_size(value: int) -> int:
"FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST": lambda: bool(
int(os.getenv("FD_SAVE_OUTPUT_CACHE_FOR_PREEMPTED_REQUEST", "1"))
),
# Whether to enable block-wise CUDA Graph capture/replay.
# When enabled, individual layer forward methods decorated with @block_wise_cuda_graph_wrap
# will be captured and replayed as CUDA Graphs for improved performance.
# Set to 1 to enable; defaults to 0 (disabled).
"FD_USE_BLOCK_WISE_CUDA_GRAPH": lambda: bool(int(os.getenv("FD_USE_BLOCK_WISE_CUDA_GRAPH", "0"))),
# Comma-separated list of token counts to pre-capture for block-wise CUDA Graphs.
# Used during the warmup phase to pre-capture graphs for these specific sizes.
# At runtime, token counts not in this list fall back to eager execution.
# Example: "1,2,4,8,16,32,64,128,256,512"
"FD_BLOCK_WISE_CUDA_GRAPH_SIZES": lambda: os.getenv("FD_BLOCK_WISE_CUDA_GRAPH_SIZES", "128,256,512,1024,2048"),
# Suspend rollouting routing replay
"FD_SUSPEND_ROUTING_REPLAY": lambda: bool(int(os.getenv("FD_SUSPEND_ROUTING_REPLAY", "0"))),
# train-infer consistency, used in RL
Expand Down
320 changes: 0 additions & 320 deletions fastdeploy/model_executor/graph_optimization/cuda_graph_op.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,14 @@ def run_static_model(self, entry: ConcreteSizeEntry, is_decode: bool = False, **

def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
# Get real shape (total num tokens)
if (
self.speculative_decoding
and self.real_bsz_to_captured_size
and all(self.real_bsz_to_captured_size.values())
):
seq_lens_this_time: paddle.Tensor = kwargs["forward_meta"].seq_lens_this_time
real_bsz = kwargs["forward_meta"].real_bsz
num_running_requests = real_bsz if real_bsz > 0 else int((seq_lens_this_time.flatten() > 0).sum().item())
num_running_requests = max(1, num_running_requests)
real_shape = self.real_bsz_to_captured_size[num_running_requests]
else:
ids_remove_padding: paddle.Tensor = kwargs["forward_meta"].ids_remove_padding
real_shape = ids_remove_padding.shape[0]
# For both MTP speculative decoding and regular decode, use ids_remove_padding.shape[0]
# directly as the real_shape key into real_shape_to_captured_size.
# In MTP, cudagraph_capture_sizes are already scaled by (num_speculative_tokens+1),
# so ids_remove_padding.shape[0] == capture_size and the lookup is correct.
# This avoids using real_bsz (a concrete Python int) which causes SOT to specialize
# on each distinct batch size, generating multiple code objects and breaking warmup_impl.
ids_remove_padding: paddle.Tensor = kwargs["forward_meta"].ids_remove_padding
real_shape = ids_remove_padding.shape[0]
exist_prefill = kwargs["forward_meta"].exist_prefill
# Static split graph mode: use Static + CUDAGraph for prefill/mixed phase
static_cudagraph_for_prefill = exist_prefill and not self.full_cuda_graph and self.dy2st
Expand Down
Loading