Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7716866
support qwen3-embedding
lizexu123 Sep 22, 2025
5fde033
merge develop
lizexu123 Oct 10, 2025
001f23d
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 10, 2025
73141d4
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 24, 2025
6a2ddaf
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 24, 2025
85d14ba
support qwen3-embedding-0.6b
lizexu123 Oct 24, 2025
8200040
fix
lizexu123 Oct 24, 2025
5832cc4
update
lizexu123 Oct 27, 2025
58616e4
fix bug
lizexu123 Oct 27, 2025
ad2f7b6
fix test_return_token_ids.py and update enable_thinking
lizexu123 Oct 27, 2025
aeddcac
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 27, 2025
955fac1
fix mtp dummy_run
lizexu123 Oct 27, 2025
21c20a7
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 28, 2025
0206d42
merge develop
lizexu123 Oct 28, 2025
30795d2
fix np.float32
lizexu123 Oct 28, 2025
6bc1ed2
delete FD_DISABLE_CHUNKED_PREFILL and FD_USE_GET_SAVE_OUTPUT_V1
lizexu123 Oct 28, 2025
f439ca2
delete and build_stream_transfer_data
lizexu123 Oct 28, 2025
27d686b
fix test_update_v1:
lizexu123 Oct 28, 2025
a6a9483
update develop
lizexu123 Oct 28, 2025
15a0df8
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy …
lizexu123 Oct 29, 2025
57e76be
fix
lizexu123 Oct 29, 2025
eae6db6
fix
lizexu123 Oct 29, 2025
90d5ee1
update dummy_run post_process
lizexu123 Oct 29, 2025
1a35691
delete test_update_v1
lizexu123 Oct 29, 2025
2fa8733
fix
lizexu123 Oct 29, 2025
e95b3f9
fix model_path
lizexu123 Oct 30, 2025
35cff6e
support Qwen2.5-Math-PRM-7B
lizexu123 Oct 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,38 @@ def is_ernie_arch(cls, architecture):
"moe_layer_end_index": None,
}

_STR_DTYPE_TO_PADDLE_DTYPE = {
"half": paddle.float16,
"float16": paddle.float16,
"float": paddle.float32,
"float32": paddle.float32,
"bfloat16": paddle.bfloat16,
}


def _get_head_dtype(
config: PretrainedConfig,
dtype: str,
runner_type: str,
) -> paddle.dtype:
head_dtype: str | paddle.dtype | None = getattr(config, "head_dtype", None)

if head_dtype == "model":
return dtype
elif isinstance(head_dtype, str):
head_dtype = head_dtype.lower()
if head_dtype not in _STR_DTYPE_TO_PADDLE_DTYPE:
raise ValueError(f"Unknown dtype: {head_dtype!r}")
return _STR_DTYPE_TO_PADDLE_DTYPE[head_dtype]
elif isinstance(head_dtype, paddle.dtype):
return head_dtype
elif head_dtype is None:
if runner_type == "pooling":
return paddle.float32
return dtype
else:
raise ValueError(f"Unknown dtype: {head_dtype!r}")


class ModelConfig:
"""
Expand Down Expand Up @@ -207,6 +239,7 @@ def __init__(
assert self.model != ""
pretrained_config, _ = PretrainedConfig.get_config_dict(self.model)
self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
print("self.pretrained_config", self.pretrained_config)

# set attribute from pretrained_config
for key, value in pretrained_config.items():
Expand Down Expand Up @@ -242,6 +275,9 @@ def _post_init(self):

self.enable_mm = is_multimodal_model

if self.runner_type == "pooling":
os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"

if self.runner_type == "generate" and not is_generative_model:
if is_multimodal_model:
pass
Expand Down Expand Up @@ -509,6 +545,29 @@ def print(self):
logger.info("{:<20}:{:<6}{}".format(k, "", v))
logger.info("=============================================================")

@property
def head_dtype(self) -> paddle.dtype:
"""
"head" refers to the last Linear layer(s) of an LLM,
such as the lm_head in a generation model,
or the score or classifier in a classification model.

`head_dtype` currently only supports pooling models.\n
- The pooling model defaults to using fp32 head,
you can use --hf-overrides '{"head_dtype": "model"}' to disable it.
"""
print("self.dtype", self.dtype)
head_dtype = _get_head_dtype(config=self.pretrained_config, dtype=self.dtype, runner_type=self.runner_type)
if self.runner_type != "pooling" and head_dtype != self.dtype:
logger.warning(
"`head_dtype` currently only supports pooling models." "fallback to model dtype [%s].",
self.dtype,
)
return self.dtype

logger.info("head dtype: %s", head_dtype)
return head_dtype


class ParallelConfig:
"""Configuration for the distributed execution."""
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/engine/common_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,7 @@ def _fetch_request():
raise
# 2. Schedule requests
tasks = self.resource_manager.schedule()

# 3. Send to engine
if tasks:
if self.cfg.scheduler_config.splitwise_role == "decode":
Expand Down
43 changes: 19 additions & 24 deletions fastdeploy/model_executor/layers/pooler.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]

def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
pooling_params = get_pooling_params(pooling_metadata)

tasks: list[PoolingTask] = [task for pooling_param in pooling_params if (task := pooling_param.task) is not None]
assert len(pooling_params) == len(tasks)

Expand Down Expand Up @@ -109,7 +108,7 @@ class Pooler(nn.Layer, ABC):
@staticmethod
def for_encode(pooler_config: PoolerConfig, model_config: Optional["ModelConfig"] = None):
if pooler_config.pooling_type == "STEP":
return StepPooler()
return StepPooler(model_config)

resolved_config = ResolvedPoolingConfig(task="encode", pooling_type=PoolingType.ALL)
return SimplePooler.from_config(resolved_config, model_config)
Expand Down Expand Up @@ -290,11 +289,19 @@ class RewardPoolerHead(PoolerHead):
def __init__(self, model_config: Optional["ModelConfig"] = None) -> None:
super().__init__(activation=PoolerClassify(static_num_labels=False))
self.model_config = model_config
self.head_dtype = model_config.head_dtype

def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], pooling_metadata: PoolingMetadata):
pooling_params = get_pooling_params(pooling_metadata)
def forward(
self,
pooled_data: list[paddle.Tensor] | paddle.Tensor,
pooling_metadata: PoolingMetadata,
):
if isinstance(pooled_data, list):
pooled_data = [p.to(self.head_dtype) for p in pooled_data]
else:
pooled_data = pooled_data.to(self.head_dtype)

# for softmax
pooling_params = get_pooling_params(pooling_metadata)
flags = [p.softmax for p in pooling_params]
if len(set(flags)) == 1:
if flags[0]:
Expand All @@ -305,19 +312,6 @@ def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], poolin
return pooled_data


def build_output(
all_data: Union[paddle.Tensor, list[paddle.Tensor]],
) -> PoolerOutput:
# Pooling models D2H & synchronize occurs here
if isinstance(all_data, list):
all_data = [d.cpu() for d in all_data]
else:
all_data = all_data.cpu()

all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
return PoolerOutput(outputs=all_outputs)


class PoolingMethod(nn.Layer, ABC):

@staticmethod
Expand Down Expand Up @@ -380,8 +374,8 @@ def forward_all(
) -> Union[list[paddle.Tensor], paddle.Tensor]:

assert not pooling_cursor.is_partial_prefill(), "partial prefill not supported with ALL pooling"

hidden_states_lst = list(hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist()))

return [hidden_states_lst[i] for i in pooling_cursor.index]


Expand Down Expand Up @@ -430,11 +424,12 @@ def forward_all(
class StepPooler(Pooler):
def __init__(
self,
model_config: ModelConfig,
) -> None:
super().__init__()

self.pooling = AllPool()
self.head = RewardPoolerHead()
self.head = RewardPoolerHead(model_config)

def extract_states(
self,
Expand Down Expand Up @@ -469,12 +464,12 @@ def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:

def forward(
self,
hidden_states: Union[paddle.Tensor, list[paddle.Tensor]],
hidden_states: paddle.Tensor | list[paddle.Tensor],
pooling_metadata: PoolingMetadata,
) -> PoolerOutput:
pooled_data = self.extract_states(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
return build_output(pooled_data)
return pooled_data


class SimplePooler(Pooler):
Expand Down Expand Up @@ -520,7 +515,7 @@ def forward(
) -> PoolerOutput:
pooled_data = self.pooling(hidden_states, pooling_metadata)
pooled_data = self.head(pooled_data, pooling_metadata)
return build_output(pooled_data)
return pooled_data


class PoolerNormalize(PoolerActivation):
Expand Down Expand Up @@ -567,7 +562,7 @@ def forward(
hidden_states,
pooling_metadata[offset : offset + num_items],
)
outputs.extend(group_output.outputs)
outputs.extend(group_output)
offset += num_items

return PoolerOutput(outputs)
8 changes: 4 additions & 4 deletions fastdeploy/model_executor/models/qwen2_rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Qwen2RewardBaseModel(nn.Layer):
def __init__(self, fd_config: FDConfig):
super().__init__()
self.model = Qwen2Model(fd_config=fd_config)
self.head_dtype = paddle.float32
self.head_dtype = paddle.bfloat16

self.score = nn.Sequential(
ColumnParallelLinear(
Expand Down Expand Up @@ -80,8 +80,8 @@ def forward(
@ModelRegistry.register_model_class(
architecture="Qwen2ForProcessRewardModel",
module_name="qwen2_rm",
category=[ModelCategory.REWARD],
primary_use=ModelCategory.REWARD,
category=ModelCategory.EMBEDDING,
primary_use=ModelCategory.EMBEDDING,
)
@default_pooling_type("STEP")
class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel):
Expand All @@ -94,7 +94,7 @@ def __init__(self, fd_config: FDConfig):
pooler_config = fd_config.model_config.pooler_config
assert pooler_config is not None

self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)})
self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config, model_config=fd_config.model_config)})

self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"])

Expand Down
Loading
Loading