diff --git a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py index 058bf02d45..7756d1a26c 100644 --- a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py +++ b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py @@ -77,22 +77,8 @@ def extract_reasoning_content_streaming( # reasoning content continues return DeltaMessage(reasoning_content=delta_text) else: - # No in previous or delta, also need to check for . - # Because the model may have generated without - # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f - if self.think_end_token in delta_text: - # in delta with more tokens, - # extract reasoning content and content - end_index = delta_text.find(self.think_end_token) - reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token):] - return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None) - elif self.think_end_token in previous_text: - # in previous, thinking content ends - return DeltaMessage(content=delta_text) - else: - # no in previous or delta, reasoning content continues - return DeltaMessage(reasoning_content=delta_text) + # no in previous or delta, all content + return DeltaMessage(content=delta_text) def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest, **kwargs) -> Tuple[Optional[str], Optional[str]]: @@ -109,26 +95,35 @@ def extract_reasoning_content(self, model_output: str, request: ChatCompletionRe reasoning_content (str | None): The reasoning content. final_output (str | None): The content. """ - # DeepSeek R1 doesn't generate now. + start_index = model_output.find(self.think_start_token) + end_index = model_output.find(self.think_end_token) # Thus we assume the reasoning content is always at the start. - # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f - if self.think_end_token not in model_output: + if end_index < 0: # for qwen3 model, the reasoning content is wrapped by xml tags - return None, model_output - # Add a start token if it's missing to keep compatibility. - if self.think_start_token not in model_output: - model_output = f'{self.think_start_token}{model_output}' - # Use a regex to find the reasoning content - reasoning_content = self.reasoning_regex.findall(model_output)[0] - - end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}') - final_output = model_output[end_index:] - if reasoning_content.startswith('\n'): - reasoning_content = reasoning_content[1:] - if reasoning_content.endswith('\n'): - reasoning_content = reasoning_content[:-1] + if start_index < 0: + return None, model_output + reasoning_content = model_output[start_index + len(self.think_start_token):] + reasoning_content = self._trim_newlines(reasoning_content) + return reasoning_content, None + + if start_index >= 0 and start_index < end_index: + reasoning_content = model_output[start_index + len(self.think_start_token):end_index] + else: + reasoning_content = model_output[:end_index] + reasoning_content = self._trim_newlines(reasoning_content) + + final_output = model_output[end_index + len(self.think_end_token):] + final_output = self._trim_newlines(final_output) if len(final_output) == 0: return reasoning_content, None - return reasoning_content, final_output + + @classmethod + def _trim_newlines(cls, text: str): + """Trim newlines from the start and end of a string.""" + while text.startswith('\n'): + text = text[1:] + while text.endswith('\n'): + text = text[:-1] + return text diff --git a/tests/test_lmdeploy/test_qwen3_parser.py b/tests/test_lmdeploy/test_qwen3_parser.py index 3a837d73a3..16518ac960 100644 --- a/tests/test_lmdeploy/test_qwen3_parser.py +++ b/tests/test_lmdeploy/test_qwen3_parser.py @@ -358,3 +358,48 @@ def test_no_think_nonstream(): first_message = resp.choices[0].message assert first_message.content == '你好呀!✨ 很高兴见到你!' assert first_message.reasoning_content is None + + +THINK_START_SEQUENCE = ['', '\n'] +TRUNCATED_SEQUENCE = ['OK', ', ', 'user', ' ', 'sends'] + + +@pytest.mark.parametrize( + 'sequence, expected_content, expected_reasoning_content', + [ + # without think start token + (TRUNCATED_SEQUENCE, ''.join(TRUNCATED_SEQUENCE), None), + # with think start token + (THINK_START_SEQUENCE + TRUNCATED_SEQUENCE, None, ''.join(TRUNCATED_SEQUENCE)), + ]) +def test_truncated_think_nonstream(sequence, expected_content, expected_reasoning_content): + + tokenizer = DummyTokenizer() + VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer) + VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer) + req = ChatCompletionRequest(model='qwen', messages=[], stream=False) + resp: ChatCompletionResponse = _chat_completion_v1(req, sequence) + + assert len(resp.choices) == 1 + first_message = resp.choices[0].message + assert first_message.content == expected_content + assert first_message.reasoning_content == expected_reasoning_content + + +@pytest.mark.parametrize( + 'sequence, expected_content, expected_reasoning_content', + [ + # without think start token + (TRUNCATED_SEQUENCE, ''.join(TRUNCATED_SEQUENCE), ''), + # with think start token + (THINK_START_SEQUENCE + TRUNCATED_SEQUENCE, '', ''.join(TRUNCATED_SEQUENCE)), + ]) +def test_truncated_think_stream(sequence, expected_content, expected_reasoning_content): + tokenizer = DummyTokenizer() + VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer) + VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer) + req = ChatCompletionRequest(model='qwen', messages=[], stream=True) + content, reasoning_content, tool_calls = _stream_parse(req, sequence) + + assert content == expected_content + assert reasoning_content.lstrip() == expected_reasoning_content