diff --git a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
index 058bf02d45..7756d1a26c 100644
--- a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
+++ b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
@@ -77,22 +77,8 @@ def extract_reasoning_content_streaming(
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
else:
- # No in previous or delta, also need to check for .
- # Because the model may have generated without
- # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
- if self.think_end_token in delta_text:
- # in delta with more tokens,
- # extract reasoning content and content
- end_index = delta_text.find(self.think_end_token)
- reasoning_content = delta_text[:end_index]
- content = delta_text[end_index + len(self.think_end_token):]
- return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
- elif self.think_end_token in previous_text:
- # in previous, thinking content ends
- return DeltaMessage(content=delta_text)
- else:
- # no in previous or delta, reasoning content continues
- return DeltaMessage(reasoning_content=delta_text)
+ # no in previous or delta, all content
+ return DeltaMessage(content=delta_text)
def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
**kwargs) -> Tuple[Optional[str], Optional[str]]:
@@ -109,26 +95,35 @@ def extract_reasoning_content(self, model_output: str, request: ChatCompletionRe
reasoning_content (str | None): The reasoning content.
final_output (str | None): The content.
"""
- # DeepSeek R1 doesn't generate now.
+ start_index = model_output.find(self.think_start_token)
+ end_index = model_output.find(self.think_end_token)
# Thus we assume the reasoning content is always at the start.
- # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
- if self.think_end_token not in model_output:
+ if end_index < 0:
# for qwen3 model, the reasoning content is wrapped by xml tags
- return None, model_output
- # Add a start token if it's missing to keep compatibility.
- if self.think_start_token not in model_output:
- model_output = f'{self.think_start_token}{model_output}'
- # Use a regex to find the reasoning content
- reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
- end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
- final_output = model_output[end_index:]
- if reasoning_content.startswith('\n'):
- reasoning_content = reasoning_content[1:]
- if reasoning_content.endswith('\n'):
- reasoning_content = reasoning_content[:-1]
+ if start_index < 0:
+ return None, model_output
+ reasoning_content = model_output[start_index + len(self.think_start_token):]
+ reasoning_content = self._trim_newlines(reasoning_content)
+ return reasoning_content, None
+
+ if start_index >= 0 and start_index < end_index:
+ reasoning_content = model_output[start_index + len(self.think_start_token):end_index]
+ else:
+ reasoning_content = model_output[:end_index]
+ reasoning_content = self._trim_newlines(reasoning_content)
+
+ final_output = model_output[end_index + len(self.think_end_token):]
+ final_output = self._trim_newlines(final_output)
if len(final_output) == 0:
return reasoning_content, None
-
return reasoning_content, final_output
+
+ @classmethod
+ def _trim_newlines(cls, text: str):
+ """Trim newlines from the start and end of a string."""
+ while text.startswith('\n'):
+ text = text[1:]
+ while text.endswith('\n'):
+ text = text[:-1]
+ return text
diff --git a/tests/test_lmdeploy/test_qwen3_parser.py b/tests/test_lmdeploy/test_qwen3_parser.py
index 3a837d73a3..16518ac960 100644
--- a/tests/test_lmdeploy/test_qwen3_parser.py
+++ b/tests/test_lmdeploy/test_qwen3_parser.py
@@ -358,3 +358,48 @@ def test_no_think_nonstream():
first_message = resp.choices[0].message
assert first_message.content == '你好呀!✨ 很高兴见到你!'
assert first_message.reasoning_content is None
+
+
+THINK_START_SEQUENCE = ['', '\n']
+TRUNCATED_SEQUENCE = ['OK', ', ', 'user', ' ', 'sends']
+
+
+@pytest.mark.parametrize(
+ 'sequence, expected_content, expected_reasoning_content',
+ [
+ # without think start token
+ (TRUNCATED_SEQUENCE, ''.join(TRUNCATED_SEQUENCE), None),
+ # with think start token
+ (THINK_START_SEQUENCE + TRUNCATED_SEQUENCE, None, ''.join(TRUNCATED_SEQUENCE)),
+ ])
+def test_truncated_think_nonstream(sequence, expected_content, expected_reasoning_content):
+
+ tokenizer = DummyTokenizer()
+ VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer)
+ VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer)
+ req = ChatCompletionRequest(model='qwen', messages=[], stream=False)
+ resp: ChatCompletionResponse = _chat_completion_v1(req, sequence)
+
+ assert len(resp.choices) == 1
+ first_message = resp.choices[0].message
+ assert first_message.content == expected_content
+ assert first_message.reasoning_content == expected_reasoning_content
+
+
+@pytest.mark.parametrize(
+ 'sequence, expected_content, expected_reasoning_content',
+ [
+ # without think start token
+ (TRUNCATED_SEQUENCE, ''.join(TRUNCATED_SEQUENCE), ''),
+ # with think start token
+ (THINK_START_SEQUENCE + TRUNCATED_SEQUENCE, '', ''.join(TRUNCATED_SEQUENCE)),
+ ])
+def test_truncated_think_stream(sequence, expected_content, expected_reasoning_content):
+ tokenizer = DummyTokenizer()
+ VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer)
+ VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer)
+ req = ChatCompletionRequest(model='qwen', messages=[], stream=True)
+ content, reasoning_content, tool_calls = _stream_parse(req, sequence)
+
+ assert content == expected_content
+ assert reasoning_content.lstrip() == expected_reasoning_content