Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 10 additions & 25 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
GenerateReqInput, GenerateReqMetaOutput, GenerateReqOutput, LogProbs,
ModelCard, ModelList, ModelPermission, PoolingRequest, PoolingResponse,
TopLogprob, UpdateParamsRequest, UsageInfo)
from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager
from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import (ReasoningParser, ReasoningParserManager,
get_streaming_state)
from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager
from lmdeploy.serve.utils.server_utils import validate_json_request
from lmdeploy.tokenizer import DetokenizeState, Tokenizer
Expand Down Expand Up @@ -505,13 +506,10 @@ def create_stream_response_json(index: int,
return response_json

async def completion_stream_generator() -> AsyncGenerator[str, None]:
previous_text = ''
current_text = ''
previous_token_ids = []
current_token_ids = []
delta_token_ids = []
has_parser = VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None
streaming_tools = False
# Shared state for streaming parsers (previous/current text & token ids)
parser_state = get_streaming_state(request) if has_parser else None
async for res in result_generator:
logprobs, usage = None, None
if gen_logprobs and res.logprobs:
Expand All @@ -533,20 +531,13 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
res.finish_reason = 'tool_calls'
else:
delta_message = DeltaMessage(role='assistant', content=res.response)
if has_parser:
current_text = current_text + res.response
current_token_ids = current_token_ids + delta_token_ids
if parser_state is not None:
parser_state.update(res.response, delta_token_ids)
if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
if res.finish_reason == 'stop' and streaming_tools is True:
res.finish_reason = 'tool_calls'
tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_message.content,
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids,
request=request)
delta_text=delta_message.content, delta_token_ids=delta_token_ids, request=request)
if tool_delta is not None:
delta_message.tool_calls = tool_delta.tool_calls
delta_message.content = tool_delta.content
Expand All @@ -557,18 +548,12 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
logger.error('Please launch the api_server with --tool-call-parser if you want to use tool.')
if VariableInterface.reasoning_parser is not None and enable_thinking is not False:
reasoning_delta = VariableInterface.reasoning_parser.extract_reasoning_content_streaming(
previous_text=previous_text,
current_text=current_text,
delta_text=delta_message.content or '',
previous_token_ids=previous_token_ids,
current_token_ids=current_token_ids,
delta_token_ids=delta_token_ids)
delta_text=delta_message.content or '', delta_token_ids=delta_token_ids, request=request)
if reasoning_delta is not None:
delta_message.reasoning_content = reasoning_delta.reasoning_content
delta_message.content = reasoning_delta.content
if has_parser:
previous_text = current_text
previous_token_ids = current_token_ids
if parser_state is not None:
parser_state.step()
if request.return_token_ids:
delta_message.gen_tokens = delta_token_ids
response_json = create_stream_response_json(index=0,
Expand Down
15 changes: 12 additions & 3 deletions lmdeploy/serve/openai/reasoning_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
from .qwen_qwq_reasoning_parser import QwenQwQReasoningParser
from .reasoning_parser import ReasoningParser, ReasoningParserManager
from .qwen_reasoning_parser import QwenQwQReasoningParser
from .reasoning_parser import (ReasoningParser, ReasoningParserManager, StreamingParserState, ThinkingReasoningParser,
get_streaming_state)

__all__ = ['ReasoningParser', 'ReasoningParserManager', 'DeepSeekR1ReasoningParser', 'QwenQwQReasoningParser']
__all__ = [
'ReasoningParser',
'ReasoningParserManager',
'StreamingParserState',
'ThinkingReasoningParser',
'get_streaming_state',
'DeepSeekR1ReasoningParser',
'QwenQwQReasoningParser',
]
145 changes: 15 additions & 130 deletions lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -1,140 +1,25 @@
# Copyright (c) OpenMMLab. All rights reserved.
# modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers
import re
from typing import Optional, Sequence, Tuple, Union

from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage

from .reasoning_parser import ReasoningParser, ReasoningParserManager
from .reasoning_parser import ReasoningParserManager, ThinkingReasoningParser


@ReasoningParserManager.register_module(name='deepseek-r1')
class DeepSeekR1ReasoningParser(ReasoningParser):
class DeepSeekR1ReasoningParser(ThinkingReasoningParser):
"""Reasoning parser for DeepSeek R1 model.

The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning text. This parser extracts the reasoning
content from the model output.
Uses <think>...</think> tokens. When the end tag is missing in
non-streaming mode, the entire output is treated as reasoning content
(DeepSeek R1 may omit the start tag).

Ref: https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
"""

start_token = '<think>'
end_token = '</think>'
strip_newlines = False
on_missing_end_tag = 'reasoning'

def __init__(self, tokenizer: object):
super().__init__(tokenizer)
self.think_start_token = '<think>'
self.think_end_token = '</think>'

self.reasoning_regex = re.compile(rf'{self.think_start_token}(.*?){self.think_end_token}', re.DOTALL)

if not self.model_tokenizer:
raise ValueError('The model tokenizer must be passed to the ReasoningParser '
'constructor during construction.')

self.think_start_token_id = self.vocab.get(self.think_start_token)
self.think_end_token_id = self.vocab.get(self.think_end_token)
if (self.think_start_token_id is None or self.think_end_token_id is None):
raise RuntimeError('DeepSeek R1 reasoning parser could not locate think start/end '
'tokens in the tokenizer!')

def extract_reasoning_content_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
**kwargs,
) -> Union[DeltaMessage, None]:
"""Instance method that should be implemented for extracting reasoning
from an incomplete response; for use when handling reasoning calls and
streaming.

Has to be an instance method because it requires state - the current tokens/diffs, but also the information
about what has previously been parsed and extracted (see constructor)
"""
# Skip single special tokens
if len(delta_token_ids) == 1:
if delta_token_ids[0] == self.think_end_token_id:
return DeltaMessage(content='')
elif delta_token_ids[0] == self.think_start_token_id:
return None

# Check if <think> is present in previous or delta.
# Keep compatibility with models that don't generate <think> tokens.
if self.think_start_token_id in previous_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in previous, </think> in delta,
# extract reasoning content
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
elif self.think_end_token_id in previous_token_ids:
# <think> in previous, </think> in previous,
return DeltaMessage(content=delta_text)
else:
# <think> in previous, no </think> in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[start_index + len(self.think_start_token):end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
else:
# <think> in delta, no </think> in delta,
# reasoning content continues
return DeltaMessage(reasoning_content=delta_text)
else:
# No <think> in previous or delta, also need to check for </think>.
# Because the model may have generated </think> without <think>
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
if self.think_end_token_id in delta_token_ids:
# </think> in delta with more tokens,
# extract reasoning content and content
end_index = delta_text.find(self.think_end_token)
reasoning_content = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token):]
return DeltaMessage(reasoning_content=reasoning_content, content=content if content else None)
elif self.think_end_token_id in previous_token_ids:
# </think> in previous, thinking content ends
return DeltaMessage(content=delta_text)
else:
# no </think> in previous or delta, reasoning content continues
return DeltaMessage(reasoning_content=delta_text)

def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
**kwargs) -> Tuple[Optional[str], Optional[str]]:
"""Extract reasoning content from a complete model-generated string.

Used for non-streaming responses where we have the entire model response
available before sending to the client.

Args:
model_output (str): The model-generated string to extract reasoning content from.
request (ChatCompletionRequest): he request object that was used to generate the model_output.

Returns:
reasoning_content (str | None): The reasoning content.
final_output (str | None): The content.
"""
# DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start.
# Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
if self.think_end_token not in model_output:
return model_output, None
else:
# Add a start token if it's missing to keep compatibility.
if self.think_start_token not in model_output:
model_output = f'{self.think_start_token}{model_output}'
# Use a regex to find the reasoning content
reasoning_content = self.reasoning_regex.findall(model_output)[0]

end_index = len(f'{self.think_start_token}{reasoning_content}{self.think_end_token}')
final_output = model_output[end_index:]

if len(final_output) == 0:
return reasoning_content, None

return reasoning_content, final_output
if self.start_token_id is None or self.end_token_id is None:
raise RuntimeError('DeepSeek R1 reasoning parser could not locate '
'think start/end tokens in the tokenizer!')
Loading
Loading