Skip to content
Closed
16 changes: 12 additions & 4 deletions vllm/entrypoints/serve/render/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from http import HTTPStatus
from typing import Any
from typing import Any, cast

from openai_harmony import Message as OpenAIMessage

Expand Down Expand Up @@ -566,8 +566,16 @@ async def preprocess_chat(
)
raise NotImplementedError(msg)
tokenizer = renderer.get_tokenizer()
request = tool_parser(tokenizer, request.tools).adjust_request(
request=request
)
request = tool_parser(
tokenizer, cast(Any, request.tools)
).adjust_request(request=request)

if self.reasoning_parser is not None:
tokenizer = renderer.get_tokenizer()
request_chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
parser_instance = self.reasoning_parser(
tokenizer, chat_template_kwargs=request_chat_kwargs
)
request = parser_instance.adjust_request(request)

return conversation, [engine_input]
19 changes: 13 additions & 6 deletions vllm/reasoning/abs_reasoning_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,19 @@ def vocab(self) -> dict[str, int]:
# whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab()

def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Adjust request parameters before inference.
Subclasses can override this to modify request settings
(e.g. forcing ``skip_special_tokens=False`` when the parser
relies on special-token delimiters surviving detokenization).
The default implementation is a no-op.
"""
return request

@abstractmethod
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
"""
Expand Down Expand Up @@ -150,12 +163,6 @@ def extract_reasoning_streaming(
previously been parsed and extracted (see constructor)
"""

def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Adjust request parameters; override in subclasses as needed."""
return request

def prepare_structured_tag(
self,
original_tag: str | None,
Expand Down
47 changes: 32 additions & 15 deletions vllm/reasoning/gemma4_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,6 @@ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
self.tool_call_token_id = self.vocab["<|tool_call>"]
self.tool_response_token_id = self.vocab["<|tool_response>"]

def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Disable special-token stripping to preserve boundary tokens."""
request.skip_special_tokens = False
return request

@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
Expand Down Expand Up @@ -96,6 +89,30 @@ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
return True
return False

# ------------------------------------------------------------------
# Request adjustment
# ------------------------------------------------------------------

def adjust_request(
self, request: "ChatCompletionRequest | ResponsesRequest"
) -> "ChatCompletionRequest | ResponsesRequest":
"""Force ``skip_special_tokens=False`` only when thinking is enabled,
so that ``<|channel>`` and ``<channel|>`` delimiters survive detokenization.

This mirrors the approach used by
:meth:`Gemma4ToolParser.adjust_request` for tool-call delimiters.
"""
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest

if isinstance(request, (ChatCompletionRequest, ResponsesRequest)):
chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
if chat_kwargs.get("enable_thinking", False):
request.skip_special_tokens = False
return request

# ------------------------------------------------------------------
# Non-streaming path
# ------------------------------------------------------------------
Expand Down Expand Up @@ -138,14 +155,14 @@ def extract_reasoning_streaming(
prefix is present, then emit the buffered content minus the
prefix.

Unlike the previous implementation which reconstructed accumulated
reasoning from ``current_text``, this uses instance state
(``_reasoning_text``) to track only the reasoning content returned
by the base parser. This is necessary because
``skip_special_tokens=True`` (the vLLM default) causes the
``<|channel>`` delimiter to be invisible in ``current_text``,
making it impossible to separate pre-reasoning content from
reasoning content via string matching.
This method uses instance state (``_reasoning_text``) to track
only the reasoning content returned by the base parser, rather
than reconstructing it from ``current_text``. Although
``adjust_request`` now forces ``skip_special_tokens=False``
(making ``<|channel>`` visible in ``current_text``), the
instance-state approach remains more robust against edge cases
where pre-reasoning content could interfere with prefix
stripping.
"""
result = super().extract_reasoning_streaming(
previous_text,
Expand Down
8 changes: 8 additions & 0 deletions vllm/reasoning/gemma4_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ def parse_thinking_output(text: str) -> dict[str, str | None]:

return {"thinking": thinking, "answer": answer}

# Handle truncated thinking: start tag present but end tag missing
# (model hit max_tokens before completing the chain-of-thought).
if _THINKING_START_TAG in text:
thinking = text.split(_THINKING_START_TAG, 1)[1]
thinking = _strip_thought_label(thinking.strip())
thinking = thinking.strip()
return {"thinking": thinking + "\n[truncated]", "answer": ""}

# No thinking delimiters found.
# Strip spurious "thought\n" role label that some Gemma4 models sometimes
# emit even without thinking mode enabled, then clean trailing tokens.
Expand Down
Loading