vllm-project · lucianommartins · Apr 6, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
 from http import HTTPStatus
-from typing import Any
+from typing import Any, cast
 
 from openai_harmony import Message as OpenAIMessage
 
@@ -566,8 +566,16 @@ async def preprocess_chat(
                     )
                     raise NotImplementedError(msg)
                 tokenizer = renderer.get_tokenizer()
-                request = tool_parser(tokenizer, request.tools).adjust_request(
-                    request=request
-                )
+                request = tool_parser(
+                    tokenizer, cast(Any, request.tools)
+                ).adjust_request(request=request)
+
+        if self.reasoning_parser is not None:
+            tokenizer = renderer.get_tokenizer()
+            request_chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
+            parser_instance = self.reasoning_parser(
+                tokenizer, chat_template_kwargs=request_chat_kwargs
+            )
+            request = parser_instance.adjust_request(request)
 
         return conversation, [engine_input]
@@ -39,6 +39,19 @@ def vocab(self) -> dict[str, int]:
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
+    def adjust_request(
+        self, request: "ChatCompletionRequest | ResponsesRequest"
+    ) -> "ChatCompletionRequest | ResponsesRequest":
+        """Adjust request parameters before inference.
+
+        Subclasses can override this to modify request settings
+        (e.g. forcing ``skip_special_tokens=False`` when the parser
+        relies on special-token delimiters surviving detokenization).
+
+        The default implementation is a no-op.
+        """
+        return request
+
     @abstractmethod
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         """
@@ -150,12 +163,6 @@ def extract_reasoning_streaming(
         previously been parsed and extracted (see constructor)
         """
 
-    def adjust_request(
-        self, request: "ChatCompletionRequest | ResponsesRequest"
-    ) -> "ChatCompletionRequest | ResponsesRequest":
-        """Adjust request parameters; override in subclasses as needed."""
-        return request
-
     def prepare_structured_tag(
         self,
         original_tag: str | None,

@@ -56,13 +56,6 @@ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         self.tool_call_token_id = self.vocab["<|tool_call>"]
         self.tool_response_token_id = self.vocab["<|tool_response>"]
 
-    def adjust_request(
-        self, request: "ChatCompletionRequest | ResponsesRequest"
-    ) -> "ChatCompletionRequest | ResponsesRequest":
-        """Disable special-token stripping to preserve boundary tokens."""
-        request.skip_special_tokens = False
-        return request
-
     @property
     def start_token(self) -> str:
         """The token that starts reasoning content."""
@@ -96,6 +89,30 @@ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
                 return True
         return False
 
+    # ------------------------------------------------------------------
+    # Request adjustment
+    # ------------------------------------------------------------------
+
+    def adjust_request(
+        self, request: "ChatCompletionRequest | ResponsesRequest"
+    ) -> "ChatCompletionRequest | ResponsesRequest":
+        """Force ``skip_special_tokens=False`` only when thinking is enabled,
+        so that ``<|channel>`` and ``<channel|>`` delimiters survive detokenization.
+
+        This mirrors the approach used by
+        :meth:`Gemma4ToolParser.adjust_request` for tool-call delimiters.
+        """
+        from vllm.entrypoints.openai.chat_completion.protocol import (
+            ChatCompletionRequest,
+        )
+        from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+
+        if isinstance(request, (ChatCompletionRequest, ResponsesRequest)):
+            chat_kwargs = getattr(request, "chat_template_kwargs", None) or {}
+            if chat_kwargs.get("enable_thinking", False):
+                request.skip_special_tokens = False
+        return request
+
     # ------------------------------------------------------------------
     # Non-streaming path
     # ------------------------------------------------------------------
@@ -138,14 +155,14 @@ def extract_reasoning_streaming(
         prefix is present, then emit the buffered content minus the
         prefix.
 
-        Unlike the previous implementation which reconstructed accumulated
-        reasoning from ``current_text``, this uses instance state
-        (``_reasoning_text``) to track only the reasoning content returned
-        by the base parser. This is necessary because
-        ``skip_special_tokens=True`` (the vLLM default) causes the
-        ``<|channel>`` delimiter to be invisible in ``current_text``,
-        making it impossible to separate pre-reasoning content from
-        reasoning content via string matching.
+        This method uses instance state (``_reasoning_text``) to track
+        only the reasoning content returned by the base parser, rather
+        than reconstructing it from ``current_text``.  Although
+        ``adjust_request`` now forces ``skip_special_tokens=False``
+        (making ``<|channel>`` visible in ``current_text``), the
+        instance-state approach remains more robust against edge cases
+        where pre-reasoning content could interfere with prefix
+        stripping.
         """
         result = super().extract_reasoning_streaming(
             previous_text,

@@ -95,6 +95,14 @@ def parse_thinking_output(text: str) -> dict[str, str | None]:
 
         return {"thinking": thinking, "answer": answer}
 
+    # Handle truncated thinking: start tag present but end tag missing
+    # (model hit max_tokens before completing the chain-of-thought).
+    if _THINKING_START_TAG in text:
+        thinking = text.split(_THINKING_START_TAG, 1)[1]
+        thinking = _strip_thought_label(thinking.strip())
+        thinking = thinking.strip()
+        return {"thinking": thinking + "\n[truncated]", "answer": ""}
+
     # No thinking delimiters found.
     # Strip spurious "thought\n" role label that some Gemma4 models sometimes
     # emit even without thinking mode enabled, then clean trailing tokens.