From 41bdeedfdfe2ed6c8272b2050e6f4176be91a222 Mon Sep 17 00:00:00 2001
From: Luciano Martins <lucianommartins@users.noreply.github.com>
Date: Wed, 8 Apr 2026 21:26:41 +0000
Subject: [PATCH] [Frontend] Preserve structured output special tokens in
 offline LLM.chat - addresses data loss in offline path where default
 'skip_special_tokens=True' strips reasoning and tool-calling delimiters -
 implements '_adjust_params_for_parsing' to inspect tokenizer vocabulary and
 detect active special tokens (e.g., <|channel|>, <|tool_call|>, <|"|>) -
 dynamically enforces 'skip_special_tokens=False' in SamplingParams when
 'enable_thinking=True' or 'tools' are present. - restricts override to
 tokenizers that actually register these strings as special tokens,
 maintaining no-op transparency for other models

Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
---
 vllm/entrypoints/llm.py | 58 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d296e84d0411..0a3baffb5ea4 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1638,6 +1638,17 @@ def _run_chat(
         seq_params = self._params_to_seq(params, len(seq_convs))
         seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs))
 
+        # When thinking is enabled or tools are provided, and the model
+        # uses special tokens for structured output (e.g. Gemma4's
+        # <|channel>, <|tool_call>, <|"|>), automatically set
+        # skip_special_tokens=False so these tokens are preserved in
+        # output.text for downstream parsing.
+        needs_parsing = (
+            chat_template_kwargs and chat_template_kwargs.get("enable_thinking")
+        ) or tools
+        if needs_parsing:
+            self._adjust_params_for_parsing(seq_params)
+
         return self._render_and_run_requests(
             prompts=(
                 self._preprocess_chat_one(
@@ -1663,6 +1674,53 @@ def _run_chat(
             use_tqdm=use_tqdm,
         )
 
+    def _adjust_params_for_parsing(
+        self, params: Sequence[SamplingParams | PoolingParams]
+    ) -> None:
+        """Set ``skip_special_tokens=False`` when the model encodes
+        structured output syntax as special tokens.
+
+        Models like Gemma4 register thinking delimiters
+        (``<|channel>``/``<channel|>``) and tool call tokens
+        (``<|tool_call>``/``<tool_call|>``/``<|"|>``) as special tokens.
+        The default ``skip_special_tokens=True`` strips them from
+        ``output.text``, breaking parsing of both reasoning blocks and
+        tool calls.
+
+        This is a no-op for models whose structured tokens are regular
+        text tokens (e.g. DeepSeek's ``<think>``/``</think>``).
+        """
+        # The offline API currently lacks a unified rendering pipeline.
+        # Until the planned Renderer refactor is complete, we hardcode
+        # this token preservation logic specifically for Gemma4 models
+        # to avoid regressions on other models.
+        hf_config = getattr(self.model_config, "hf_config", None)
+        architectures = getattr(hf_config, "architectures", [])
+
+        if any("Gemma4" in arch for arch in architectures):
+            tokenizer = self.renderer.get_tokenizer()
+            vocab = tokenizer.get_vocab()
+            special_ids = set(getattr(tokenizer, "all_special_ids", []))
+
+            # Tokens used for thinking delimiters and tool call syntax
+            # that some models (Gemma4) register as special tokens.
+            structured_tokens = (
+                "<|channel>",
+                "<channel|>",  # thinking delimiters
+                "<|tool_call>",
+                "<tool_call|>",  # tool call delimiters
+                '<|"|>',  # string quoting in tool args
+            )
+            needs_special = any(
+                vocab.get(tok) in special_ids
+                for tok in structured_tokens
+                if tok in vocab
+            )
+            if needs_special:
+                for sp in params:
+                    if isinstance(sp, SamplingParams) and sp.skip_special_tokens:
+                        sp.skip_special_tokens = False
+
     def _render_and_run_requests(
         self,
         prompts: Iterable[EngineInput],