From 41bdeedfdfe2ed6c8272b2050e6f4176be91a222 Mon Sep 17 00:00:00 2001 From: Luciano Martins Date: Wed, 8 Apr 2026 21:26:41 +0000 Subject: [PATCH] [Frontend] Preserve structured output special tokens in offline LLM.chat - addresses data loss in offline path where default 'skip_special_tokens=True' strips reasoning and tool-calling delimiters - implements '_adjust_params_for_parsing' to inspect tokenizer vocabulary and detect active special tokens (e.g., <|channel|>, <|tool_call|>, <|"|>) - dynamically enforces 'skip_special_tokens=False' in SamplingParams when 'enable_thinking=True' or 'tools' are present. - restricts override to tokenizers that actually register these strings as special tokens, maintaining no-op transparency for other models Signed-off-by: Luciano Martins --- vllm/entrypoints/llm.py | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index d296e84d0411..0a3baffb5ea4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1638,6 +1638,17 @@ def _run_chat( seq_params = self._params_to_seq(params, len(seq_convs)) seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs)) + # When thinking is enabled or tools are provided, and the model + # uses special tokens for structured output (e.g. Gemma4's + # <|channel>, <|tool_call>, <|"|>), automatically set + # skip_special_tokens=False so these tokens are preserved in + # output.text for downstream parsing. + needs_parsing = ( + chat_template_kwargs and chat_template_kwargs.get("enable_thinking") + ) or tools + if needs_parsing: + self._adjust_params_for_parsing(seq_params) + return self._render_and_run_requests( prompts=( self._preprocess_chat_one( @@ -1663,6 +1674,53 @@ def _run_chat( use_tqdm=use_tqdm, ) + def _adjust_params_for_parsing( + self, params: Sequence[SamplingParams | PoolingParams] + ) -> None: + """Set ``skip_special_tokens=False`` when the model encodes + structured output syntax as special tokens. + + Models like Gemma4 register thinking delimiters + (``<|channel>``/````) and tool call tokens + (``<|tool_call>``/````/``<|"|>``) as special tokens. + The default ``skip_special_tokens=True`` strips them from + ``output.text``, breaking parsing of both reasoning blocks and + tool calls. + + This is a no-op for models whose structured tokens are regular + text tokens (e.g. DeepSeek's ````/````). + """ + # The offline API currently lacks a unified rendering pipeline. + # Until the planned Renderer refactor is complete, we hardcode + # this token preservation logic specifically for Gemma4 models + # to avoid regressions on other models. + hf_config = getattr(self.model_config, "hf_config", None) + architectures = getattr(hf_config, "architectures", []) + + if any("Gemma4" in arch for arch in architectures): + tokenizer = self.renderer.get_tokenizer() + vocab = tokenizer.get_vocab() + special_ids = set(getattr(tokenizer, "all_special_ids", [])) + + # Tokens used for thinking delimiters and tool call syntax + # that some models (Gemma4) register as special tokens. + structured_tokens = ( + "<|channel>", + "", # thinking delimiters + "<|tool_call>", + "", # tool call delimiters + '<|"|>', # string quoting in tool args + ) + needs_special = any( + vocab.get(tok) in special_ids + for tok in structured_tokens + if tok in vocab + ) + if needs_special: + for sp in params: + if isinstance(sp, SamplingParams) and sp.skip_special_tokens: + sp.skip_special_tokens = False + def _render_and_run_requests( self, prompts: Iterable[EngineInput],