[Bugfix] [Frontend] Fix Gemma 4 reasoning parser for multi-turn and tool calls

bbrowning · bbrowning · commit fd514567aef6 · 2026-04-08T13:18:49.000-04:00
Add is_reasoning_end override so the parser correctly detects reasoning
boundaries when tool calls are present or across multi-turn conversations.
Also refactor test helpers for reuse.

Signed-off-by: Ben Browning &lt;bbrownin@redhat.com&gt;
diff --git a/tests/reasoning/test_gemma4_reasoning_parser.py b/tests/reasoning/test_gemma4_reasoning_parser.py
@@ -128,6 +128,13 @@ def generic_tokenizer():
     "content": "Done",
     "is_reasoning_end": True,
 }
+# The model isn't reasoning if we're generating tool calls.
+TOOL_CALL_STARTED = {
+    "output": "<|tool_call>",
+    "reasoning": None,
+    "content": "<|tool_call>",
+    "is_reasoning_end": True,
+}
 
 TEST_CASES = [
     pytest.param(False, INVALID_SIMPLE_NONSTREAMING, id="invalid_simple"),
@@ -159,17 +166,12 @@ def generic_tokenizer():
     ),
     pytest.param(False, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge"),
     pytest.param(True, THOUGHT_PREFIX_DIVERGE, id="thought_prefix_diverge_streaming"),
+    pytest.param(False, TOOL_CALL_STARTED, id="tool_call_started"),
+    pytest.param(True, TOOL_CALL_STARTED, id="tool_call_started_streaming"),
 ]
 
 
-@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
-def test_gemma4_reasoning(
-    streaming: bool,
-    param_dict: dict,
-    generic_tokenizer,
-):
-    output = param_dict["output"]
-
+def gemma4_encode_output(generic_tokenizer, output: str) -> list[int]:
     # Resolve token IDs dynamically from the real tokenizer
     vocab = generic_tokenizer.get_vocab()
     start_token_id = vocab["<|channel>"]
@@ -215,6 +217,18 @@ def _encode(text: str) -> list[int]:
     else:
         output_tokens += _encode(output)
 
+    return output_tokens
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_gemma4_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    generic_tokenizer,
+):
+    output = param_dict["output"]
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
+
     parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
         generic_tokenizer
     )
@@ -246,3 +260,16 @@ def test_gemma4_adjust_request(generic_tokenizer):
     result = parser.adjust_request(request)
     assert result.skip_special_tokens is False
     assert result is request
+
+
+def test_gemma4_previous_turn_reasoning_is_reasoning_end(generic_tokenizer):
+    output = (
+        "<|channel>thought\n1st thought<channel|>1st content<turn|>\n"
+        "<|turn>user\nThanks<|turn>model\n"
+    )
+    output_tokens = gemma4_encode_output(generic_tokenizer, output)
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
+        generic_tokenizer
+    )
+    is_reasoning_end = parser.is_reasoning_end(output_tokens)
+    assert not is_reasoning_end
diff --git a/vllm/reasoning/gemma4_reasoning_parser.py b/vllm/reasoning/gemma4_reasoning_parser.py
@@ -52,6 +52,9 @@ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         # skip_special_tokens=True).
         self._reasoning_text: str = ""
         self._prefix_stripped: bool = False
+        self.new_turn_token_id = self.vocab["<|turn>"]
+        self.tool_call_token_id = self.vocab["<|tool_call>"]
+        self.tool_response_token_id = self.vocab["<|tool_response>"]
 
     def adjust_request(
         self, request: "ChatCompletionRequest | ResponsesRequest"
@@ -70,6 +73,29 @@ def end_token(self) -> str:
         """The token that ends reasoning content."""
         return "<channel|>"
 
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        start_token_id = self.start_token_id
+        end_token_id = self.end_token_id
+        new_turn_token_id = self.new_turn_token_id
+        tool_call_token_id = self.tool_call_token_id
+        tool_response_token_id = self.tool_response_token_id
+
+        # Search from the end of input_ids to find the last match.
+        for i in range(len(input_ids) - 1, -1, -1):
+            if input_ids[i] == start_token_id:
+                return False
+            if input_ids[i] == tool_call_token_id:
+                # We're generating a tool call, so reasoning must be ended.
+                return True
+            if input_ids[i] in (new_turn_token_id, tool_response_token_id):
+                # We found a new turn or tool response token so don't consider
+                # reasoning ended yet, since the model starts new reasoning
+                # after these tokens.
+                return False
+            if input_ids[i] == end_token_id:
+                return True
+        return False
+
     # ------------------------------------------------------------------
     # Non-streaming path
     # ------------------------------------------------------------------