Merge branch 'main' into gold_vlm_support

Strongich · web-flow · commit 537cc137eef0 · 2026-04-06T18:14:39.000+02:00
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -716,6 +716,26 @@ trainer.train()
 
 `reset` can return either `None` or a string. In GRPO, when it returns a string, that string is appended to the last user message before generation.
 
+### Multimodal Tool Responses
+
+Tools can return images alongside text by returning a list of content blocks. This is useful for VLM agent training where the tool provides visual feedback (e.g., screenshots, plots, camera captures).
+
+```python
+from PIL import Image
+
+def take_screenshot() -> list:
+    """
+    Takes a screenshot of the current screen.
+
+    Returns:
+        The screenshot image with a description.
+    """
+    img = Image.open("screenshot.png")
+    return [{"type": "image", "image": img}, {"type": "text", "text": "Here is the screenshot."}]
+```
+
+The returned images are automatically injected into the conversation and passed to the VLM for subsequent generation turns.
+
 ### Supported Models
 
 Tested with:
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -2446,6 +2446,90 @@ def fake_generate(input_ids, **kwargs):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    @pytest.mark.xfail(
+        condition=Version(transformers.__version__) < Version("5.2.0"),
+        reason="Qwen3.5 models were introduced in transformers-5.2.0",
+        strict=True,
+    )
+    @require_jmespath
+    @require_vision
+    def test_training_with_tools_multimodal_response(self):
+        # Test that tools returning images (multimodal responses) work correctly with a VLM.
+        # The tool returns a list of content blocks including an image.
+        from PIL import Image as PILImage
+
+        def screenshot_tool() -> list:
+            """
+            Takes a screenshot and returns it.
+
+            Returns:
+                A list of content blocks with the screenshot image.
+            """
+            img = PILImage.new("RGB", (64, 64), color="red")
+            return [{"type": "image", "image": img}, {"type": "text", "text": "Here is the screenshot"}]
+
+        dataset = load_dataset("trl-internal-testing/zen", "conversational_prompt_only", split="train")
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=512,
+            report_to="none",
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen3_5ForConditionalGeneration",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+            tools=[screenshot_tool],
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        def fake_generate(input_ids, **kwargs):
+            if input_ids.shape[0] == 3:  # first call
+                # fmt: off
+                completion_ids = torch.tensor(
+                    [
+                        # '<tool_call>\n<function=screenshot_tool>\n</function>\n</tool_call><|im_end|>'
+                        [248058, 198, 27, 1628, 13744, 30091, 22076, 29, 198, 510, 1628, 29, 198, 248059, 248046],
+                        # "I don't know any tool<|im_end|>" + padding
+                        [40, 1459, 914, 1366, 866, 5224, 248046, 248044, 248044, 248044, 248044, 248044, 248044, 248044, 248044],
+                        # '<tool_call>\n<function=screenshot_tool>\n</function>\n</tool_call><|im_end|>'
+                        [248058, 198, 27, 1628, 13744, 30091, 22076, 29, 198, 510, 1628, 29, 198, 248059, 248046],
+                    ],
+                    device=input_ids.device,
+                )
+                # fmt: on
+            else:  # second call: 2 tool calls succeeded
+                completion_ids = torch.tensor(
+                    [
+                        # 'Done!<|im_end|>'
+                        [16936, 0, 248046],
+                        # 'Done!<|im_end|>'
+                        [16936, 0, 248046],
+                    ],
+                    device=input_ids.device,
+                )
+            return torch.cat([input_ids, completion_ids], dim=-1)
+
+        with patch.object(trainer.model, "generate", side_effect=fake_generate):
+            trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+        assert trainer.state.log_history[-1]["tools/call_frequency"] == pytest.approx(2 / 3)
+        assert trainer.state.log_history[-1]["tools/failure_frequency"] == pytest.approx(0.0)
+
+        # Check that the params have changed (skip vision parts, see test_training_vlm)
+        params_to_skip = ("model.visual.",)
+        for n, param in previous_trainable_params.items():
+            if n.startswith(params_to_skip):
+                continue
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     @pytest.mark.xfail(
         condition=Version(transformers.__version__) < Version("5.2.0"),
         reason="Environment factory support is not available in transformers versions below 5.2.0",
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -24,6 +24,7 @@
 import transformers
 from packaging.version import Version
 from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
+from transformers.testing_utils import torch_device
 from transformers.utils import is_peft_available
 
 from trl import ModelConfig
@@ -48,7 +49,7 @@
     use_adapter,
 )
 
-from .testing_utils import TrlTestCase, require_peft, require_rich
+from .testing_utils import TrlTestCase, require_peft, require_rich, require_torch_accelerator
 
 
 if is_peft_available():
@@ -960,6 +961,23 @@ def test_multi_images(self):
         assert torch.equal(result["image_grid_thw"][0], torch.tensor([[1, 1, 2]]))
         assert torch.equal(result["image_grid_thw"][1], torch.tensor([[1, 2, 2], [1, 2, 1]]))
 
+    def test_split_by_image_position_ids(self):
+        # Gemma-style: no image_grid_thw, split by num_images using image_position_ids
+        batch = {
+            "num_images": [1, 2],
+            "pixel_values": torch.arange(3 * 4).reshape(3, 4),
+            "image_position_ids": torch.tensor([[0, 1], [2, 3], [4, 5]]),
+        }
+        result = split_pixel_values_by_grid(batch)
+        assert isinstance(result["pixel_values"], list)
+        assert len(result["pixel_values"]) == 2
+        assert torch.equal(result["pixel_values"][0], batch["pixel_values"][:1])
+        assert torch.equal(result["pixel_values"][1], batch["pixel_values"][1:])
+        assert isinstance(result["image_position_ids"], list)
+        assert len(result["image_position_ids"]) == 2
+        assert torch.equal(result["image_position_ids"][0], batch["image_position_ids"][:1])
+        assert torch.equal(result["image_position_ids"][1], batch["image_position_ids"][1:])
+
 
 class TestUnsplitPixelValuesByGrid(TrlTestCase):
     def test_unsplit_correctly(self):
@@ -975,13 +993,23 @@ def test_unsplit_correctly(self):
         assert torch.equal(result["image_grid_thw"], image_grid_thw_merged)
         assert "other_key" in result
 
+    def test_unsplit_image_position_ids(self):
+        image_position_ids = [torch.tensor([[0, 1]]), torch.tensor([[2, 3], [4, 5]])]
+        image_position_ids_merged = torch.cat(image_position_ids, dim=0)
+        pixel_values = [torch.randn(1, 4), torch.randn(2, 4)]
+        batch = {"pixel_values": pixel_values, "image_position_ids": image_position_ids}
+        result = unsplit_pixel_values_by_grid(batch)
+        assert isinstance(result["image_position_ids"], torch.Tensor)
+        assert torch.equal(result["image_position_ids"], image_position_ids_merged)
+
     def test_no_op_if_not_list(self):
         original = torch.randn(5, 3)
         batch = {"pixel_values": original}
         result = unsplit_pixel_values_by_grid(batch)
         assert torch.equal(result["pixel_values"], original)
 
 
+@require_torch_accelerator
 class TestForwardMaskedLogits:
     @pytest.mark.parametrize(
         "model_id",
@@ -1005,12 +1033,11 @@ class TestForwardMaskedLogits:
         ],
     )
     def test_llm(self, model_id):
-        device = torch.device("cuda")
-        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map=device)
-        input_ids = torch.randint(0, model.config.vocab_size, (2, 8), device=device)
+        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", device_map=torch_device)
+        input_ids = torch.randint(0, model.config.vocab_size, (2, 8), device=torch_device)
         logits_mask = torch.tensor(
             [[1, 1, 0, 0, 1, 0, 1, 0], [0, 1, 1, 0, 0, 1, 0, 1]],
-            device=device,
+            device=torch_device,
         )
 
         full_outputs = model(input_ids=input_ids)
@@ -1051,12 +1078,11 @@ def test_llm(self, model_id):
         ],
     )
     def test_vlm(self, model_id):
-        device = torch.device("cuda")
-        model = AutoModelForImageTextToText.from_pretrained(model_id, dtype="auto", device_map=device)
-        input_ids = torch.randint(0, model.config.text_config.vocab_size, (2, 8), device=device)
+        model = AutoModelForImageTextToText.from_pretrained(model_id, dtype="auto", device_map=torch_device)
+        input_ids = torch.randint(0, model.config.text_config.vocab_size, (2, 8), device=torch_device)
         logits_mask = torch.tensor(
             [[1, 1, 0, 0, 1, 0, 1, 0], [0, 1, 1, 0, 0, 1, 0, 1]],
-            device=device,
+            device=torch_device,
         )
 
         full_outputs = model(input_ids=input_ids)
@@ -1203,6 +1229,7 @@ def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
 ]
 
 
+@require_torch_accelerator
 class TestPatchChunkedLMHead:
     B, S = 4, 16  # batch size, sequence length (including prompt + completion)
     H, V = 32, 128
@@ -1285,15 +1312,14 @@ def test_dummy_model_chunked_forward_completion_mask_backward(self, temperature)
     @pytest.mark.parametrize("model_id", _CHUNKED_LM_HEAD_MODEL_IDS)
     @pytest.mark.parametrize("temperature", [1.0, 0.7])
     def test_forward(self, model_id, temperature):
-        device = torch.device("cuda")
-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
         if getattr(model.config, "final_logit_softcapping", None) is not None:
             pytest.skip("model uses final_logit_softcapping, not supported by chunked LM head")
         model.eval()
 
         B, S, chunk_size = 2, 8, 32
         torch.manual_seed(42)
-        input_ids = torch.randint(0, model.config.vocab_size, (B, S), device=device)
+        input_ids = torch.randint(0, model.config.vocab_size, (B, S), device=torch_device)
         labels = input_ids.clone()
 
         # Reference: standard forward → shifted logits → logprobs & entropy
@@ -1316,15 +1342,14 @@ def test_forward(self, model_id, temperature):
     @pytest.mark.parametrize("model_id", _CHUNKED_LM_HEAD_MODEL_IDS)
     @pytest.mark.parametrize("temperature", [1.0, 0.7])
     def test_backward(self, model_id, temperature):
-        device = torch.device("cuda")
-        model_ref = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)
+        model_ref = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(torch_device)
         if getattr(model_ref.config, "final_logit_softcapping", None) is not None:
             pytest.skip("model uses final_logit_softcapping, not supported by chunked LM head")
         model_chunked = copy.deepcopy(model_ref)
 
         B, S, chunk_size = 2, 8, 32
         torch.manual_seed(42)
-        input_ids = torch.randint(0, model_ref.config.vocab_size, (B, S), device=device)
+        input_ids = torch.randint(0, model_ref.config.vocab_size, (B, S), device=torch_device)
         labels = input_ids.clone()
         shifted_labels = labels[:, 1:]
 
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -887,39 +887,39 @@ def identity(x):
 
 def split_pixel_values_by_grid(batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor | list[torch.Tensor]]:
     """
-    Splits `batch["pixel_values"]` into a list of tensors based on the product of each row in `batch["image_grid_thw"]`
-    and batch["num_images"] while keeping other entries unchanged. For models without `image_grid_thw` (e.g. Gemma),
-    splits by `num_images` directly.
+    Splits `batch["pixel_values"]` into a list of tensors, one per sample, based on `batch["num_images"]`.
+
+    For models with `image_grid_thw` (e.g. Qwen), the grid dimensions determine how many rows of `pixel_values` belong
+    to each image. For models with `image_position_ids` instead (e.g. Gemma), `pixel_values` is indexed directly by
+    image count.
     """
     if "pixel_values" not in batch or "num_images" not in batch:
         return batch
 
+    num_images = batch["num_images"]
+    pixel_values = batch["pixel_values"]  # [total, feature_dim]
+
     if "image_grid_thw" in batch:
         lengths = batch["image_grid_thw"].prod(-1).tolist()  # [num_images]
-        pixel_values = batch["pixel_values"]  # [total, feature_dim]
-
         if sum(lengths) != pixel_values.size(0):
             raise ValueError(
                 f"Mismatch: sum(lengths) = {sum(lengths)} != pixel_values.size(0) = {pixel_values.size(0)}"
             )
 
-        boundaries = [0, *accumulate(batch["num_images"])]  # [3, 4, 5] -> [0, 3, 7, 12]
-        sections = [sum(lengths[boundaries[i] : boundaries[i + 1]]) for i in range(len(batch["num_images"]))]
-        split_values = list(torch.split(batch["pixel_values"], sections, dim=0))
-        image_grid_thw = list(torch.split(batch["image_grid_thw"], batch["num_images"], dim=0))
-        return {**batch, "pixel_values": split_values, "image_grid_thw": image_grid_thw}
-    else:
-        # Models without image_grid_thw (e.g. Gemma): split pixel_values by num_images per sample
-        num_images = batch["num_images"]
-        num_images_list = num_images.tolist() if isinstance(num_images, torch.Tensor) else list(num_images)
-        split_values = list(torch.split(batch["pixel_values"], [int(n) for n in num_images_list], dim=0))
-        result = {**batch, "pixel_values": split_values}
-        # Also split image_position_ids if present (indexed by image, same as pixel_values)
-        if "image_position_ids" in batch:
-            result["image_position_ids"] = list(
-                torch.split(batch["image_position_ids"], [int(n) for n in num_images_list], dim=0)
-            )
-        return result
+        boundaries = [0, *accumulate(num_images)]
+        image_grid_thw = batch["image_grid_thw"]  # [total, 3]
+        sections = [sum(lengths[boundaries[i] : boundaries[i + 1]]) for i in range(len(num_images))]
+        split_pixel_values = list(torch.split(pixel_values, sections, dim=0))
+        split_image_grid_thw = list(torch.split(image_grid_thw, num_images, dim=0))
+        return {**batch, "pixel_values": split_pixel_values, "image_grid_thw": split_image_grid_thw}
+
+    if "image_position_ids" in batch:
+        image_position_ids = batch["image_position_ids"]  # [total]
+        split_pixel_values = list(torch.split(pixel_values, num_images, dim=0))
+        split_image_position_ids = list(torch.split(image_position_ids, num_images, dim=0))
+        return {**batch, "pixel_values": split_pixel_values, "image_position_ids": split_image_position_ids}
+
+    return batch
 
 
 def unsplit_pixel_values_by_grid(batch: dict[str, torch.Tensor | list[torch.Tensor]]) -> dict[str, torch.Tensor]: