From f237d560fa9d5022210c04d9fc4b6671014f2d68 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:27:28 +0200 Subject: [PATCH 1/3] Avoid deepcopy in prepare_multimodal_messages --- trl/data_utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/trl/data_utils.py b/trl/data_utils.py index f72c9e5693..461138dc63 100644 --- a/trl/data_utils.py +++ b/trl/data_utils.py @@ -72,36 +72,38 @@ def prepare_multimodal_messages(messages: list[dict[str, Any]], images: list | N ``` """ images = images or [] - messages = copy.deepcopy(messages) # avoid modifying the original messages - # First, convert all messages to the structured format if needed, and insert image placeholders if needed + # First, convert all messages to the structured format if needed, and insert image placeholders if needed. + # Build new message dicts only when transforming string content to avoid modifying the originals. + new_messages = [] images_included = False for message in messages: if message["role"] == "system": if isinstance(message["content"], str): # if already prepared, the content will be a list - message["content"] = [{"type": "text", "text": message["content"]}] + message = {**message, "content": [{"type": "text", "text": message["content"]}]} elif message["role"] == "user": if isinstance(message["content"], str) and not images_included: image_entries = [{"type": "image"} for _ in range(len(images))] - message["content"] = [*image_entries, {"type": "text", "text": message["content"]}] + message = {**message, "content": [*image_entries, {"type": "text", "text": message["content"]}]} images_included = True - elif isinstance(message["content"], str) and images_included: - message["content"] = [{"type": "text", "text": message["content"]}] + elif isinstance(message["content"], str): + message = {**message, "content": [{"type": "text", "text": message["content"]}]} elif message["role"] == "assistant": if message.get("content") and isinstance(message["content"], str): - message["content"] = [{"type": "text", "text": message["content"]}] + message = {**message, "content": [{"type": "text", "text": message["content"]}]} elif message["role"] == "tool": if message.get("content") and isinstance(message["content"], str): - message["content"] = [{"type": "text", "text": message["content"]}] + message = {**message, "content": [{"type": "text", "text": message["content"]}]} else: raise ValueError( f"Invalid role in message: {message['role']}. Expected 'system', 'user', 'assistant', or 'tool'." ) + new_messages.append(message) # Then, check that the number of image placeholders matches the number of images provided num_placeholders = sum( sum(1 for part in message["content"] if part["type"] == "image") - for message in messages + for message in new_messages if message.get("content") and message["role"] != "tool" ) if num_placeholders != len(images): @@ -111,7 +113,7 @@ def prepare_multimodal_messages(messages: list[dict[str, Any]], images: list | N # Then, fill in the actual images in the placeholders img_idx = 0 - for message in messages: + for message in new_messages: if not message.get("content") or message["role"] == "tool": continue for part in message["content"]: @@ -119,7 +121,7 @@ def prepare_multimodal_messages(messages: list[dict[str, Any]], images: list | N part["image"] = images[img_idx] img_idx += 1 - return messages + return new_messages def prepare_multimodal_messages_vllm(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: From f57630f82a8fbd31ff00d058462e13a233b59731 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:27:39 +0200 Subject: [PATCH 2/3] Update docstring --- trl/data_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trl/data_utils.py b/trl/data_utils.py index 461138dc63..8231d49ebc 100644 --- a/trl/data_utils.py +++ b/trl/data_utils.py @@ -44,7 +44,7 @@ def prepare_multimodal_messages(messages: list[dict[str, Any]], images: list | N List of image objects to insert in the messages. Returns: - `list[dict[str, Any]]`: A deep-copied list of messages where every `"content"` value is a list of structured + `list[dict[str, Any]]`: A new list of messages where every `"content"` value is a list of structured content blocks, and all `"image"` placeholders are populated with the corresponding image objects. If the assistant turns contains `"tool_calls"`, then the `"content"` might be empty. From 71071043a36196a9c7bf833ac70f12e3479078e0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 8 Apr 2026 08:54:17 +0200 Subject: [PATCH 3/3] Avoid mutating in-place messages with image placeholders --- trl/data_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/trl/data_utils.py b/trl/data_utils.py index 8231d49ebc..7b49132f05 100644 --- a/trl/data_utils.py +++ b/trl/data_utils.py @@ -113,13 +113,17 @@ def prepare_multimodal_messages(messages: list[dict[str, Any]], images: list | N # Then, fill in the actual images in the placeholders img_idx = 0 - for message in new_messages: + for i, message in enumerate(new_messages): if not message.get("content") or message["role"] == "tool": continue + new_content = [] for part in message["content"]: if part["type"] == "image": - part["image"] = images[img_idx] + new_content.append({**part, "image": images[img_idx]}) img_idx += 1 + else: + new_content.append(part) + new_messages[i] = {**message, "content": new_content} return new_messages