From 529acfeb980795d5af1c8655d143214b0fb5fb10 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Tue, 7 Apr 2026 12:16:49 +0000
Subject: [PATCH 1/9] support qwen3.5 chat template

---
 tests/resource/qwen35_tokenize_data.jsonl     |  15 +
 xtuner/v1/data_proto/messages/__init__.py     |   3 +-
 xtuner/v1/data_proto/messages/chat.py         |   2 +-
 xtuner/v1/data_proto/messages/qwen35_chat.py  | 407 ++++++++++++++++++
 xtuner/v1/data_proto/templates/__init__.py    |   7 +-
 xtuner/v1/data_proto/templates/hybrid.py      |   9 +-
 .../mllm_tokenize_fn/base_mllm_tokenize_fn.py |   1 +
 .../mllm_tokenize_fn/qwen3_vl_tokenize_fn.py  | 205 ++++++---
 8 files changed, 592 insertions(+), 57 deletions(-)
 create mode 100644 tests/resource/qwen35_tokenize_data.jsonl
 create mode 100644 xtuner/v1/data_proto/messages/qwen35_chat.py

diff --git a/tests/resource/qwen35_tokenize_data.jsonl b/tests/resource/qwen35_tokenize_data.jsonl
new file mode 100644
index 000000000..24f3b7902
--- /dev/null
+++ b/tests/resource/qwen35_tokenize_data.jsonl
@@ -0,0 +1,15 @@
+{"id":1,"messages": [{"role": "system", "content": "这是单轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"}]}
+{"id":2,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道","reasoning_content": "这是 reasoning_content 内容"}]}
+{"id":3,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "\n我需要先调用一些工具才能知道","reasoning_content": "\n这是 reasoning_content 内容\n"}]}
+{"id":4,"messages": [{"role": "system", "content": "这是多轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"}]}
+{"id":5,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题", "reasoning_content": "这是 reasoning_content 内容"}]}
+{"id":6,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"},{"role": "user", "content": "这是第三个问题"},{"role": "assistant", "content": "好的，我知道这是第三个问题", "reasoning_content": "这是 reasoning_content 内容 2"}]}
+{"id":7,"messages": [{"role": "system", "content": "这是单轮无think+toolcall例子"},{"role": "user", "content": "北京今天的天气如何？"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls":[{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments":  {"location": "Boston"}}}]},{"role": "tool","content": "35"},{"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}],"tools": [{"type":"function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}},{"type": "function", "function": {"name":"get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters":{"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":8,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。", "reasoning_content": "这是 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":9,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。","reasoning_content": "这是最后一个 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":10,"messages": [{"role": "system", "content": "这是多轮无think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何"}, {"role": "assistant", "content": "好的，我知道这是第二个问题。我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":11,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。只有一个用户 user 输入。只有一次真 user 输入 表示整个对话过程中只有 user message。此时中间的所有 think 过程都会保留"}, {"role": "user", "content": "北京和上海今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "我现在知道北京的天气了，我需要继续知道上海的天气", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度，上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入，则之前的 think 内容会全部丢掉，因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何？"}, {"role": "assistant", "content": "现在是第二个问题了，我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手，能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容，它们有什么相同点和不同点？"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫，背景都是室内环境，光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧，而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比，有什么新的元素出现？"}]}, {"role": "assistant", "content": "与前两张图片相比，这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片，你认为它们想表达什么主题？"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题：猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松，第二张图片展示了猫的舒适和满足，而第三张图片则通过不同的构图和视觉元素，传达了猫在这个环境中的多样性和丰富性。整体上，这些图片共同描绘了猫在室内生活中的多样化表现，表达了对猫的喜爱和对其生活状态的关注。"}]}
+{"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/demo.mp4"}, {"type": "text", "text": "请描述这个视频的主要内容，并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景，有行人和车辆在移动。随后镜头切换到一家咖啡店的内部，可以看到顾客在排队点单。接着视频展示了咖啡制作的过程，包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅，画面清晰。"}]}
+{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/cooking_tutorial.mp4"}, {"type": "video", "video": "https://example.com/video/cooking_result.mp4"}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/failed_attempt.mp4"}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
\ No newline at end of file
diff --git a/xtuner/v1/data_proto/messages/__init__.py b/xtuner/v1/data_proto/messages/__init__.py
index b0e841f70..9b95ad829 100644
--- a/xtuner/v1/data_proto/messages/__init__.py
+++ b/xtuner/v1/data_proto/messages/__init__.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseMessages
 from .chat import ChatMessages
+from .qwen35_chat import Qwen35ChatMessages
 
 
-__all__ = ["BaseMessages", "ChatMessages"]
+__all__ = ["BaseMessages", "ChatMessages", "Qwen35ChatMessages"]
diff --git a/xtuner/v1/data_proto/messages/chat.py b/xtuner/v1/data_proto/messages/chat.py
index fcc4adc64..55e1dd1f8 100644
--- a/xtuner/v1/data_proto/messages/chat.py
+++ b/xtuner/v1/data_proto/messages/chat.py
@@ -227,7 +227,7 @@ def get_prompt(self, chat_template: ChatTemplate) -> str:
                 prompt += chat_template.sep
         return prompt
 
-    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate) -> Dict:
+    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: ChatTemplate, **kwargs) -> Dict:
         input_ids = tokenizer.encode("", add_special_tokens=False)
         labels = [IGNORE_INDEX for _ in input_ids]
 
diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
new file mode 100644
index 000000000..a8604b4a5
--- /dev/null
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -0,0 +1,407 @@
+from typing import List, Dict, Optional
+import json
+from xtuner.v1.data_proto.messages.base import BaseMessages
+from xtuner.v1.data_proto.templates import HybridChatTemplate
+from transformers import PreTrainedTokenizer
+import copy
+
+
+def get_offset_mapping(tokenizer, text: str):
+    encoding = tokenizer(text, add_special_tokens=False)
+    input_ids = encoding["input_ids"]
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    offset_mapping = []
+    pos = 0
+    for token_id, token in zip(input_ids, tokens):
+        decoded = tokenizer.decode([token_id], skip_special_tokens=False)
+        if not decoded:
+            offset_mapping.append((pos, pos))
+            continue
+        idx = text.find(decoded, pos)
+        if idx == -1:
+            offset_mapping.append((pos, pos))
+        else:
+            end = idx + len(decoded)
+            offset_mapping.append((idx, end))
+            pos = end
+    return input_ids, offset_mapping
+
+
+def render_content(content, do_vision_count, image_count, video_count, add_vision_id=False):
+    if isinstance(content, str):
+        return content, image_count, video_count
+    result = ""
+    for item in content:
+        if "image" in item or "image_url" in item or item.get("type") == "image":
+            if do_vision_count:
+                image_count += 1
+            if add_vision_id:
+                result += f"Picture {image_count}: "
+            result += "<|vision_start|><|image_pad|><|vision_end|>"
+        elif "video" in item or item.get("type") == "video":
+            if do_vision_count:
+                video_count += 1
+            if add_vision_id:
+                result += f"Video {video_count}: "
+            
+            video_content = item.get("video", {})
+            timestamps = video_content.get("timestamps", [])
+            if len(timestamps) > 0:
+                video_placeholder = ""
+                for timestamp in timestamps:
+                    video_placeholder += f"<{timestamp:.1f} seconds><|vision_start|><|video_pad|><|vision_end|>"
+                result += video_placeholder
+            
+            conversation_timestamp = video_content.get("conversation_timestamp", [])
+            if len(conversation_timestamp) > 0:
+                start_time = conversation_timestamp[0]  
+                end_time = conversation_timestamp[1]
+                timestamps = f"<{start_time:.1f}-{end_time:.1f} seconds>"
+                result += timestamps
+        elif "text" in item:
+            result += item["text"]
+    return result, image_count, video_count
+
+
+# Qwen3.5 工具系统提示（与 Qwen3 不同的 XML 格式）
+_QWEN35_TOOL_SYSTEM = (
+    "# Tools\n\n"
+    "You have access to the following functions:\n\n"
+    "<tools>"
+)
+_QWEN35_TOOL_INSTRUCTIONS = (
+    "\n</tools>\n\n"
+    "If you choose to call a function ONLY reply in the following format with NO suffix:\n\n"
+    "<tool_call>\n"
+    "<function=example_function_name>\n"
+    "<parameter=example_parameter_1>\n"
+    "value_1\n"
+    "</parameter>\n"
+    "<parameter=example_parameter_2>\n"
+    "This is the value for the second parameter\n"
+    "that can span\n"
+    "multiple lines\n"
+    "</parameter>\n"
+    "</function>\n"
+    "</tool_call>\n\n"
+    "<IMPORTANT>\n"
+    "Reminder:\n"
+    "- Function calls MUST follow the specified format: an inner <function=...></function> "
+    "block must be nested within <tool_call></tool_call> XML tags\n"
+    "- Required parameters MUST be specified\n"
+    "- You may provide optional reasoning for your function call in natural language BEFORE "
+    "the function call, but NOT after\n"
+    "- If there is no function call available, answer the question like normal with your "
+    "current knowledge and do not tell the user about function calls\n"
+    "</IMPORTANT>"
+)
+
+
+def _render_tool_call_args(arguments: dict) -> str:
+    """将 tool_call arguments dict 渲染为 Qwen3.5 XML 参数格式。"""
+    parts = ""
+    for k, v in arguments.items():
+        parts += f"<parameter={k}>\n"
+        if isinstance(v, (dict, list)):
+            parts += json.dumps(v, ensure_ascii=False)
+        else:
+            parts += str(v)
+        parts += "\n</parameter>\n"
+    return parts
+
+
+def qwen35_tokenize_fn_fastspeed(
+    messages,
+    tokenizer=None,
+    tools=None,
+    add_generation_prompt=False,
+    add_vision_id=False,
+    return_labels=True,
+):  
+
+    enable_thinking = any("reasoning_content" in msg for msg in messages)
+
+    image_count = 0
+    video_count = 0
+    result = ""
+    loss_mask: list[bool] = []
+
+    def _render(content, do_vision_count: bool) -> str:
+        nonlocal image_count, video_count
+        out, image_count, video_count = render_content(
+            content, do_vision_count, image_count, video_count, add_vision_id
+        )
+        return out
+
+    def _append(text: str, is_loss: bool) -> None:
+        nonlocal result
+        result += text
+        loss_mask.extend([is_loss] * len(text))
+
+    # ── system / tools 块 ─────────────────────────────────────────────────
+    if tools:
+        _append("<|im_start|>system\n", False)
+        _append(_QWEN35_TOOL_SYSTEM, False)
+        for tool in tools:
+            _append("\n" + json.dumps(tool, ensure_ascii=False), False)
+        _append(_QWEN35_TOOL_INSTRUCTIONS, False)
+        if messages[0]["role"] == "system":
+            sys_content = _render(messages[0]["content"], False).strip()
+            if sys_content:
+                _append("\n\n" + sys_content, False)
+        _append("<|im_end|>\n", False)
+    else:
+        if messages[0]["role"] == "system":
+            sys_content = _render(messages[0]["content"], False).strip()
+            _append(f"<|im_start|>system\n{sys_content}<|im_end|>\n", False)
+
+    # ── 计算 last_query_index ─────────────────────────────────────────────
+    multi_step_tool = True
+    last_query_index = len(messages) - 1
+    for i in range(len(messages) - 1, -1, -1):
+        msg = messages[i]
+        if multi_step_tool and msg["role"] == "user":
+            content_str = _render(msg["content"], False).strip()
+            if not (
+                content_str.startswith("<tool_response>")
+                and content_str.endswith("</tool_response>")
+            ):
+                multi_step_tool = False
+                last_query_index = i
+
+    # ── 主循环 ────────────────────────────────────────────────────────────
+    for idx, message in enumerate(messages):
+        is_first = idx == 0
+        is_last = idx == len(messages) - 1
+        content = _render(message["content"], True).strip()
+        role = message["role"]
+
+        if role == "user" or (role == "system" and not is_first):
+            _append(f"<|im_start|>{role}\n{content}<|im_end|>\n", False)
+
+        elif role == "assistant":
+            reasoning_content = ""
+            if isinstance(message.get("reasoning_content"), str):
+                reasoning_content = message["reasoning_content"]
+            else:
+                if "</think>" in content:
+                    reasoning_content = (
+                        content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n")
+                    )
+                    content = content.split("</think>")[-1].lstrip("\n")
+            # Qwen3.5 模板对 reasoning_content 做 |trim
+            reasoning_content = reasoning_content.strip()
+
+            is_loss = message.get("loss", True)
+
+            _append(f"<|im_start|>{role}\n", False)
+
+            if idx > last_query_index:
+                # 最后查询之后的轮次：渲染 <think> 块，并计算 loss
+                _append("<think>\n", False)
+                if reasoning_content:
+                    # 有 reasoning：gen prompt 以 <think>\n 结尾，content_tokens 从 reasoning 开始
+                    _append(reasoning_content + "\n", is_loss)
+                    _append("</think>\n\n", is_loss)
+                elif enable_thinking:
+                    # enable_thinking=True 但无 reasoning：gen prompt 以 <think>\n 结尾
+                    # content_tokens 从 </think> 开始，所以 </think>\n\n 算 loss
+                    _append("\n", False)  # 空内容的 \n（与 <think>\n 合并为 \n\n token，不算 loss）
+                    _append("</think>\n\n", is_loss)
+                else:
+                    # enable_thinking=False：gen prompt 以完整 <think>\n\n</think>\n\n 结尾
+                    # content_tokens 只包含实际回复，</think>\n\n 不算 loss
+                    _append("\n", False)
+                    _append("</think>\n\n", False)
+                body_is_loss = is_loss
+            else:
+                # 历史轮次：
+                # - enable_thinking=False：gen prompt 含完整 <think>\n\n</think>\n\n，
+                #   content_tokens 只有回复内容，在 total_ids 中可以找到 → 用 is_loss
+                # - enable_thinking=True：content_tokens 以 </think> 开头，
+                #   total_ids 里历史轮无 <think> 块 → NOT FOUND → 不算 loss
+                body_is_loss = is_loss if not enable_thinking else False
+                _append(content, body_is_loss)
+
+            if idx > last_query_index:
+                _append(content, body_is_loss)
+
+            # tool_calls（Qwen3.5 XML 格式）
+            if message.get("tool_calls"):
+                for tc_idx, tool_call in enumerate(message["tool_calls"]):
+                    tc = tool_call.get("function", tool_call)
+                    tc_name = tc["name"]
+                    tc_args = tc.get("arguments", {})
+
+                    if tc_idx == 0:
+                        if content.strip():
+                            _append("\n\n", body_is_loss)
+                        _append(f"<tool_call>\n<function={tc_name}>\n", body_is_loss)
+                    else:
+                        _append(f"\n<tool_call>\n<function={tc_name}>\n", body_is_loss)
+
+                    if isinstance(tc_args, dict):
+                        _append(_render_tool_call_args(tc_args), body_is_loss)
+                    _append(f"</function>\n</tool_call>", body_is_loss)
+
+            _append("<|im_end|>\n", body_is_loss)
+
+        elif role == "tool":
+            prev_role = messages[idx - 1]["role"] if idx > 0 else None
+            if is_first or prev_role != "tool":
+                _append("<|im_start|>user", False)
+            _append("\n<tool_response>\n", False)
+            _append(content, False)
+            _append("\n</tool_response>", False)
+            next_role = messages[idx + 1]["role"] if not is_last else None
+            if is_last or next_role != "tool":
+                _append("<|im_end|>\n", False)
+
+    if add_generation_prompt:
+        _append("<|im_start|>assistant\n", False)
+        if not enable_thinking:
+            _append("<think>\n\n</think>\n\n", False)
+        else:
+            _append("<think>\n", False)
+
+    # ── 不需要 labels ─────────────────────────────────────────────────────
+    if not return_labels:
+        return result, loss_mask
+
+    # ── 需要 labels ───────────────────────────────────────────────────────
+    assert tokenizer is not None, "return_labels=True 时必须传入 tokenizer"
+
+    try:
+        encoded = tokenizer(
+            result,
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+        )
+        input_ids = encoded["input_ids"]
+        offset_mapping = encoded["offset_mapping"]
+    except Exception:
+        input_ids, offset_mapping = get_offset_mapping(tokenizer, result)
+
+    labels = []
+    for token_id, (start, end) in zip(input_ids, offset_mapping):
+        if start == end:
+            labels.append(-100)
+        elif any(loss_mask[i] for i in range(start, end)):
+            labels.append(token_id)
+        else:
+            labels.append(-100)
+
+    return input_ids, labels
+
+
+def qwen35_process_text_and_loss_mask(text: str, loss_mask: list[bool], tokenizer: PreTrainedTokenizer):
+    assert tokenizer is not None
+    assert len(text) == len(loss_mask), "text and loss_mask must have the same length. Got {len(text)} and {len(loss_mask)}."
+
+    try:
+        encoded = tokenizer(
+            text,
+            return_offsets_mapping=True,
+            add_special_tokens=False,
+        )
+        input_ids = encoded["input_ids"]
+        offset_mapping = encoded["offset_mapping"]
+    except Exception:
+        input_ids, offset_mapping = get_offset_mapping(tokenizer, text)
+
+    labels = []
+    for token_id, (start, end) in zip(input_ids, offset_mapping):
+        if start == end:
+            labels.append(-100)
+        elif any(loss_mask[i] for i in range(start, end)):
+            labels.append(token_id)
+        else:
+            labels.append(-100)
+
+    return input_ids, labels
+
+
+def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tools=None, add_vision_id=False, **kwargs):
+    """
+    终极稳定版 Tokenize：基于 Token 级别的绝对对齐 (椒盐算法升级版)。
+    逻辑：
+    1. 生成全量 total_ids 作为唯一真实的参考系。
+    2. 对于每个 assistant 消息，通过历史截断渲染，提取出它“应该长什么样”的 token 序列。
+    3. 在 total_ids 中顺藤摸瓜，精确匹配这些 token 序列。
+    4. 完美解决字符偏移错位、模板历史修改、以及特殊 Token 对齐问题。
+    """
+
+    enable_thinking = any("reasoning_content" in msg for msg in messages)
+
+    full_text = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools,add_vision_id=add_vision_id, enable_thinking=enable_thinking, **kwargs)
+    total_ids = tokenizer.encode(full_text, add_special_tokens=False)
+    labels = [-100] * len(total_ids)
+    # 记录在 total_ids 中搜索的起始位置，确保不会搜到前面的轮次
+    curr_ptr = 0
+    for i, msg in enumerate(messages):
+        if msg['role'] == 'assistant' and msg.get('loss', True):
+            # 1. 获取包含当前消息之前所有内容的“前缀”文本 (带 generation prompt)
+            prompt_text = tokenizer.apply_chat_template(messages[:i], tokenize=False, add_generation_prompt=True, add_vision_id=add_vision_id, enable_thinking=enable_thinking, tools=tools if i==0 else None, **kwargs)
+            # 2. 获取包含当前消息的完整“截断”文本
+            # 我们通过修改当前消息的内容，强制在末尾加上一个罕见标记，来准确捕获这部分的内容
+            # 为什么要加标记？因为我们想知道当前消息的结束符（如 <|im_end|>）被 tokenizer 编成了什么
+            temp_msgs = [m.copy() for m in messages[:i+1]]
+            # 提取真实内容
+            m_text = tokenizer.apply_chat_template(temp_msgs, tokenize=False,add_vision_id=add_vision_id, enable_thinking=enable_thinking, tools=tools if i==0 else None, **kwargs)
+            # 转换为 Token 序列
+            p_ids = tokenizer.encode(prompt_text, add_special_tokens=False)
+            m_ids = tokenizer.encode(m_text, add_special_tokens=False)
+            # 3. 提取当前消息的纯内容 Tokens (包含 reasoning, content, tool_calls, 以及结尾的 im_end)
+            # 注意：由于 tokenizer 的特性，m_ids 的前缀可能并不完美等于 p_ids
+            # 所以我们要寻找 p_ids 的特征来切分
+            # 为了最稳健，我们直接在 m_ids 的末尾倒推。
+            # 我们知道 m_ids 是由 p_ids + current_content_ids 组成的
+            # 我们直接取差集：
+            content_tokens = m_ids[len(p_ids):]
+            if not content_tokens:
+                continue
+            # 4. 在全量 total_ids 中搜索这段 content_tokens
+            found = False
+            # 从 curr_ptr 开始往后搜
+            for s_ptr in range(curr_ptr, len(total_ids) - len(content_tokens) + 1):
+                if total_ids[s_ptr : s_ptr + len(content_tokens)] == content_tokens:
+                    # 匹配成功！
+                    labels[s_ptr : s_ptr + len(content_tokens)] = content_tokens
+                    curr_ptr = s_ptr + len(content_tokens)
+                    found = True
+                    break
+            if not found:
+                # 如果没找到，说明模板在全量渲染时，修改了这条历史消息的内容（例如删了 thinking）
+                # 这是允许的，只要它不是当前轮次（我们不强求历史轮次一定要匹配上，因为我们通常只对最后的 Turn 算 loss）
+                # 但如果是最后一条消息还没匹配上，那就一定是出大问题了
+                if i == len(messages) - 1:
+                    raise ValueError(f"严重错误：最后一条 Assistant 消息无法在全量 Token 中对齐。")
+    return total_ids, labels
+
+
+class Qwen35ChatMessages(BaseMessages):
+    messages: List[dict] # 暂时不做校验
+    tools: Optional[List[Dict]] = None
+    
+    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: HybridChatTemplate, add_vision_id=False, **kwargs) -> Dict:
+        is_pretrain = False
+        if len(self.messages) == 1 and self.messages[0]['role'] == "pretrain":
+            is_pretrain = True
+        
+        if is_pretrain:
+            text = self.messages[0]['content']
+            token_ids = tokenizer.encode(text, add_special_tokens=False)
+            label_ids = copy.deepcopy(token_ids)
+        else:
+            # replace system message
+            if chat_template.default_system is not None:
+                if self.messages[0]['role'] == "system":
+                    self.messages[0]['content'] = chat_template.default_system
+                else:
+                    self.messages.insert(0, {'role': 'system', 'content': chat_template.default_system})
+
+            token_ids, label_ids = qwen35_tokenize_fn_fastspeed(self.messages, tokenizer, self.tools,
+                                                                add_vision_id=add_vision_id, 
+                                                                return_labels=True)
+        return {"input_ids": token_ids, "labels": label_ids}
+
diff --git a/xtuner/v1/data_proto/templates/__init__.py b/xtuner/v1/data_proto/templates/__init__.py
index a016798bf..1e543d5a0 100644
--- a/xtuner/v1/data_proto/templates/__init__.py
+++ b/xtuner/v1/data_proto/templates/__init__.py
@@ -4,10 +4,15 @@
 from .chat import ChatTemplate
 from .hybrid import HybridChatTemplate
 
-
 current_date = datetime.now().strftime("%Y-%m-%d")
 
 CHAT_TEMPLATE_MAP = {
+    "qwen3.5-vl": HybridChatTemplate(
+        image_start_token="<|vision_start|>",
+        image_end_token="<|vision_end|>",
+        image_context_token="<|image_pad|>",
+        video_context_token="<|video_pad|>"
+    ),
     "intern-s1": HybridChatTemplate(
         system="<|im_start|>system\n{system}<|im_end|>\n",
         user="<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n",
diff --git a/xtuner/v1/data_proto/templates/hybrid.py b/xtuner/v1/data_proto/templates/hybrid.py
index 0ec2ddfcf..cb3899b03 100644
--- a/xtuner/v1/data_proto/templates/hybrid.py
+++ b/xtuner/v1/data_proto/templates/hybrid.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, List, Optional
+from typing import Callable
 
 from pydantic import BaseModel, ConfigDict, field_validator
 
@@ -12,11 +13,11 @@ class HybridChatTemplate(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     # Normal Chat
-    system: str  # System message format, role
+    system: str | None = None # System message format, role
     developer: str | None = None  # Developer message format, role
-    user: str  # User message format, role
-    assistant: str  # Assistant message format, role
-    stop_words: List[str]  # List of stop words
+    user: str | None = None # User message format, role
+    assistant: str | None = None # Assistant message format, role
+    stop_words: List[str] | None = None # List of stop words
     sep: str = "\n"
     thinking: str | None = None  # Thinking message format, not role
     default_system: Optional[str] = None
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
index 4309000ad..90d832f7a 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
@@ -270,6 +270,7 @@ class BaseMLLMTokenizeFnConfig(BaseModel):
     llm_pack_weight: float = 1.0
     visual_pack_weight: float = 0.0
     trim_memory_interval: int = 1
+    chat_template: str | None = None
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
index bc8071d17..461f6dbb5 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
@@ -6,7 +6,7 @@
 import os
 from itertools import chain
 from types import SimpleNamespace
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 
 import numpy as np
 import torch
@@ -17,10 +17,9 @@
 import transformers
 from transformers import AutoProcessor, PreTrainedTokenizer
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages import ChatMessages, Qwen35ChatMessages
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP, HybridChatTemplate
 from xtuner.v1.utils import get_logger
-
 from ..data_item import CacheItem, QwenVL3DataItem
 from ..utils import apply_exif_orientation, generate_random_int_from_dict
 from .base_mllm_tokenize_fn import (
@@ -203,12 +202,72 @@ def replace_video_token(
     assert current_image_idx == n_image, f"VIDEO ERROR: total_image_idx: {current_image_idx} != {n_image}"
 
 
+def replace_video_timestamps(messages: list[Dict], timestamps_list: list[list[float]]):
+    video_cnt = 0
+    for msg in messages:
+        if msg['role'] == "user":
+            content = msg['content']
+            if isinstance(content, list):
+                for item in content:
+                    if 'video' in item:
+                        video_content = item['video']
+                        timestamps = timestamps_list[video_cnt]
+                        video_content['timestamps'] = timestamps
+                        video_cnt += 1
+
+
+def replace_qwen35_media_token(tokenized:dict, context_token_id: int, num_media_token_list: list[int] | list[list[int]], total_media_count: int):
+    input_ids = tokenized["input_ids"]
+    labels = tokenized.get("labels")
+
+    if isinstance(num_media_token_list, list):
+        # video
+        num_media_token_list = [item for sublist in num_media_token_list for item in sublist]
+    else:
+        # image
+        num_media_token_list = [num_media_token_list]
+
+    context_token_count = input_ids.count(context_token_id)
+    assert context_token_count == len(num_media_token_list), (
+        f"context_token_count and num_media_token_list length must be the same, "
+        f"but got {context_token_count} and {len(num_media_token_list)}"
+    )
+    new_input_ids: list[int] = []
+    new_labels: list[int] | None = [] if labels is not None else None
+    media_idx = 0
+    for i, tid in enumerate(input_ids):
+        if tid == context_token_id:
+            n = int(num_media_token_list[media_idx])
+            new_input_ids.extend([context_token_id] * n)
+            if new_labels is not None and labels is not None:
+                lbl = labels[i]
+                new_labels.extend([lbl] * n)
+            media_idx += 1
+        else:
+            new_input_ids.append(tid)
+            if new_labels is not None and labels is not None:
+                new_labels.append(labels[i])
+
+    if new_labels is not None:
+        assert len(new_input_ids) == len(new_labels), "new_input_ids and new_labels length must be the same"
+    assert new_input_ids.count(context_token_id) == total_media_count, (
+        f"new_input_ids and total_media_count must be the same, "
+        f"but got {new_input_ids.count(context_token_id)} and {total_media_count}"
+    )
+
+    tokenized["input_ids"] = new_input_ids
+    if new_labels is not None:
+        tokenized["labels"] = new_labels
+    return tokenized
+
+
 class Qwen3VLTokenizeFunction(BaseMLLMTokenizeFunction):
     def __init__(
         self,
         tokenizer: PreTrainedTokenizer,
         processor_path: str,
         anno_name: str,
+        chat_template: str = 'qwen3-vl', # qwen3.5-vl or qwen3-vl
         min_pixels: int | None = None,  # Max image pixels (H*W) for image
         max_pixels: int | None = None,  # Min image pixels (H*W) for image
         video_min_frames: int | None = None,  # Min frames per video
@@ -294,8 +353,8 @@ def __init__(
             f"video_max_frames: {self.video_processor.max_frames}, fps: {self.video_processor.fps}, "
             f"rand_video_max_frames: {self.rand_video_max_frames}"
         )
-
-        self.chat_template = CHAT_TEMPLATE_MAP["qwen3-vl"]
+        self.chat_template_name = chat_template
+        self.chat_template = CHAT_TEMPLATE_MAP[chat_template]
         if system_message is not None:
             self.chat_template.default_system = system_message
 
@@ -339,6 +398,17 @@ def __init__(
             trim_memory_interval=trim_memory_interval,
         )
 
+    def calc_num_tokens_pure_text_get_item(self, data_item) -> CacheItem:
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+        tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+        input_ids = tokenized["input_ids"]
+        labels = tokenized["labels"]
+        input_ids, _ = self._truncated_input_and_labels(input_ids, labels)
+        return {"num_tokens": len(input_ids), "num_img_tokens": [0]}
+
     def _truncated_data_item(
         self, input_ids: list[int], labels: list[int] | None = None, position_ids: torch.Tensor | None = None
     ):
@@ -359,14 +429,18 @@ def _truncated_data_item(
         return input_ids, labels, position_ids
 
     def pure_text_get_item(self, data_item: dict) -> QwenVL3DataItem:
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-
         is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+                is_pretrain = True
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
         assert is_pretrain is False, "Text pretrain data should not be processed by this function"
 
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
         input_ids = tokenized["input_ids"]
         labels: list[int] = tokenized["labels"]
 
@@ -412,14 +486,22 @@ def calc_num_tokens_multi_modal_get_item(self, data_item: dict) -> CacheItem:
             print(f"ERROR of {self._image_wh_list}: {e}, data_name: {self.data_name}")
             return {"num_tokens": 0, "num_img_tokens": [0]}  # type: ignore
 
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_image_token(messages, self.chat_template, sum_media_grid_thw, add_vision_id=self.add_vision_id)
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(tokenized, self.img_context_token_id, sum_media_grid_thw, sum_media_grid_thw.sum())
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            replace_image_token(messages, self.chat_template, sum_media_grid_thw, add_vision_id=self.add_vision_id)
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -458,15 +540,23 @@ def multi_modal_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3
         image_tensor = visual_processed["pixel_values"]
         grid_thw = visual_processed["image_grid_thw"]  # b,3
         grid_thw_merged = [merged_thw.prod() // self.merge_length for merged_thw in grid_thw]  # type: ignore
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_image_token(messages, self.chat_template, grid_thw_merged, add_vision_id=self.add_vision_id)  # type: ignore
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(tokenized, self.img_context_token_id, grid_thw_merged, torch.stack(grid_thw_merged, dim=0).sum())
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_image_token(messages, self.chat_template, grid_thw_merged, add_vision_id=self.add_vision_id)  # type: ignore
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
         labels = tokenized["labels"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -689,29 +779,34 @@ def calc_num_tokens_video_get_item(self, data_item: dict) -> CacheItem:
             frame_seqlen = grid_h * grid_w // self.merge_length
             num_image_token_list.append([frame_seqlen] * grid_t)
             total_sum_media_grid_thw += sum_media_grid_thw
-
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_video_token(
-            messages,
-            self.chat_template,
-            num_image_token_list,
-            timestamps_list=timestamps_list,
-            add_vision_id=self.add_vision_id,
-        )
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            if len(timestamps_list) > 0:
+                assert len(timestamps_list) == len(num_image_token_list), (
+                    "timestamps should have the same length as num_image_token_list"
+                )
+                replace_video_timestamps(data_item["messages"], timestamps_list)
+        
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+            replace_qwen35_media_token(tokenized, self.video_context_token_id, num_image_token_list, total_sum_media_grid_thw)
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_video_token(messages, self.chat_template, num_image_token_list, timestamps_list, add_vision_id=self.add_vision_id)
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
             if self.add_eos_token:
                 input_ids = input_ids + [self.eos_token_id]
-
         input_ids, _, _ = self._truncated_data_item(input_ids)
-
         # 如果图片被截断，则该数据丢弃
         num_image_tokens_1 = (torch.tensor(input_ids) == self.video_context_token_id).sum()
         num_image_tokens_2 = total_sum_media_grid_thw
@@ -822,22 +917,30 @@ def video_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3DataIt
             num_image_tokens_list.append([frame_seqlen] * grid_thw[0][0])
             num_imgs_list.append(num_frames)
             total_sum_media_grid_thw += sum_media_grid_thw
-
-        messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-        replace_video_token(
-            messages,
-            self.chat_template,
-            num_image_tokens_list,
-            timestamps_list=timestamps_list,
-            add_vision_id=self.add_vision_id,
-        )
-        tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+        
+        is_pretrain = False
+        if self.chat_template_name == "qwen3.5-vl":
+            if len(timestamps_list) > 0:
+                assert len(timestamps_list) == len(num_image_tokens_list), (
+                    "timestamps should have the same length as num_image_token_list"
+                )
+                replace_video_timestamps(data_item["messages"], timestamps_list)
+        
+            messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+                is_pretrain = True
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+            replace_qwen35_media_token(tokenized, self.video_context_token_id, num_image_tokens_list, total_sum_media_grid_thw)
+        else:
+            messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
+            if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
+                is_pretrain = True
+            replace_video_token(messages, self.chat_template, num_image_tokens_list, timestamps_list, add_vision_id=self.add_vision_id)
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
+            
         input_ids = tokenized["input_ids"]
         labels = tokenized["labels"]
 
-        is_pretrain = False
-        if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
-            is_pretrain = True
         if is_pretrain:
             if self.add_bos_token:
                 input_ids = [self.bos_token_id] + input_ids
@@ -889,6 +992,7 @@ def video_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3DataIt
 class Qwen3VLTokenizeFnConfig(BaseMLLMTokenizeFnConfig):
     model_config = ConfigDict(title="Base dataset config for xtuner", extra="forbid")
     processor_path: str
+    chat_template: str = 'qwen3-vl'
     min_pixels: int | None = None
     max_pixels: int | None = None
     oss_loader_cfg: OSSLoaderConfig | None = None
@@ -915,6 +1019,7 @@ def build(
             tokenizer,
             self.processor_path,
             anno_name,
+            chat_template=self.chat_template,
             min_pixels=self.min_pixels,
             max_pixels=self.max_pixels,
             oss_loader_cfg=self.oss_loader_cfg,

From eeff35960517ef168f1f844e44c48c431980d46a Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Wed, 8 Apr 2026 07:41:06 +0000
Subject: [PATCH 2/9] update ci

---
 tests/chat_template/test_chat_template.py     |  42 ++-
 tests/datasets/test_qwen35_vl_tokenize_fn.py  | 340 ++++++++++++++++++
 tests/datasets/test_qwen3_vl_tokenize_fn.py   |   2 +-
 ...mllm_pretrain_image_example_data_new.jsonl |  12 +
 ...mllm_pretrain_video_example_data_new.jsonl |   5 +
 ...llm_sft_multi_image_example_data_new.jsonl |  11 +
 ...lm_sft_single_image_example_data_new.jsonl |  22 ++
 .../mllm_sft_video_example_data_new.jsonl     |  10 +
 .../mllm_sft_video_hf_example_data.jsonl      |  12 +-
 tests/resource/qwen35_tokenize_data.jsonl     |   4 +-
 xtuner/v1/data_proto/messages/qwen35_chat.py  | 114 +++---
 xtuner/v1/data_proto/templates/__init__.py    |   3 +-
 xtuner/v1/data_proto/templates/hybrid.py      |  12 +-
 .../mllm_tokenize_fn/base_mllm_tokenize_fn.py |  53 ++-
 .../mllm_tokenize_fn/qwen3_vl_tokenize_fn.py  | 108 +++---
 15 files changed, 623 insertions(+), 127 deletions(-)
 create mode 100644 tests/datasets/test_qwen35_vl_tokenize_fn.py
 create mode 100644 tests/resource/mllm_pretrain_image_example_data_new.jsonl
 create mode 100644 tests/resource/mllm_pretrain_video_example_data_new.jsonl
 create mode 100644 tests/resource/mllm_sft_multi_image_example_data_new.jsonl
 create mode 100644 tests/resource/mllm_sft_single_image_example_data_new.jsonl
 create mode 100644 tests/resource/mllm_sft_video_example_data_new.jsonl

diff --git a/tests/chat_template/test_chat_template.py b/tests/chat_template/test_chat_template.py
index bf702560e..4569b7b64 100644
--- a/tests/chat_template/test_chat_template.py
+++ b/tests/chat_template/test_chat_template.py
@@ -1,11 +1,17 @@
 from datetime import datetime
 import os
+import json
 import parametrize
 from unittest import TestCase
 from transformers import AutoTokenizer
+import torch
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+import unittest
 
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP 
 from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages.qwen35_chat import Qwen35ChatMessages, qwen35_tokenize_fn_slowspeed
 
 
 QWEN3_PATH = os.environ["QWEN3_PATH"]
@@ -222,6 +228,40 @@ def test_deepseek_v3_template(self, template_type,thinking, tokenizer):
         input_ids = _messages.tokenize(tokenizer, chat_template)['input_ids']
         
         self.assertTrue((input_ids == input_ids_ref))
+    
 
+    @unittest.skipIf(
+        Version(transformers_version) < Version("5.2.0"),
+        f"transformers >= 5.2.0 is required, but got {transformers_version}"
+    )
+    def test_qwen35vl_template(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        chat_template = CHAT_TEMPLATE_MAP["qwen3.5-vl"]
+        tokenizer = AutoTokenizer.from_pretrained(QWEN35_VL_PATH, trust_remote_code=True)
+        
+        jsonl_path = 'tests/resource/qwen35_tokenize_data.jsonl'
+        all_data= []
+        with open(jsonl_path, 'r') as f:
+            for line in f:
+                all_data.append(json.loads(line))
+        
+        for j, data in enumerate(all_data):
+            if j in [13,14]: # video 肯定和 hf 对不上
+                continue
+            gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
+            _messages = Qwen35ChatMessages(messages=data["messages"], tools=data.get("tools"))
+            tokenized = _messages.tokenize(tokenizer, chat_template, add_vision_id=True)
+            self.assertEqual(tokenized['input_ids'], gt_token_ids)
+            self.assertEqual(tokenized['labels'], gt_labels)
 
-        
\ No newline at end of file
+            enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
+            decode_str = tokenizer.decode(tokenized['input_ids'], skip_special_tokens=False)
+            hf_text = tokenizer.apply_chat_template(data['messages'],   
+                                               tools=data.get('tools'),       
+                                               add_vision_id=True,   
+                                               tokenize=False,
+                                               enable_thinking=enable_thinking,
+                                               add_generation_prompt=False)
+            self.assertEqual(decode_str, hf_text)
+            
+     
\ No newline at end of file
diff --git a/tests/datasets/test_qwen35_vl_tokenize_fn.py b/tests/datasets/test_qwen35_vl_tokenize_fn.py
new file mode 100644
index 000000000..291f19fe4
--- /dev/null
+++ b/tests/datasets/test_qwen35_vl_tokenize_fn.py
@@ -0,0 +1,340 @@
+import os
+from unittest import TestCase
+from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig, PretrainTokenizeFunction
+from transformers import AutoTokenizer, AutoProcessor,Qwen3VLProcessor
+import json
+import torch
+import parametrize
+from xtuner.v1.utils.test_utils import add_video_root
+from packaging.version import Version
+from transformers import __version__ as transformers_version
+import unittest
+from xtuner.v1.data_proto.messages.qwen35_chat import qwen35_tokenize_fn_slowspeed
+
+VIDEO_ROOT = os.environ["VIDEO_ROOT"]
+
+
+@unittest.skipIf(
+    Version(transformers_version) < Version("5.2.0"),
+    f"transformers >= 5.2.0 is required, but got {transformers_version}"
+)
+class TestMLLMTokenizeFn(TestCase):
+    def setUp(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        self.tokenizer = AutoTokenizer.from_pretrained(QWEN35_VL_PATH)
+        self.tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, 
+                                                   chat_template="qwen3.5-vl",
+                                                   rand_video_max_frames=14,
+                                                   add_vision_id=False).build(
+            self.tokenizer)
+        self.processor = AutoProcessor.from_pretrained(QWEN35_VL_PATH)
+
+    def test_qwen35vl_text(self):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, chat_template="qwen3.5-vl", add_vision_id=True).build(self.tokenizer)
+
+        data_path = 'tests/resource/qwen35_tokenize_data.jsonl'
+        all_data= []
+        with open(data_path, 'r') as f:
+            for line in f:
+                all_data.append(json.loads(line))
+        
+        for j, data in enumerate(all_data):
+            print(f"Processing data {j+1} of {len(all_data)}")
+            if j>=12:
+                break
+            gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(self.tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
+            ret = tokenize_fn(data)
+            input_ids_xtuner = ret['input_ids']
+            labels_xtuner = ret['labels']
+            self.assertEqual(input_ids_xtuner, gt_token_ids)
+            self.assertEqual(labels_xtuner, gt_labels)
+
+            enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
+            decode_str = self.tokenizer.decode(input_ids_xtuner, skip_special_tokens=False)
+            hf_text = self.tokenizer.apply_chat_template(data['messages'],   
+                                               tools=data.get('tools'),       
+                                               add_vision_id=True,   
+                                               tokenize=False,
+                                               enable_thinking=enable_thinking,
+                                               add_generation_prompt=False)
+            self.assertEqual(decode_str, hf_text)
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen35_vl_sft_single_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_single_image_example_data_new.jsonl'
+        total_step = 50
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i >=total_step:
+                    break
+                raw_data = json.loads(line)
+                
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = raw_data['messages']
+                messages[0]['content'][0]['type'] = 'image'
+                messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                del messages[0]['content'][0]['image']
+
+                for msg in messages:
+                    if not isinstance(msg['content'], list):
+                        msg['content'] = [{"type": "text", "text": msg['content']}]
+
+                ret = self.processor.apply_chat_template(messages,
+                                                         add_generation_prompt=False,
+                                                         tokenize=True,
+                                                         add_vision_id=add_vision_id,
+                                                         return_dict=True)
+                input_ids_hf = ret['input_ids'][0]
+                pixel_values_hf = ret['pixel_values']
+                image_grid_thw_hf = ret['image_grid_thw']
+                self.assertEqual(input_ids_xtuner, input_ids_hf)
+                self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_sft_multi_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_multi_image_example_data_new.jsonl'
+        total_index = [0, 1, 2, 3, 4, 10]
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i not in total_index:
+                    continue
+                raw_data = json.loads(line)
+
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = raw_data['messages']
+                if i != 10:
+                    messages[0]['content'][0]['type'] = 'image'
+                    messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                    messages[0]['content'][1]['type'] = 'image'
+                    messages[0]['content'][1]['path'] = 'tests/' + messages[0]['content'][1]['image']['url']
+                    del messages[0]['content'][0]['image']
+                    del messages[0]['content'][1]['image']
+                else:
+                    messages[0]['content'][0]['type'] = 'image'
+                    messages[0]['content'][0]['path'] = 'tests/' + messages[0]['content'][0]['image']['url']
+                    del messages[0]['content'][0]['image']
+
+                    messages[4]['content'][0]['type'] = 'image'
+                    messages[4]['content'][0]['path'] = 'tests/' + messages[4]['content'][0]['image']['url']
+                    del messages[4]['content'][0]['image']
+
+                for msg in messages:
+                    if not isinstance(msg['content'], list):
+                        msg['content'] = [{"type": "text", "text": msg['content']}]
+
+                ret = self.processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=True,
+                                                         return_dict=True, add_vision_id=add_vision_id)
+                input_ids_hf = ret['input_ids'][0]
+                pixel_values_hf = ret['pixel_values']
+                image_grid_thw_hf = ret['image_grid_thw']
+
+                self.assertEqual(input_ids_xtuner, input_ids_hf)
+                self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    def test_calc_frame_info(self):
+        self.tokenize_fn.state = "cache"
+        data_path = 'tests/resource/mllm_video_frame_test_data.jsonl'
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                raw_data = json.loads(line)
+                self.tokenize_fn(raw_data)
+                frames_indices_list, origin_fps_list, timestamps_list = self.tokenize_fn.calc_frame_info(raw_data)
+                num_frames_list = []
+                for frames_indices in frames_indices_list:
+                    if isinstance(frames_indices, int):
+                        num_frames_list.append(frames_indices)
+                    else:
+                        num_frames_list.append(len(frames_indices))
+                if i == 0:
+                    # case: 如果不存在 origin_fps ，则会基于预设的 rand_video_max_frames 参数随机采样
+                    assert len(origin_fps_list) == len(timestamps_list) == 0
+                    assert self.tokenize_fn.video_processor.min_frames <= num_frames_list[
+                        0] <= self.tokenize_fn.rand_video_max_frames
+                    assert self.tokenize_fn.video_processor.min_frames <= num_frames_list[
+                        1] <= self.tokenize_fn.rand_video_max_frames
+                elif i == 1:
+                    # case: 如果存在 origin_fps ，则会基于 origin_fps 计算 timestamps
+                    self.assertEqual(num_frames_list, [20, 4])
+                    self.assertEqual(origin_fps_list, [10, 8])
+                    self.assertEqual(timestamps_list,
+                                     [[0.25, 1.3, 2.35, 3.35, 4.45, 5.45, 6.55, 7.55, 8.600000000000001, 9.65],
+                                      [0.25, 1.125]])
+                elif i == 2:
+                    # case: 测试 origin_fps 为 1 且长度小于 4 时是否正常
+                    self.assertEqual(num_frames_list, [20, 4])
+                    self.assertEqual(origin_fps_list, [10, 1])
+                    self.assertEqual(timestamps_list,
+                                     [[0.25, 1.3, 2.35, 3.35, 4.45, 5.45, 6.55, 7.55, 8.600000000000001, 9.65],
+                                      [0.0, 0.0]])
+                elif i == 3:
+                    # case: 测试存在 processed_fps 且一个能被 fps 整除，一个不能且视频长度大于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [10, 14])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.35, 2.45, 3.55, 4.65],
+                                                       [0.3, 1.3, 2.4000000000000004, 3.5, 4.6, 5.7, 6.7]])
+                elif i == 4:
+                    # case: 测试存在 processed_fps 且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [10, 12])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.35, 2.45, 3.55, 4.65],
+                                                       [0.1, 0.5, 0.9, 1.2999999999999998, 1.7000000000000002, 2.1]])
+                elif i == 5:
+                    # case: 测试存在 frames_timestamp，且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [4, 14])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5],
+                                                       [0.1, 0.5, 1.1, 1.5, 1.9, 2.5, 2.9]])
+                elif i == 6:
+                    # case: 测试存在 frames_timestamp，且一个能被 fps 整除，一个不能且视频长度小于 rand_video_max_frames
+                    self.assertEqual(num_frames_list, [4, 12])
+                    self.assertEqual(origin_fps_list, [20, 10])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5],
+                                                       [0.1, 0.5, 0.9, 1.2999999999999998, 1.7000000000000002, 2.1]])
+                elif i == 7:
+                    # case: 测试单视频
+                    self.assertEqual(num_frames_list, [4])
+                    self.assertEqual(origin_fps_list, [20])
+                    self.assertEqual(timestamps_list, [[0.25, 1.5]])
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_sft_video(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH, rand_video_max_frames=14,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(
+            self.tokenizer)
+        data_path = 'tests/resource/mllm_sft_video_example_data_new.jsonl'
+        hf_data_path = 'tests/resource/mllm_sft_video_hf_example_data.jsonl'
+        hf_raw_datas = []
+        with open(hf_data_path) as f:
+            for line in f:
+                hf_raw_datas.append(json.loads(line))
+
+        total_index = [1, 4, 5, 6, 7, 8, 9]
+        with open(data_path) as f:
+            for i, line in enumerate(f):
+                if i not in total_index:
+                    continue
+                raw_data = json.loads(line)
+                hf_raw_data = hf_raw_datas[i]
+
+                if i in [7]:
+                    # transformers 当输入视频文件夹时候，无法支持采样，有多少视频就读多少视频
+                    do_sample_frames = False
+                    tokenize_fn.video_processor.fps = 3
+                    tokenize_fn.rand_video_max_frames = 24  # 设置为大于采样后视频，防止进行采样
+                else:
+                    do_sample_frames = True
+                    tokenize_fn.video_processor.fps = 2
+                    tokenize_fn.rand_video_max_frames = 14
+
+                ret = tokenize_fn(raw_data, media_root=VIDEO_ROOT)
+                input_ids_xtuner = ret['input_ids']
+                pixel_values_xtuner: torch.Tensor = ret['pixel_values']
+                image_grid_thw_xtuner: torch.Tensor = ret['image_grid_thw']
+
+                # to hf openai format
+                messages = hf_raw_data['messages']
+                add_video_root(messages, VIDEO_ROOT)
+
+                if i not in [8, 9]:
+                    ret = self.processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=True,
+                                                             do_sample_frames=do_sample_frames,
+                                                             return_dict=True, add_vision_id=add_vision_id,
+                                                             return_tensors="pt")
+                    input_ids_hf = ret['input_ids'][0]
+                    pixel_values_hf = ret['pixel_values_videos']
+                    image_grid_thw_hf = ret['video_grid_thw']
+
+                text = self.tokenize_fn.tokenizer.decode(input_ids_xtuner)
+
+                if i == 1:
+                    # 不应该包括 seconds> 内容
+                    self.assertTrue('seconds>' not in text)
+                else:
+                    if i == 8:
+                        # 测试能整除下均匀采样
+                        self.assertEqual(pixel_values_xtuner.size(), (45760, 1536))
+                        self.assertEqual(text.count('seconds>'), 13)
+                    elif i == 9:
+                        # 测试无法整除且超过最大帧数情况下，均匀采样
+                        self.assertEqual(pixel_values_xtuner.size(), (24640, 1536))
+                        self.assertEqual(text.count('seconds>'), 7)
+                        print(pixel_values_xtuner.size(), image_grid_thw_xtuner, text.count('seconds>'), 'xxx')
+                    else:
+                        if i == 7:
+                            self.assertEqual(len(input_ids_xtuner), len(input_ids_hf))
+                        else:
+                            self.assertEqual(input_ids_xtuner, input_ids_hf.tolist())
+                        self.assertTrue('seconds>' in text)
+                        self.assertTrue(torch.allclose(pixel_values_xtuner, pixel_values_hf))
+                        self.assertTrue(torch.allclose(image_grid_thw_xtuner, image_grid_thw_hf))
+
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_pretrain_image(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_pretrain_image_example_data_new.jsonl'
+        total_step = 60
+        with open(data_path, encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= total_step:
+                    break
+                raw_data = json.loads(line)
+                ret = tokenize_fn(raw_data, media_root='tests/')
+                input_ids_xtuner = ret['input_ids']
+                labels_xtuner = torch.tensor(ret['labels'])
+                input_str = tokenize_fn.tokenizer.decode(input_ids_xtuner, skip_special_tokens=False)
+                input_str = input_str.replace('<|image_pad|>', '')
+                input_xtuner_str = input_str.replace('<|vision_start|><|vision_end|>', '<|vision_start|><|image_pad|><|vision_end|>')
+                
+                messages = raw_data['messages']
+                messages[0]['role'] = 'user'
+                hf_text = self.tokenizer.apply_chat_template(raw_data['messages'],          
+                                               add_vision_id=add_vision_id,   
+                                               tokenize=False,
+                                               enable_thinking=False,
+                                               add_generation_prompt=False)
+                hf_text = hf_text.replace('<|im_start|>user\n', '')   
+                hf_text = hf_text[:-1]  # remove \n                   
+                self.assertEqual(input_xtuner_str, hf_text)
+                self.assertTrue((labels_xtuner == self.tokenize_fn.img_context_token_id).sum() == 0)
+    
+    @parametrize.parametrize("add_vision_id", [(True,), (False,)])
+    def test_qwen3_vl_pretrain_video(self, add_vision_id):
+        QWEN35_VL_PATH = os.environ["QWEN3_5_MOE_PATH"]
+        tokenize_fn = Qwen3VLTokenizeFnConfig(processor_path=QWEN35_VL_PATH,
+                                              chat_template="qwen3.5-vl",
+                                              add_vision_id=add_vision_id).build(self.tokenizer)
+        data_path = 'tests/resource/mllm_pretrain_video_example_data_new.jsonl'
+        total_step = 60
+        with open(data_path, encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i >= total_step:   
+                    break
+                raw_data = json.loads(line)
+                ret = tokenize_fn(raw_data, media_root=VIDEO_ROOT)
+                labels_xtuner = torch.tensor(ret['labels'])
+                self.assertTrue((labels_xtuner == tokenize_fn.video_context_token_id).sum() == 0)
diff --git a/tests/datasets/test_qwen3_vl_tokenize_fn.py b/tests/datasets/test_qwen3_vl_tokenize_fn.py
index b421758aa..6fb19af8f 100644
--- a/tests/datasets/test_qwen3_vl_tokenize_fn.py
+++ b/tests/datasets/test_qwen3_vl_tokenize_fn.py
@@ -115,7 +115,7 @@ def test_qwen3_vl_sft_single_image(self, add_vision_id):
                 for msg in messages:
                     if not isinstance(msg['content'], list):
                         msg['content'] = [{"type": "text", "text": msg['content']}]
-
+                
                 ret = self.processor.apply_chat_template(messages,
                                                          add_generation_prompt=False,
                                                          tokenize=True,
diff --git a/tests/resource/mllm_pretrain_image_example_data_new.jsonl b/tests/resource/mllm_pretrain_image_example_data_new.jsonl
new file mode 100644
index 000000000..ebb3512a6
--- /dev/null
+++ b/tests/resource/mllm_pretrain_image_example_data_new.jsonl
@@ -0,0 +1,12 @@
+{"id": 1, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}},{"type":"text", "text": "图片中的狗是什么颜色?"}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}},{"type": "text", "text": "图中有几只猫?"}]}]}
+{"id": 2, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 3, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}]}]}
+{"id": 4, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 5, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 6, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "这两张图片都有包括动物相关的内容,都包括了非常温馨的画面，太棒了！！！"}]}]}
+{"id": 7, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}]}
+{"id": 8, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 9, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "When the dog gazes out the window, it seems to be longing for something. This may imply that it wants to go outside or interact with someone or something."}]}]}
+{"id": 10, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 11, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
+{"id": 12, "messages": [{"role": "pretrain", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Both of these pictures include animal-related content and very warm scenes, awesome!!!"}]}]}
diff --git a/tests/resource/mllm_pretrain_video_example_data_new.jsonl b/tests/resource/mllm_pretrain_video_example_data_new.jsonl
new file mode 100644
index 000000000..15d42a52a
--- /dev/null
+++ b/tests/resource/mllm_pretrain_video_example_data_new.jsonl
@@ -0,0 +1,5 @@
+{"id": 1, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "请描述下视频内容？一男一女在打网球。请简要解释下网球，网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "视频中在做什么？打网球"}]}]}
+{"id": 3, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "Please describe the video content? A man and a woman are playing tennis. Please briefly explain tennis. Tennis is a sport where players use rackets to hit the ball over the net into the opponent's court. The goal is to score points by making the ball land in the opponent's court and the opponent fails to return it. Tennis can be played as singles or doubles."}]}]}
+{"id": 4, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "text", "text": "What is happening in the video? Playing tennis"}]}]}
+{"id": 5, "messages": [{"role": "pretrain", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}}, {"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182,"origin_fps": 30.0}},{"type": "text", "text": "What is happening in the video? Playing tennis"}]}]}
\ No newline at end of file
diff --git a/tests/resource/mllm_sft_multi_image_example_data_new.jsonl b/tests/resource/mllm_sft_multi_image_example_data_new.jsonl
new file mode 100644
index 000000000..1e17b88c1
--- /dev/null
+++ b/tests/resource/mllm_sft_multi_image_example_data_new.jsonl
@@ -0,0 +1,11 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "两张图片有啥相同之处？"}]}, {"role": "assistant", "content": "两幅图片中都存在动物。"}, {"role": "user", "content": "都有些什么动物？"}, {"role": "assistant", "content": "第一幅图片中有一只狗，第二副图片中有两只猫。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "两张图片有啥相同之处？"}]}, {"role": "assistant", "content": "两幅图片中都存在动物。"}, {"role": "user", "content": "都有些什么动物？"}, {"role": "assistant", "content": "第一幅图片中有两只猫，第二副图片中有一只狗。"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第二幅图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第一幅图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}, {"role": "user", "content": "第一幅图中还有啥东西？"}, {"role": "assistant", "content": "第一幅图片中还有2个电视遥控器"}, {"role": "user", "content": "两只猫在做什么？"}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}, {"role": "user", "content": "请描述下第一幅图片？"}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "请描述下第二幅图片中这只狗有什么类型的项圈？"}]}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [375, 500]}}, {"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What are the similarities between the two images?"}]}, {"role": "assistant", "content": "Both images contain animals."}, {"role": "user", "content": "What animals are there?"}, {"role": "assistant", "content": "The first image contains a dog, and the second image contains two cats."}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What are the similarities between the two images?"}]}, {"role": "assistant", "content": "Both images contain animals."}, {"role": "user", "content": "What animals are there?"}, {"role": "assistant", "content": "The first image contains two cats, and the second image contains a dog."}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the color of the dog in the second image?"}]}, {"role": "assistant", "content": "The dog in the image is brown."}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "How many cats are in the first image?"}]}, {"role": "assistant", "content": "There are 2 cats in the image."}, {"role": "user", "content": "What else is in the first image?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the first image."}, {"role": "user", "content": "What are the two cats doing?"}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the first image?"}, {"role": "assistant", "content": "The image shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the type of collar the dog in the second image has?"}]}, {"role": "assistant", "content": "The dog has a red collar."}]}
+{"id": 11, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the first image?"}]}, {"role": "assistant", "content": "There are 2 cats in the image."}, {"role": "user", "content": "What else is in the first image?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the first image."}, {"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What are the two cats doing?"}]}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the first image?"}, {"role": "assistant", "content": "The image shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
diff --git a/tests/resource/mllm_sft_single_image_example_data_new.jsonl b/tests/resource/mllm_sft_single_image_example_data_new.jsonl
new file mode 100644
index 000000000..7837da245
--- /dev/null
+++ b/tests/resource/mllm_sft_single_image_example_data_new.jsonl
@@ -0,0 +1,22 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}, {"role": "user", "content": "这只狗有什么类型的项圈？"}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}, {"role": "user", "content": "那只狗在窗后的位置是什么？"}, {"role": "assistant", "content": "那只狗正坐在窗户后面。"}, {"role": "user", "content": "那只狗望着窗外时，它的表情是怎样的？"}, {"role": "assistant", "content": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}, {"role": "user", "content": "你能描述一下狗的毛吗？"}, {"role": "assistant", "content": "这只狗有蓬松、毛茸茸的棕色皮毛，使它看起来既可爱又温顺。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "图片中的狗是什么颜色？"}]}, {"role": "assistant", "content": "图片中的狗是棕色的。"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "这只狗有什么类型的项圈？"}]}, {"role": "assistant", "content": "这只狗有一条红色的项圈。"}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "那只狗在窗后的位置是什么？"}]}, {"role": "assistant", "content": "那只狗正坐在窗户后面。"}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "那只狗望着窗外时，它的表情是怎样的？"}]}, {"role": "assistant", "content": "当狗凝视窗外的时候，它看起来是在渴望着什么东西。 这也许暗示了它想要到外面去或者与某人或某物进行互动。"}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "你能描述一下狗的毛吗？"}]}, {"role": "assistant", "content": "这只狗有蓬松、毛茸茸的棕色皮毛，使它看起来既可爱又温顺。"}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}]}, {"role": "assistant", "content": "The dog in the picture is brown."}, {"role": "user", "content": "What type of collar does this dog have?"}, {"role": "assistant", "content": "This dog has a red collar."}, {"role": "user", "content": "What is the position of the dog behind the window?"}, {"role": "assistant", "content": "The dog is sitting behind the window."}, {"role": "user", "content": "What is the dog's expression when it looks out the window?"}, {"role": "assistant", "content": "When the dog gazes out the window, it seems to be longing for something. This might suggest that it wants to go outside or interact with someone or something."}, {"role": "user", "content": "Can you describe the dog's fur?"}, {"role": "assistant", "content": "The dog has fluffy, furry brown fur, which makes it look both cute and gentle."}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What color is the dog in the picture?"}]}, {"role": "assistant", "content": "The dog in the picture is brown."}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What type of collar does this dog have?"}]}, {"role": "assistant", "content": "This dog has a red collar."}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What is the position of the dog behind the window?"}]}, {"role": "assistant", "content": "The dog is sitting behind the window."}]}
+{"id": 11, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "What is the dog's expression when it looks out the window?"}]}, {"role": "assistant", "content": "When the dog gazes out the window, it seems to be longing for something. This might suggest that it wants to go outside or interact with someone or something."}]}
+{"id": 12, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_dog_000000319154.jpg", "image_wh": [375, 500]}}, {"type": "text", "text": "Can you describe the dog's fur?"}]}, {"role": "assistant", "content": "The dog has fluffy, furry brown fur, which makes it look both cute and gentle."}]}
+{"id": 13, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}, {"role": "user", "content": "图中还有啥东西？"}, {"role": "assistant", "content": "图片中还有2个电视遥控器"}, {"role": "user", "content": "两只猫在做什么？"}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}, {"role": "user", "content": "请描述下这个图片？"}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 14, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图片中有几只猫？"}]}, {"role": "assistant", "content": "图片中有2只猫。"}]}
+{"id": 15, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "图中除了猫还有啥东西？"}]}, {"role": "assistant", "content": "图片中还有2个电视遥控器"}]}
+{"id": 16, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "两只猫在做什么？"}]}, {"role": "assistant", "content": "它们悠闲的躺在沙发上"}]}
+{"id": 17, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "请描述下这个图片？"}]}, {"role": "assistant", "content": "图片中有两只猫，悠闲的躺在沙发上，旁边还有2个电视遥控器。"}]}
+{"id": 18, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}, {"role": "assistant", "content": "There are 2 cats in the picture."}, {"role": "user", "content": "What else is in the picture?"}, {"role": "assistant", "content": "There are also 2 TV remotes in the picture."}, {"role": "user", "content": "What are the two cats doing?"}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}, {"role": "user", "content": "Can you describe the picture?"}, {"role": "assistant", "content": "The picture shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
+{"id": 19, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "How many cats are in the picture?"}]}, {"role": "assistant", "content": "There are 2 cats in the picture."}]}
+{"id": 20, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What else is in the picture besides the cats?"}]}, {"role": "assistant", "content": "There are also 2 TV remotes in the picture."}]}
+{"id": 21, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "What are the two cats doing?"}]}, {"role": "assistant", "content": "They are leisurely lying on the sofa."}]}
+{"id": 22, "messages": [{"role": "user", "content": [{"type": "image", "image": {"url": "resource/mscoco_twocat_000000039769.jpg", "image_wh": [640, 480]}}, {"type": "text", "text": "Can you describe the picture?"}]}, {"role": "assistant", "content": "The picture shows two cats leisurely lying on the sofa, with 2 TV remotes next to them."}]}
diff --git a/tests/resource/mllm_sft_video_example_data_new.jsonl b/tests/resource/mllm_sft_video_example_data_new.jsonl
new file mode 100644
index 000000000..023eb7a80
--- /dev/null
+++ b/tests/resource/mllm_sft_video_example_data_new.jsonl
@@ -0,0 +1,10 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": "一男一女在打网球"}, {"role": "user", "content": "请简要解释下网球"}, {"role": "assistant", "content": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": "A man and a woman are playing tennis."}, {"role": "user", "content": "Can you briefly explain tennis?"}, {"role": "assistant", "content": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": "Playing tennis."}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": "A man and a woman are playing tennis."}, {"role": "user", "content": "Can you briefly explain tennis?"}, {"role": "assistant", "content": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": "Playing tennis."}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "video", "video": {"url": "tennis.mp4", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_4fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 23, "processed_fps": 4}}, {"type": "video", "video": {"url": "tennis_frames_2fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 13, "processed_fps": 2}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_4fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 23, "processed_fps": 4}}, {"type": "video", "video": {"url": "tennis_frames_2fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 13, "processed_fps": 2}}, {"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "tennis_frames_15fps/", "image_wh": [1280, 720], "origin_video_length": 182, "origin_fps": 30.0, "processed_video_length": 91, "processed_fps": 15}}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": "打网球"}]}
diff --git a/tests/resource/mllm_sft_video_hf_example_data.jsonl b/tests/resource/mllm_sft_video_hf_example_data.jsonl
index d4125cb9c..146928d1d 100644
--- a/tests/resource/mllm_sft_video_hf_example_data.jsonl
+++ b/tests/resource/mllm_sft_video_hf_example_data.jsonl
@@ -1,9 +1,9 @@
-{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\n请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
-{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\n视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
-{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "\nCan you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
-{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "\nWhat is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
-{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\nCan you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
-{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\nWhat is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "What is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
 {"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "video", "path": "tennis.mp4"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
 {"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
 {"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
diff --git a/tests/resource/qwen35_tokenize_data.jsonl b/tests/resource/qwen35_tokenize_data.jsonl
index 24f3b7902..362bfa6ad 100644
--- a/tests/resource/qwen35_tokenize_data.jsonl
+++ b/tests/resource/qwen35_tokenize_data.jsonl
@@ -11,5 +11,5 @@
 {"id":11,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。只有一个用户 user 输入。只有一次真 user 输入 表示整个对话过程中只有 user message。此时中间的所有 think 过程都会保留"}, {"role": "user", "content": "北京和上海今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "我现在知道北京的天气了，我需要继续知道上海的天气", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度，上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入，则之前的 think 内容会全部丢掉，因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何？"}, {"role": "assistant", "content": "现在是第二个问题了，我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手，能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容，它们有什么相同点和不同点？"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫，背景都是室内环境，光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧，而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比，有什么新的元素出现？"}]}, {"role": "assistant", "content": "与前两张图片相比，这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片，你认为它们想表达什么主题？"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题：猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松，第二张图片展示了猫的舒适和满足，而第三张图片则通过不同的构图和视觉元素，传达了猫在这个环境中的多样性和丰富性。整体上，这些图片共同描绘了猫在室内生活中的多样化表现，表达了对猫的喜爱和对其生活状态的关注。"}]}
-{"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/demo.mp4"}, {"type": "text", "text": "请描述这个视频的主要内容，并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景，有行人和车辆在移动。随后镜头切换到一家咖啡店的内部，可以看到顾客在排队点单。接着视频展示了咖啡制作的过程，包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅，画面清晰。"}]}
-{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/cooking_tutorial.mp4"}, {"type": "video", "video": "https://example.com/video/cooking_result.mp4"}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/failed_attempt.mp4"}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
\ No newline at end of file
+{"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/demo.mp4"}}, {"type": "text", "text": "请描述这个视频的主要内容，并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景，有行人和车辆在移动。随后镜头切换到一家咖啡店的内部，可以看到顾客在排队点单。接着视频展示了咖啡制作的过程，包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅，画面清晰。"}]}
+{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/cooking_tutorial.mp4"}}, {"type": "video", "video": {"url":"https://example.com/video/cooking_result.mp4"}}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/failed_attempt.mp4"}}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
\ No newline at end of file
diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
index a8604b4a5..ef2d3e67a 100644
--- a/xtuner/v1/data_proto/messages/qwen35_chat.py
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -1,9 +1,11 @@
-from typing import List, Dict, Optional
+import copy
 import json
-from xtuner.v1.data_proto.messages.base import BaseMessages
-from xtuner.v1.data_proto.templates import HybridChatTemplate
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel, ConfigDict
+
 from transformers import PreTrainedTokenizer
-import copy
+from xtuner.v1.data_proto.templates import HybridChatTemplate
 
 
 def get_offset_mapping(tokenizer, text: str):
@@ -43,32 +45,35 @@ def render_content(content, do_vision_count, image_count, video_count, add_visio
                 video_count += 1
             if add_vision_id:
                 result += f"Video {video_count}: "
-            
+
             video_content = item.get("video", {})
+            assert isinstance(video_content, dict), f"video_content must be a dict, but got {type(video_content)}"
             timestamps = video_content.get("timestamps", [])
             if len(timestamps) > 0:
                 video_placeholder = ""
                 for timestamp in timestamps:
                     video_placeholder += f"<{timestamp:.1f} seconds><|vision_start|><|video_pad|><|vision_end|>"
                 result += video_placeholder
-            
+            else:
+                # 每个视频可能有 n 帧，每一帧里面可能占据 m 个 token
+                assert "num_frames" in video_content, "num_frames must be in video_content"
+                num_frames = video_content["num_frames"]
+                for _ in range(len(num_frames)):
+                    result += "<|vision_start|><|video_pad|><|vision_end|>"
             conversation_timestamp = video_content.get("conversation_timestamp", [])
             if len(conversation_timestamp) > 0:
-                start_time = conversation_timestamp[0]  
+                start_time = conversation_timestamp[0]
                 end_time = conversation_timestamp[1]
                 timestamps = f"<{start_time:.1f}-{end_time:.1f} seconds>"
                 result += timestamps
+
         elif "text" in item:
             result += item["text"]
     return result, image_count, video_count
 
 
 # Qwen3.5 工具系统提示（与 Qwen3 不同的 XML 格式）
-_QWEN35_TOOL_SYSTEM = (
-    "# Tools\n\n"
-    "You have access to the following functions:\n\n"
-    "<tools>"
-)
+_QWEN35_TOOL_SYSTEM = "# Tools\n\nYou have access to the following functions:\n\n<tools>"
 _QWEN35_TOOL_INSTRUCTIONS = (
     "\n</tools>\n\n"
     "If you choose to call a function ONLY reply in the following format with NO suffix:\n\n"
@@ -117,8 +122,7 @@ def qwen35_tokenize_fn_fastspeed(
     add_generation_prompt=False,
     add_vision_id=False,
     return_labels=True,
-):  
-
+):
     enable_thinking = any("reasoning_content" in msg for msg in messages)
 
     image_count = 0
@@ -162,10 +166,7 @@ def _append(text: str, is_loss: bool) -> None:
         msg = messages[i]
         if multi_step_tool and msg["role"] == "user":
             content_str = _render(msg["content"], False).strip()
-            if not (
-                content_str.startswith("<tool_response>")
-                and content_str.endswith("</tool_response>")
-            ):
+            if not (content_str.startswith("<tool_response>") and content_str.endswith("</tool_response>")):
                 multi_step_tool = False
                 last_query_index = i
 
@@ -185,9 +186,7 @@ def _append(text: str, is_loss: bool) -> None:
                 reasoning_content = message["reasoning_content"]
             else:
                 if "</think>" in content:
-                    reasoning_content = (
-                        content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n")
-                    )
+                    reasoning_content = content.split("</think>")[0].rstrip("\n").split("<think>")[-1].lstrip("\n")
                     content = content.split("</think>")[-1].lstrip("\n")
             # Qwen3.5 模板对 reasoning_content 做 |trim
             reasoning_content = reasoning_content.strip()
@@ -242,7 +241,7 @@ def _append(text: str, is_loss: bool) -> None:
 
                     if isinstance(tc_args, dict):
                         _append(_render_tool_call_args(tc_args), body_is_loss)
-                    _append(f"</function>\n</tool_call>", body_is_loss)
+                    _append("</function>\n</tool_call>", body_is_loss)
 
             _append("<|im_end|>\n", body_is_loss)
 
@@ -296,7 +295,9 @@ def _append(text: str, is_loss: bool) -> None:
 
 def qwen35_process_text_and_loss_mask(text: str, loss_mask: list[bool], tokenizer: PreTrainedTokenizer):
     assert tokenizer is not None
-    assert len(text) == len(loss_mask), "text and loss_mask must have the same length. Got {len(text)} and {len(loss_mask)}."
+    assert len(text) == len(loss_mask), (
+        "text and loss_mask must have the same length. Got {len(text)} and {len(loss_mask)}."
+    )
 
     try:
         encoded = tokenizer(
@@ -333,21 +334,38 @@ def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tool
 
     enable_thinking = any("reasoning_content" in msg for msg in messages)
 
-    full_text = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools,add_vision_id=add_vision_id, enable_thinking=enable_thinking, **kwargs)
+    full_text = tokenizer.apply_chat_template(
+        messages, tokenize=False, tools=tools, add_vision_id=add_vision_id, enable_thinking=enable_thinking, **kwargs
+    )
     total_ids = tokenizer.encode(full_text, add_special_tokens=False)
     labels = [-100] * len(total_ids)
     # 记录在 total_ids 中搜索的起始位置，确保不会搜到前面的轮次
     curr_ptr = 0
     for i, msg in enumerate(messages):
-        if msg['role'] == 'assistant' and msg.get('loss', True):
+        if msg["role"] == "assistant" and msg.get("loss", True):
             # 1. 获取包含当前消息之前所有内容的“前缀”文本 (带 generation prompt)
-            prompt_text = tokenizer.apply_chat_template(messages[:i], tokenize=False, add_generation_prompt=True, add_vision_id=add_vision_id, enable_thinking=enable_thinking, tools=tools if i==0 else None, **kwargs)
+            prompt_text = tokenizer.apply_chat_template(
+                messages[:i],
+                tokenize=False,
+                add_generation_prompt=True,
+                add_vision_id=add_vision_id,
+                enable_thinking=enable_thinking,
+                tools=tools if i == 0 else None,
+                **kwargs,
+            )
             # 2. 获取包含当前消息的完整“截断”文本
             # 我们通过修改当前消息的内容，强制在末尾加上一个罕见标记，来准确捕获这部分的内容
             # 为什么要加标记？因为我们想知道当前消息的结束符（如 <|im_end|>）被 tokenizer 编成了什么
-            temp_msgs = [m.copy() for m in messages[:i+1]]
+            temp_msgs = [m.copy() for m in messages[: i + 1]]
             # 提取真实内容
-            m_text = tokenizer.apply_chat_template(temp_msgs, tokenize=False,add_vision_id=add_vision_id, enable_thinking=enable_thinking, tools=tools if i==0 else None, **kwargs)
+            m_text = tokenizer.apply_chat_template(
+                temp_msgs,
+                tokenize=False,
+                add_vision_id=add_vision_id,
+                enable_thinking=enable_thinking,
+                tools=tools if i == 0 else None,
+                **kwargs,
+            )
             # 转换为 Token 序列
             p_ids = tokenizer.encode(prompt_text, add_special_tokens=False)
             m_ids = tokenizer.encode(m_text, add_special_tokens=False)
@@ -357,7 +375,7 @@ def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tool
             # 为了最稳健，我们直接在 m_ids 的末尾倒推。
             # 我们知道 m_ids 是由 p_ids + current_content_ids 组成的
             # 我们直接取差集：
-            content_tokens = m_ids[len(p_ids):]
+            content_tokens = m_ids[len(p_ids) :]
             if not content_tokens:
                 continue
             # 4. 在全量 total_ids 中搜索这段 content_tokens
@@ -375,33 +393,41 @@ def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tool
                 # 这是允许的，只要它不是当前轮次（我们不强求历史轮次一定要匹配上，因为我们通常只对最后的 Turn 算 loss）
                 # 但如果是最后一条消息还没匹配上，那就一定是出大问题了
                 if i == len(messages) - 1:
-                    raise ValueError(f"严重错误：最后一条 Assistant 消息无法在全量 Token 中对齐。")
+                    raise ValueError("严重错误：最后一条 Assistant 消息无法在全量 Token 中对齐。")
     return total_ids, labels
 
 
-class Qwen35ChatMessages(BaseMessages):
-    messages: List[dict] # 暂时不做校验
+class Qwen35ChatMessages(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    messages: List[dict]  # 暂时不做校验
     tools: Optional[List[Dict]] = None
-    
-    def tokenize(self, tokenizer: PreTrainedTokenizer, chat_template: HybridChatTemplate, add_vision_id=False, **kwargs) -> Dict:
+
+    def tokenize(
+        self, tokenizer: PreTrainedTokenizer, chat_template: HybridChatTemplate, add_vision_id=False, **kwargs
+    ) -> Dict:
         is_pretrain = False
-        if len(self.messages) == 1 and self.messages[0]['role'] == "pretrain":
+        if len(self.messages) == 1 and self.messages[0]["role"] == "pretrain":
             is_pretrain = True
-        
+
         if is_pretrain:
-            text = self.messages[0]['content']
+            text, _, _ = render_content(
+                self.messages[0]["content"],
+                do_vision_count=True,
+                image_count=0,
+                video_count=0,
+                add_vision_id=add_vision_id,
+            )
             token_ids = tokenizer.encode(text, add_special_tokens=False)
             label_ids = copy.deepcopy(token_ids)
         else:
             # replace system message
             if chat_template.default_system is not None:
-                if self.messages[0]['role'] == "system":
-                    self.messages[0]['content'] = chat_template.default_system
+                if self.messages[0]["role"] == "system":
+                    self.messages[0]["content"] = chat_template.default_system
                 else:
-                    self.messages.insert(0, {'role': 'system', 'content': chat_template.default_system})
+                    self.messages.insert(0, {"role": "system", "content": chat_template.default_system})
 
-            token_ids, label_ids = qwen35_tokenize_fn_fastspeed(self.messages, tokenizer, self.tools,
-                                                                add_vision_id=add_vision_id, 
-                                                                return_labels=True)
+            token_ids, label_ids = qwen35_tokenize_fn_fastspeed(
+                self.messages, tokenizer, self.tools, add_vision_id=add_vision_id, return_labels=True
+            )
         return {"input_ids": token_ids, "labels": label_ids}
-
diff --git a/xtuner/v1/data_proto/templates/__init__.py b/xtuner/v1/data_proto/templates/__init__.py
index 744c79fd6..7d7a7a78d 100644
--- a/xtuner/v1/data_proto/templates/__init__.py
+++ b/xtuner/v1/data_proto/templates/__init__.py
@@ -4,6 +4,7 @@
 from .chat import ChatTemplate
 from .hybrid import HybridChatTemplate
 
+
 current_date = datetime.now().strftime("%Y-%m-%d")
 
 CHAT_TEMPLATE_MAP = {
@@ -11,7 +12,7 @@
         image_start_token="<|vision_start|>",
         image_end_token="<|vision_end|>",
         image_context_token="<|image_pad|>",
-        video_context_token="<|video_pad|>"
+        video_context_token="<|video_pad|>",
     ),
     "intern-s1": HybridChatTemplate(
         system="<|im_start|>system\n{system}<|im_end|>\n",
diff --git a/xtuner/v1/data_proto/templates/hybrid.py b/xtuner/v1/data_proto/templates/hybrid.py
index cb3899b03..a83b553d5 100644
--- a/xtuner/v1/data_proto/templates/hybrid.py
+++ b/xtuner/v1/data_proto/templates/hybrid.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from typing import Dict, List, Optional
-from typing import Callable
 
 from pydantic import BaseModel, ConfigDict, field_validator
 
@@ -13,11 +12,11 @@ class HybridChatTemplate(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     # Normal Chat
-    system: str | None = None # System message format, role
+    system: str | None = None  # System message format, role
     developer: str | None = None  # Developer message format, role
-    user: str | None = None # User message format, role
-    assistant: str | None = None # Assistant message format, role
-    stop_words: List[str] | None = None # List of stop words
+    user: str | None = None  # User message format, role
+    assistant: str | None = None  # Assistant message format, role
+    stop_words: List[str] | None = None  # List of stop words
     sep: str = "\n"
     thinking: str | None = None  # Thinking message format, not role
     default_system: Optional[str] = None
@@ -62,6 +61,7 @@ def mm_token_maps(self) -> Dict[str, int]:
 
     def decorate_system(self, text: str) -> str:
         """Decorate text with the `system` template."""
+        assert self.system is not None, "system template is not defined."
         return self.system.format(system=text)
 
     def decorate_developer(self, text: str) -> str:
@@ -72,6 +72,7 @@ def decorate_developer(self, text: str) -> str:
 
     def decorate_assistant(self, text: str) -> str:
         """Decorate text with the `assistant` template."""
+        assert self.assistant is not None, "assistant template is not defined."
         return self.assistant.format(assistant=text)
 
     def decorate_thinking(self, text: str) -> str:
@@ -82,6 +83,7 @@ def decorate_thinking(self, text: str) -> str:
 
     def decorate_user(self, text: str) -> str:
         """Decorate text with the `user` template."""
+        assert self.user is not None, "user template is not defined."
         return self.user.format(user=text)
 
     def decorate_files(self, text: str) -> str:
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
index a622bbef9..ac7772e94 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
@@ -31,10 +31,18 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
             content = msg["content"]
             if isinstance(content, list):
                 for c in content:
-                    if c["type"] == "image_url":
-                        image_paths.append(c["image_url"]["url"])
-                        if "image_wh" in c["image_url"]:
-                            image_wh = c["image_url"]["image_wh"]
+                    if c["type"] == "image_url" or c["type"] == "image":
+                        if c["type"] == "image_url":
+                            image_paths.append(c["image_url"]["url"])
+                        else:
+                            image_paths.append(c["image"]["url"])
+
+                        if "image_url" in c:
+                            key = "image_url"
+                        else:
+                            key = "image"
+                        if "image_wh" in c[key]:
+                            image_wh = c[key]["image_wh"]
                             if isinstance(image_wh[0], (list, tuple)):
                                 assert len(image_wh) == 1, (
                                     f"Only one image size is supported for each image. but got {image_wh}"
@@ -42,10 +50,14 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
                                 image_wh = image_wh[0]
                             image_wh_list.append(image_wh)
                             assert len(image_wh) == 2, f"image_wh should be [width, height], but got {image_wh}"
-                    if c["type"] == "video_url":
-                        video_paths.append(c["video_url"]["url"])
+                    if c["type"] == "video_url" or c["type"] == "video":
+                        if "video_url" in c:
+                            video_paths.append(c["video_url"]["url"])
+                            video_wh = c["video_url"].get("image_wh")
+                        else:
+                            video_paths.append(c["video"]["url"])
+                            video_wh = c["video"].get("image_wh")
 
-                        video_wh = c["video_url"].get("image_wh")
                         if video_wh is not None:
                             if isinstance(video_wh[0], (list, tuple)):
                                 assert len(video_wh) == 1, (
@@ -56,16 +68,22 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
                             assert len(video_wh) == 2, f"video_wh should be [width, height], but got {video_wh}"
 
                         video_extra_dict = {}
-                        if "origin_video_length" in c["video_url"]:
-                            video_extra_dict["origin_video_length"] = c["video_url"]["origin_video_length"]
-                        if "origin_fps" in c["video_url"]:
-                            video_extra_dict["origin_fps"] = c["video_url"]["origin_fps"]
-                        if "processed_video_length" in c["video_url"]:
-                            video_extra_dict["processed_video_length"] = c["video_url"]["processed_video_length"]
-                        if "processed_fps" in c["video_url"]:
-                            video_extra_dict["processed_fps"] = c["video_url"]["processed_fps"]
-                        if "frames_timestamp" in c["video_url"]:
-                            video_extra_dict["frames_timestamp"] = c["video_url"]["frames_timestamp"]
+
+                        if "video_url" in c:
+                            key = "video_url"
+                        else:
+                            key = "video"
+
+                        if "origin_video_length" in c[key]:
+                            video_extra_dict["origin_video_length"] = c[key]["origin_video_length"]
+                        if "origin_fps" in c[key]:
+                            video_extra_dict["origin_fps"] = c[key]["origin_fps"]
+                        if "processed_video_length" in c[key]:
+                            video_extra_dict["processed_video_length"] = c[key]["processed_video_length"]
+                        if "processed_fps" in c[key]:
+                            video_extra_dict["processed_fps"] = c[key]["processed_fps"]
+                        if "frames_timestamp" in c[key]:
+                            video_extra_dict["frames_timestamp"] = c[key]["frames_timestamp"]
                         if len(video_extra_dict) > 0:
                             video_extra_info_list.append(video_extra_dict)
 
@@ -271,7 +289,6 @@ class BaseMLLMTokenizeFnConfig(BaseModel):
     llm_pack_weight: float = 1.0
     visual_pack_weight: float = 0.0
     trim_memory_interval: int = 1
-    chat_template: str | None = None
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
index e505d9c06..d9571b4a1 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/qwen3_vl_tokenize_fn.py
@@ -6,7 +6,7 @@
 import os
 from itertools import chain
 from types import SimpleNamespace
-from typing import Literal, Dict, Optional, Union
+from typing import Dict, Literal, Optional, Union
 
 import numpy as np
 import torch
@@ -20,6 +20,7 @@
 from xtuner.v1.data_proto.messages import ChatMessages, Qwen35ChatMessages
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP, HybridChatTemplate
 from xtuner.v1.utils import get_logger
+
 from ..data_item import CacheItem, QwenVL3DataItem
 from ..utils import apply_exif_orientation, generate_random_int_from_dict
 from .base_mllm_tokenize_fn import (
@@ -202,30 +203,39 @@ def replace_video_token(
     assert current_image_idx == n_image, f"VIDEO ERROR: total_image_idx: {current_image_idx} != {n_image}"
 
 
-def replace_video_timestamps(messages: list[Dict], timestamps_list: list[list[float]]):
+def replace_video_timestamps_and_num_frame(
+    messages: list[Dict], num_image_token_list: list[list[int]], timestamps_list: list[list[float]]
+):
+    if len(timestamps_list) > 0:
+        assert len(timestamps_list) == len(num_image_token_list), (
+            "timestamps should have the same length as num_image_token_list"
+        )
     video_cnt = 0
     for msg in messages:
-        if msg['role'] == "user":
-            content = msg['content']
+        if msg["role"] == "user" or msg["role"] == "pretrain":
+            content = msg["content"]
             if isinstance(content, list):
                 for item in content:
-                    if 'video' in item:
-                        video_content = item['video']
-                        timestamps = timestamps_list[video_cnt]
-                        video_content['timestamps'] = timestamps
+                    if "video" in item:
+                        video_content = item["video"]
+                        if len(timestamps_list) > 0:
+                            timestamps = timestamps_list[video_cnt]
+                            video_content["timestamps"] = timestamps
+                        if len(num_image_token_list) > 0:
+                            num_frames = num_image_token_list[video_cnt]
+                            video_content["num_frames"] = num_frames
                         video_cnt += 1
 
 
-def replace_qwen35_media_token(tokenized:dict, context_token_id: int, num_media_token_list: list[int] | list[list[int]], total_media_count: int):
+def replace_qwen35_media_token(
+    tokenized: dict, context_token_id: int, num_media_token_list: list[int] | list[list[int]], total_media_count: int
+):
     input_ids = tokenized["input_ids"]
     labels = tokenized.get("labels")
 
-    if isinstance(num_media_token_list, list):
-        # video
-        num_media_token_list = [item for sublist in num_media_token_list for item in sublist]
-    else:
-        # image
-        num_media_token_list = [num_media_token_list]
+    if isinstance(num_media_token_list[0], list):
+        # video flatten list
+        num_media_token_list = [item for sublist in num_media_token_list for item in sublist]  # type: ignore
 
     context_token_count = input_ids.count(context_token_id)
     assert context_token_count == len(num_media_token_list), (
@@ -237,7 +247,7 @@ def replace_qwen35_media_token(tokenized:dict, context_token_id: int, num_media_
     media_idx = 0
     for i, tid in enumerate(input_ids):
         if tid == context_token_id:
-            n = int(num_media_token_list[media_idx])
+            n = int(num_media_token_list[media_idx])  # type: ignore
             new_input_ids.extend([context_token_id] * n)
             if new_labels is not None and labels is not None:
                 lbl = labels[i]
@@ -267,7 +277,7 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
         processor_path: str,
         anno_name: str,
-        chat_template: str = 'qwen3-vl', # qwen3.5-vl or qwen3-vl
+        chat_template: str = "qwen3-vl",  # qwen3.5-vl or qwen3-vl
         min_pixels: int | None = None,  # Max image pixels (H*W) for image
         max_pixels: int | None = None,  # Min image pixels (H*W) for image
         video_min_frames: int | None = None,  # Min frames per video
@@ -292,7 +302,7 @@ def __init__(
         hash: str | None = None,
         add_eos_token: bool = True,  # for mllm pretrain
         add_bos_token: bool = False,  # for mllm pretrain
-        trim_memory_interval: int = 1
+        trim_memory_interval: int = 1,
     ):
         self.oss_loader = None
         self.debug = debug
@@ -432,7 +442,7 @@ def pure_text_get_item(self, data_item: dict) -> QwenVL3DataItem:
         is_pretrain = False
         if self.chat_template_name == "qwen3.5-vl":
             messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
                 is_pretrain = True
         else:
             messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
@@ -489,10 +499,12 @@ def calc_num_tokens_multi_modal_get_item(self, data_item: dict) -> CacheItem:
         is_pretrain = False
         if self.chat_template_name == "qwen3.5-vl":
             messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
                 is_pretrain = True
             tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
-            replace_qwen35_media_token(tokenized, self.img_context_token_id, sum_media_grid_thw, sum_media_grid_thw.sum())
+            replace_qwen35_media_token(
+                tokenized, self.img_context_token_id, sum_media_grid_thw, sum_media_grid_thw.sum()
+            )
         else:
             messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
             replace_image_token(messages, self.chat_template, sum_media_grid_thw, add_vision_id=self.add_vision_id)
@@ -539,15 +551,15 @@ def multi_modal_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3
         visual_processed = self.image_processor.preprocess(image_data_list, return_tensors="pt")
         image_tensor = visual_processed["pixel_values"]
         grid_thw = visual_processed["image_grid_thw"]  # b,3
-        grid_thw_merged = [merged_thw.prod() // self.merge_length for merged_thw in grid_thw]  # type: ignore
+        grid_thw_merged = [merged_thw.prod().item() // self.merge_length for merged_thw in grid_thw]  # type: ignore
 
         is_pretrain = False
         if self.chat_template_name == "qwen3.5-vl":
             messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
                 is_pretrain = True
             tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
-            replace_qwen35_media_token(tokenized, self.img_context_token_id, grid_thw_merged, torch.stack(grid_thw_merged, dim=0).sum())
+            replace_qwen35_media_token(tokenized, self.img_context_token_id, grid_thw_merged, sum(grid_thw_merged))
         else:
             messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
             if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
@@ -583,7 +595,7 @@ def multi_modal_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3
 
         # 如果图片被截断，则该数据要丢弃
         num_image_tokens_1 = (torch.tensor(input_ids) == self.img_context_token_id).sum()
-        num_image_tokens_2 = torch.stack(grid_thw_merged, dim=0).sum()
+        num_image_tokens_2 = sum(grid_thw_merged)
         # assert 会被捕获，该数据会丢弃
         assert num_image_tokens_1 == num_image_tokens_2, (
             f"num_image_tokens of input_ids {num_image_tokens_1} != num_image_tokens of media_grid_thw {num_image_tokens_2}, "
@@ -779,25 +791,24 @@ def calc_num_tokens_video_get_item(self, data_item: dict) -> CacheItem:
             frame_seqlen = grid_h * grid_w // self.merge_length
             num_image_token_list.append([frame_seqlen] * grid_t)
             total_sum_media_grid_thw += sum_media_grid_thw
-        
+
         is_pretrain = False
         if self.chat_template_name == "qwen3.5-vl":
-            if len(timestamps_list) > 0:
-                assert len(timestamps_list) == len(num_image_token_list), (
-                    "timestamps should have the same length as num_image_token_list"
-                )
-                replace_video_timestamps(data_item["messages"], timestamps_list)
-        
+            replace_video_timestamps_and_num_frame(data_item["messages"], num_image_token_list, timestamps_list)
             messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
                 is_pretrain = True
-            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
-            replace_qwen35_media_token(tokenized, self.video_context_token_id, num_image_token_list, total_sum_media_grid_thw)
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(
+                tokenized, self.video_context_token_id, num_image_token_list, total_sum_media_grid_thw
+            )
         else:
             messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
             if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
                 is_pretrain = True
-            replace_video_token(messages, self.chat_template, num_image_token_list, timestamps_list, add_vision_id=self.add_vision_id)
+            replace_video_token(
+                messages, self.chat_template, num_image_token_list, timestamps_list, add_vision_id=self.add_vision_id
+            )
             tokenized = messages.tokenize(self.tokenizer, self.chat_template)
         input_ids = tokenized["input_ids"]
 
@@ -917,27 +928,26 @@ def video_get_item(self, data_item: dict, media_root: str = "") -> QwenVL3DataIt
             num_image_tokens_list.append([frame_seqlen] * grid_thw[0][0])
             num_imgs_list.append(num_frames)
             total_sum_media_grid_thw += sum_media_grid_thw
-        
+
         is_pretrain = False
         if self.chat_template_name == "qwen3.5-vl":
-            if len(timestamps_list) > 0:
-                assert len(timestamps_list) == len(num_image_tokens_list), (
-                    "timestamps should have the same length as num_image_token_list"
-                )
-                replace_video_timestamps(data_item["messages"], timestamps_list)
-        
+            replace_video_timestamps_and_num_frame(data_item["messages"], num_image_tokens_list, timestamps_list)
             messages = Qwen35ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
-            if len(data_item["messages"]) == 1 and data_item["messages"][0]['role'] == "pretrain":
+            if len(data_item["messages"]) == 1 and data_item["messages"][0]["role"] == "pretrain":
                 is_pretrain = True
-            tokenized = messages.tokenize(self.tokenizer, self.chat_template)
-            replace_qwen35_media_token(tokenized, self.video_context_token_id, num_image_tokens_list, total_sum_media_grid_thw)
+            tokenized = messages.tokenize(self.tokenizer, self.chat_template, add_vision_id=self.add_vision_id)
+            replace_qwen35_media_token(
+                tokenized, self.video_context_token_id, num_image_tokens_list, total_sum_media_grid_thw
+            )
         else:
             messages = ChatMessages(messages=data_item["messages"], tools=data_item.get("tools"))
             if len(messages.messages) == 1 and messages.messages[0].role == "pretrain":
                 is_pretrain = True
-            replace_video_token(messages, self.chat_template, num_image_tokens_list, timestamps_list, add_vision_id=self.add_vision_id)
+            replace_video_token(
+                messages, self.chat_template, num_image_tokens_list, timestamps_list, add_vision_id=self.add_vision_id
+            )
             tokenized = messages.tokenize(self.tokenizer, self.chat_template)
-            
+
         input_ids = tokenized["input_ids"]
         labels = tokenized["labels"]
 
@@ -1010,7 +1020,7 @@ class Qwen3VLTokenizeFnConfig(BaseMLLMTokenizeFnConfig):
     add_vision_id: bool = True
 
     trim_memory_interval: int = 1
-    chat_template: Literal["qwen3-vl", "qwen3-vl-rl"] = "qwen3-vl"
+    chat_template: Literal["qwen3-vl", "qwen3-vl-rl", "qwen3.5-vl"] = "qwen3-vl"
 
     def build(
         self, tokenizer, tokenizer_hash: str | None = None, anno_name: str = "", **kwargs

From a644c625fede0dbdfb7b5d60ec23e473586d0e43 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Wed, 8 Apr 2026 07:51:31 +0000
Subject: [PATCH 3/9] update

---
 tests/datasets/test_qwen35_vl_tokenize_fn.py         |  2 +-
 tests/datasets/test_qwen3_vl_tokenize_fn.py          |  2 +-
 tests/resource/mllm_sft_video_hf_example_data.jsonl  | 12 ++++++------
 .../mllm_sft_video_hf_example_data_new.jsonl         | 10 ++++++++++
 4 files changed, 18 insertions(+), 8 deletions(-)
 create mode 100644 tests/resource/mllm_sft_video_hf_example_data_new.jsonl

diff --git a/tests/datasets/test_qwen35_vl_tokenize_fn.py b/tests/datasets/test_qwen35_vl_tokenize_fn.py
index 291f19fe4..d5c6a4e55 100644
--- a/tests/datasets/test_qwen35_vl_tokenize_fn.py
+++ b/tests/datasets/test_qwen35_vl_tokenize_fn.py
@@ -224,7 +224,7 @@ def test_qwen3_vl_sft_video(self, add_vision_id):
                                               add_vision_id=add_vision_id).build(
             self.tokenizer)
         data_path = 'tests/resource/mllm_sft_video_example_data_new.jsonl'
-        hf_data_path = 'tests/resource/mllm_sft_video_hf_example_data.jsonl'
+        hf_data_path = 'tests/resource/mllm_sft_video_hf_example_data_new.jsonl'
         hf_raw_datas = []
         with open(hf_data_path) as f:
             for line in f:
diff --git a/tests/datasets/test_qwen3_vl_tokenize_fn.py b/tests/datasets/test_qwen3_vl_tokenize_fn.py
index 6fb19af8f..b421758aa 100644
--- a/tests/datasets/test_qwen3_vl_tokenize_fn.py
+++ b/tests/datasets/test_qwen3_vl_tokenize_fn.py
@@ -115,7 +115,7 @@ def test_qwen3_vl_sft_single_image(self, add_vision_id):
                 for msg in messages:
                     if not isinstance(msg['content'], list):
                         msg['content'] = [{"type": "text", "text": msg['content']}]
-                
+
                 ret = self.processor.apply_chat_template(messages,
                                                          add_generation_prompt=False,
                                                          tokenize=True,
diff --git a/tests/resource/mllm_sft_video_hf_example_data.jsonl b/tests/resource/mllm_sft_video_hf_example_data.jsonl
index 146928d1d..d4125cb9c 100644
--- a/tests/resource/mllm_sft_video_hf_example_data.jsonl
+++ b/tests/resource/mllm_sft_video_hf_example_data.jsonl
@@ -1,9 +1,9 @@
-{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
-{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
-{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
-{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "What is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
-{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
-{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\n请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\n视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "\nCan you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "\nWhat is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\nCan you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "\nWhat is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
 {"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "video", "path": "tennis.mp4"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
 {"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
 {"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
diff --git a/tests/resource/mllm_sft_video_hf_example_data_new.jsonl b/tests/resource/mllm_sft_video_hf_example_data_new.jsonl
new file mode 100644
index 000000000..146928d1d
--- /dev/null
+++ b/tests/resource/mllm_sft_video_hf_example_data_new.jsonl
@@ -0,0 +1,10 @@
+{"id": 1, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "请描述下视频内容？"}]}, {"role": "assistant", "content": [{"type": "text","text":"一男一女在打网球"}]},{"role": "user", "content": [{"type": "text","text": "请简要解释下网球"}]},{"role": "assistant", "content": [{"type": "text","text": "网球是一项运动，运动员使用球拍将球击打过网进入对方场地。目标是通过让球落入对方场地且对方无法回击来得分。网球可以单人对战（单打）或双人组队对战（双打）。"}]}]}
+{"id": 2, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 3, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}]}
+{"id": 4, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "content": [{"type": "text","text": "What is happening in the video?"}]}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 5, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "Can you describe the video content?"}]}, {"role": "assistant", "content": [{"type": "text","text": "A man and a woman are playing tennis."}]}, {"role": "user", "content": [{"type": "text","text": "Can you briefly explain tennis?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Tennis is a sport where players use a racket to hit a ball over a net into the opponent's court. The objective is to score points by making the ball land in the opponent's court in a way that they cannot return it. It can be played individually (singles) or in pairs (doubles)."}]}]}
+{"id": 6, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "text", "text": "What is happening in the video?"}]}, {"role": "assistant", "content": [{"type": "text","text": "Playing tennis."}]}]}
+{"id": 7, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis.mp4"}, {"type": "video", "path": "tennis.mp4"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 8, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 9, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_4fps/"}, {"type": "video", "path": "tennis_frames_2fps/"},{"type": "text", "text": "两个视频中都在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
+{"id": 10, "messages": [{"role": "user", "content": [{"type": "video", "path": "tennis_frames_15fps/"},{"type": "text", "text": "视频中在做什么？"}]}, {"role": "assistant", "content": [{"type": "text","text": "打网球"}]}]}
\ No newline at end of file

From 18721ca26fad20179207d40fa03668e3dcca1389 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Wed, 8 Apr 2026 09:10:25 +0000
Subject: [PATCH 4/9] fix ci

---
 tests/datasets/test_qwen35_vl_tokenize_fn.py  |  2 +-
 .../mllm_video_frame_test_data_new.jsonl      |  8 +++++
 xtuner/v1/data_proto/messages/qwen35_chat.py  | 31 ++-----------------
 .../mllm_tokenize_fn/base_mllm_tokenize_fn.py | 30 +++++-------------
 4 files changed, 18 insertions(+), 53 deletions(-)
 create mode 100644 tests/resource/mllm_video_frame_test_data_new.jsonl

diff --git a/tests/datasets/test_qwen35_vl_tokenize_fn.py b/tests/datasets/test_qwen35_vl_tokenize_fn.py
index d5c6a4e55..baaa9988a 100644
--- a/tests/datasets/test_qwen35_vl_tokenize_fn.py
+++ b/tests/datasets/test_qwen35_vl_tokenize_fn.py
@@ -153,7 +153,7 @@ def test_qwen3_vl_sft_multi_image(self, add_vision_id):
 
     def test_calc_frame_info(self):
         self.tokenize_fn.state = "cache"
-        data_path = 'tests/resource/mllm_video_frame_test_data.jsonl'
+        data_path = 'tests/resource/mllm_video_frame_test_data_new.jsonl'
         with open(data_path) as f:
             for i, line in enumerate(f):
                 raw_data = json.loads(line)
diff --git a/tests/resource/mllm_video_frame_test_data_new.jsonl b/tests/resource/mllm_video_frame_test_data_new.jsonl
new file mode 100644
index 000000000..7c566ff90
--- /dev/null
+++ b/tests/resource/mllm_video_frame_test_data_new.jsonl
@@ -0,0 +1,8 @@
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720]}}, {"type": "text", "text": "两个视频中都在做什么?"},{"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "origin_video_length":12, "origin_fps":8}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "origin_video_length":1, "origin_fps":1}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":20,"processed_video_length":50, "processed_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":36, "processed_fps":5, "origin_video_length":72, "origin_fps":10}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":100, "origin_fps":20,"processed_video_length":50, "processed_fps":10}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":12, "processed_fps":5, "origin_video_length":24, "origin_fps":10}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":16, "processed_fps":5, "origin_video_length":32, "origin_fps":10,"frames_timestamp":[0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8, 3.0]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "两个视频中都在做什么?"}, {"type": "video", "video": {"url": "yyy.mp4", "image_wh": [460, 340], "processed_video_length":12, "processed_fps":5, "origin_video_length":24, "origin_fps":10,"frames_timestamp":[0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2]}}]}, {"role": "assistant", "content": "打网球"}]}
+{"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "xxx.mp4", "image_wh": [1280, 720], "origin_video_length":40, "origin_fps":20,"processed_video_length":8, "processed_fps":4, "frames_timestamp":[0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75]}}, {"type": "text", "text": "视频中都在做什么?"}]}, {"role": "assistant", "content": "打网球"}]}
diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
index ef2d3e67a..495ee8c10 100644
--- a/xtuner/v1/data_proto/messages/qwen35_chat.py
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import json
 from typing import Dict, List, Optional
@@ -293,35 +294,6 @@ def _append(text: str, is_loss: bool) -> None:
     return input_ids, labels
 
 
-def qwen35_process_text_and_loss_mask(text: str, loss_mask: list[bool], tokenizer: PreTrainedTokenizer):
-    assert tokenizer is not None
-    assert len(text) == len(loss_mask), (
-        "text and loss_mask must have the same length. Got {len(text)} and {len(loss_mask)}."
-    )
-
-    try:
-        encoded = tokenizer(
-            text,
-            return_offsets_mapping=True,
-            add_special_tokens=False,
-        )
-        input_ids = encoded["input_ids"]
-        offset_mapping = encoded["offset_mapping"]
-    except Exception:
-        input_ids, offset_mapping = get_offset_mapping(tokenizer, text)
-
-    labels = []
-    for token_id, (start, end) in zip(input_ids, offset_mapping):
-        if start == end:
-            labels.append(-100)
-        elif any(loss_mask[i] for i in range(start, end)):
-            labels.append(token_id)
-        else:
-            labels.append(-100)
-
-    return input_ids, labels
-
-
 def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tools=None, add_vision_id=False, **kwargs):
     """
     终极稳定版 Tokenize：基于 Token 级别的绝对对齐 (椒盐算法升级版)。
@@ -397,6 +369,7 @@ def qwen35_tokenize_fn_slowspeed(tokenizer, messages: List[Dict[str, str]], tool
     return total_ids, labels
 
 
+# 我们采用全新逻辑，因此不需要继承 BaseChatMessages，后续之前的 ChatMessages 逻辑全部删除
 class Qwen35ChatMessages(BaseModel):
     model_config = ConfigDict(extra="forbid")
     messages: List[dict]  # 暂时不做校验
diff --git a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
index ac7772e94..d21fbbe19 100644
--- a/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
+++ b/xtuner/v1/datasets/mllm_tokenize_fn/base_mllm_tokenize_fn.py
@@ -31,16 +31,9 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
             content = msg["content"]
             if isinstance(content, list):
                 for c in content:
-                    if c["type"] == "image_url" or c["type"] == "image":
-                        if c["type"] == "image_url":
-                            image_paths.append(c["image_url"]["url"])
-                        else:
-                            image_paths.append(c["image"]["url"])
-
-                        if "image_url" in c:
-                            key = "image_url"
-                        else:
-                            key = "image"
+                    if c["type"] in ("image_url", "image"):
+                        key = "image_url" if "image_url" in c else "image"
+                        image_paths.append(c[key]["url"])
                         if "image_wh" in c[key]:
                             image_wh = c[key]["image_wh"]
                             if isinstance(image_wh[0], (list, tuple)):
@@ -50,14 +43,10 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
                                 image_wh = image_wh[0]
                             image_wh_list.append(image_wh)
                             assert len(image_wh) == 2, f"image_wh should be [width, height], but got {image_wh}"
-                    if c["type"] == "video_url" or c["type"] == "video":
-                        if "video_url" in c:
-                            video_paths.append(c["video_url"]["url"])
-                            video_wh = c["video_url"].get("image_wh")
-                        else:
-                            video_paths.append(c["video"]["url"])
-                            video_wh = c["video"].get("image_wh")
-
+                    if c["type"] in ("video_url", "video"):
+                        key = "video_url" if "video_url" in c else "video"
+                        video_paths.append(c[key]["url"])
+                        video_wh = c[key].get("image_wh")
                         if video_wh is not None:
                             if isinstance(video_wh[0], (list, tuple)):
                                 assert len(video_wh) == 1, (
@@ -69,11 +58,6 @@ def collect_image_video_paths_and_extra(messages: list[dict]):
 
                         video_extra_dict = {}
 
-                        if "video_url" in c:
-                            key = "video_url"
-                        else:
-                            key = "video"
-
                         if "origin_video_length" in c[key]:
                             video_extra_dict["origin_video_length"] = c[key]["origin_video_length"]
                         if "origin_fps" in c[key]:

From afd9dfe1fedd8df108a47a51fc8edce62ec031b0 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Wed, 8 Apr 2026 09:35:23 +0000
Subject: [PATCH 5/9] update data

---
 tests/datasets/test_qwen35_vl_tokenize_fn.py | 2 --
 tests/resource/qwen35_tokenize_data.jsonl    | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/datasets/test_qwen35_vl_tokenize_fn.py b/tests/datasets/test_qwen35_vl_tokenize_fn.py
index baaa9988a..3f8cbf529 100644
--- a/tests/datasets/test_qwen35_vl_tokenize_fn.py
+++ b/tests/datasets/test_qwen35_vl_tokenize_fn.py
@@ -40,7 +40,6 @@ def test_qwen35vl_text(self):
                 all_data.append(json.loads(line))
         
         for j, data in enumerate(all_data):
-            print(f"Processing data {j+1} of {len(all_data)}")
             if j>=12:
                 break
             gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(self.tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
@@ -280,7 +279,6 @@ def test_qwen3_vl_sft_video(self, add_vision_id):
                         # 测试无法整除且超过最大帧数情况下，均匀采样
                         self.assertEqual(pixel_values_xtuner.size(), (24640, 1536))
                         self.assertEqual(text.count('seconds>'), 7)
-                        print(pixel_values_xtuner.size(), image_grid_thw_xtuner, text.count('seconds>'), 'xxx')
                     else:
                         if i == 7:
                             self.assertEqual(len(input_ids_xtuner), len(input_ids_hf))
diff --git a/tests/resource/qwen35_tokenize_data.jsonl b/tests/resource/qwen35_tokenize_data.jsonl
index 362bfa6ad..8ab909aed 100644
--- a/tests/resource/qwen35_tokenize_data.jsonl
+++ b/tests/resource/qwen35_tokenize_data.jsonl
@@ -1,13 +1,13 @@
 {"id":1,"messages": [{"role": "system", "content": "这是单轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"}]}
 {"id":2,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道","reasoning_content": "这是 reasoning_content 内容"}]}
 {"id":3,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "\n我需要先调用一些工具才能知道","reasoning_content": "\n这是 reasoning_content 内容\n"}]}
-{"id":4,"messages": [{"role": "system", "content": "这是多轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"}]}
+{"id":4,"messages": [{"role": "system", "content": "这是多轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "loss": false},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"}]}
 {"id":5,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题", "reasoning_content": "这是 reasoning_content 内容"}]}
 {"id":6,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的，我知道这是第二个问题"},{"role": "user", "content": "这是第三个问题"},{"role": "assistant", "content": "好的，我知道这是第三个问题", "reasoning_content": "这是 reasoning_content 内容 2"}]}
 {"id":7,"messages": [{"role": "system", "content": "这是单轮无think+toolcall例子"},{"role": "user", "content": "北京今天的天气如何？"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls":[{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments":  {"location": "Boston"}}}]},{"role": "tool","content": "35"},{"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}],"tools": [{"type":"function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}},{"type": "function", "function": {"name":"get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters":{"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":8,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。", "reasoning_content": "这是 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":9,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。","reasoning_content": "这是最后一个 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
-{"id":10,"messages": [{"role": "system", "content": "这是多轮无think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何"}, {"role": "assistant", "content": "好的，我知道这是第二个问题。我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
+{"id":10,"messages": [{"role": "system", "content": "这是多轮无think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。", "loss": false}, {"role": "user", "content": "这是第二个问题。上海的天气如何"}, {"role": "assistant", "content": "好的，我知道这是第二个问题。我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":11,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。只有一个用户 user 输入。只有一次真 user 输入 表示整个对话过程中只有 user message。此时中间的所有 think 过程都会保留"}, {"role": "user", "content": "北京和上海今天的天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "我现在知道北京的天气了，我需要继续知道上海的天气", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度，上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入，则之前的 think 内容会全部丢掉，因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何？"}, {"role": "assistant", "content": "现在是第二个问题了，我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手，能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容，它们有什么相同点和不同点？"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫，背景都是室内环境，光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧，而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比，有什么新的元素出现？"}]}, {"role": "assistant", "content": "与前两张图片相比，这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片，你认为它们想表达什么主题？"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题：猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松，第二张图片展示了猫的舒适和满足，而第三张图片则通过不同的构图和视觉元素，传达了猫在这个环境中的多样性和丰富性。整体上，这些图片共同描绘了猫在室内生活中的多样化表现，表达了对猫的喜爱和对其生活状态的关注。"}]}

From 3c92d035affd9cbe632c51c238cccddade49faf4 Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Wed, 8 Apr 2026 10:01:00 +0000
Subject: [PATCH 6/9] update

---
 xtuner/v1/datasets/sft_tokenize_fn/openai.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/xtuner/v1/datasets/sft_tokenize_fn/openai.py b/xtuner/v1/datasets/sft_tokenize_fn/openai.py
index 1c1bbfba4..5ce9c8cfd 100644
--- a/xtuner/v1/datasets/sft_tokenize_fn/openai.py
+++ b/xtuner/v1/datasets/sft_tokenize_fn/openai.py
@@ -7,7 +7,7 @@
 from pydantic import BaseModel, ConfigDict
 
 from transformers import PreTrainedTokenizer
-from xtuner.v1.data_proto.messages import ChatMessages
+from xtuner.v1.data_proto.messages import ChatMessages, Qwen35ChatMessages
 from xtuner.v1.data_proto.templates import CHAT_TEMPLATE_MAP
 from xtuner.v1.datasets.data_item import CacheItem, DataItem
 from xtuner.v1.utils import get_logger
@@ -30,6 +30,7 @@ def __init__(
         assert chat_template in CHAT_TEMPLATE_MAP, (
             f"chat_template {chat_template} not found in {CHAT_TEMPLATE_MAP.keys()}"
         )
+        self.chat_template_name = chat_template
         self.chat_template = CHAT_TEMPLATE_MAP[chat_template]
         self._hash = hash
         self._tokenizer_hash = tokenizer_hash
@@ -43,7 +44,11 @@ def __call__(self, item: dict | list, **kwargs) -> DataItem | CacheItem:
             tools = item["tools"]
         if isinstance(item, dict) and "messages" in item:
             item = item["messages"]
-        messages = ChatMessages(messages=item, tools=tools)
+
+        if self.chat_template_name == "qwen3.5-vl":
+            messages = Qwen35ChatMessages(messages=item, tools=tools)
+        else:
+            messages = ChatMessages(messages=item, tools=tools)
         tokenized = messages.tokenize(self.tokenizer, self.chat_template)
 
         input_ids = tokenized["input_ids"]

From 155ddd76de4066cc5870bc12153484d1133f5f6c Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Fri, 10 Apr 2026 08:31:31 +0000
Subject: [PATCH 7/9] fix sp token

---
 xtuner/v1/data_proto/messages/qwen35_chat.py | 33 ++++++++++++++------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
index 495ee8c10..80bca5c2c 100644
--- a/xtuner/v1/data_proto/messages/qwen35_chat.py
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -12,21 +12,36 @@
 def get_offset_mapping(tokenizer, text: str):
     encoding = tokenizer(text, add_special_tokens=False)
     input_ids = encoding["input_ids"]
-    tokens = tokenizer.convert_ids_to_tokens(input_ids)
     offset_mapping = []
     pos = 0
-    for token_id, token in zip(input_ids, tokens):
-        decoded = tokenizer.decode([token_id], skip_special_tokens=False)
+    pending_ids = [] # type: ignore
+    max_pending = 8
+
+    def _flush_pending(start, end):
+        nonlocal pending_ids, pos
+        offset_mapping.extend([(start, end)] * len(pending_ids))
+        pos = end
+        pending_ids = []
+
+    def _flush_pending_as_empty():
+        nonlocal pending_ids
+        offset_mapping.extend([(pos, pos)] * len(pending_ids))
+        pending_ids = []
+
+    for token_id in input_ids:
+        pending_ids.append(token_id)
+        decoded = tokenizer.decode(pending_ids, skip_special_tokens=False)
         if not decoded:
-            offset_mapping.append((pos, pos))
             continue
         idx = text.find(decoded, pos)
-        if idx == -1:
-            offset_mapping.append((pos, pos))
-        else:
+        if idx != -1:
             end = idx + len(decoded)
-            offset_mapping.append((idx, end))
-            pos = end
+            _flush_pending(idx, end)
+        elif "\ufffd" not in decoded or len(pending_ids) >= max_pending:
+            _flush_pending_as_empty()
+
+    if pending_ids:
+        _flush_pending_as_empty()
     return input_ids, offset_mapping
 
 

From 122c7c53cd114650013ef39e54e4178a4cc2dc9d Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Fri, 10 Apr 2026 08:36:08 +0000
Subject: [PATCH 8/9] fix lint

---
 xtuner/v1/data_proto/messages/qwen35_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
index 80bca5c2c..dcc820b52 100644
--- a/xtuner/v1/data_proto/messages/qwen35_chat.py
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -14,7 +14,7 @@ def get_offset_mapping(tokenizer, text: str):
     input_ids = encoding["input_ids"]
     offset_mapping = []
     pos = 0
-    pending_ids = [] # type: ignore
+    pending_ids = []  # type: ignore
     max_pending = 8
 
     def _flush_pending(start, end):

From 56fc6826d57861b6b485b7c11356087f9553783e Mon Sep 17 00:00:00 2001
From: huanghaian <huanghaian@pjlab.org.cn>
Date: Fri, 10 Apr 2026 12:01:38 +0000
Subject: [PATCH 9/9] add video ci

---
 tests/chat_template/test_chat_template.py    | 31 ++++++++++++--------
 tests/resource/qwen35_tokenize_data.jsonl    |  4 ++-
 xtuner/v1/data_proto/messages/qwen35_chat.py |  2 +-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/chat_template/test_chat_template.py b/tests/chat_template/test_chat_template.py
index 4569b7b64..6ed85fc01 100644
--- a/tests/chat_template/test_chat_template.py
+++ b/tests/chat_template/test_chat_template.py
@@ -251,17 +251,22 @@ def test_qwen35vl_template(self):
             gt_token_ids, gt_labels = qwen35_tokenize_fn_slowspeed(tokenizer, data['messages'], tools=data.get('tools'), add_vision_id=True)
             _messages = Qwen35ChatMessages(messages=data["messages"], tools=data.get("tools"))
             tokenized = _messages.tokenize(tokenizer, chat_template, add_vision_id=True)
-            self.assertEqual(tokenized['input_ids'], gt_token_ids)
-            self.assertEqual(tokenized['labels'], gt_labels)
-
-            enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
             decode_str = tokenizer.decode(tokenized['input_ids'], skip_special_tokens=False)
-            hf_text = tokenizer.apply_chat_template(data['messages'],   
-                                               tools=data.get('tools'),       
-                                               add_vision_id=True,   
-                                               tokenize=False,
-                                               enable_thinking=enable_thinking,
-                                               add_generation_prompt=False)
-            self.assertEqual(decode_str, hf_text)
-            
-     
\ No newline at end of file
+
+            if j!=15 and j!=16:
+                self.assertEqual(tokenized['input_ids'], gt_token_ids)
+                self.assertEqual(tokenized['labels'], gt_labels)
+
+                enable_thinking = any("reasoning_content" in msg for msg in data['messages'])
+                hf_text = tokenizer.apply_chat_template(data['messages'],   
+                                                tools=data.get('tools'),       
+                                                add_vision_id=True,   
+                                                tokenize=False,
+                                                enable_thinking=enable_thinking,
+                                                add_generation_prompt=False)
+                self.assertEqual(decode_str, hf_text)
+            else:
+                if j==15:
+                    self.assertTrue('Video 1: <|vision_start|><|video_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|><|vision_start|><|video_pad|><|vision_end|><0.0-10.0 seconds>Describe the video in detail. [NO_REASONING]<|im_end|>' in decode_str)
+                else:
+                    self.assertTrue('Video 1: <0.0 seconds><|vision_start|><|video_pad|><|vision_end|><1.0 seconds><|vision_start|><|video_pad|><|vision_end|><2.0 seconds><|vision_start|><|video_pad|><|vision_end|><0.0-10.0 seconds>Describe the video in detail. [NO_REASONING]<|im_end|>' in decode_str)
diff --git a/tests/resource/qwen35_tokenize_data.jsonl b/tests/resource/qwen35_tokenize_data.jsonl
index 8ab909aed..b367e5fa3 100644
--- a/tests/resource/qwen35_tokenize_data.jsonl
+++ b/tests/resource/qwen35_tokenize_data.jsonl
@@ -12,4 +12,6 @@
 {"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入，则之前的 think 内容会全部丢掉，因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何？"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察，今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何？"}, {"role": "assistant", "content": "现在是第二个问题了，我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察，今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
 {"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手，能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容，它们有什么相同点和不同点？"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫，背景都是室内环境，光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧，而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比，有什么新的元素出现？"}]}, {"role": "assistant", "content": "与前两张图片相比，这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片，你认为它们想表达什么主题？"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题：猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松，第二张图片展示了猫的舒适和满足，而第三张图片则通过不同的构图和视觉元素，传达了猫在这个环境中的多样性和丰富性。整体上，这些图片共同描绘了猫在室内生活中的多样化表现，表达了对猫的喜爱和对其生活状态的关注。"}]}
 {"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/demo.mp4"}}, {"type": "text", "text": "请描述这个视频的主要内容，并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景，有行人和车辆在移动。随后镜头切换到一家咖啡店的内部，可以看到顾客在排队点单。接着视频展示了咖啡制作的过程，包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅，画面清晰。"}]}
-{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/cooking_tutorial.mp4"}}, {"type": "video", "video": {"url":"https://example.com/video/cooking_result.mp4"}}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/failed_attempt.mp4"}}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
\ No newline at end of file
+{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手，能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/cooking_tutorial.mp4"}}, {"type": "video", "video": {"url":"https://example.com/video/cooking_result.mp4"}}, {"type": "text", "text": "请分析这两个视频，第一个视频是烹饪教程，第二个是最终成品。它们之间有什么联系？"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频：视频展示了准备食材的过程，包括切菜、腌制肉类等步骤。然后是烹饪过程，展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于：第一个视频是制作过程，第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": {"url":"https://example.com/video/failed_attempt.mp4"}}, {"type": "text", "text": "这里还有一个失败尝试的视频，和前两个相比有什么问题？"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题：首先，火候控制不当，视频中可以看到食材有些焦糊。其次，调味顺序有问题，盐放得太早导致食材出水过多。第三，翻炒的频率不够，导致受热不均匀。相比之下，第一个成功视频中火候掌握得当，调味时机准确，翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频，总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析，成功烹饪这道菜的关键要点包括：第一，火候控制是核心，需要保持中火避免焦糊；第二，调味顺序很重要，盐应在出锅前加入；第三，翻炒要频繁均匀，确保食材受热一致；第四，食材预处理要到位，切块的均匀度影响受热；第五，要有耐心，每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
+{"id":16,"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "demo", "origin_video_length": 196, "origin_fps": 29.97002997002997, "processed_video_length": 26, "processed_fps": 4.0, "image_wh": [202, 360], "num_frames":[1,1,1], "conversation_timestamps": [0.0, 10.0]}}, {"type": "text", "text": "Describe the video in detail. [NO_REASONING]"}]}, {"role": "assistant", "content": "A young boy is sitting in the passenger seat of a car, wearing a white and blue striped polo shirt with a 'UTSA' logo. He is holding a pair of sunglasses and gradually puts them on his face. The background outside the car window shows a fence, trees, and a clear sky."}]}
+{"id":17,"messages": [{"role": "user", "content": [{"type": "video", "video": {"url": "demo", "origin_video_length": 196, "origin_fps": 29.97002997002997, "processed_video_length": 26, "processed_fps": 4.0, "image_wh": [202, 360], "timestamps":[0,1,2], "conversation_timestamps": [0.0, 10.0]}}, {"type": "text", "text": "Describe the video in detail. [NO_REASONING]"}]}, {"role": "assistant", "content": "A young boy is sitting in the passenger seat of a car, wearing a white and blue striped polo shirt with a 'UTSA' logo. He is holding a pair of sunglasses and gradually puts them on his face. The background outside the car window shows a fence, trees, and a clear sky."}]}
\ No newline at end of file
diff --git a/xtuner/v1/data_proto/messages/qwen35_chat.py b/xtuner/v1/data_proto/messages/qwen35_chat.py
index dcc820b52..dcb09d5d5 100644
--- a/xtuner/v1/data_proto/messages/qwen35_chat.py
+++ b/xtuner/v1/data_proto/messages/qwen35_chat.py
@@ -76,7 +76,7 @@ def render_content(content, do_vision_count, image_count, video_count, add_visio
                 num_frames = video_content["num_frames"]
                 for _ in range(len(num_frames)):
                     result += "<|vision_start|><|video_pad|><|vision_end|>"
-            conversation_timestamp = video_content.get("conversation_timestamp", [])
+            conversation_timestamp = video_content.get("conversation_timestamps", [])
             if len(conversation_timestamp) > 0:
                 start_time = conversation_timestamp[0]
                 end_time = conversation_timestamp[1]