Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion tests/datasets/test_openai_tokenize_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,65 @@
import parametrize
from unittest import TestCase
from transformers import AutoTokenizer

import unittest
from packaging.version import Version
from transformers import __version__ as transformers_version
import json
from xtuner.v1.datasets import OpenaiTokenizeFunctionConfig
import copy

QWEN3_PATH = os.environ["QWEN3_VL_DENSE_PATH"] # We need instruct model


class TestOpenaiTokenizeFunction(TestCase):

@unittest.skipIf(
Version(transformers_version) < Version("5.2.0"),
f"transformers >= 5.2.0 is required, but got {transformers_version}"
)
def test_qwen3p5_openai_tokenize_fn(self):
QWEN3P5_PATH = os.environ["QWEN3_5_MOE_PATH"]
demo_data_path = 'tests/resource/qwen35_tokenize_data.jsonl'

tokenizer = AutoTokenizer.from_pretrained(QWEN3P5_PATH, trust_remote_code=True)
tokenizer_fn_cfg = OpenaiTokenizeFunctionConfig(chat_template="qwen3.5-vl")
tokenizer_fn = tokenizer_fn_cfg.build(tokenizer)

all_data = []
with open(demo_data_path, 'r') as f:
for line in f:
all_data.append(json.loads(line))

for data in all_data:
id = data["id"]
if id == 7:
data_with_system = copy.deepcopy(data)
input_ids_ref = tokenizer.apply_chat_template(data_with_system['messages'], tools=data.get('tools'), tokenize=True, add_generation_prompt=False)['input_ids']
# 临时方案,为了和 hf 对齐
data_with_system['messages'][2]["content"] = "<think>\n\n</think>\n\n我需要先调用一些工具才能知道\n"
data_with_system['messages'][-1]["content"] = "<think>\n\n</think>\n\n基于我的观察,今天北京的天气是35度。"

input_ids = tokenizer_fn(data_with_system)['input_ids']
prompt_ref = tokenizer.decode(input_ids_ref,skip_special_tokens=False)
prompt = tokenizer.decode(input_ids,skip_special_tokens=False)
self.assertEqual(prompt_ref, prompt)
self.assertEqual(input_ids_ref, input_ids)

data_wo_system = copy.deepcopy(data)
del data_wo_system['messages'][0]
input_ids_ref = tokenizer.apply_chat_template(data_wo_system['messages'], tools=data.get('tools'), tokenize=True, add_generation_prompt=False)['input_ids']
# 临时方案,为了和 hf 对齐
data_wo_system['messages'][1]["content"] = "<think>\n\n</think>\n\n我需要先调用一些工具才能知道\n"
data_wo_system['messages'][-1]["content"] = "<think>\n\n</think>\n\n基于我的观察,今天北京的天气是35度。"

input_ids = tokenizer_fn(data_wo_system)['input_ids']
prompt_ref = tokenizer.decode(input_ids_ref,skip_special_tokens=False)
prompt = tokenizer.decode(input_ids,skip_special_tokens=False)
self.assertEqual(prompt_ref, prompt)
self.assertEqual(input_ids_ref, input_ids)


# TODO: Remove this test later
@parametrize.parametrize(
"template_type, tokenizer_path",
[
Expand Down
15 changes: 15 additions & 0 deletions tests/resource/qwen35_tokenize_data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{"id":1,"messages": [{"role": "system", "content": "这是单轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"}]}
{"id":2,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道","reasoning_content": "这是 reasoning_content 内容"}]}
{"id":3,"messages": [{"role": "system", "content": "这是单轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "\n我需要先调用一些工具才能知道","reasoning_content": "\n这是 reasoning_content 内容\n"}]}
{"id":4,"messages": [{"role": "system", "content": "这是多轮无think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的,我知道这是第二个问题"}]}
{"id":5,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的,我知道这是第二个问题", "reasoning_content": "这是 reasoning_content 内容"}]}
{"id":6,"messages": [{"role": "system", "content": "这是多轮有think例子"},{"role": "user", "content": "这是第一个问题"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1"},{"role": "user", "content": "这是第二个问题"},{"role": "assistant", "content": "好的,我知道这是第二个问题"},{"role": "user", "content": "这是第三个问题"},{"role": "assistant", "content": "好的,我知道这是第三个问题", "reasoning_content": "这是 reasoning_content 内容 2"}]}
{"id":7,"messages": [{"role": "system", "content": "这是单轮无think+toolcall例子"},{"role": "user", "content": "北京今天的天气如何?"},{"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls":[{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]},{"role": "tool","content": "35"},{"role": "assistant", "content": "基于我的观察,今天北京的天气是35度。"}],"tools": [{"type":"function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}},{"type": "function", "function": {"name":"get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters":{"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":8,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何?"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察,今天北京的天气是35度。", "reasoning_content": "这是 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":9,"messages": [{"role": "system", "content": "这是单轮有think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何?"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察,今天北京的天气是35度。","reasoning_content": "这是最后一个 reasoning_content 内容"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":10,"messages": [{"role": "system", "content": "这是多轮无think+toolcall例子"}, {"role": "user", "content": "北京今天的天气如何?"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察,今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何"}, {"role": "assistant", "content": "好的,我知道这是第二个问题。我需要先调用一些工具才能知道", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察,今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":11,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。只有一个用户 user 输入。只有一次真 user 输入 表示整个对话过程中只有 user message。此时中间的所有 think 过程都会保留"}, {"role": "user", "content": "北京和上海今天的天气如何?"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "我现在知道北京的天气了,我需要继续知道上海的天气", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察,今天北京的天气是35度,上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":12,"messages": [{"role": "system", "content": "这是多轮有think+toolcall例子。有多个用户 user 输入。一旦再次来了一个新的真 user 输入,则之前的 think 内容会全部丢掉,因为相当于是一次新的回话"}, {"role": "user", "content": "北京今天天气如何?"}, {"role": "assistant", "content": "我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 1", "tool_calls": [{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "Boston"}}}]}, {"role": "tool", "content": "35"}, {"role": "assistant", "content": "基于我的观察,今天北京的天气是35度。"}, {"role": "user", "content": "这是第二个问题。上海的天气如何?"}, {"role": "assistant", "content": "现在是第二个问题了,我需要先调用一些工具才能知道", "reasoning_content": "这是 reasoning_content 内容 2", "tool_calls": [{"id": "call_789", "type": "function", "function": {"name": "get_weather", "arguments": {"location": "shanghai"}}}]}, {"role": "tool", "content": "25"}, {"role": "assistant", "content": "基于我的观察,今天上海的天气是25度。"}], "tools": [{"type": "function", "function": {"name": "get_current_temperature", "description": "Gets the temperature at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the temperature for"}}, "required": ["location"]}}}, {"type": "function", "function": {"name": "get_current_wind_speed", "description": "Get the current wind speed in km/h at a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The location to get the wind speed for, in the format \"City, Country\""}}, "required": ["location"]}}}]}
{"id":13,"messages": [{"role": "system", "content": "你是一个专业的图像分析助手,能够理解和分析多张图片。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "请描述这两张图片的内容,它们有什么相同点和不同点?"}]}, {"role": "assistant", "content": "我需要仔细对比两张图片的主体、背景、光线等要素。", "reasoning_content": "第一张图片和第二张图片的主体都是同一只猫,背景都是室内环境,光线也相似。它们的相同点是都展示了这只猫在窗台上休息的场景。不同点是第一张图片中猫的姿势是侧卧,而第二张图片中猫的姿势是仰卧。"}, {"role": "user", "content": [{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, {"type": "text", "text": "这张新图片和之前的图片相比,有什么新的元素出现?"}]}, {"role": "assistant", "content": "与前两张图片相比,这张新图片中出现了不同的构图角度和新的视觉元素。"}, {"role": "user", "content": [{"type": "text", "text": "综合以上三张图片,你认为它们想表达什么主题?"}]}, {"role": "assistant", "content": "需要从整体角度总结三张图片的共同叙事逻辑和情感表达。", "reasoning_content": "这三张图片共同表达了一个主题:猫在室内环境中的不同状态和情感。第一张图片展示了猫的安静和放松,第二张图片展示了猫的舒适和满足,而第三张图片则通过不同的构图和视觉元素,传达了猫在这个环境中的多样性和丰富性。整体上,这些图片共同描绘了猫在室内生活中的多样化表现,表达了对猫的喜爱和对其生活状态的关注。"}]}
{"id":14,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手,能够理解和分析视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/demo.mp4"}, {"type": "text", "text": "请描述这个视频的主要内容,并分析其中的关键事件。"}]}, {"role": "assistant", "content": "让我仔细观察这个视频的每一帧内容。", "reasoning_content": "视频开始时展示了一个城市街道的场景,有行人和车辆在移动。随后镜头切换到一家咖啡店的内部,可以看到顾客在排队点单。接着视频展示了咖啡制作的过程,包括研磨咖啡豆、萃取咖啡液和打奶泡。最后视频以一杯精美的拉花咖啡特写结束。整个视频的节奏流畅,画面清晰。"}]}
{"id":15,"messages": [{"role": "system", "content": "你是一个专业的视频分析助手,能够理解和分析多个视频内容。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/cooking_tutorial.mp4"}, {"type": "video", "video": "https://example.com/video/cooking_result.mp4"}, {"type": "text", "text": "请分析这两个视频,第一个视频是烹饪教程,第二个是最终成品。它们之间有什么联系?"}]}, {"role": "assistant", "content": "我需要仔细对比这两个视频的内容和逻辑关系。", "reasoning_content": "首先看第一个烹饪教程视频:视频展示了准备食材的过程,包括切菜、腌制肉类等步骤。然后是烹饪过程,展示了翻炒、调味等操作。最后视频展示了装盘。第二个成品视频展示了最终菜品的摆盘和特写镜头。两个视频的联系在于:第一个视频是制作过程,第二个视频是制作完成后的成品展示。它们共同构成了一个完整的从制作到呈现的叙事链条。"}, {"role": "user", "content": [{"type": "video", "video": "https://example.com/video/failed_attempt.mp4"}, {"type": "text", "text": "这里还有一个失败尝试的视频,和前两个相比有什么问题?"}]}, {"role": "assistant", "content": "让我对比分析这个失败案例与之前的成功案例。", "reasoning_content": "通过对比可以看出几个关键问题:首先,火候控制不当,视频中可以看到食材有些焦糊。其次,调味顺序有问题,盐放得太早导致食材出水过多。第三,翻炒的频率不够,导致受热不均匀。相比之下,第一个成功视频中火候掌握得当,调味时机准确,翻炒动作连贯。这些细节差异最终导致了截然不同的结果。"}, {"role": "user", "content": [{"type": "text", "text": "基于这三个视频,总结一下成功烹饪这道菜的关键要点。"}]}, {"role": "assistant", "content": "需要从成功和失败的对比中提炼出关键要点。", "reasoning_content": "综合三个视频的分析,成功烹饪这道菜的关键要点包括:第一,火候控制是核心,需要保持中火避免焦糊;第二,调味顺序很重要,盐应在出锅前加入;第三,翻炒要频繁均匀,确保食材受热一致;第四,食材预处理要到位,切块的均匀度影响受热;第五,要有耐心,每个步骤都不能急于求成。失败视频恰恰反证了这些要点的重要性。"}]}
Loading
Loading