From 02ff6895d29ce0a6621c299c28b9ffcdb559f859 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 6 Apr 2026 07:08:32 +0200 Subject: [PATCH 1/2] Fix SmolVLM video processor resize using wrong interpolation after image processor backend refactor The PR #43514 refactored _preprocess to pass resample=resample to resize, but resize still accepted interpolation as its parameter. The resample kwarg was silently swallowed by **kwargs, causing interpolation to default to BILINEAR instead of the intended LANCZOS->BICUBIC path, producing ~0.36 difference in pixel_values. Fix by renaming the parameter to resample and converting PIL resample integers to torchvision InterpolationMode via pil_torch_interpolation_mapping, matching the pattern used in TorchvisionBackend.resize. --- .../smolvlm/video_processing_smolvlm.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/smolvlm/video_processing_smolvlm.py b/src/transformers/models/smolvlm/video_processing_smolvlm.py index 31636795527f..301bb991ed21 100644 --- a/src/transformers/models/smolvlm/video_processing_smolvlm.py +++ b/src/transformers/models/smolvlm/video_processing_smolvlm.py @@ -12,13 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional import numpy as np import torch from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling, SizeDict +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + PILImageResampling, + SizeDict, + pil_torch_interpolation_mapping, +) from ...processing_utils import Unpack, VideosKwargs from ...utils import TensorType, is_torchvision_available, logging from ...video_processing_utils import BaseVideoProcessor @@ -127,7 +132,7 @@ def resize( self, video: "torch.Tensor", size: SizeDict, - interpolation: Optional["tvF.InterpolationMode"] = None, + resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None, antialias: bool = True, **kwargs, ) -> "torch.Tensor": @@ -138,12 +143,18 @@ def resize( Video to resize. size (`SizeDict`): Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video. - resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): - `InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`. + resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use when resizing the video. Returns: `torch.Tensor`: The resized video. """ - interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR + if resample is not None: + if isinstance(resample, (PILImageResampling, int)): + interpolation = pil_torch_interpolation_mapping[resample] + else: + interpolation = resample + else: + interpolation = tvF.InterpolationMode.BILINEAR if interpolation == tvF.InterpolationMode.LANCZOS: logger.warning_once( "You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. " From 908e786e9878789184f0618d616ee515596a0fd6 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 6 Apr 2026 07:21:10 +0200 Subject: [PATCH 2/2] fix --- tests/models/smolvlm/test_modeling_smolvlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py index ba6d0d9e82c1..2969108f3d4e 100644 --- a/tests/models/smolvlm/test_modeling_smolvlm.py +++ b/tests/models/smolvlm/test_modeling_smolvlm.py @@ -571,7 +571,7 @@ def test_integration_test_video(self): { (None, None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature', ("cuda", (8, 0)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed', - ("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model, specifically a neural network model, which is designed to learn and', + ("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature', ("rocm", (9, 4)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature', ("rocm", None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed', }