Skip to content

Commit 182f20c

Browse files
authored
Fix SmolVLM video processor resize using wrong interpolation after backend refactor (#45258)
* Fix SmolVLM video processor resize using wrong interpolation after image processor backend refactor The PR #43514 refactored _preprocess to pass resample=resample to resize, but resize still accepted interpolation as its parameter. The resample kwarg was silently swallowed by **kwargs, causing interpolation to default to BILINEAR instead of the intended LANCZOS->BICUBIC path, producing ~0.36 difference in pixel_values. Fix by renaming the parameter to resample and converting PIL resample integers to torchvision InterpolationMode via pil_torch_interpolation_mapping, matching the pattern used in TorchvisionBackend.resize. * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
1 parent 02c38f7 commit 182f20c

File tree

2 files changed

+18
-7
lines changed

2 files changed

+18
-7
lines changed

src/transformers/models/smolvlm/video_processing_smolvlm.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,18 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import Optional
1615

1716
import numpy as np
1817
import torch
1918

2019
from ...image_processing_utils import BatchFeature, get_size_dict
21-
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling, SizeDict
20+
from ...image_utils import (
21+
IMAGENET_STANDARD_MEAN,
22+
IMAGENET_STANDARD_STD,
23+
PILImageResampling,
24+
SizeDict,
25+
pil_torch_interpolation_mapping,
26+
)
2227
from ...processing_utils import Unpack, VideosKwargs
2328
from ...utils import TensorType, is_torchvision_available, logging
2429
from ...video_processing_utils import BaseVideoProcessor
@@ -127,7 +132,7 @@ def resize(
127132
self,
128133
video: "torch.Tensor",
129134
size: SizeDict,
130-
interpolation: Optional["tvF.InterpolationMode"] = None,
135+
resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
131136
antialias: bool = True,
132137
**kwargs,
133138
) -> "torch.Tensor":
@@ -138,12 +143,18 @@ def resize(
138143
Video to resize.
139144
size (`SizeDict`):
140145
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
141-
resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
142-
`InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`.
146+
resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
147+
Resampling filter to use when resizing the video.
143148
Returns:
144149
`torch.Tensor`: The resized video.
145150
"""
146-
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
151+
if resample is not None:
152+
if isinstance(resample, (PILImageResampling, int)):
153+
interpolation = pil_torch_interpolation_mapping[resample]
154+
else:
155+
interpolation = resample
156+
else:
157+
interpolation = tvF.InterpolationMode.BILINEAR
147158
if interpolation == tvF.InterpolationMode.LANCZOS:
148159
logger.warning_once(
149160
"You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. "

tests/models/smolvlm/test_modeling_smolvlm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ def test_integration_test_video(self):
571571
{
572572
(None, None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
573573
("cuda", (8, 0)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
574-
("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model, specifically a neural network model, which is designed to learn and',
574+
("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
575575
("rocm", (9, 4)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
576576
("rocm", None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
577577
}

0 commit comments

Comments
 (0)