Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 6 additions & 20 deletions src/transformers/models/smolvlm/video_processing_smolvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
IMAGENET_STANDARD_STD,
PILImageResampling,
SizeDict,
pil_torch_interpolation_mapping,
)
from ...processing_utils import Unpack, VideosKwargs
from ...utils import TensorType, is_torchvision_available, logging
Expand Down Expand Up @@ -148,21 +147,6 @@ def resize(
Returns:
`torch.Tensor`: The resized video.
"""
if resample is not None:
if isinstance(resample, (PILImageResampling, int)):
interpolation = pil_torch_interpolation_mapping[resample]
else:
interpolation = resample
else:
interpolation = tvF.InterpolationMode.BILINEAR
if interpolation == tvF.InterpolationMode.LANCZOS:
logger.warning_once(
"You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. "
"BICUBIC resample will be used as an alternative. Please fall back to image processor if you "
"want full consistency with the original model."
)
interpolation = tvF.InterpolationMode.BICUBIC

if size.longest_edge:
# Resize the image so that the shortest edge or the longest edge is of the given size
# while maintaining the aspect ratio of the original image.
Expand All @@ -175,12 +159,14 @@ def resize(
else:
raise ValueError(f"Size must contain 'height' and 'width' keys, or 'longest_edge' key. Got {size}.")

video = tvF.resize(video, new_size, interpolation=interpolation, antialias=antialias)
video = super().resize(
video, SizeDict(height=new_size[0], width=new_size[1]), resample=resample, antialias=antialias
)

# Resize again to match image processor when `do_image_splitting=False`. Frames have to be squared to `max_image_size`
# NOTE: videos are always processoed without image splitting
max_size = self.max_image_size["longest_edge"], self.max_image_size["longest_edge"]
video = tvF.resize(video, max_size, interpolation=interpolation, antialias=antialias)
# NOTE: videos are always processed without image splitting
max_size = SizeDict(height=self.max_image_size["longest_edge"], width=self.max_image_size["longest_edge"])
video = super().resize(video, max_size, resample=resample, antialias=antialias)
return video

def pad(
Expand Down
Loading