Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions src/transformers/models/smolvlm/video_processing_smolvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import numpy as np
import torch

from ...image_processing_utils import BatchFeature, get_size_dict
from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling, SizeDict
from ...image_utils import (
IMAGENET_STANDARD_MEAN,
IMAGENET_STANDARD_STD,
PILImageResampling,
SizeDict,
pil_torch_interpolation_mapping,
)
from ...processing_utils import Unpack, VideosKwargs
from ...utils import TensorType, is_torchvision_available, logging
from ...video_processing_utils import BaseVideoProcessor
Expand Down Expand Up @@ -127,7 +132,7 @@ def resize(
self,
video: "torch.Tensor",
size: SizeDict,
interpolation: Optional["tvF.InterpolationMode"] = None,
resample: "PILImageResampling | tvF.InterpolationMode | int | None" = None,
antialias: bool = True,
**kwargs,
) -> "torch.Tensor":
Expand All @@ -138,12 +143,18 @@ def resize(
Video to resize.
size (`SizeDict`):
Dictionary in the format `{"height": int, "width": int}` specifying the size of the output video.
resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
`InterpolationMode` filter to use when resizing the video e.g. `InterpolationMode.BICUBIC`.
resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
Resampling filter to use when resizing the video.
Returns:
`torch.Tensor`: The resized video.
"""
interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
if resample is not None:
if isinstance(resample, (PILImageResampling, int)):
interpolation = pil_torch_interpolation_mapping[resample]
else:
interpolation = resample
else:
interpolation = tvF.InterpolationMode.BILINEAR
if interpolation == tvF.InterpolationMode.LANCZOS:
logger.warning_once(
"You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. "
Expand Down
2 changes: 1 addition & 1 deletion tests/models/smolvlm/test_modeling_smolvlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def test_integration_test_video(self):
{
(None, None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
("cuda", (8, 0)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model, specifically a neural network model, which is designed to learn and',
("cuda", (8, 6)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This value should have been updated long time ago. The PR #43514 further changed the actual outputs, but with the fix of this PR, it brings the actual output back to the one that remain the same for several months, which is the new value I provide here.

("rocm", (9, 4)): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video depicts a large language model architecture, specifically a language model with a "quick brown" feature',
("rocm", None): 'User: You are provided the following series of nine frames from a 0:00:09 [H:MM:SS] video.\n\nFrame from 00:00:\nFrame from 00:01:\nFrame from 00:02:\nFrame from 00:03:\nFrame from 00:04:\nFrame from 00:05:\nFrame from 00:06:\nFrame from 00:08:\nFrame from 00:09:\n\nDescribe this video in detail\nAssistant: The video showcases a large language model architecture, specifically a "Quick Brown" model, which is designed',
}
Expand Down
Loading