diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 0a718ba6d512..0edb00aacd6a 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -26,6 +26,8 @@ title: Exporting to production - local: modular_transformers title: Contributing a new model to Transformers + - local: add_vision_processing_components + title: Add vision processing components - local: add_new_model title: Legacy model contribution - local: auto_docstring diff --git a/docs/source/en/add_vision_processing_components.md b/docs/source/en/add_vision_processing_components.md new file mode 100644 index 000000000000..a77a19a36d1d --- /dev/null +++ b/docs/source/en/add_vision_processing_components.md @@ -0,0 +1,88 @@ + + +# Add vision processing components + +Adding a vision language model (VLM) requires two image processor classes on top of the standard [modular](./modular_transformers) approach. + +> [!NOTE] +> For the modeling and config steps, follow the [modular](./modular_transformers) guide first. + +- [torchvision](https://docs.pytorch.org/vision/stable/index.html) backend is the default and supports GPU acceleration. +- [PIL](https://pillow.readthedocs.io/en/stable/index.html) backend is a fallback when no GPU is available. + +Both classes share the same preprocessing logic but have different backends. Their constructor signatures and default values must be identical. [`AutoImageProcessor.from_pretrained()`] selects the backend at load time and falls back to PIL when torchvision isn't available. Mismatched signatures cause the same saved config to behave differently across environments. + +## torchvision + +Create `image_processing_.py` with a class that inherits from [`TorchvisionBackend`]. Define a kwargs class first if your processor needs custom parameters beyond the standard [`ImagesKwargs`]. + +```py +from ...image_processing_backends import TorchvisionBackend +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...processing_utils import ImagesKwargs, Unpack +from ...utils import auto_docstring + +class MyModelImageProcessorKwargs(ImagesKwargs, total=False): + tile_size: int # any model-specific kwargs + +@auto_docstring +class MyModelImageProcessor(TorchvisionBackend): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + def __init__(self, **kwargs: Unpack[MyModelImageProcessorKwargs]): + super().__init__(**kwargs) +``` + +## PIL + +Create `image_processing_pil_.py` with a class that inherits from [`PilBackend`]. Import the kwargs class from the torchvision file, but don't redefine it. Sharing the same class keeps both backends' kwargs in sync. For processors with no custom parameters, use [`ImagesKwargs`] directly. + +```py +from ...image_processing_backends import PilBackend +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import auto_docstring +from .image_processing_ import MyModelImageProcessorKwargs + +@auto_docstring +class MyModelImageProcessorPil(PilBackend): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + def __init__(self, **kwargs: Unpack[MyModelImageProcessorKwargs]): + super().__init__(**kwargs) +``` + +> [!TIP] +> See [`CLIPImageProcessor`]/[`CLIPImageProcessorPil`] and [`LlavaOnevisionImageProcessor`]/[`LlavaOnevisionImageProcessorPil`] for reference. + +## Next steps + +- Read the [Auto-generating docstrings](./auto_docstring) guide to auto-generate consistent docstrings with `@auto_docstring`. +- Read the [Writing model tests](./testing) guide to write integration tests for your model.