vllm-project · sjmonson · Mar 13, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,7 @@ dependencies = [
     "uvloop>=0.18",
     "torch",
     "more-itertools>=10.8.0",
+    "lazy-loader @ git+https://github.com/sjmonson/lazy-loader.git@feat/lazy_submodules",
 ]
 
 [project.optional-dependencies]

diff --git a/src/guidellm/backends/__init__.py b/src/guidellm/backends/__init__.py
@@ -9,8 +9,6 @@
 handlers for processing streaming and non-streaming API responses.
 """
 
-from guidellm.extras.vllm import HAS_VLLM
-
 from .backend import Backend, BackendArgs, BackendType
 from .openai import (
     AudioRequestHandler,
@@ -20,13 +18,7 @@
     OpenAIRequestHandlerFactory,
     TextCompletionsRequestHandler,
 )
-
-# Conditionally import VLLM backend if available
-if HAS_VLLM:
-    from .vllm_python import VLLMPythonBackend, VLLMResponseHandler
-else:
-    VLLMPythonBackend = None  # type: ignore[assignment, misc]
-    VLLMResponseHandler = None  # type: ignore[assignment, misc]
+from .vllm_python import VLLMPythonBackend, VLLMResponseHandler
 
 __all__ = [
     "AudioRequestHandler",
@@ -38,8 +30,6 @@
     "OpenAIRequestHandler",
     "OpenAIRequestHandlerFactory",
     "TextCompletionsRequestHandler",
+    "VLLMPythonBackend",
+    "VLLMResponseHandler",
 ]
-
-# Conditionally add VLLM backend and handler to exports
-if HAS_VLLM:
-    __all__.extend(["VLLMPythonBackend", "VLLMResponseHandler"])
diff --git a/src/guidellm/backends/vllm_python/vllm.py b/src/guidellm/backends/vllm_python/vllm.py
@@ -22,13 +22,7 @@
 
 from guidellm.backends.backend import Backend, BackendArgs
 from guidellm.backends.vllm_python.vllm_response import VLLMResponseHandler
-from guidellm.extras.vllm import (
-    HAS_VLLM,
-    AsyncEngineArgs,
-    AsyncLLMEngine,
-    RequestOutput,
-    SamplingParams,
-)
+from guidellm.extras import audio, vision, vllm
 from guidellm.logger import logger
 from guidellm.schemas import (
     GenerationRequest,
@@ -37,22 +31,6 @@
     StandardBaseModel,
 )
 
-try:
-    from guidellm.extras.audio import _decode_audio
-
-    HAS_AUDIO = True
-except ImportError:
-    _decode_audio = None  # type: ignore[assignment]
-    HAS_AUDIO = False
-
-try:
-    from guidellm.extras.vision import image_dict_to_pil
-
-    HAS_VISION = True
-except ImportError:
-    image_dict_to_pil = None  # type: ignore[assignment]
-    HAS_VISION = False
-
 # Sentinel for "chat template not yet resolved" cache.
 _CHAT_TEMPLATE_UNSET: object = object()
 
@@ -124,14 +102,6 @@ class _ResolvedRequest(StandardBaseModel):
     )
 
 
-def _check_vllm_available() -> None:
-    """Check if vllm is available and raise helpful error if not."""
-    if not HAS_VLLM:
-        raise ImportError(
-            "vllm is not installed. Install vllm to use the vllm python backend."
-        )
-
-
 def _has_jinja2_markers(s: str) -> bool:
     """Return True if the string contains Jinja2 template syntax ({{, {%, or {#)."""
     return "{{" in s or "{%" in s or "{#" in s
@@ -197,7 +167,6 @@ def __init__(
         :param audio_placeholder: Optional string to use as the audio placeholder when
             using audio_column; if unset, falls back to "<|audio|>".
         """
-        _check_vllm_available()
         super().__init__(type_="vllm_python")
 
         self.model = model
@@ -209,7 +178,7 @@ def __init__(
 
         # Runtime state
         self._in_process = False
-        self._engine: AsyncLLMEngine | None = None
+        self._engine: vllm.AsyncLLMEngine | None = None
         self._resolved_chat_template: str | None | object = _CHAT_TEMPLATE_UNSET
 
     @property
@@ -270,8 +239,8 @@ async def process_startup(self):
         if self._in_process:
             raise RuntimeError("Backend already started up for process.")
 
-        engine_args = AsyncEngineArgs(**self.vllm_config)  # type: ignore[misc]
-        self._engine = AsyncLLMEngine.from_engine_args(engine_args)  # type: ignore[misc]
+        engine_args = vllm.AsyncEngineArgs(**self.vllm_config)
+        self._engine = vllm.AsyncLLMEngine.from_engine_args(engine_args)
         self._in_process = True
 
     async def process_shutdown(self):
@@ -320,7 +289,7 @@ async def default_model(self) -> str:
         """
         return self.model
 
-    def _validate_backend_initialized(self) -> AsyncLLMEngine:
+    def _validate_backend_initialized(self) -> vllm.AsyncLLMEngine:
         """
         Validate that the backend is initialized and return the engine.
 
@@ -360,14 +329,9 @@ def _build_multi_modal_data_from_columns(  # noqa: C901, PLR0912
         for item in image_items:
             if not item or not isinstance(item, dict):
                 continue
-            if not HAS_VISION or image_dict_to_pil is None:
-                raise ImportError(
-                    "Image column support requires guidellm[vision]. "
-                    "Install with: pip install 'guidellm[vision]'"
-                )
             # Convert raw image dicts into PIL Images as required by vLLM's vision
             # processor
-            pil_image = image_dict_to_pil(item)
+            pil_image = vision.image_dict_to_pil(item)
             if "image" not in multi_modal_data:
                 multi_modal_data["image"] = pil_image
             else:
@@ -390,15 +354,10 @@ def _build_multi_modal_data_from_columns(  # noqa: C901, PLR0912
             else:
                 audio_bytes = first.get("audio")
                 if isinstance(audio_bytes, bytes) and len(audio_bytes) > 0:
-                    if not HAS_AUDIO or _decode_audio is None:
-                        raise ImportError(
-                            "Audio column support requires guidellm[audio]. "
-                            "Install with: pip install 'guidellm[audio]'"
-                        )
                     try:
                         # Decode raw audio bytes into an array since vLLM audio models
                         # expect either raw numpy arrays or specific tensor formats
-                        audio_samples = _decode_audio(audio_bytes)
+                        audio_samples = audio._decode_audio(audio_bytes)  # noqa: SLF001
                         # torchcodec decodes audio on CPU, so .data is always
                         # a CPU torch.Tensor. .cpu() is a no-op on CPU tensors.
                         audio_array = audio_samples.data.cpu().numpy()
@@ -731,7 +690,7 @@ def _update_token_timing(
         request_info.timings.last_token_iteration = iter_time
         request_info.timings.token_iterations += iterations
 
-    def _text_from_output(self, output: RequestOutput | None) -> str:
+    def _text_from_output(self, output: vllm.RequestOutput | None) -> str:
         """
         Extract generated text from VLLM RequestOutput.
 
@@ -744,7 +703,7 @@ def _text_from_output(self, output: RequestOutput | None) -> str:
 
     def _stream_usage_tokens(
         self,
-        output: RequestOutput,
+        output: vllm.RequestOutput,
         request_info: RequestInfo,
     ) -> tuple[int, int]:
         """
@@ -770,7 +729,7 @@ def _stream_usage_tokens(
 
     def _usage_from_output(
         self,
-        output: RequestOutput | None,
+        output: vllm.RequestOutput | None,
         *,
         request_info: RequestInfo | None = None,
     ) -> dict[str, int] | None:
@@ -805,7 +764,7 @@ def _build_final_response(
         self,
         request: GenerationRequest,
         request_info: RequestInfo,
-        final_output: RequestOutput | None,
+        final_output: vllm.RequestOutput | None,
         stream: bool,
         text: str = "",
     ) -> tuple[GenerationResponse, RequestInfo] | None:
@@ -832,7 +791,7 @@ def _build_final_response(
     def _create_sampling_params(
         self,
         max_tokens_override: int | None = None,
-    ) -> SamplingParams:
+    ) -> vllm.SamplingParams:
         """
         Create VLLM SamplingParams.
 
@@ -850,7 +809,7 @@ def _create_sampling_params(
             params["max_tokens"] = max_tokens_override
             params["ignore_eos"] = True
 
-        return SamplingParams(**params)  # type: ignore[misc]
+        return vllm.SamplingParams(**params)
 
     def _raise_generation_error(self, exc: BaseException) -> None:
         """Re-raise generation failure with context.
@@ -895,7 +854,7 @@ async def _run_generation(
         request_info: RequestInfo,
         stream: bool,
         generate_input: str | dict[str, Any],
-        sampling_params: SamplingParams,
+        sampling_params: vllm.SamplingParams,
         request_id: str,
         state: dict[str, Any],
     ) -> AsyncIterator[tuple[GenerationResponse, RequestInfo]]:

diff --git a/src/guidellm/data/preprocessors/encoders.py b/src/guidellm/data/preprocessors/encoders.py
@@ -6,6 +6,8 @@
     DatasetPreprocessor,
     PreprocessorRegistry,
 )
+from guidellm.extras import audio as guidellm_audio
+from guidellm.extras import vision as guidellm_vision
 
 __all__ = ["MediaEncoder"]
 
@@ -27,24 +29,6 @@ def __init__(
             encode_kwargs.get("video", {}) if encode_kwargs else {}
         )
 
-    @staticmethod
-    def encode_audio(*args, **kwargs):
-        from guidellm.extras.audio import encode_audio
-
-        return encode_audio(*args, **kwargs)
-
-    @staticmethod
-    def encode_image(*args, **kwargs):
-        from guidellm.extras.vision import encode_image
-
-        return encode_image(*args, **kwargs)
-
-    @staticmethod
-    def encode_video(*args, **kwargs):
-        from guidellm.extras.vision import encode_video
-
-        return encode_video(*args, **kwargs)
-
     def __call__(self, items: list[dict[str, list[Any]]]) -> list[dict[str, list[Any]]]:
         return [self.encode_turn(item) for item in items]
 
@@ -56,7 +40,7 @@ def encode_turn(self, columns: dict[str, list[Any]]) -> dict[str, list[Any]]:
                     continue
 
                 encoded_audio.append(
-                    self.encode_audio(audio, **self.encode_audio_kwargs)
+                    guidellm_audio.encode_audio(audio, **self.encode_audio_kwargs)
                 )
             columns["audio_column"] = encoded_audio
 
@@ -67,7 +51,7 @@ def encode_turn(self, columns: dict[str, list[Any]]) -> dict[str, list[Any]]:
                     continue
 
                 encoded_images.append(
-                    self.encode_image(image, **self.encode_image_kwargs)
+                    guidellm_vision.encode_image(image, **self.encode_image_kwargs)
                 )
             columns["image_column"] = encoded_images
 
@@ -78,7 +62,7 @@ def encode_turn(self, columns: dict[str, list[Any]]) -> dict[str, list[Any]]:
                     continue
 
                 encoded_videos.append(
-                    self.encode_video(video, **self.encode_video_kwargs)
+                    guidellm_vision.encode_video(video, **self.encode_video_kwargs)
                 )
             columns["video_column"] = encoded_videos
 

diff --git a/src/guidellm/extras/__init__.py b/src/guidellm/extras/__init__.py
@@ -1,4 +1,22 @@
 """
 Code that depends on optional dependencies.
-Each submodule should be deferred imported.
+
+All dependent code should import in one of two ways:
+
+1. import guidellm.extras
+2. from guidellm.extras import submodule
+
+As most of the codebase eager imports, importing specific functions or classes may cause
+ImportErrors if the optional dependencies are missing. Importing from the module or
+submodule level ensures errors are deferred to calling point.
 """
+
+import lazy_loader as lazy
+
+submodules = ["vllm", "vision", "audio"]
+
+__getattr__, __dir__, __all__ = lazy.attach(
+    __name__,
+    submodules=submodules,
+    lazy_submodules=True,  # Only import submodules when accessed
+)
diff --git a/src/guidellm/extras/audio.py b/src/guidellm/extras/audio.py
@@ -12,7 +12,7 @@
     from torchcodec.decoders import AudioDecoder
     from torchcodec.encoders import AudioEncoder
 except ImportError as e:
-    raise ImportError("Please install guidellm[audio] to use audio features") from e
+    raise AttributeError("Please install guidellm[audio] to use audio features") from e
 
 __all__ = [
     "encode_audio",

diff --git a/src/guidellm/extras/vision.py b/src/guidellm/extras/vision.py
@@ -11,7 +11,7 @@
 try:
     from PIL import Image as PILImage
 except ImportError as e:
-    raise ImportError(
+    raise AttributeError(
         "Please install guidellm[vision] to use image/video features"
     ) from e
 

diff --git a/src/guidellm/extras/vllm.py b/src/guidellm/extras/vllm.py
@@ -1,13 +1,15 @@
+"""
+vLLM wrapper with same interface as vLLM.
+"""
+
 try:
-    from vllm import SamplingParams
-    from vllm.engine.arg_utils import AsyncEngineArgs
-    from vllm.engine.async_llm_engine import AsyncLLMEngine
-    from vllm.outputs import RequestOutput
-
-    HAS_VLLM = True
-except ImportError:
-    AsyncLLMEngine = None  # type: ignore[assignment, misc]
-    AsyncEngineArgs = None  # type: ignore[assignment, misc]
-    SamplingParams = None  # type: ignore[assignment, misc]
-    RequestOutput = None  # type: ignore[assignment, misc]
-    HAS_VLLM = False
+    import vllm
+except ImportError as e:
+    raise AttributeError("Please install vllm to use vLLM features") from e
+
+
+def __getattr__(name: str):
+    return getattr(vllm, name)
+
+
+__all__ = vllm.__all__
diff --git a/uv.lock b/uv.lock