invoke-ai · lstein · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 10, 2026
diff --git a/USER_ISOLATION_IMPLEMENTATION.md b/USER_ISOLATION_IMPLEMENTATION.md
diff --git a/docs/src/content/docs/start-here/system-requirements.mdx b/docs/src/content/docs/start-here/system-requirements.mdx
@@ -2,7 +2,7 @@
 title: Hardware Requirements
 sidebar:
   order: 1
-lastUpdated: 2026-02-18
+lastUpdated: 2026-05-11
 ---
 
 import { Tabs, TabItem, Steps } from '@astrojs/starlight/components'
@@ -28,6 +28,8 @@ The requirements below are rough guidelines for best performance. GPUs with less
 | FLUX.2 Klein 4B | 1024x1024 | Nvidia 30xx+ | 12GB | 16GB | FP8 works with 8GB+; Diffusers + encoder |
 | FLUX.2 Klein 9B | 1024x1024 | Nvidia 40xx | 24GB | 32GB | FP8 works with 12GB+; Diffusers + encoder |
 | Z-Image Turbo | 1024x1024 | Nvidia 20xx+ | 8GB | 16GB | Q4_K 8GB; Q8/BF16 16GB+ |
+| Wan 2.2 A14B (T2V/I2V) | 1280x720 | Nvidia 30xx+ | 12GB | 32GB | Dual-expert MoE; Q4_K_M 12GB; Q8 18GB+; Diffusers requires 32GB+ |
+| Wan 2.2 TI2V-5B | 1280x720 | Nvidia 20xx+ | 8GB | 16GB | Single transformer; Q4_K_M 6GB+; Q8 8GB+; Diffusers 12GB+ |
 
 :::tip[`tmpfs` on Linux]
   If your temporary directory is mounted as a `tmpfs`, ensure it has sufficient space.

@@ -62,6 +62,7 @@
     QwenImageConditioningInfo,
     SD3ConditioningInfo,
     SDXLConditioningInfo,
+    WanConditioningInfo,
     ZImageConditioningInfo,
 )
 from invokeai.backend.util.logging import InvokeAILogger
@@ -152,6 +153,7 @@ def initialize(
                     ZImageConditioningInfo,
                     QwenImageConditioningInfo,
                     AnimaConditioningInfo,
+                    WanConditioningInfo,
                 ],
                 ephemeral=True,
             ),

@@ -173,6 +173,9 @@ class FieldDescriptions:
     z_image_model = "Z-Image model (Transformer) to load"
     qwen_image_model = "Qwen Image Edit model (Transformer) to load"
     qwen_vl_encoder = "Qwen2.5-VL tokenizer, processor and text/vision encoder"
+    wan_model = "Wan 2.2 model (Transformer) to load"
+    wan_t5_encoder = "UMT5-XXL tokenizer and text encoder for Wan 2.2"
+    wan_ref_image = "Reference-image (VAE-latent) conditioning for Wan 2.2 I2V."
     sdxl_main_model = "SDXL Main model (UNet, VAE, CLIP1, CLIP2) to load"
     sdxl_refiner_model = "SDXL Refiner Main Modde (UNet, VAE, CLIP2) to load"
     onnx_main_model = "ONNX Main model (UNet, VAE, CLIP) to load"
@@ -364,6 +367,33 @@ class AnimaConditioningField(BaseModel):
     )
 
 
+class WanConditioningField(BaseModel):
+    """A Wan 2.2 conditioning tensor primitive value.
+
+    Wan conditioning is the UMT5-XXL hidden state for the prompt plus an attention
+    mask marking valid (non-padding) tokens.
+    """
+
+    conditioning_name: str = Field(description="The name of conditioning tensor")
+
+
+class WanRefImageConditioningField(BaseModel):
+    """Reference-image conditioning for Wan 2.2 I2V.
+
+    Carries the 20-channel VAE-latent condition tensor (4-channel first-frame
+    mask + 16-channel ref-image latents). The denoise loop concatenates this
+    to the 16-channel noise latents along the channel dim each step, producing
+    the 36-channel input the I2V-A14B transformer expects.
+
+    Also carries the spatial dims used to encode the image, so the denoise
+    node can sanity-check that the user's width/height match.
+    """
+
+    condition_tensor_name: str = Field(description="Name of the saved [1, 20, 1, H/8, W/8] condition tensor.")
+    width: int = Field(description="Image width used during VAE encoding (matches denoise width).")
+    height: int = Field(description="Image height used during VAE encoding (matches denoise height).")
+
+
 class ConditioningField(BaseModel):
     """A conditioning tensor primitive value"""
 

@@ -174,6 +174,11 @@ def invoke(self, context: InvocationContext) -> MetadataOutput:
     "anima_img2img",
     "anima_inpaint",
     "anima_outpaint",
+    "wan_txt2img",
+    "wan_img2img",
+    "wan_inpaint",
+    "wan_outpaint",
+    "wan_i2v",
 ]
 
 

@@ -87,6 +87,14 @@ class Qwen3EncoderField(BaseModel):
     loras: List[LoRAField] = Field(default_factory=list, description="LoRAs to apply on model loading")
 
 
+class WanT5EncoderField(BaseModel):
+    """Field for the UMT5-XXL text encoder used by Wan 2.2 models."""
+
+    tokenizer: ModelIdentifierField = Field(description="Info to load tokenizer submodel")
+    text_encoder: ModelIdentifierField = Field(description="Info to load text_encoder submodel")
+    loras: List[LoRAField] = Field(default_factory=list, description="LoRAs to apply on model loading")
+
+
 class VAEField(BaseModel):
     vae: ModelIdentifierField = Field(description="Info to load vae submodel")
     seamless_axes: List[str] = Field(default_factory=list, description='Axes("x" and "y") to which apply seamless')
@@ -101,6 +109,46 @@ class TransformerField(BaseModel):
     loras: List[LoRAField] = Field(description="LoRAs to apply on model loading")
 
 
+class WanTransformerField(BaseModel):
+    """Transformer field for Wan 2.2 models.
+
+    Wan 2.2 A14B is a Mixture-of-Experts model with two transformer experts:
+    a high-noise expert (active at large timesteps) and a low-noise expert
+    (active at small timesteps). TI2V-5B is a single-transformer model and only
+    populates ``transformer``.
+
+    ``boundary_ratio`` matches Diffusers' ``WanPipeline`` semantics: it's the
+    boundary timestep as a fraction of ``num_train_timesteps`` (typically 1000),
+    so ``boundary_ratio=0.875`` means the high-noise expert handles t >= 875 and
+    the low-noise expert handles t < 875.
+    """
+
+    transformer: ModelIdentifierField = Field(
+        description="Primary transformer submodel. For A14B this is the high-noise expert."
+    )
+    transformer_low_noise: ModelIdentifierField | None = Field(
+        default=None,
+        description="Low-noise transformer expert (Wan 2.2 A14B only). None for TI2V-5B.",
+    )
+    loras: List[LoRAField] = Field(
+        default_factory=list,
+        description="LoRAs to apply to the primary transformer. For A14B applied to the high-noise expert.",
+    )
+    loras_low_noise: List[LoRAField] = Field(
+        default_factory=list,
+        description="Optional separate LoRAs for the low-noise expert (Wan 2.2 A14B). "
+        "If empty and transformer_low_noise is set, the primary 'loras' list is reused.",
+    )
+    boundary_ratio: float = Field(
+        default=0.875,
+        ge=0.0,
+        le=1.0,
+        description="Boundary timestep as a fraction of num_train_timesteps (Wan 2.2 A14B only). "
+        "High-noise expert: t >= boundary_ratio * num_train_timesteps. Low-noise expert: t below. "
+        "Ignored for TI2V-5B.",
+    )
+
+
 @invocation_output("unet_output")
 class UNetOutput(BaseInvocationOutput):
     """Base class for invocations that output a UNet field."""

@@ -29,6 +29,8 @@
     SD3ConditioningField,
     TensorField,
     UIComponent,
+    WanConditioningField,
+    WanRefImageConditioningField,
     ZImageConditioningField,
 )
 from invokeai.app.services.images.images_common import ImageDTO
@@ -497,6 +499,35 @@ def build(cls, conditioning_name: str) -> "AnimaConditioningOutput":
         return cls(conditioning=AnimaConditioningField(conditioning_name=conditioning_name))
 
 
+@invocation_output("wan_conditioning_output")
+class WanConditioningOutput(BaseInvocationOutput):
+    """Base class for nodes that output a Wan 2.2 text conditioning tensor."""
+
+    conditioning: WanConditioningField = OutputField(description=FieldDescriptions.cond)
+
+    @classmethod
+    def build(cls, conditioning_name: str) -> "WanConditioningOutput":
+        return cls(conditioning=WanConditioningField(conditioning_name=conditioning_name))
+
+
+@invocation_output("wan_ref_image_output")
+class WanRefImageOutput(BaseInvocationOutput):
+    """Output of a Wan 2.2 reference-image VAE-encoder."""
+
+    ref_image: WanRefImageConditioningField = OutputField(
+        description="VAE-latent reference-image conditioning for Wan 2.2 I2V.",
+        title="Reference Image",
+    )
+
+    @classmethod
+    def build(cls, condition_tensor_name: str, width: int, height: int) -> "WanRefImageOutput":
+        return cls(
+            ref_image=WanRefImageConditioningField(
+                condition_tensor_name=condition_tensor_name, width=width, height=height
+            )
+        )
+
+
 @invocation_output("conditioning_output")
 class ConditioningOutput(BaseInvocationOutput):
     """Base class for nodes that output a single conditioning tensor"""