huggingface · gagandhakrey · Apr 5, 2026 · Apr 5, 2026
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -867,8 +867,10 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
         """
         if encoder_attention_mask.dim() == 3:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
+        elif encoder_attention_mask.dim() == 2:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        else:
+            raise ValueError(f"Wrong shape for encoder_attention_mask (shape {encoder_attention_mask.shape})")
         # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
         # encoder_extended_attention_mask.transpose(-1, -2))