InternLM · 43758726 · Apr 2, 2026 · Apr 2, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/lmdeploy/lite/apis/auto_awq.py b/lmdeploy/lite/apis/auto_awq.py
@@ -8,12 +8,11 @@
 import torch
 from torch import nn
 
+from lmdeploy.lite.apis.calibrate import LAYER_TYPE_MAP, calibrate
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP, awq_layers, quant_weights, smooth_layers
 from lmdeploy.lite.utils import collect_target_modules
 from lmdeploy.utils import try_import_deeplink
 
-from .calibrate import LAYER_TYPE_MAP, calibrate
-
 
 def save_vl_model(vl_model, model_path, dst_path):
     vl_model.save_pretrained(dst_path, safe_serialization=True)

diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
@@ -251,23 +251,25 @@ def calibrate(model: str,
         model = load_hf_from_pretrained(model, dtype=dtype, trust_remote_code=True)
         vl_model = None
     elif model_type == 'vlm':
+        from transformers import AutoConfig
+        original_torch_dtype = AutoConfig.from_pretrained(model, trust_remote_code=True).torch_dtype
         vl_model = load_vl_model(model, backend=None, with_llm=True).vl_model
         model = vl_model
         if hasattr(vl_model, 'language_model'):  # deepseek-vl, ...
             model = vl_model.language_model
         if hasattr(vl_model, 'llm'):  # MiniCPMV, ...
             model = vl_model.llm
         model.config.use_cache = False
-        if dtype == 'float16':
+        if hasattr(model.config, 'text_config'):
+            model.config.text_config.use_cache = False
+        elif hasattr(model.config, 'llm_config'):
+            model.config.llm_config.use_cache = False
+        if dtype == 'float16' or (dtype == 'auto' and original_torch_dtype == torch.float16):
             model.half()
-        elif dtype == 'bfloat16':
+        elif dtype == 'bfloat16' or (dtype == 'auto' and original_torch_dtype == torch.bfloat16):
             assert torch.cuda.is_bf16_supported(
             ), 'your device does not support bfloat16 please set --dtype float16'  # noqa
             model.to(torch.bfloat16)
-        elif dtype == 'auto' and model.config.torch_dtype == torch.bfloat16:
-            print('Warning: we cast model to float16 to prevent OOM. You'
-                  ' may enforce it bfloat16 by `--dtype bfloat16`')
-            model.half()
         model.eval()
 
     model_type = type(model).__name__

diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
@@ -236,7 +236,10 @@ def smooth_fc_fcs(pre_fc: torch.nn.Module,
               'clamping w_scales.pow(1 - alpha) to 1e-4')
         w_scales_pow = w_scales_pow.clamp(min=1e-4)
     scales = (act_scales.pow(alpha) / w_scales_pow).clamp(min=1e-4).to(device).to(dtype)
-    scales = scales / (scales.max() * scales.min()).sqrt()
+    # prevent scales.max() * scales.min() == inf
+    denom = (scales.max().float() * scales.min().float()).sqrt()
+    denom = denom.to(dtype=dtype)
+    scales = scales / denom
-    denom = denom.to(dtype=dtype)
-    scales = scales / denom
+    scales = (scales.float() / denom).to(device=device, dtype=dtype)
-    denom = denom.to(dtype=dtype)
-    scales = scales / denom
+    scales = (scales.float() / denom).to(device=device, dtype=dtype)
 
     # (for qwen&baichuan) pre_fc is packed QKV, only V needs to scale
     # phi3 fused qkv and gate_up

diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
@@ -53,9 +53,9 @@ def __init__(self,
         self.norm_type = norm_type
         self.batch_size = batch_size
 
-        num_kv_heads, num_attn_heads = self._guess_num_heads(model)
+        num_kv_heads, num_attn_heads, text_config = self._guess_num_heads(model)
         self.num_kv_heads = num_kv_heads
-        self.head_dim = model.config.hidden_size // num_attn_heads
+        self.head_dim = text_config.hidden_size // num_attn_heads
         self.model = model
 
         self.tokenizer = tokenizer
@@ -80,14 +80,21 @@ def __init__(self,
 
     def _guess_num_heads(self, model):
 
-        if hasattr(model.config, 'num_key_value_heads'):
-            num_kv_heads = model.config.num_key_value_heads
+        if hasattr(model.config, 'text_config'):
+            text_config = model.config.text_config
+        elif hasattr(model.config, 'llm_config'):
+            text_config = model.config.llm_config
         else:
-            num_kv_heads = model.config.num_attention_heads
+            text_config = model.config
 
-        num_attn_heads = model.config.num_attention_heads
+        if hasattr(text_config, 'num_key_value_heads'):
+            num_kv_heads = text_config.num_key_value_heads
+        else:
+            num_kv_heads = text_config.num_attention_heads
+
+        num_attn_heads = text_config.num_attention_heads
 
-        return num_kv_heads, num_attn_heads
+        return num_kv_heads, num_attn_heads, text_config
 
     def _init_input_observers(self, name2mod):
         """Initialize input observers for given modules."""

diff --git a/lmdeploy/lite/utils/load.py b/lmdeploy/lite/utils/load.py
@@ -66,10 +66,6 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, dtype: Literal['float
         torch_dtype = torch.bfloat16
     elif dtype == 'float16':
         torch_dtype = torch.float16
-    elif dtype == 'auto' and torch_dtype == torch.bfloat16:
-        print('Warning: we cast model to float16 to prevent OOM. '
-              'You may enforce it bfloat16 by `--dtype bfloat16`')
-        torch_dtype = torch.float16
 
-
+
+    if torch_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
+        if dtype == 'auto':
+            torch_dtype = torch.float16
+            if hasattr(hf_config, 'bf16'):
+                hf_config.bf16 = False
+            if hasattr(hf_config, 'fp16'):
+                hf_config.fp16 = True
+        else:
+            raise RuntimeError('Your device does not supports bf16(bfloat16), '
+                               'please change to fp16(float16)')
-
+
+    if torch_dtype == torch.bfloat16 and not torch.cuda.is_bf16_supported():
+        if dtype == 'auto':
+            torch_dtype = torch.float16
+            if hasattr(hf_config, 'bf16'):
+                hf_config.bf16 = False
+            if hasattr(hf_config, 'fp16'):
+                hf_config.fp16 = True
+        else:
+            raise RuntimeError('Your device does not supports bf16(bfloat16), '
+                               'please change to fp16(float16)')
     with LoadNoInit():
         # Load model

diff --git a/lmdeploy/pytorch/models/q_modules.py b/lmdeploy/pytorch/models/q_modules.py
@@ -48,7 +48,7 @@ def from_float(cls, mod: nn.Module, initialization: bool = True, quant_dtype=tor
         `initialization = True` for real init. `initialization = False` for dummy init.
         """
         hidden_size = mod.weight.shape[0]
-        eps = mod.variance_epsilon
+        eps = getattr(mod, 'variance_epsilon', None) or getattr(mod, 'eps', 1e-6)
-        eps = getattr(mod, 'variance_epsilon', None) or getattr(mod, 'eps', 1e-6)
+        eps = getattr(mod, 'variance_epsilon', None)
+        if eps is None:
+            eps = getattr(mod, 'eps', 1e-6)
-        eps = getattr(mod, 'variance_epsilon', None) or getattr(mod, 'eps', 1e-6)
+        eps = getattr(mod, 'variance_epsilon', None)
+        if eps is None:
+            eps = getattr(mod, 'eps', 1e-6)
         q_mod = cls(hidden_size, eps, quant_dtype=quant_dtype)
         if initialization:
             q_mod.weight = nn.Parameter(mod.weight.detach())