InternLM · 43758726 · Apr 8, 2026 · Apr 8, 2026 · Copilot · Apr 8, 2026
diff --git a/lmdeploy/turbomind/deploy/converter.py b/lmdeploy/turbomind/deploy/converter.py
@@ -31,7 +31,8 @@ def get_input_model_registered_name(model_path: str, model_format: str):
     return register_name
 
 
-def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int):
+def get_output_model_registered_name_and_config(model_path: str, model_format: str, dtype: str, group_size: int,
+                                                quantized_format: str):
     """Get the registered name of the turbomind model and its configuration
     according to the input model path, format and user-input config. The name
     will be used to access the OUTPUT_MODELS registry.
@@ -42,6 +43,8 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
             ['hf', 'awq', 'gptq']
         dtype (str): the data type of the model's weights and activations
         group_size (int): the size of group used by awq model
+        quantized_format (str): the quantized format of compressed-tensors model,
+            which can be one of ['pack-quantized', 'float-quantized']
-            ['hf', 'awq', 'gptq']
-        dtype (str): the data type of the model's weights and activations
-        group_size (int): the size of group used by awq model
-        quantized_format (str): the quantized format of compressed-tensors model,
-            which can be one of ['pack-quantized', 'float-quantized']
+            ['hf', 'awq', 'gptq', 'fp8', 'mxfp4', 'compressed-tensors'].
+            For 'compressed-tensors', `quantized_format` determines the target
+            registration path: 'pack-quantized' maps to the AWQ path and
+            'float-quantized' maps to the FP8 path.
+        dtype (str): the data type of the model's weights and activations
+        group_size (int): the size of group used by awq model
+        quantized_format (str): the quantized format of a
+            compressed-tensors model, which can be one of
+            ['pack-quantized', 'float-quantized']
-            ['hf', 'awq', 'gptq']
-        dtype (str): the data type of the model's weights and activations
-        group_size (int): the size of group used by awq model
-        quantized_format (str): the quantized format of compressed-tensors model,
-            which can be one of ['pack-quantized', 'float-quantized']
+            ['hf', 'awq', 'gptq', 'fp8', 'mxfp4', 'compressed-tensors'].
+            For 'compressed-tensors', `quantized_format` determines the target
+            registration path: 'pack-quantized' maps to the AWQ path and
+            'float-quantized' maps to the FP8 path.
+        dtype (str): the data type of the model's weights and activations
+        group_size (int): the size of group used by awq model
+        quantized_format (str): the quantized format of a
+            compressed-tensors model, which can be one of
+            ['pack-quantized', 'float-quantized']
     """
     register_name = 'tm'
 
@@ -75,11 +78,18 @@ def get_output_model_registered_name_and_config(model_path: str, model_format: s
     session_len = _get_and_verify_max_len(model_config, None)
 
     if model_format in ['awq', 'gptq', 'compressed-tensors']:
-        weight_type = 'int4'
-        dtype = 'float16'  # force float16 for int4 quantized weights
+        if model_format in ['awq', 'gptq']:
+            weight_type = 'int4'
+            dtype = 'float16'  # force float16 for int4 quantized weights
+        elif model_format == 'compressed-tensors':
+            if quantized_format == 'pack-quantized':
+                weight_type = 'int4'
+                model_format = 'awq'
+                dtype = 'float16'  # force float16 for int4 quantized weights
+            elif quantized_format == 'float-quantized':
+                weight_type = 'fp8'
+                model_format = 'fp8'
         group_size = 128 if group_size == 0 else group_size
-        if model_format == 'compressed-tensors':
-            model_format = 'awq'
     elif model_format == 'fp8':
         weight_type = 'fp8'
         group_size = 128
@@ -196,18 +206,23 @@ def get_tm_model(model_path,
             _group_size = 32
         elif quant_method == 'compressed-tensors':
             _format = quant_config['config_groups']['group_0']['format']
-            assert _format == 'pack-quantized', ('compressed-tennsors only supports pack-quantized format, '
-                                                 f'but got {_format}')
+            assert _format in ['pack-quantized', 'float-quantized'
+                               ], ('compressed-tennsors only supports pack-quantized/float-quantized format, '
-                               ], ('compressed-tennsors only supports pack-quantized/float-quantized format, '
+                               ], ('compressed-tensors only supports pack-quantized/float-quantized format, '
-                               ], ('compressed-tennsors only supports pack-quantized/float-quantized format, '
+                               ], ('compressed-tensors only supports pack-quantized/float-quantized format, '
+                                   f'but got {_format}')
             _weights = quant_config['config_groups']['group_0']['weights']
             _group_size = _weights['group_size']
             _num_bits = _weights['num_bits']
             _type = _weights['type']
-            assert _num_bits == 4 and _type == 'int', ('pack-quantized requires 4-bit int, '
-                                                       f'but got {_num_bits}-bit {_type}')
+            assert (_num_bits == 4 and _type == 'int') or (_num_bits == 8 and _type == 'float'), (
+                'pack-quantized requires 4-bit int, '
+                f'but got {_num_bits}-bit {_type}. '
+                'or float-quantized requires 8-bit float, '
+                f'but got {_num_bits}-bit {_type}')
         else:
             assert 0, f'unsupported quant_config: {quant_config}'
 
         engine_config.model_format = quant_method
+        quantized_format = _format if quant_method == 'compressed-tensors' else None
         group_size = _group_size
 
     if engine_config.model_format in ['awq', 'gptq', 'compressed-tensors']:
@@ -221,16 +236,19 @@ def get_tm_model(model_path,
     input_model_name = get_input_model_registered_name(model_path, engine_config.model_format)
 
     fp8_quant = (engine_config.model_format == 'fp8' and not quant_config)
-    input_policy = get_input_policy(engine_config.model_format)
+    input_policy = get_input_policy(engine_config.model_format,
+                                    quantized_format=quantized_format if quant_config else None)
     input_model = INPUT_MODELS.get(input_model_name)(model_path=model_path,
                                                      tokenizer_path=model_path,
                                                      input_policy=input_policy,
                                                      fp8_quant=fp8_quant)
 
-    output_model_name, tm_cfg = get_output_model_registered_name_and_config(model_path=model_path,
-                                                                            model_format=engine_config.model_format,
-                                                                            dtype=engine_config.dtype,
-                                                                            group_size=group_size)
+    output_model_name, tm_cfg = get_output_model_registered_name_and_config(
+        model_path=model_path,
+        model_format=engine_config.model_format,
+        dtype=engine_config.dtype,
+        group_size=group_size,
+        quantized_format=quantized_format if quant_config else None)
 
     if mixed_awq:
         # Mixed-precision AWQ: attention weights are fp16 (not quantized),

diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
@@ -78,6 +78,15 @@ def __call__(self, f, g, i):
         f(i, g('weight'), 'weight', identity)
 
 
+class WeightScale(Parameter):
+    KEYS = '.weight_scale', '.weight'
+
+    # TODO: flag any operations crossing the quant blocks as illegal
+    def __call__(self, f, g, i):
+        f(i, g('weight_scale'), 'scales', to_float, apply_gs=['w1', 'w3', 'w2'])
+        f(i, g('weight'), 'weight', identity)
+
+
 class CompressedWeight(Parameter):
     KEYS = '.weight_packed', '.weight_scale', '.weight_zero_point'
 
@@ -133,6 +142,8 @@ def get_params(keys: list[str], bias=0):
         ps.append(QuantWeightOnly())
     if WeightScaleInv.take(keys):
         ps.append(WeightScaleInv())
+    if WeightScale.take(keys):
+        ps.append(WeightScale())
     xs = CompressedWeight.take(keys)
     if xs:
         ps.append(CompressedWeight(xs))

diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
@@ -68,7 +68,7 @@ def process_fp8(x: torch.Tensor, kind: str):
         return x.to(dtype=torch.bfloat16)
 
 
-def process_compressed_tensor(x: torch.Tensor, kind: str):
+def process_compressed_packed_tensor(x: torch.Tensor, kind: str):
     x = x.cuda()
     if x.dtype == torch.int32:
         xs = get_u4_slices(x, torch.uint8)
@@ -79,7 +79,7 @@ def process_compressed_tensor(x: torch.Tensor, kind: str):
     return x
 
 
-def get_input_policy(model_format):
+def get_input_policy(model_format, quantized_format=None):
     if model_format == 'awq':
         return process_awq_gemm
     elif model_format == 'gptq':
@@ -89,6 +89,9 @@ def get_input_policy(model_format):
     elif model_format == 'fp8':
         return process_fp8
     elif model_format == 'compressed-tensors':
-        return process_compressed_tensor
+        if quantized_format == 'pack-quantized':
+            return process_compressed_packed_tensor
+        elif quantized_format == 'float-quantized':
+            return process_fp8
-            return process_fp8
+            return process_fp8
+        raise ValueError(f'Unsupported quantized_format for compressed-tensors: {quantized_format!r}')
-            return process_fp8
+            return process_fp8
+        raise ValueError(f'Unsupported quantized_format for compressed-tensors: {quantized_format!r}')
     else:
         return to_cuda