InternLM
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 13 additions & 3 deletions b/‎lmdeploy/messages.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎lmdeploy/pytorch/backends/attention.py‎
Lines changed: 4 additions & 2 deletions b/‎lmdeploy/pytorch/backends/attention.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/attention/default.py‎
Lines changed: 10 additions & 10 deletions b/‎lmdeploy/pytorch/backends/cuda/attention/default.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/attention/mla.py‎
Lines changed: 2 additions & 1 deletion b/‎lmdeploy/pytorch/backends/cuda/attention/mla.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/config.py‎
Lines changed: 3 additions & 3 deletions b/‎lmdeploy/pytorch/config.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎lmdeploy/pytorch/engine/cache_engine.py‎
Lines changed: 12 additions & 11 deletions b/‎lmdeploy/pytorch/engine/cache_engine.py‎
Lines changed: 12 additions & 11 deletions
@@ -16,6 +16,14 @@
 
 logger = get_logger('lmdeploy')
 
+
+class QuantPolicy(enum.IntEnum):
+    """Quantization policy constants for KV cache."""
+    NONE = 0
+    INT4 = 4  # 4-bit KV cache
+    INT8 = 8  # 8-bit KV cache
+    TURBO_QUANT = 42  # TurboQuant: K=4bit QJL4 + V=2bit MSE
+
 LogitsProcessor = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
 """LogitsProcessor is a function that takes a tensor of input_ids, the logits
 tensor for the next token, and returns a modified tensor of logits to sample
@@ -298,7 +306,8 @@ def __post_init__(self):
         assert self.dtype in ['auto', 'float16', 'bfloat16']
         assert self.tp >= 1, 'tp must be a positive integer'
         assert self.cache_max_entry_count > 0, 'invalid cache_max_entry_count'
-        assert self.quant_policy in (0, 4, 8, 42), 'invalid quant_policy'
+        assert self.quant_policy in (QuantPolicy.NONE, QuantPolicy.INT4, QuantPolicy.INT8, QuantPolicy.TURBO_QUANT), \
+               'invalid quant_policy'
         assert self.rope_scaling_factor >= 0, 'invalid rope_scaling_factor'
         assert self.max_prefill_token_num >= 0, \
             'invalid max_prefill_token_num'
@@ -403,7 +412,7 @@ class PytorchEngineConfig:
     custom_module_map: dict[str, str] = None
     download_dir: str = None
     revision: str = None
-    quant_policy: Literal[0, 4, 8, 42] = 0
+    quant_policy: QuantPolicy = QuantPolicy.NONE
     distributed_executor_backend: str = None
     empty_init: bool = False
     enable_microbatch: bool = False
@@ -440,7 +449,8 @@ def __post_init__(self):
         assert self.max_prefill_token_num >= 0, \
             'invalid max_prefill_token_num'
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
-        assert self.quant_policy in (0, 4, 8, 42), 'invalid quant_policy'
+        assert self.quant_policy in (QuantPolicy.NONE, QuantPolicy.INT4, QuantPolicy.INT8, QuantPolicy.TURBO_QUANT), \
+               'invalid quant_policy'
         assert self.device_type in ['cuda', 'ascend', 'maca', 'camb'], (f'invalid device_type: {self.device_type}')
         assert self.block_size >= 16 and (self.block_size & (self.block_size - 1)) == 0, \
             f'block_size must be >= 16 and a power of 2, but got {self.block_size}'
 
@@ -2,10 +2,12 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Generic, Literal, TypeVar
+from typing import Generic, TypeVar
 
 import torch
 
+from lmdeploy.messages import QuantPolicy
+
 
 @dataclass
 class AttentionMetadata:
@@ -18,7 +20,7 @@ class AttentionMetadata:
     fill_seqlens: torch.Tensor = None
     cu_seqlens_q: torch.Tensor = None
     cu_seqlens_k: torch.Tensor = None
-    quant_policy: Literal[0, 4, 8, 42] = 0
+    quant_policy: QuantPolicy = 0
 
 
 T = TypeVar('T', bound=AttentionMetadata)
 
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
-from typing import Literal
 
 import torch
 
+from lmdeploy.messages import QuantPolicy
 from lmdeploy.pytorch.backends.attention import AttentionImpl, AttentionMetadata
 from lmdeploy.utils import get_logger
 
@@ -40,7 +40,7 @@ class TritonAttentionMetadata(AttentionMetadata):
     q_seqlens: torch.Tensor = None
     kv_start_loc: torch.Tensor = None
     kv_seqlens: torch.Tensor = None
-    quant_policy: Literal[0, 4, 8, 42] = 0
+    quant_policy: QuantPolicy = 0
     kv_flatten_size: int = None
     # flash mla
     tile_scheduler_metadata: torch.Tensor = None
@@ -279,15 +279,15 @@ def _forward_prefill(
             flatten_kv_layout=kv_layout,
         )
 
-        # For quant_policy==42, flattened K/V are in rotated domain.
+        # For quant_policy==QuantPolicy.TURBO_QUANT, flattened K/V are in rotated domain.
         # Rotate Q to match, and inverse-rotate output afterwards.
-        if quant_policy == 42:
-            from lmdeploy.pytorch.kernels.cuda.fill_kv_cache import (
-                butterfly_rotate,
-                butterfly_rotate_inv,
+        if quant_policy == QuantPolicy.TURBO_QUANT:
+            from lmdeploy.pytorch.kernels.cuda.turbo_quant import (
+                hadamard_rotate,
+                hadamard_rotate_inv,
             )
             orig_dtype = query.dtype
-            query = butterfly_rotate(query.float()).to(orig_dtype)
+            query = hadamard_rotate(query.float()).to(orig_dtype)
 
         attn_output = self.flash_attention_fwd(
             query,
@@ -309,8 +309,8 @@ def _forward_prefill(
         )
 
         # Inverse-rotate output back to original domain
-        if quant_policy == 42:
-            attn_output = butterfly_rotate_inv(
+        if quant_policy == QuantPolicy.TURBO_QUANT:
+            attn_output = hadamard_rotate_inv(
                 attn_output.float()
             ).to(orig_dtype)
 
 
@@ -4,6 +4,7 @@
 
 import torch
 
+from lmdeploy.messages import QuantPolicy
 from lmdeploy.utils import get_logger
 
 from .default import TritonAttentionImpl, TritonAttentionMetadata
@@ -405,7 +406,7 @@ def _fill_kv_cache_impl(self,
         block_offsets = attn_metadata.block_offsets
         kv_seqlens = attn_metadata.kv_seqlens
         quant_policy = attn_metadata.quant_policy
-        assert quant_policy == 0
+        assert quant_policy == QuantPolicy.NONE
 
         # fill seqlen args
         fill_seqlens, fill_max_q_seqlen, fill_q_start_loc = self._get_fill_meta(
 
@@ -2,11 +2,11 @@
 import enum
 from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Literal
+from typing import Any
 
 import torch
 
-from lmdeploy.messages import PytorchEngineConfig
+from lmdeploy.messages import PytorchEngineConfig, QuantPolicy
 from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
 from lmdeploy.pytorch.utils import maybe_register_config_serialize_by_value
 from lmdeploy.utils import get_logger, is_bf16_supported
@@ -98,7 +98,7 @@ class CacheConfig:
     cache_max_entry_count: float = 0.8
     max_prefill_token_num: int = 4096
     enable_prefix_caching: bool = False
-    quant_policy: Literal[0, 4, 8, 42] = 0
+    quant_policy: QuantPolicy = QuantPolicy.NONE
     device_type: str = 'cuda'
     num_state_caches: int = None
     states_shapes: list[tuple] = field(default_factory=list)
 
@@ -4,7 +4,6 @@
 import math
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Literal
 
 import torch
 
@@ -20,6 +19,7 @@
 )
 from lmdeploy.utils import get_logger
 
+from ...messages import QuantPolicy
 from ..config import CacheConfig, ModelConfig
 
 KVCache = tuple[torch.Tensor, torch.Tensor]
@@ -140,7 +140,7 @@ def _get_key_block_shape_impl(cls,
                                   block_size: int,
                                   head_size: int,
                                   world_size: int = 1,
-                                  quant_policy: Literal[0, 4, 8, 42] = 0):
+                                  quant_policy: QuantPolicy = 0):
         """Get single block shape."""
         attn_backend = get_backend()
         dtype = model_config.dtype
@@ -155,7 +155,8 @@ def _get_key_block_shape_impl(cls,
         if model_config.use_mla_fp8_cache:
             return (block_size, num_heads, MLA_FP8_HEAD_DIM)
 
-        if quant_policy == 4 or quant_policy == 42:  # pack head_dim to uint8 (4-bit)
+        # pack head_dim to uint8 (4-bit)
+        if quant_policy == QuantPolicy.INT4 or quant_policy == QuantPolicy.TURBO_QUANT:
             assert head_size % 2 == 0, \
                 f'head_size: {head_size}, quant_policy: {quant_policy}'
             head_size = head_size // 2
@@ -167,7 +168,7 @@ def _get_value_block_shape_impl(cls,
                                     block_size: int,
                                     head_size: int,
                                     world_size: int = 1,
-                                    quant_policy: Literal[0, 4, 8, 42] = 0):
+                                    quant_policy: QuantPolicy = 0):
         """Get single block shape."""
         attn_backend = get_backend()
         dtype = model_config.dtype
@@ -183,11 +184,11 @@ def _get_value_block_shape_impl(cls,
             # flash mla shared key and value
             return (block_size, num_heads, 0)
 
-        if quant_policy == 42:  # pack head_dim to uint8 (2-bit for V cache)
+        if quant_policy == QuantPolicy.TURBO_QUANT:  # pack head_dim to uint8 (2-bit for V cache)
             assert head_size % 4 == 0, \
                 f'head_size: {head_size}, quant_policy: {quant_policy}'
             head_size = head_size // 4
-        elif quant_policy == 4:  # pack head_dim to uint8 (4-bit)
+        elif quant_policy == QuantPolicy.INT4:  # pack head_dim to uint8 (4-bit)
             assert head_size % 2 == 0, \
                 f'head_size: {head_size}, quant_policy: {quant_policy}'
             head_size = head_size // 2
@@ -209,7 +210,7 @@ def get_k_cache_desc(cls, model_config: ModelConfig, cache_config: CacheConfig,
         )
         shape = list(shape)
         dtype = _get_kv_cache_dtype(model_config)
-        if cache_config.quant_policy in (4, 8, 42):
+        if cache_config.quant_policy in (QuantPolicy.INT4, QuantPolicy.INT8, QuantPolicy.TURBO_QUANT):
             dtype = torch.uint8
         return CacheDesc(shape=shape, dtype=dtype)
 
@@ -228,21 +229,21 @@ def get_v_cache_desc(cls, model_config: ModelConfig, cache_config: CacheConfig,
         )
         shape = list(shape)
         dtype = _get_kv_cache_dtype(model_config)
-        if cache_config.quant_policy in (4, 8, 42):
+        if cache_config.quant_policy in (QuantPolicy.INT4, QuantPolicy.INT8, QuantPolicy.TURBO_QUANT):
             dtype = torch.uint8
         return CacheDesc(shape=shape, dtype=dtype)
 
     @classmethod
     def get_quant_cache_descs(cls, k_cache_desc: CacheDesc, v_cache_desc: CacheDesc, model_config: ModelConfig,
                               cache_config: CacheConfig):
         """Get quant cache descs."""
-        if cache_config.quant_policy == 0:
+        if cache_config.quant_policy == QuantPolicy.NONE:
             return []
 
         dtype = model_config.dtype
-        # For quant_policy==42, K uses 4-bit quantization (has MSE norm and QJL norm),
+        # For quant_policy==QuantPolicy.TURBO_QUANT, K uses 4-bit quantization (has MSE norm and QJL norm),
         # V uses 2-bit quantization (only has MSE norm)
-        if cache_config.quant_policy == 42:
+        if cache_config.quant_policy == QuantPolicy.TURBO_QUANT:
             key_scale_zero_shape = k_cache_desc.shape[:-1] + [2]
             val_scale_zero_shape = v_cache_desc.shape[:-1] + [1]
         else: