Apply suggestions from code review

windreamer · Copilot · web-flow · commit 21a7d08e3971 · 2026-04-09T12:41:24.000+08:00
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -20,7 +20,7 @@ class AttentionMetadata:
     fill_seqlens: torch.Tensor = None
     cu_seqlens_q: torch.Tensor = None
     cu_seqlens_k: torch.Tensor = None
-    quant_policy: QuantPolicy = 0
+    quant_policy: QuantPolicy = QuantPolicy.NONE
 
 
 T = TypeVar('T', bound=AttentionMetadata)
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -691,8 +691,11 @@ def _get_block_d(Lk):
         return BLOCK_DMODEL, BLOCK_DMODEL1, BLOCK_DV
 
     turbo_quant = False
-    turbo_k_codebook = None
-    turbo_v_codebook = None
+    # Triton still receives these arguments for quantized paths, so keep
+    # valid tensor-backed pointers even when turbo quant is not enabled.
+    # They will be overwritten with real codebooks when quant_policy == 42.
+    turbo_k_codebook = q.new_empty((1, ))
+    turbo_v_codebook = q.new_empty((1, ))
 
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k_cache.shape[d_dim], v_cache.shape[d_dim]