ROCm · amd-ruitang3 · Apr 17, 2026 · Apr 23, 2026 · Copilot · Apr 17, 2026
diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py
@@ -81,7 +81,7 @@ def gen_pa_fwd_asm(
         return torch.empty_like(Q)
 
 
-@compile_ops("module_attention", gen_fake=gen_pa_fwd_native_fake)
+@compile_ops("module_attention", gen_fake=gen_pa_fwd_native_fake, develop=True)
 def pa_fwd_naive(
     # [num_seqs, num_heads, head_size]
     query: torch.Tensor,
@@ -652,7 +652,7 @@ def paged_attention_ragged(
 MD_NAME = "module_mla_asm"
 
 
-@compile_ops(MD_NAME, ffi_type="ctypes")
+@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
-@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
+@compile_ops(MD_NAME, ffi_type="ctypes")
-@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
+@compile_ops(MD_NAME, ffi_type="ctypes")
 def mla_decode_stage1_asm_fwd(
     # [num_seqs, num_heads, head_size]
     Q: torch.Tensor,
@@ -688,7 +688,7 @@ def mla_decode_stage1_asm_fwd(
 ) -> None: ...
 
 
-@compile_ops(MD_NAME, ffi_type="ctypes")
+@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
 def mla_prefill_asm_fwd(
     # [num_seqs, num_heads, head_size]
     Q: torch.Tensor,
@@ -873,7 +873,7 @@ def get_ps_metadata_v1(
 ) -> None: ...
 
 
-@compile_ops(MD_NAME, ffi_type="ctypes")
+@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
-@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
+@compile_ops(MD_NAME, ffi_type="ctypes")
-@compile_ops(MD_NAME, ffi_type="ctypes", develop=True)
+@compile_ops(MD_NAME, ffi_type="ctypes")
 def mla_prefill_ps_asm_fwd(
     Q: torch.Tensor,
     K: torch.Tensor,
@@ -933,32 +933,24 @@ def get_mla_metadata_info_v1(
         )
     )
 
-    # In sparse mode, each expanded batch has 1 Q token
-    effective_seqlen_qo = 1 if is_sparse else max_seqlen_qo
     max_qo_tiles_per_batch = (
-        int(math.ceil(effective_seqlen_qo * num_head_qo / 128))
+        int(math.ceil(max_seqlen_qo * num_head_qo / 128))
         if num_head_qo == 16
         or (
             get_gfx() == "gfx942"
             and num_head_qo == 128
             and kv_dtype == dtypes.fp8
             and q_dtype == dtypes.fp8
         )
-        or (
-            get_gfx() == "gfx950"
-            and (num_head_qo * effective_seqlen_qo) % 128 == 0
-            and kv_dtype == dtypes.bf16
-            and q_dtype == dtypes.bf16
-        )
         or (
             get_gfx() == "gfx950"
             and num_head_qo == 64
             and q_dtype == dtypes.fp8
             and kv_dtype == dtypes.fp8
-            and effective_seqlen_qo == 1
+            and max_seqlen_qo == 1
         )
         or use_qseqlen_fold
-        else int(math.ceil(effective_seqlen_qo * num_head_qo / 16))
+        else int(math.ceil(max_seqlen_qo * num_head_qo / 16))
     )
     batch_size = batch_size * max_seqlen_qo if is_sparse else batch_size
     tile_cnt = batch_size * max_qo_tiles_per_batch

diff --git a/aiter/utility/dtypes.py b/aiter/utility/dtypes.py
@@ -23,6 +23,8 @@ def get_dtype_fp8():
 i4x2 = getattr(torch, "int4", _8bit_fallback)
 fp4x2 = getattr(torch, "float4_e2m1fn_x2", _8bit_fallback)
 fp8 = get_dtype_fp8()
+fp8_e4m3fn = torch.float8_e4m3fn
+fp8_e4m3fnuz = torch.float8_e4m3fnuz
 fp8_e8m0 = getattr(torch, "float8_e8m0fnu", _8bit_fallback)
 fp16 = torch.float16
 bf16 = torch.bfloat16

diff --git a/csrc/include/aiter_enum.h b/csrc/include/aiter_enum.h
@@ -24,7 +24,9 @@ enum class QuantType : int
 };
 typedef enum
 {
-    AITER_DTYPE_fp8,
+    AITER_DTYPE_fp8, // Ambiguous dtype, use the corresponding type per device.
+    AITER_DTYPE_fp8_e4m3fn,
+    AITER_DTYPE_fp8_e4m3fnuz,
     AITER_DTYPE_fp8_e8m0,
     AITER_DTYPE_fp16,
     AITER_DTYPE_bf16,
@@ -45,6 +47,8 @@ static inline size_t AiterDtype_element_size(AiterDtype dtype)
     switch(dtype)
     {
     case AITER_DTYPE_fp8:
+    case AITER_DTYPE_fp8_e4m3fn:
+    case AITER_DTYPE_fp8_e4m3fnuz:
     case AITER_DTYPE_fp8_e8m0:
     case AITER_DTYPE_i4x2:
     case AITER_DTYPE_fp4x2:
@@ -67,6 +71,8 @@ static inline std::string AiterDtype_to_str(int dtype)
     switch(dtype)
     {
     case AITER_DTYPE_fp8: return "fp8";
+    case AITER_DTYPE_fp8_e4m3fn: return "fp8_e4m3fn";
+    case AITER_DTYPE_fp8_e4m3fnuz: return "fp8_e4m3fnuz";
     case AITER_DTYPE_fp8_e8m0: return "fp8_e8m0";
     case AITER_DTYPE_fp16: return "fp16";
     case AITER_DTYPE_bf16: return "bf16";

diff --git a/csrc/include/attention.h b/csrc/include/attention.h
@@ -1,14 +1,15 @@
 #pragma once
 // SPDX-License-Identifier: MIT
 // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include <torch/extension.h>
+#include "aiter_tensor.h"
+#include <string>
 
 void paged_attention(
-    torch::Tensor &out, torch::Tensor &exp_sums, torch::Tensor &max_logits,
-    torch::Tensor &tmp_out, torch::Tensor &query, torch::Tensor &key_cache,
-    torch::Tensor &value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor &block_tables, torch::Tensor &context_lens,
+    const aiter_tensor_t &out, const aiter_tensor_t &exp_sums, const aiter_tensor_t &max_logits,
+    const aiter_tensor_t &tmp_out, const aiter_tensor_t &query, const aiter_tensor_t &key_cache,
+    const aiter_tensor_t &value_cache, int64_t num_kv_heads, double scale,
+    const aiter_tensor_t &block_tables, const aiter_tensor_t &context_lens,
     int64_t block_size, int64_t max_context_len,
-    const std::optional<torch::Tensor> &alibi_slopes,
+    const aiter_tensor_t *alibi_slopes,
     const std::string &kv_cache_dtype, double k_scale, double v_scale,
-    const std::optional<torch::Tensor> &fp8_out_scale, int64_t partition_size);
+    const aiter_tensor_t *fp8_out_scale, int64_t partition_size);
diff --git a/csrc/include/attention_ragged.h b/csrc/include/attention_ragged.h
@@ -1,24 +1,22 @@
 #pragma once
 // SPDX-License-Identifier: MIT
 // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include <torch/extension.h>
+#include "aiter_tensor.h"
+#include <optional>
+#include <string>
 
 void paged_attention_ragged(
-    torch::Tensor &out, // [num_seqs, num_heads, head_size]
-    torch::Tensor &workspace_buffer,
-    torch::Tensor &query, // [num_seqs, num_heads, head_size]
-    torch::Tensor
-        &key_cache, // [num_blocks, num_heads, block_size, head_size] or
-                    // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor
-        &value_cache, // [num_blocks, num_heads, block_size, head_size] or
-                      // [num_blocks, block_size, num_heads, head_size]
+    const aiter_tensor_t &out, // [num_seqs, num_heads, head_size]
+    const aiter_tensor_t &workspace_buffer,
+    const aiter_tensor_t &query, // [num_seqs, num_heads, head_size]
+    const aiter_tensor_t &key_cache,
+    const aiter_tensor_t &value_cache,
     double scale,
-    torch::Tensor &kv_indptr,                        // [num_seqs + 1]
-    torch::Tensor &kv_page_indices,                  // [max_num_blocks]
-    std::optional<torch::Tensor> &kv_last_page_lens, // [num_seqs]
+    const aiter_tensor_t &kv_indptr,                  // [num_seqs + 1]
+    const aiter_tensor_t &kv_page_indices,            // [max_num_blocks]
+    const aiter_tensor_t *kv_last_page_lens,          // [num_seqs]
     int64_t block_size, int64_t max_num_partitions,
-    const std::optional<torch::Tensor> &alibi_slopes,
+    const aiter_tensor_t *alibi_slopes,
     const std::string &kv_cache_dtype, const std::string &kv_cache_layout,
-    float logits_soft_cap, torch::Tensor &k_scale, torch::Tensor &v_scale,
-    const std::optional<torch::Tensor> &fp8_out_scale, int64_t partition_size);
+    float logits_soft_cap, const aiter_tensor_t &k_scale, const aiter_tensor_t &v_scale,
+    const aiter_tensor_t *fp8_out_scale, int64_t partition_size);
diff --git a/csrc/include/attention_v1.h b/csrc/include/attention_v1.h
@@ -1,23 +1,22 @@
 #pragma once
 // SPDX-License-Identifier: MIT
 // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
-#include <torch/extension.h>
+#include "aiter_tensor.h"
+#include <optional>
+#include <string>
 
 void paged_attention_v1(
-    torch::Tensor &out, // [num_seqs, num_heads, head_size]
-    torch::Tensor &workspace_buffer,
-    torch::Tensor &query, // [num_seqs, num_heads, head_size]
-    torch::Tensor
-        &key_cache, // [num_blocks, num_heads, block_size, head_size] or
-                    // [num_blocks, block_size, num_heads, head_size]
-    torch::Tensor
-        &value_cache, // [num_blocks, num_heads, block_size, head_size] or
-                      // [num_blocks, block_size, num_heads, head_size]
+    const aiter_tensor_t &out, // [num_seqs, num_heads, head_size]
+    const aiter_tensor_t &workspace_buffer,
+    const aiter_tensor_t &query, // [num_seqs, num_heads, head_size]
+    const aiter_tensor_t &key_cache,
+    const aiter_tensor_t &value_cache,
     double scale,
-    torch::Tensor &block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const std::optional<torch::Tensor>& cu_query_lens, // [num_seqs+1]
-    torch::Tensor &context_lens,  // [num_seqs]
-    int64_t max_context_len, const std::optional<torch::Tensor> &alibi_slopes,
+    const aiter_tensor_t &block_tables,   // [num_seqs, max_num_blocks_per_seq]
+    const aiter_tensor_t *cu_query_lens,  // [num_seqs+1]
+    const aiter_tensor_t &context_lens,   // [num_seqs]
+    int64_t max_context_len,
+    const aiter_tensor_t *alibi_slopes,
     const std::string &kv_cache_dtype, const std::string &kv_cache_layout,
-    float logits_soft_cap, torch::Tensor &k_scale, torch::Tensor &v_scale,
-    const std::optional<torch::Tensor> &fp8_out_scale, int64_t partition_size);
+    float logits_soft_cap, const aiter_tensor_t &k_scale, const aiter_tensor_t &v_scale,
+    const aiter_tensor_t *fp8_out_scale, int64_t partition_size);