diff --git a/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd.cuh b/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd.cuh index bf0e460c8f..6a05d67b7e 100644 --- a/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd.cuh +++ b/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd.cuh @@ -77,6 +77,9 @@ sm100_store_cd(const utils::PatternVisitor& smem_cd, uint32_t& tma uint32_t tmem_addr = tmem_base_addr + // Accumulator offset w * BLOCK_N + // Wave offset s * STORE_BLOCK_N + i * kNumElemsPerBankGroup; // In-block offset +#ifdef __CUDACC_DEBUG__ + tmem_addr |= ((epilogue_warp_idx % 4) * 32) << 16; +#endif auto smem_ptr = smem_base_ptr + // Base pointer epilogue_warp_idx * 32 * kSwizzleCDMode + // Warp offset row * (kNumBankGroupBytes * 8) + col * kNumBankGroupBytes; // In-atom offset diff --git a/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd_swap_ab.cuh b/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd_swap_ab.cuh index f3f5351e6a..75e18cb842 100644 --- a/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd_swap_ab.cuh +++ b/deep_gemm/include/deep_gemm/epilogue/sm100_store_cd_swap_ab.cuh @@ -60,6 +60,9 @@ sm100_store_cd_swap_ab(const utils::PatternVisitor& smem_cd, uint3 uint32_t tmem_addr = tmem_base_addr + s * STORE_BLOCK_M + // Store stage offset i * kNumSwizzleAtomRows; // In-block offset +#ifdef __CUDACC_DEBUG__ + tmem_addr |= ((epilogue_warp_idx % 4) * 32) << 16; +#endif uint32_t values[kNumSwizzleAtomRows]; // Warps cooperatively write an atomic block to shared memory diff --git a/deep_gemm/include/deep_gemm/impls/sm100_bmk_bnk_mn.cuh b/deep_gemm/include/deep_gemm/impls/sm100_bmk_bnk_mn.cuh index 13bb087232..36a400b72e 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_bmk_bnk_mn.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_bmk_bnk_mn.cuh @@ -242,6 +242,9 @@ sm100_bmn_bnk_mn_gemm_impl(uint32_t shape_s, // Load from tensor memory, store into shared memory uint32_t values[kNumElemsPerBankGroup]; DG_STATIC_ASSERT(kNumElemsPerBankGroup == 4, "Invalid type"); +#ifdef __CUDACC_DEBUG__ + tmem_addr |= ((warp_idx % 4) * 32) << 16; +#endif cute::SM100_TMEM_LOAD_32dp32b4x::copy(tmem_addr, values[0], values[1], values[2], values[3]); cutlass::arch::fence_view_async_tmem_load(); diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp4_mqa_logits.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp4_mqa_logits.cuh index b8a99fd042..ff14be25db 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp4_mqa_logits.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp4_mqa_logits.cuh @@ -353,8 +353,13 @@ void sm100_fp4_mqa_logits(const uint32_t seq_len, const uint32_t seq_len_kv, using Loader = cute::conditional_t; +#ifdef __CUDACC_DEBUG__ + uint32_t addr = tmem_addr | (((threadIdx.x / 32) % 4 * 32) << 16); +#else + const auto& addr = tmem_addr; +#endif [&](cute::index_sequence) { - Loader::copy(tmem_addr, reinterpret_cast(accum)[Is]...); + Loader::copy(addr, reinterpret_cast(accum)[Is]...); }(cute::make_index_sequence{}); cutlass::arch::fence_view_async_tmem_load(); }; diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh index d9add53425..3240e1b039 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp4_paged_mqa_logits.cuh @@ -386,8 +386,13 @@ void sm100_fp4_paged_mqa_logits(const uint32_t batch_size, using Loader = cute::conditional_t; +#ifdef __CUDACC_DEBUG__ + uint32_t addr = tmem_addr | (((threadIdx.x / 32) % 4 * 32) << 16); +#else + const auto& addr = tmem_addr; +#endif [&](cute::index_sequence) { - Loader::copy(tmem_addr, reinterpret_cast(accum)[Is]...); + Loader::copy(addr, reinterpret_cast(accum)[Is]...); }(cute::make_index_sequence{}); cutlass::arch::fence_view_async_tmem_load(); }; diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh index b2adc6c7ad..33811d2ee7 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_fp4_mega_moe.cuh @@ -978,6 +978,9 @@ sm100_fp8_fp4_mega_moe_impl(void* y, // Load from TMEM uint32_t tmem_addr = accum_stage_idx * UMMA_N + epilogue_wg_idx * WG_BLOCK_M + j * ATOM_M; uint32_t values[ATOM_M]; +#ifdef __CUDACC_DEBUG__ + tmem_addr |= (warp_idx_in_wg * 32) << 16; +#endif cute::SM100_TMEM_LOAD_16dp256b1x::copy(tmem_addr, values[0], values[1], values[2], values[3]); cute::SM100_TMEM_LOAD_16dp256b1x::copy(tmem_addr | 0x00100000, @@ -1141,6 +1144,9 @@ sm100_fp8_fp4_mega_moe_impl(void* y, // Load from TMEM using .16x256b shape to satisfy STSM layout requirements // Start from lane index 0 and 16 uint32_t tmem_addr = accum_stage_idx * UMMA_N + epilogue_wg_idx * WG_BLOCK_M + s * STORE_BLOCK_M + i * ATOM_M; +#ifdef __CUDACC_DEBUG__ + tmem_addr |= (warp_idx_in_wg * 32) << 16; +#endif uint32_t values[ATOM_M]; cute::SM100_TMEM_LOAD_16dp256b1x::copy(tmem_addr, values[0], values[1], values[2], values[3]); diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh index 7ce008e5ea..2e57781872 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh @@ -501,6 +501,9 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // Load from tensor memory, store into shared memory uint32_t values[kNumElemsPerBankGroup]; +#ifdef __CUDACC_DEBUG__ + tmem_addr |= ((epilogue_warp_idx % 4) * 32) << 16; +#endif if constexpr (cute::is_same_v) { // For FP32 output, read and store DG_STATIC_ASSERT(kNumElemsPerBankGroup == 4, "Invalid type"); diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_mqa_logits.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_mqa_logits.cuh index e6744f59ac..f87871240b 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_mqa_logits.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_mqa_logits.cuh @@ -297,8 +297,13 @@ void sm100_fp8_mqa_logits(const uint32_t seq_len, const uint32_t seq_len_kv, using Loader = cute::conditional_t; +#ifdef __CUDACC_DEBUG__ + uint32_t addr = tmem_addr | (((threadIdx.x / 32) % 4 * 32) << 16); +#else + const auto& addr = tmem_addr; +#endif [&](cute::index_sequence) { - Loader::copy(tmem_addr, reinterpret_cast(accum)[Is]...); + Loader::copy(addr, reinterpret_cast(accum)[Is]...); }(cute::make_index_sequence{}); cutlass::arch::fence_view_async_tmem_load(); }; diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh index 9a5bddbf37..8ffaa76266 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_paged_mqa_logits.cuh @@ -311,8 +311,13 @@ void sm100_fp8_paged_mqa_logits(const uint32_t batch_size, using Loader = cute::conditional_t; +#ifdef __CUDACC_DEBUG__ + uint32_t addr = tmem_addr | (((threadIdx.x / 32) % 4 * 32) << 16); +#else + const auto& addr = tmem_addr; +#endif [&](cute::index_sequence) { - Loader::copy(tmem_addr, reinterpret_cast(accum)[Is]...); + Loader::copy(addr, reinterpret_cast(accum)[Is]...); }(cute::make_index_sequence{}); cutlass::arch::fence_view_async_tmem_load(); }; diff --git a/deep_gemm/include/deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh index aaf7fd9aea..02c453a95a 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_tf32_hc_prenorm_gemm.cuh @@ -240,6 +240,9 @@ sm100_tf32_hc_prenorm_gemm_impl(const uint32_t shape_m, // Load from tensor memory, store into shared memory uint32_t values[kNumElemsPerBankGroup]; DG_STATIC_ASSERT(kNumElemsPerBankGroup == 4, "Invalid type"); +#ifdef __CUDACC_DEBUG__ + tmem_addr |= ((warp_idx % 4) * 32) << 16; +#endif cute::SM100_TMEM_LOAD_32dp32b4x::copy(tmem_addr, values[0], values[1], values[2], values[3]); cutlass::arch::fence_view_async_tmem_load();