NVIDIA · timmoon10 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
@@ -193,7 +193,8 @@ void performTestGLU(const size_t N, const size_t H) {
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 
-  if (otype == DType::kFloat8E4M3 || otype == DType::kFloat8E5M2) {
+  if ((otype == DType::kFloat8E4M3 || otype == DType::kFloat8E5M2)
+      && N * H > 0) {
     auto [atol, rtol] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol, rtol);
     if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
@@ -392,7 +393,9 @@ std::vector<std::pair<size_t, size_t>> act_test_cases = {{2048, 12288},
                                                          {65536, 128},
                                                          {256, 256},
                                                          {257, 259},
-                                                         {128, 128+1}};
+                                                         {128, 128+1},
+                                                         {0, 128},
+                                                         {128, 0}};
 
 }  // namespace
 

diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
@@ -64,7 +64,7 @@ void performTest(const std::vector<size_t>& shape) {
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-  if (isFp8Type(otype)) {
+  if (isFp8Type(otype) && full_size > 0) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
     float ref_scale_inv = 1.f / output_c.scale();
@@ -91,6 +91,8 @@ std::vector<std::vector<size_t>> test_cases = {
   {5, 160},
   {5, 4, 3, 160},
   {217, 256},
+  {0, 128},
+  {128, 0},
 };
 }  // namespace
 

diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -97,7 +97,7 @@ void performTest(const std::vector<size_t>& shape) {
                                  rows,
                                  cols);
 
-  if (isFp8Type(otype)) {
+  if (isFp8Type(otype) && input_size > 0) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
     float ref_scale_inv = 1.f / output_c.scale();
@@ -118,6 +118,8 @@ std::vector<std::vector<size_t>> test_cases = {
   {217, 256},
   {1296},
   {5, 4, 3, 160},
+  {0, 128},
+  {128, 0},
 };
 
 }  // namespace

diff --git a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
@@ -415,6 +415,8 @@ std::vector<std::pair<size_t, size_t>> matrix_sizes = {
     {768, 1024},
     {8192, 128},
     {577, 1632},
+    {0, 128},
+    {128, 0},
 };
 
 std::vector<std::pair<size_t, size_t>> block_sizes = {

diff --git a/transformer_engine/common/cast/fp8/gated_fp8.cuh b/transformer_engine/common/cast/fp8/gated_fp8.cuh
@@ -282,9 +282,14 @@ void cast_gated_tma(const Tensor &gated_input, const Tensor &grad, Tensor *outpu
   checkCuDriverContext(stream);
 
   NVTE_CHECK(!output->has_columnwise_data(), "Only rowwise cast supported in this function.");
+
+  // Tensor dimensions
   const size_t rows = gated_input.flat_first_dim();
   const size_t cols = gated_input.flat_last_dim() / 2;
   const size_t output_cols = (IS_BWD ? 2 : 1) * cols;
+  if (rows == 0 || cols == 0) {
+    return;
+  }
 
   const size_t blocks_Y = DIVUP(rows, CHUNK_DIM_Y);
   const size_t blocks_X = DIVUP(cols, CHUNK_DIM_X);

diff --git a/transformer_engine/common/cast/fp8/quantize_fp8.cuh b/transformer_engine/common/cast/fp8/quantize_fp8.cuh
@@ -355,7 +355,12 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
 template <bool IS_ACT, typename ParamOP, float (*OP)(float, const ParamOP &)>
 void quantize_1D(const Tensor &input, Tensor *output, cudaStream_t stream) {
   using namespace quantize_1D_kernel;
+
+  // Tensor size
   const size_t N = product(input.data.shape);
+  if (N == 0) {
+    return;
+  }
 
   const bool isFullTile = (N % ELEMS_PER_BLOCK == 0);
   NVTE_CHECK(isFullTile, "Only full tiles are supported.");
@@ -391,8 +396,18 @@ void quantize_2D(const Tensor &input, const Tensor *act_input, Tensor *output, T
   using namespace quantize_2D_kernel;
   checkCuDriverContext(stream);
 
+  // Tensor dimensions
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+
+  // Skip kernel if tensor size is zero
+  if (rows == 0 || cols == 0) {
+    if constexpr (IS_DBIAS) {
+      NVTE_ERROR("Invalid tensor shape for DBias computation (shape=", input.shape(), ").");
+    }
+    return;
+  }
+
   const size_t chunks_Y = DIVUP(rows, FP8_CHUNK_DIM_Y);
   const size_t chunks_X = DIVUP(cols, FP8_CHUNK_DIM_X);
   const size_t blocks_Y = chunks_Y;

diff --git a/transformer_engine/common/cast/mxfp8/dequantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/dequantize_mxfp8.cuh
@@ -249,6 +249,10 @@ inline void dequantize(const Tensor &input, Tensor *output, cudaStream_t stream)
 
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+  if (rows == 0 || cols == 0) {
+    return;
+  }
+
   const size_t chunks_Y = DIVUP(rows, CHUNK_DIM_Y);
   const size_t chunks_X = DIVUP(cols, CHUNK_DIM_X);
 

diff --git a/transformer_engine/common/cast/mxfp8/gated_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/gated_mxfp8.cuh
@@ -707,6 +707,13 @@ void quantize_gated(const Tensor &gated_input, const Tensor &grad, Tensor *outpu
   using namespace gated_kernel;
   checkCuDriverContext(stream);
 
+  const size_t rows = gated_input.flat_first_dim();
+  const size_t cols = gated_input.flat_last_dim() / 2;
+  const size_t output_cols = (IS_BWD ? 2 : 1) * cols;
+  if (rows == 0 || cols == 0) {
+    return;
+  }
+
   const bool USE_ROWWISE_SCALING = output->has_data();
   const bool USE_COLWISE_SCALING = output->has_columnwise_data();
   const bool with_gemm_swizzled_scales = output->with_gemm_swizzled_scales;
@@ -725,12 +732,10 @@ void quantize_gated(const Tensor &gated_input, const Tensor &grad, Tensor *outpu
     scaling_type = ScalingType::COLWISE;
   } else if (USE_ROWWISE_SCALING && USE_COLWISE_SCALING) {
     scaling_type = ScalingType::BIDIMENSIONAL;
+  } else {
+    NVTE_ERROR("Missing both row-wise and column-wise data.");
   }
 
-  const size_t rows = gated_input.flat_first_dim();
-  const size_t cols = gated_input.flat_last_dim() / 2;
-  const size_t output_cols = (IS_BWD ? 2 : 1) * cols;
-
   const size_t blocks_Y = DIVUP(rows, CHUNK_DIM_Y);
   const size_t blocks_X = DIVUP(cols, CHUNK_DIM_X);
 

diff --git a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -870,6 +870,15 @@ void group_quantize(const GroupedTensor *input, const GroupedTensor *activations
     }
   }
 
+  // Skip kernel if tensor size is zero
+  if (elts_total == 0) {
+    if constexpr (IS_DBIAS) {
+      NVTE_ERROR("Invalid grouped tensor shape for DBias computation (first_logical_dim=",
+                 first_logical_dim, ", last_logical_dim=", last_logical_dim, ")");
+    }
+    return;
+  }
+
   TRANSFORMER_ENGINE_TYPE_SWITCH_NON_FP8ONLY(
       input->dtype(), IType,
       TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(

diff --git a/transformer_engine/common/cast/mxfp8/quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/quantize_mxfp8.cuh
@@ -579,6 +579,14 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
 
+  // Skip kernel if tensor size is zero
+  if (rows == 0 || cols == 0) {
+    if constexpr (IS_DBIAS) {
+      NVTE_ERROR("Invalid tensor shape for DBias computation (shape=", input.shape(), ").");
+    }
+    return;
+  }
+
   // Tensor chunk handled by each CUDA block
   constexpr size_t CHUNK_DIM_Y = CAST_DBIAS_ONLY ? 128 : 64;
   constexpr size_t CHUNK_DIM_X = CAST_DBIAS_ONLY ? 128 : 64;

diff --git a/transformer_engine/common/cast/nvfp4/dequantize_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/dequantize_nvfp4.cuh
@@ -87,6 +87,9 @@ inline void dequantize(const Tensor &input, Tensor *output, cudaStream_t stream)
   constexpr int FP4_BLOCK_SIZE = 16;
   const size_t N = input.flat_first_dim();
   const size_t M = input.flat_last_dim();
+  if (N == 0 || M == 0) {
+    return;
+  }
 
   NVTE_CHECK(M % FP4_BLOCK_SIZE == 0, "Last dimension of FP4 tensors needs to be divisible by ",
              FP4_BLOCK_SIZE, ", but got ", input.data.shape, ".");

diff --git a/transformer_engine/common/cast/nvfp4/group_quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/group_quantize_transpose_nvfp4.cuh
@@ -785,6 +785,9 @@ void group_quantize_transpose(const Tensor &input, const Tensor *noop,
 
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+  if (rows == 0 || cols == 0) {
+    return;
+  }
 
   NVTE_CHECK(rows % 32 == 0,
              "Number of tensor rows must be a multiple of 32");  // 16B alignment for TMA

diff --git a/transformer_engine/common/cast/nvfp4/quantize_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_nvfp4.cuh
@@ -560,6 +560,9 @@ inline void quantize(const Tensor &input, const Tensor *noop, Tensor *output, cu
 
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+  if (rows == 0 || cols == 0) {
+    return;
+  }
 
   constexpr size_t CHUNK_DIM_Y = 128;
   constexpr size_t CHUNK_DIM_X = 128;

diff --git a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
@@ -1197,6 +1197,9 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
 
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+  if (rows == 0 || cols == 0) {
+    return;
+  }
 
   NVTE_CHECK(rows % 32 == 0,
              "Number of tensor rows must be a multiple of 32");  // 16B alignment for TMA

diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
@@ -704,6 +704,9 @@ inline void quantize_transpose_tuned_1D(const Tensor &input, const Tensor *noop,
 
   const size_t rows = input.flat_first_dim();
   const size_t cols = input.flat_last_dim();
+  if (rows == 0 || cols == 0) {
+    return;
+  }
 
   NVTE_CHECK(rows % 32 == 0,
              "Number of tensor rows must be a multiple of 32");  // 16B alignment for TMA