[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 59ab765828a0 · 2026-04-10T00:40:11.000Z
for more information, see https://pre-commit.ci
diff --git a/tests/pytorch/debug/run_distributed.py b/tests/pytorch/debug/run_distributed.py
@@ -47,6 +47,7 @@
 
 fp8_available = is_fp8_available()
 
+
 def _cmp_dist(ground_truth, output, parallel_mode):
     if parallel_mode == "column" and torch.cuda.get_device_capability() == (12, 0):
         # SM120: distributed column-parallel path may show a single-element
diff --git a/tests/pytorch/test_custom_recipe.py b/tests/pytorch/test_custom_recipe.py
@@ -121,7 +121,7 @@ def test_custom_recipe_grouped_linear_sanity():
     out_features = 64
     # Use 16-aligned splits on SM120 to satisfy FP8 GEMM leading-dimension requirements in backward.
     is_sm120 = torch.cuda.get_device_capability() == (12, 0)
-    if is_sm120:    
+    if is_sm120:
         split_m = 16
         batch = num_gemms * split_m
         m_splits = [split_m] * num_gemms
diff --git a/transformer_engine/common/cast/dispatch/gated.cuh b/transformer_engine/common/cast/dispatch/gated.cuh
@@ -48,9 +48,10 @@ void quantize_gated_fwd_helper(const NVTETensor nvte_input, NVTETensor nvte_outp
     case NVTE_DELAYED_TENSOR_SCALING: {
       //const bool use_tma_kernels = (cols % 32 == 0) && is_supported_by_CC_100();
       // sm120 shared memory capapbilities are much smaller than sm100, so we disable TMA kernels on sm120
-      // KL: It is possible that for fwd, the limits are not exceeded for sm120. To be investigated - 
+      // KL: It is possible that for fwd, the limits are not exceeded for sm120. To be investigated -
       // are there any forward only tests we'd like to keep enabled on sm120?
-      const bool use_tma_kernels = (cols % 32 == 0) && is_supported_by_CC_100() && !is_supported_by_CC_120();
+      const bool use_tma_kernels =
+          (cols % 32 == 0) && is_supported_by_CC_100() && !is_supported_by_CC_120();
       if (use_tma_kernels) {
         Tensor dummy_grad_tensor;
         fp8::cast_gated_tma</*IS_BWD=*/false, ParamOP, ActOP, nullptr>(input, dummy_grad_tensor,
@@ -143,7 +144,8 @@ void quantize_gated_bwd_helper(const NVTETensor nvte_grad, const NVTETensor nvte
     case NVTE_DELAYED_TENSOR_SCALING: {
       //const bool use_tma_kernels = (cols % 32 == 0) && is_supported_by_CC_100();
       // sm120 shared memory capapbilities are much smaller than sm100, so we disable TMA kernels on sm120
-      const bool use_tma_kernels = (cols % 32 == 0) && is_supported_by_CC_100() && !is_supported_by_CC_120();
+      const bool use_tma_kernels =
+          (cols % 32 == 0) && is_supported_by_CC_100() && !is_supported_by_CC_120();
       if (use_tma_kernels) {
         fp8::cast_gated_tma</*IS_BWD=*/true, ParamOP, ActOP, DActOP>(gated_input, grad, output, p,
                                                                      stream);
diff --git a/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu b/transformer_engine/common/gemm/cublaslt_grouped_gemm.cu
@@ -302,8 +302,7 @@ inline size_t grouped_gemm_setup_workspace_size(size_t num_tensors) {
 inline void check_grouped_gemm_requirements(const char *api_name) {
   const int current_device = transformer_engine::cuda::current_device();
   const int sm_arch = transformer_engine::cuda::sm_arch(current_device);
-  NVTE_CHECK(sm_arch >= 100, api_name,
-             " requires Blackwell (SM100) or newer architecture.");
+  NVTE_CHECK(sm_arch >= 100, api_name, " requires Blackwell (SM100) or newer architecture.");
   NVTE_CHECK(sm_arch != 120, api_name,
              " is currently unsupported on SM120. Grouped cuBLASLt GEMM heuristic selection "
              "returns CUBLAS_STATUS_NOT_SUPPORTED on this architecture (even with relaxed hints)");
diff --git a/transformer_engine/pytorch/csrc/extensions/cast.cpp b/transformer_engine/pytorch/csrc/extensions/cast.cpp
@@ -89,7 +89,6 @@ inline bool is_sm120_device() {
   return device_prop.major == 12 && device_prop.minor == 0;
 }
 
-
 // helper functions for NVFP4 grouped quantization (cuda graph safe with shapes stored in device without D2H copy)
 void group_quantize_nvfp4_impl(const GroupedTensorWrapper &grouped_input_tensor,
                                GroupedTensorWrapper &grouped_output_tensor,
@@ -1192,9 +1191,9 @@ void split_quantize_nvfp4_impl_with_rht_helper(const TensorWrapper &input,
           auto rht_output_t = allocateTorchTensor(cols, rows, input_list[i].dtype());
           rht_output_t_tensors.push_back(rht_output_t);
           TensorWrapper rht_output_t_cpp;
-          rht_output_t_cpp.set_rowwise_data(rht_output_t.data_ptr(), input_list[i].dtype(),
-                                            std::vector<size_t>{static_cast<size_t>(cols),
-                                                                static_cast<size_t>(rows)});
+          rht_output_t_cpp.set_rowwise_data(
+              rht_output_t.data_ptr(), input_list[i].dtype(),
+              std::vector<size_t>{static_cast<size_t>(cols), static_cast<size_t>(rows)});
           nvte_hadamard_transform(input_list[i].data(), rht_output_t_cpp.data(), 0,
                                   quantizer.rht_matrix_random_sign_mask_t, stream);
           nvte_quantize_v2(rht_output_t_cpp.data(), out_transpose_list[i].data(),