intel · hdharpure9922 · Jul 1, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/cmake/onnxruntime_providers_vsinpu.cmake b/cmake/onnxruntime_providers_vsinpu.cmake
@@ -14,7 +14,7 @@
     safeint_interface )
   add_dependencies(onnxruntime_providers_vsinpu ${onnxruntime_EXTERNAL_DEPENDENCIES})
   set_target_properties(onnxruntime_providers_vsinpu PROPERTIES FOLDER "ONNXRuntime" LINKER_LANGUAGE CXX)
-  target_include_directories(onnxruntime_providers_vsinpu PRIVATE ${ONNXRUNTIME_ROOT} $ENV{TIM_VX_INSTALL}/include)
+  target_include_directories(onnxruntime_providers_vsinpu PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} $ENV{TIM_VX_INSTALL}/include)
 
   find_library(TIMVX_LIBRARY NAMES tim-vx PATHS $ENV{TIM_VX_INSTALL}/lib NO_DEFAULT_PATH)
   if(NOT TIMVX_LIBRARY)

diff --git a/docs/contrib_ops/cuda/moe_qmoe.md b/docs/contrib_ops/cuda/moe_qmoe.md
@@ -989,6 +989,27 @@ per-column INT4, block-wise INT4/INT8, and interleaved-SwiGLU GEMV kernels.
 | Kernel instantiation | `moe_gemv.cu` adds `__nv_bfloat16` details/instantiations (group sizes 0/32/64/128, INT4/INT8, bias on/off) under `ENABLE_BF16`. | The custom FC1/FC2 GEMV kernels run for BF16; no grouped-GEMM fallback when the FP16 gate would route. |
 | Profiling | GPT-OSS-20B, Qwen3.6-35B-A3B, and Gemma model shapes profiled with `block_size=64` for both dtypes. | BF16 matches FP16 routing and latency within noise (about 1.3x–1.5x faster than grouped GEMM); SwiGLU BF16 parity tests pass. |
 
+#### Split-K2 SwiGLU GEMV default path
+
+The fp16 INT4 interleaved-SwiGLU GEMV path uses a two-pass Split-K2 FC1 kernel by
+default for supported decode shapes. The first pass computes two K-split FP32
+partials into QMoE workspace, and the second pass reduces those partials, adds
+optional bias, and applies the interleaved SwiGLU epilogue. FC2 stays on the
+regular `moe_gemv_kernel` path.
+
+Set `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1` before process start to force the
+previous single-kernel FC1 SwiGLU GEMV path for debugging, A/B benchmarking, or
+bisecting numerical differences. On GPT-OSS-20B, Split-K2 reduced FC1 kernel
+work from about 21.42 us to 19.98 us and improved repeated CUDA-graph decode
+throughput by about 0.9% to 1.6% with valid focused-helper output. A 1000-sample
+MMLU smoke matched the opt-out fallback within noise. A future autotuner can
+replace this hand-selected default with per-shape route selection.
+
+```bash
+onnxruntime/test/python/transformers/profile_qmoe_gemv.py \
+  --case gpt_oss_20b_m1_top4_fp16_2880x2880_e32 \
+  --disable-splitk2-swiglu --warmup 5 --repeat 100 --nvtx
+```
 #### Accumulation policy
 
 The QMoE GEMV fast path accumulates fp16 activations in fp16 by default. Set

diff --git a/docs/contrib_ops/cuda/qmoe_gemv_experiments.md b/docs/contrib_ops/cuda/qmoe_gemv_experiments.md
@@ -979,6 +979,169 @@ Every case reported `has_invalid_output=false`.
 - Per-column INT8 W8A16 decode shapes route to GEMV for both FP16 and BF16 and
   beat the grouped-GEMM fallback at every profiled shape.
 
+## 2026-06-19: Split-K2 Two-Pass SwiGLU GEMV Experiment
+
+### Change Under Test
+
+- Code commit: `f1d6718be719c1237be392c0389874b6a8926a3c`
+  (`Experiment QMoE split-K SwiGLU GEMV`).
+- Added default Split-K2 route with opt-out env knob:
+  `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1`.
+- Scope: FP16 INT4/interleaved-SwiGLU FC1 GEMV path for decode-shaped QMoE.
+- Implementation:
+  - First pass launches `moe_gemv_splitk_partials_kernel` with `SplitK=2` and
+    writes FP32 partials into QMoE workspace.
+  - Second pass launches `moe_gemv_splitk_reduce_swiglu_kernel` to reduce the
+    partials, add optional bias, and apply SwiGLU.
+  - FC2 remains on the existing `moe_gemv_kernel`.
+  - Scratch is allocated only for the supported Split-K2 route. Setting
+    `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1` restores the previous single-kernel
+    FC1 SwiGLU GEMV path.
+
+### Repro Notes
+
+- Build: `cmake --build build/cu130/Release --target onnxruntime_providers_cuda --parallel $(nproc)`.
+- Important provider sync: Python tests importing from
+  `build/cu130/Release/onnxruntime` load
+  `build/cu130/Release/onnxruntime/capi/libonnxruntime_providers_cuda.so`, not
+  only the top-level `build/cu130/Release/libonnxruntime_providers_cuda.so` or
+  the venv copy. Sync all relevant copies before measuring:
+
+  ```bash
+  cp build/cu130/Release/libonnxruntime_providers_cuda.so \
+     build/cu130/Release/onnxruntime/capi/libonnxruntime_providers_cuda.so
+  cp build/cu130/Release/libonnxruntime_providers_cuda.so \
+     .venv_cu130/lib/python3.14/site-packages/onnxruntime/capi/libonnxruntime_providers_cuda.so
+  ```
+
+- Focused QMoE helper:
+
+  ```bash
+  cd ~
+  CUDA_VISIBLE_DEVICES=1 \
+  LD_LIBRARY_PATH=~/onnxruntime/build/cu130/Release:~/cuda13.0/lib64:~/cudnn9.19_cuda13/lib:~/cudnn9.19_cuda13/lib64:${LD_LIBRARY_PATH:-} \
+  PYTHONPATH=~/onnxruntime/build/cu130/Release:~/onnxruntime/onnxruntime/test/python/transformers \
+  ~/onnxruntime/.venv_cu130/bin/python \
+  ~/onnxruntime/onnxruntime/test/python/transformers/profile_qmoe_gemv.py \
+    --case gpt_oss_20b_m1_top4_fp16_2880x2880_e32 --warmup 3 --repeat 20
+  ```
+
+### Focused QMoE Smoke
+
+Both modes reported `has_invalid_output=false`.
+
+| Mode | Env | Latency ms |
+|------|-----|------------|
+| Baseline | `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1` | 0.072344 |
+| Split-K2 | none | 0.073816 |
+
+The short helper was slightly slower with split-K2, so Nsight was required to
+confirm route selection and isolate kernel time.
+
+### Nsight Systems Kernel Results
+
+Artifacts:
+
+- Baseline: `/tmp/qmoe_gptoss_baseline_final.{nsys-rep,sqlite}`
+- Split-K2: `/tmp/qmoe_gptoss_splitk_final.{nsys-rep,sqlite}`
+
+Command shape:
+
+```bash
+~/cuda13.0/bin/nsys profile -t cuda,nvtx --force-overwrite true \
+  -o /tmp/qmoe_gptoss_splitk_final --export=sqlite \
+  ~/onnxruntime/.venv_cu130/bin/python \
+  ~/onnxruntime/onnxruntime/test/python/transformers/profile_qmoe_gemv.py \
+    --case gpt_oss_20b_m1_top4_fp16_2880x2880_e32 --warmup 3 --repeat 30 --nvtx
+```
+
+Parsed with `parse_nsys.py --nvtx-range benchmark --pattern '%'`.
+
+| Mode | Kernel | Calls | Avg us |
+|------|--------|-------|--------|
+| Baseline | `moe_gemv_interleaved_swiglu_kernel` | 30 | 21.42 |
+| Baseline | `moe_gemv_kernel` | 30 | 12.13 |
+| Split-K2 | `moe_gemv_splitk_partials_kernel` | 30 | 17.59 |
+| Split-K2 | `moe_gemv_splitk_reduce_swiglu_kernel` | 30 | 2.39 |
+| Split-K2 | `moe_gemv_kernel` | 30 | 12.22 |
+
+Split-K2 reduced FC1 kernel work from about `21.42 us` to `17.59 + 2.39 =
+19.98 us`, a net FC1 reduction of about `1.44 us` per QMoE invocation. End-to-end
+under Nsight was effectively tied:
+
+| Mode | Helper latency ms |
+|------|-------------------|
+| Baseline | 0.079855 |
+| Split-K2 | 0.079728 |
+
+### Model-Level Decode Benchmark With CUDA Graph
+
+The user requested model-level measurement assuming CUDA graph. Both runs used
+the GPT-OSS-20B INT4 QMoE model package, CUDA graph enabled, XQA enabled, and
+deterministic MoE tactic selection:
+
+```bash
+MODEL=models/gpt-oss-20b/variants/cuda_int4_int4_qmoe_rtn_matmul_only \
+GPU=0 PROMPT_LEN=512 GEN_LEN=128 REPS=10 WARMUP=3 CUDA_GRAPH=1 XQA=1 SYNC_LIB=1 \
+ORT_FORCE_DETERMINISTIC_MOE=1 \
+bash scripts/bench_gpt_oss_ort_decode.sh
+```
+
+Baseline additionally set `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1`.
+
+| Run | Mode | Decode latency ms/token | Decode throughput tok/s |
+|-----|------|-------------------------|-------------------------|
+| R1, `REPS=5`, `WARMUP=2` | Baseline | 2.869450 | 348.498901 |
+| R1, `REPS=5`, `WARMUP=2` | Split-K2 | 2.823800 | 354.132707 |
+| R2, `REPS=10`, `WARMUP=3` | Baseline | 2.865840 | 348.937861 |
+| R2, `REPS=10`, `WARMUP=3` | Split-K2 | 2.839335 | 352.195107 |
+
+The longer CUDA-graph pair showed about `+0.9%` decode throughput. The shorter
+pair showed about `+1.6%`. Since the focused helper reported valid output and
+the model-level gain repeated in the same direction, even this modest gain is
+worth enabling for GPT-OSS-20B decode while keeping an opt-out for A/B checks.
+
+After flipping Split-K2 to the default and adding
+`ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1` as the opt-out, three more paired
+CUDA-graph model runs were collected with `REPS=10`, `WARMUP=3`, prompt length
+512, and generation length 128:
+
+| Run | Mode | Decode latency ms/token | Decode throughput tok/s |
+|-----|------|-------------------------|-------------------------|
+| R3 | Default Split-K2 | 3.017252 | 331.427448 |
+| R3 | Split-K2 disabled | 3.055736 | 327.253380 |
+| R4 | Default Split-K2 | 3.006739 | 332.586260 |
+| R4 | Split-K2 disabled | 3.047570 | 328.130314 |
+| R5 | Default Split-K2 | 3.009466 | 332.284898 |
+| R5 | Split-K2 disabled | 3.047015 | 328.190090 |
+| Average | Default Split-K2 | 3.011152 | 332.099536 |
+| Average | Split-K2 disabled | 3.050107 | 327.857928 |
+
+The default Split-K2 route was faster in all three pairs, averaging `+1.29%`
+decode throughput and `-1.28%` decode latency versus the opt-out fallback.
+
+### Accuracy Smoke
+
+A 1000-sample `match_mmlu` smoke was run with the local parallel eval harness on
+all eight H200 GPUs, using the same GPT-OSS-20B INT4 QMoE model package and the
+current ORT build package. The default Split-K2 run scored `0.8380` pooled
+accuracy; the opt-out fallback with `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1`
+scored `0.8350`. The small positive difference is within smoke-test noise, and
+there is no accuracy regression signal from enabling Split-K2 by default.
+
+### Decision
+
+- Enable Split-K2 by default for its supported fp16 INT4 interleaved-SwiGLU GEMV
+  scope.
+- Keep `ORT_DISABLE_MOE_GEMV_SPLITK2_SWIGLU=1` as the fallback and A/B knob.
+- The 1000-sample MMLU smoke matched the opt-out fallback within noise, so the
+  default flip has an accuracy sanity check in addition to focused-helper valid
+  output.
+- Future work:
+  - Add per-shape autotune so route selection is data-driven instead of a fixed
+    default.
+  - Try a launch-fused reduction strategy or cooperative approach to keep the
+    FC1 parallelism benefit without the extra reduce launch.
 ## 2026-06-19 FP16 Accumulation Default: SM90, GPT-OSS Decode Shape
 
 ### Setup

diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -33,8 +33,8 @@ struct OrtCUDAProviderOptionsV2 {
   int tunable_op_enable = 0;                                                                                   // flag specifying if TunableOp is enabled.
   int tunable_op_tuning_enable = 0;                                                                            // flag specifying if TunableOp is enabled for tuning, this relies on TunableOp is enabled.
   int tunable_op_max_tuning_duration_ms = 0;                                                                   // Max tuning duration time limit for TunableOp.
-  int enable_skip_layer_norm_strict_mode = 0;                                                                  // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
-                                                                                                               // The strict mode has better accuracy but lower performance.
+  int enable_skip_layer_norm_strict_mode = 0;                                                                  // [Deprecated] Accepted for ABI/back-compat but not stored in EP info. SkipLayerNorm always accumulates in fp32.
+                                                                                                               // Setting it has no effect on computation or output.
   int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
   int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
   int use_tf32 = 1;                                                                                            // use TF32

diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h
@@ -97,7 +97,7 @@ Status Check_QKV(const T* packed_qkv, const T* value, const int num_heads, const
 
 template <typename T>
 Status CheckPast(const T* past_key, const T* past_value, int batch_size, int kv_num_heads, int head_size, int kv_cache_bit_width,
-                 int& past_sequence_length) {
+                 int& past_sequence_length, int kv_cache_extra_bits = 0) {
   const auto& past_key_dims = past_key->Shape().GetDims();
   const auto& past_value_dims = past_value->Shape().GetDims();
 
@@ -140,17 +140,25 @@ Status CheckPast(const T* past_key, const T* past_value, int batch_size, int kv_
   // We assume all sequence in past kv are right-padded to max or past sequence length
   past_sequence_length = static_cast<int>(past_key_dims[2]);
 
-  // For 4-bit quantized KV cache, actual dimension is head_size / 2 because 2 nibbles are packed into one byte.
-  // Note that we have checked that head_size is a multiple of 8 in Check_QKV.
-  int packed_head_size = (kv_cache_bit_width == 4) ? (head_size / 2) : head_size;
+  // Compute expected KV cache head dimension from quantization parameters.
+  // kv_cache_bit_width: bits per element (4 or 8). 0 means no quantization.
+  // kv_cache_extra_bits: additional metadata bits per head
+  // (e.g., 32bits for TurboQuant storing scale).
+  int packed_head_size;
+  if (kv_cache_bit_width == 0) {
+    packed_head_size = head_size;
+  } else {
+    int bits_per_element = static_cast<int>(past_key->DataType()->Size()) * 8;
+    packed_head_size = (head_size * kv_cache_bit_width + kv_cache_extra_bits) / bits_per_element;
+  }
   if (past_key_dims[3] != packed_head_size) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'past_key' dimension 3 should be same as head_size, got ",
+                           "Input 'past_key' dimension 3 should match the packed KV head dimension, got ",
                            past_key_dims[3], " expected ", packed_head_size);
   }
   if (past_value_dims[3] != packed_head_size) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Input 'past_value' dimension 3 should be same as head_size, got ",
+                           "Input 'past_value' dimension 3 should match the packed KV head dimension, got ",
                            past_value_dims[3], " expected ", packed_head_size);
   }
   return Status::OK();
@@ -206,7 +214,12 @@ Status CheckInputs(const T* query,
                    const T* total_seqlen,
                    float scale,
                    float softcap,
-                   int kv_cache_bit_width) {
+                   int kv_cache_bit_width,
+                   int max_threads_per_block = 0,
+                   int kv_cache_extra_bits = 0) {
+  if (max_threads_per_block > 0 && num_heads > max_threads_per_block) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block);
+  }
   // Note: Here S* is seqlen_past_kv_cache, S+ is seqlen_present_kv_cache
   //     past_key                   : (B, N_k, S*, H) or (B, N_k, S+, H) or nullptr
   //     past_value                 : (B, N_k, S*, H) or (B, N_k, S+, H) or nullptr
@@ -246,10 +259,15 @@ Status CheckInputs(const T* query,
     kv_sequence_length = sequence_length;
   }
 
+  if (kv_cache_extra_bits != 0 && kv_cache_bit_width == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "kv_cache_extra_bits requires kv_cache_bit_width to be non-zero.");
+  }
+
   // Check past-present KV
   int32_t past_sequence_length = 0;
   if (past_key != nullptr && past_value != nullptr) {
-    ORT_RETURN_IF_ERROR(CheckPast(past_key, past_value, batch_size, kv_num_heads, head_size, kv_cache_bit_width, past_sequence_length));
+    ORT_RETURN_IF_ERROR(CheckPast(past_key, past_value, batch_size, kv_num_heads, head_size, kv_cache_bit_width, past_sequence_length, kv_cache_extra_bits));
     // When past KV exists, Q and K/V must have the same sequence length,
     // UNLESS kv_sequence_length is 0 (shared KV: new K/V are empty, past buffer
     // already contains the full shared KV cache — no append needed).
@@ -377,30 +395,6 @@ Status CheckInputs(const T* query,
   return Status::OK();
 }
 
-template <typename T = Tensor>
-Status CheckInputs(const T* query,
-                   const T* key,
-                   const T* value,
-                   const T* past_key,
-                   const T* past_value,
-                   const T* cos_cache,
-                   const T* sin_cache,
-                   void* parameters,
-                   int num_heads,
-                   int kv_num_heads,
-                   const T* seqlens_k,
-                   const T* total_seqlen,
-                   float scale,
-                   float softcap,
-                   int kv_cache_bit_width,
-                   int max_threads_per_block) {
-  if (max_threads_per_block > 0 && num_heads > max_threads_per_block) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block);
-  }
-
-  return CheckInputs(query, key, value, past_key, past_value, cos_cache, sin_cache, parameters, num_heads, kv_num_heads, seqlens_k, total_seqlen, scale, softcap, kv_cache_bit_width);
-}
-
 template <typename T = Tensor>
 Status CheckCustomAttentionInputs(const T* position_ids,
                                   const T* attention_bias,

diff --git a/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h b/onnxruntime/contrib_ops/cpu/maxpool_with_mask.h
@@ -216,6 +216,16 @@ class MaxpoolWithMask : public OpKernel, public PoolBase {
                         "Mask and input spatial dimensions mismatch at dimension ", i,
                         ": mask=", m_shape[i], " input=", x_shape[i]);
     }
+    // x_shape.NumDimensions() >= 3 is guaranteed above, so this subtraction cannot underflow.
+    const size_t input_spatial_rank = x_shape.NumDimensions() - 2;
+    // The pooling kernel rank drives the 1D/2D/3D dispatch below, which reads x_shape[2..4] and
+    // output_dims[2..4]. Require it to match the input spatial rank so those reads stay in bounds.
+    ORT_RETURN_IF_NOT(pool_attrs_.kernel_shape.size() == input_spatial_rank,
+                      "Pooling kernel rank must equal input spatial rank. Got kernel rank: ",
+                      pool_attrs_.kernel_shape.size(), " input spatial rank: ", input_spatial_rank);
+    // Only 1D/2D/3D pooling is implemented by the dispatch below; a larger rank would match no case.
+    ORT_RETURN_IF_NOT(input_spatial_rank >= 1 && input_spatial_rank <= 3,
+                      "Only 1D, 2D, and 3D pooling are supported. Got input spatial rank: ", input_spatial_rank);
 
     TensorShapeVector pads = pool_attrs_.pads;
     TensorShapeVector kernel_shape = pool_attrs_.kernel_shape;