diff --git a/docs_new/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx b/docs_new/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx index 5bbcd399cedd..87a03fe6c8a2 100644 --- a/docs_new/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx +++ b/docs_new/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx @@ -110,7 +110,7 @@ The Playground is where you experiment with **SGLang features beyond the verifie The knobs come in two flavors: - **Built-in SGLang features** — parallelism overrides (TP / CP / DP-Attention — DP-Attention's value is the DP degree, with `off` to disable), MoE backend + EP, reasoning / tool-call parsers, speculative-decoding presets, prefill/decode disaggregation, HiCache tiers, and HiSparse hierarchical sparse attention (decode-role only — the card appears once PD-Disagg mode is set to decode). -- **DeepSeek-V4 specific features** — MegaMoE W4A8 / W4A4 fused kernel (Blackwell only). +- **DeepSeek-V4 specific features** — MegaMoE W4A8 / W4A4 fused kernel (Blackwell only; Hopper SM90 uses a separate all-FP8 MegaMoE path — see Configuration Tips below). Lines highlighted **green** are added by your overrides; lines with **red strikethrough** were in the verified base but stripped by an override. When no override differs from the base cell, the playground inherits the base's **Verified** badge; any actual change flips it to **Not Verified** until the new configuration is run end-to-end and submitted back. @@ -286,6 +286,8 @@ Two options are available for running DeepSeek-V4 on Hopper: - **Original FP4 checkpoints** — apply the W4A16 MoE kernels (Marlin) as the command generator picks for Hopper cells. This path works on both H100 and H200 and is the only option for H100 (no FP8 path). It is TP-only; on H200 the Pro variant fits on a single 8-GPU node, while H100 Pro needs 2 nodes (TP=16). - **Converted FP8 checkpoints** (H100 and H200 only) — pre-repackaged FP8 weights at [`sgl-project/DeepSeek-V4-Flash-FP8`](https://huggingface.co/sgl-project/DeepSeek-V4-Flash-FP8) and [`sgl-project/DeepSeek-V4-Pro-FP8`](https://huggingface.co/sgl-project/DeepSeek-V4-Pro-FP8) unlock DP-attention + DeepEP and richer parallelism (e.g. Pro TP=16 across 2 nodes). +On these FP8 checkpoints you can additionally enable the all-FP8 **MegaMoE** path on SM90 for higher long-context / large-decode throughput — see the **SM90 (Hopper) FP8 MegaMoE** note in Configuration Tips below. + PD-Disagg recipes on H200 may require `docker run --privileged --ulimit memlock=-1` (or `--device /dev/infiniband:/dev/infiniband --cap-add IPC_LOCK`) so mooncake can discover the IB HCAs; without IB exposure mooncake silently falls back to @@ -321,11 +323,38 @@ Two variants are exposed: (~89.5 GPQA on Pro). Notes: -- MegaMoE is **only supported on Blackwell GPUs** (B200 / B300 / GB200 / GB300). The chip is hidden when the Deploy panel's base cell sits on Hopper (H100 / H200). +- The W4A8 / W4A4 variants above are **Blackwell-only** (B200 / B300 / GB200 / GB300). On **Hopper (SM90, H100 / H200)** use the all-FP8 MegaMoE path described below instead. - MegaMoE is **only wired into the `high-throughput` recipe** on Blackwell (per [sgl-project/sglang#26451](https://github.com/sgl-project/sglang/pull/26451)). The chip is hidden on `low-latency` and `balanced` — switch to `high-throughput` to expose it. - When running MegaMoE, don't set `--moe-runner-backend` manually. - Adjust `SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK` based on your workload and memory usage. Setting higher number of tokens for MegaMoE requires more HBM space (recommended: 8320 for high-throughput). +**SM90 (Hopper) FP8 MegaMoE (Experimental)** + +On SM90 (Hopper, H100 / H200), the all-FP8 MegaMoE path routes MoE through the +DeepGEMM `mega_moe` runner for higher long-context / large-decode throughput on +the FP8 checkpoints. Unlike the Blackwell W4A8 / W4A4 variants above, experts +stay in **FP8** — keep `SGLANG_DSV4_FP4_EXPERTS=0`. It requires a `sgl-deep-gemm` +build with SM90 FP8 MegaMoE support. **Please use the latest image for this +feature.** + +Enable the MegaMoE path with `--moe-a2a-backend megamoe` — or equivalently set +`SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1`, which auto-configures the same backend: + +```bash Command +SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 \ +SGLANG_DSV4_FP4_EXPERTS=0 \ +sglang serve \ + --model-path sgl-project/DeepSeek-V4-Flash-FP8 \ + --tp 8 \ + --moe-a2a-backend megamoe \ + --chunked-prefill-size 4096 +``` + +`SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK` caps the number of tokens +the MegaMoE path processes per rank (i.e. per GPU); the MegaMoE path is only used +for batches at or below this cap. The right value depends on your parallelism / +token-split scheme, and larger values reserve more HBM. + **GB300 PD-Disagg cross-pod MNNVL** On some GB300 clusters with cross-pod KV transfer over NVLink, mooncake may diff --git a/python/sglang/srt/layers/moe/mega_moe.py b/python/sglang/srt/layers/moe/mega_moe.py index 8e8474f161c5..723460a7ba70 100644 --- a/python/sglang/srt/layers/moe/mega_moe.py +++ b/python/sglang/srt/layers/moe/mega_moe.py @@ -25,8 +25,13 @@ from sglang.srt.environ import envs from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo from sglang.srt.layers.dp_attention import get_dp_global_num_tokens +from sglang.srt.layers.moe.mega_moe_sm90 import ( + is_sm90_fp8_mega_moe_available, + run_sm90_mega_routed, +) from sglang.srt.layers.moe.utils import get_moe_a2a_backend from sglang.srt.model_executor.runner import get_is_capture_mode +from sglang.srt.models.deepseek_common.utils import _device_sm if TYPE_CHECKING: from deep_gemm import SymmBuffer @@ -99,6 +104,9 @@ def should_use_mega_moe(moe: DeepseekV2MoE, hidden_states: torch.Tensor) -> bool return False if not getattr(moe.experts, "_mega_moe_weights_built", False): return False + if _device_sm == 90: + if not is_sm90_fp8_mega_moe_available(moe.experts): + return False if get_is_capture_mode(): return True @@ -213,6 +221,16 @@ def _run_mega_routed( topk_ids_in = hidden_states.new_empty((0, top_k), dtype=torch.int32) topk_weights_in = hidden_states.new_empty((0, top_k), dtype=torch.float32) + if _device_sm == 90: + return run_sm90_mega_routed( + moe, + hidden_states, + topk_ids_in, + topk_weights_in, + buf, + num_tokens, + ) + use_fp4_acts = envs.SGLANG_OPT_DEEPGEMM_MEGA_MOE_USE_FP4_ACTS.get() if use_fp4_acts: # FP4 path goes through DeepGEMM's mega_moe_pre_dispatch which diff --git a/python/sglang/srt/layers/moe/mega_moe_sm90.py b/python/sglang/srt/layers/moe/mega_moe_sm90.py new file mode 100644 index 000000000000..abe5214825ce --- /dev/null +++ b/python/sglang/srt/layers/moe/mega_moe_sm90.py @@ -0,0 +1,179 @@ +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SM90 FP8 Mega-MoE forward path and expert-weight prep.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch + +from sglang.srt.environ import envs +from sglang.srt.models.deepseek_common.utils import _device_sm + +if TYPE_CHECKING: + from deep_gemm import SymmBuffer + + from sglang.srt.models.deepseek_v2 import DeepseekV2MoE + + +def is_sm90_fp8_mega_moe_available(experts) -> bool: + if _device_sm != 90: + return False + try: + import deep_gemm + except ImportError: + return False + return ( + hasattr(deep_gemm, "fp8_mega_moe") + and hasattr(deep_gemm, "mega_moe_pre_dispatch_sm90") + and getattr(experts, "_mega_moe_sm90_fp8_weights", False) + ) + + +def run_sm90_mega_routed( + moe: DeepseekV2MoE, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + buf: SymmBuffer, + num_tokens: int, +) -> torch.Tensor: + import deep_gemm + + if moe.experts.should_fuse_routed_scaling_factor_in_topk: + routed_scaling_factor = 1.0 + else: + routed_scaling_factor = float(moe.routed_scaling_factor) + + deep_gemm.mega_moe_pre_dispatch_sm90( + hidden_states, + topk_ids, + topk_weights, + buf.x, + buf.x_sf, + buf.topk_idx, + buf.topk_weights, + num_tokens=num_tokens, + group_size=128, + routed_scaling_factor=routed_scaling_factor, + ) + + y = torch.empty( + (max(num_tokens, 1), moe.config.hidden_size), + dtype=torch.bfloat16, + device=hidden_states.device, + ) + deep_gemm.fp8_mega_moe( + y, + moe.experts.mega_l1_weights, + moe.experts.mega_l2_weights, + buf, + recipe=(128, 128, 128), + activation="swiglu", + activation_clamp=getattr(moe.config, "swiglu_limit", None), + fast_math=True, + ) + y = y[:num_tokens] + + return y + + +def _interleave_l1_weight_only(weight: torch.Tensor, gran: int = 8) -> torch.Tensor: + num_groups, n, *rest = weight.shape + half = n // 2 + gate = weight[:, :half].reshape(num_groups, half // gran, gran, *rest) + up = weight[:, half:].reshape(num_groups, half // gran, gran, *rest) + return torch.stack([gate, up], dim=2).reshape(num_groups, n, *rest) + + +def build_sm90_mega_moe_experts_weights(experts) -> None: + if getattr(experts, "_mega_moe_weights_built", False): + return + + w13 = experts.w13_weight.data + w13_sf_fp32 = experts.w13_weight_scale_inv.data + w2 = experts.w2_weight.data + w2_sf_fp32 = experts.w2_weight_scale_inv.data + + assert w13.dtype == torch.float8_e4m3fn + assert w2.dtype == torch.float8_e4m3fn + + num_groups, n1, k1 = w13.shape + _, n2, k2 = w2.shape + scale_group_mn, scale_group_k = 128, 128 + + assert k1 % scale_group_k == 0 and k2 % scale_group_k == 0, ( + f"invalid SM90 mega-moe K/group_size: k1={k1}, k2={k2}, " + f"group_k={scale_group_k}" + ) + expected_n_groups_1 = (n1 + scale_group_mn - 1) // scale_group_mn + expected_n_groups_2 = (n2 + scale_group_mn - 1) // scale_group_mn + expected_k_groups_1 = k1 // scale_group_k + expected_k_groups_2 = k2 // scale_group_k + assert w13_sf_fp32.shape[1] == expected_n_groups_1, ( + f"w13 scale N groups mismatch: got {w13_sf_fp32.shape[1]}, " + f"expected {expected_n_groups_1} (n1={n1}, group_mn={scale_group_mn})" + ) + assert w2_sf_fp32.shape[1] == expected_n_groups_2, ( + f"w2 scale N groups mismatch: got {w2_sf_fp32.shape[1]}, " + f"expected {expected_n_groups_2} (n2={n2}, group_mn={scale_group_mn})" + ) + assert w13_sf_fp32.shape[2] == expected_k_groups_1, ( + f"w13 scale K groups mismatch: got {w13_sf_fp32.shape[2]}, " + f"expected {expected_k_groups_1} (k1={k1}, group_k={scale_group_k})" + ) + assert w2_sf_fp32.shape[2] == expected_k_groups_2, ( + f"w2 scale K groups mismatch: got {w2_sf_fp32.shape[2]}, " + f"expected {expected_k_groups_2} (k2={k2}, group_k={scale_group_k})" + ) + + if envs.SGLANG_OPT_FIX_MEGA_MOE_MEMORY.get(): + w13_interleaved = _interleave_l1_weight_only(w13) + experts.w13_weight.data = w13_interleaved + experts.mega_l1_weights = ( + experts.w13_weight.data, + experts.w13_weight_scale_inv.data, + ) + experts.mega_l2_weights = ( + experts.w2_weight.data, + experts.w2_weight_scale_inv.data, + ) + else: + import deep_gemm + + w13_sf = deep_gemm.transform_sf_into_required_layout( + w13_sf_fp32, + mn=n1, + k=k1, + recipe=(128, 128), + num_groups=num_groups, + disable_ue8m0_cast=True, + ) + w2_sf = deep_gemm.transform_sf_into_required_layout( + w2_sf_fp32, + mn=n2, + k=k2, + recipe=(128, 128), + num_groups=num_groups, + disable_ue8m0_cast=True, + ) + l1_pair, l2_pair = deep_gemm.transform_weights_for_mega_moe_sm90( + (w13, w13_sf), (w2, w2_sf) + ) + experts.mega_l1_weights = l1_pair + experts.mega_l2_weights = l2_pair + + experts._mega_moe_sm90_fp8_weights = True + experts._mega_moe_weights_built = True diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index c11e7e0a53e6..34e37fb5dd8b 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -1359,6 +1359,15 @@ def process_weights_after_loading_block_quant(self, layer: Module) -> None: layer.w13_weight_scale_inv.format_ue8m0 = True layer.w2_weight_scale_inv.format_ue8m0 = True + if get_moe_a2a_backend().is_megamoe() and is_sm90_supported(): + from sglang.srt.layers.moe.mega_moe_sm90 import ( + build_sm90_mega_moe_experts_weights, + ) + + assert not self.is_fp4_expert + build_sm90_mega_moe_experts_weights(layer) + return + if not self.is_fp4_expert: weight_block_size = self.quant_config.weight_block_size if requant_block_scale_ue8m0_for_deepgemm( diff --git a/python/sglang/srt/models/deepseek_v4.py b/python/sglang/srt/models/deepseek_v4.py index bb4754ccd3be..49ad6547f195 100644 --- a/python/sglang/srt/models/deepseek_v4.py +++ b/python/sglang/srt/models/deepseek_v4.py @@ -1632,12 +1632,16 @@ def forward( and getattr(self.mlp, "_shared_expert_tp1", False) ) if _use_cp: - if get_moe_a2a_backend().is_none(): + moe_a2a_backend = get_moe_a2a_backend() + if moe_a2a_backend.is_none(): hidden_states = dsa_cp_gather_hidden_states(hidden_states) else: - assert get_moe_a2a_backend().is_deepep(), ( - "CP requires DeepEP (moe_a2a_backend == deepep). " - "Only DeepEP is tested with CP's per-rank token split." + cp_moe_backend_supported = ( + moe_a2a_backend.is_deepep() or moe_a2a_backend.is_megamoe() + ) + assert cp_moe_backend_supported, ( + "CP requires DeepEP (moe_a2a_backend == deepep) or MegaMoE " + "(moe_a2a_backend == megamoe)." ) elif _use_tp_moe_gather: hidden_states, local_hidden_states = ( diff --git a/test/registered/models_e2e/test_deepseek_v4_flash_fp8_h200.py b/test/registered/models_e2e/test_deepseek_v4_flash_fp8_h200.py index a44094ef02bb..f5a6e0cd7bec 100644 --- a/test/registered/models_e2e/test_deepseek_v4_flash_fp8_h200.py +++ b/test/registered/models_e2e/test_deepseek_v4_flash_fp8_h200.py @@ -22,7 +22,7 @@ try_cached_model, ) -register_cuda_ci(est_time=280, stage="extra-b", runner_config="deepep-8-gpu-h200") +register_cuda_ci(est_time=560, stage="extra-b", runner_config="deepep-8-gpu-h200") MODEL_FP8 = "sgl-project/DeepSeek-V4-Flash-FP8" SERVER_LAUNCH_TIMEOUT = 3600 @@ -87,5 +87,76 @@ def tearDownClass(cls): kill_process_tree(cls.process.pid) +class TestDSV4FlashFP8H200MegaMoE( + SpecDecodingMixin, + BasicDecodeCorrectnessMixin, + GSM8KMixin, + CustomTestCase, +): + """SM90 FP8 MegaMoE recipe: same 4-GPU split (TP=4) + EAGLE spec decoding + as the recipe above, but routes MoE through the SM90 all-FP8 MegaMoE path + (DeepEP a2a + DeepGEMM ``deep_gemm`` runner + the ``SGLANG_OPT_*`` flags). + """ + + gsm8k_accuracy_thres = 0.93 + accept_length_thres = 1.8 + bs_1_speed_thres = 140 + + @classmethod + def setUpClass(cls): + cls.model = try_cached_model(MODEL_FP8) + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=SERVER_LAUNCH_TIMEOUT, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--dp", + "4", + "--enable-dp-attention", + "--moe-a2a-backend", + "deepep", + "--moe-runner-backend", + "deep_gemm", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "1", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "2", + "--chunked-prefill-size", + "8192", + "--cuda-graph-max-bs-decode", + "128", + "--max-running-requests", + "128", + "--watchdog-timeout", + "900", + ], + env={ + "SGLANG_DSV4_FP4_EXPERTS": "0", + "SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE": "1", + "SGLANG_OPT_FIX_MEGA_MOE_MEMORY": "1", + "SGLANG_OPT_USE_JIT_EP_ACTIVATION": "1", + # INVARIANT: this per-rank cap MUST equal + # chunked_prefill_size / dp_size (= 8192 / 4 = 2048), the per-rank + # prefill bound under --enable-dp-attention. If you change + # --chunked-prefill-size or --dp above, update this to match — + # otherwise MegaMoE falls back whenever a prefill chunk exceeds it. + "SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK": "2048", + }, + ) + + @classmethod + def tearDownClass(cls): + if hasattr(cls, "process") and cls.process: + kill_process_tree(cls.process.pid) + + if __name__ == "__main__": unittest.main()