add max token num branch for trtllm_allreduce_fusion

BingooYang · BingooYang · commit b7c4a4777d51 · 2026-03-11T21:38:31.000+08:00
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -940,7 +940,10 @@ def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor:
 
         out = self.quant_method.apply(self, x)
 
-        if self.reduce_results and self.tp_size > 1 and not self.enable_all_reduce_fusion:
+        need_tp_all_reduce = (
+            self.reduce_results and self.tp_size > 1 and not (self.enable_all_reduce_fusion and out.shape[0] <= 2048)
+        )
+        if need_tp_all_reduce:
             out = tensor_model_parallel_all_reduce(out, self.tp_group)
 
         return out
diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py
@@ -242,7 +242,7 @@ def forward(
                     return norm_out.astype(x_dtype), residual_out
                 norm_out = self.norm_func(x, residual_input, self.weight, self.eps)
             # enable trtllm all reduce fusion
-            elif self.enable_all_reduce_fusion:
+            elif self.enable_all_reduce_fusion and x.shape[0] <= 2048:
                 norm_out = flashinfer_allreduce_residual_rmsnorm(
                     fd_config=self.fd_config, input_tensor=x, residual=residual_input, weight=self.weight, eps=self.eps
                 )
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
@@ -127,7 +127,6 @@ def __init__(
         self.tensor_parallel_size = fd_config.parallel_config.tensor_parallel_size
         self.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
         self.tp_group = fd_config.parallel_config.tp_group
-        self.enable_all_reduce_fusion = fd_config.parallel_config.enable_flashinfer_allreduce_fusion
         self.use_ep = self.expert_parallel_size > 1
         self.use_tp = self.tensor_parallel_size > 1
 

Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ def forward(`
`242`	`242`	`return norm_out.astype(x_dtype), residual_out`
`243`	`243`	`norm_out = self.norm_func(x, residual_input, self.weight, self.eps)`
`244`	`244`	`# enable trtllm all reduce fusion`
`245`		`- elif self.enable_all_reduce_fusion:`
	`245`	`+ elif self.enable_all_reduce_fusion and x.shape[0] <= 2048:`
`246`	`246`	`norm_out = flashinfer_allreduce_residual_rmsnorm(`
`247`	`247`	`fd_config=self.fd_config, input_tensor=x, residual=residual_input, weight=self.weight, eps=self.eps`
`248`	`248`	`)`