Skip to content

Commit 6119a07

Browse files
authored
[Cherry-Pick][BugFix] Fix real token exceeding max_batched_tokens limit(#7438) (#7440)
* fix max_num_batched_tokens error compute * add temperatory solution * fix bug
1 parent 43a657a commit 6119a07

1 file changed

Lines changed: 11 additions & 1 deletion

File tree

fastdeploy/engine/sched/resource_manager_v1.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,17 @@ def get_enough_request(request, scheduled_reqs):
718718
scheduled_reqs: list[Request] = []
719719
preempted_reqs: list[Request] = []
720720
error_reqs: list[tuple[str, str]] = []
721-
token_budget = self.config.scheduler_config.max_num_batched_tokens
721+
tokens_per_seq = (
722+
(self.config.speculative_config.num_speculative_tokens + 1)
723+
if self.config.speculative_config is not None
724+
else 1
725+
)
726+
token_budget = (
727+
self.config.scheduler_config.max_num_batched_tokens
728+
- self.config.scheduler_config.max_num_seqs * tokens_per_seq
729+
)
730+
# temperatory solution to avoid negative token_budget
731+
token_budget = max(token_budget, min(self.config.scheduler_config.max_num_batched_tokens, 512))
722732
need_abort_requests = [] # users trigger abortion
723733

724734
# First, schedule the RUNNING requests.

0 commit comments

Comments
 (0)