InternLM · lvhan028 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -2,7 +2,7 @@
 import enum
 import time
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Literal
 
 import torch
 from pydantic.dataclasses import dataclass as pydantic_dataclass
@@ -100,7 +100,7 @@ class GenerationConfig:
     """
 
     n: int = 1
-    max_new_tokens: int = 512
+    max_new_tokens: int = None
-    max_new_tokens: int = None
+    max_new_tokens: int | None = 512
-    max_new_tokens: int = None
+    max_new_tokens: int | None = 512
     do_sample: bool = False
     top_p: float = 1.0
     top_k: int = 50
@@ -109,24 +109,24 @@ class GenerationConfig:
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
     random_seed: int = None
-    stop_words: List[str] = None
-    bad_words: List[str] = None
-    stop_token_ids: List[int] = None
-    bad_token_ids: List[int] = None
+    stop_words: list[str] = None
+    bad_words: list[str] = None
+    stop_token_ids: list[int] | list[list[int]] = None
+    bad_token_ids: list[int] = None
     min_new_tokens: int = None
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     logprobs: int = None
-    response_format: Optional[Dict] = None
-    logits_processors: Optional[List[LogitsProcessor]] = None
+    response_format: dict | None = None
+    logits_processors: list[LogitsProcessor] | None = None
     output_logits: Literal['all', 'generation'] = None
     output_last_hidden_state: Literal['all', 'generation'] = None
     include_stop_str_in_output: bool = False
 
     # for disaggregation
     with_cache: bool = False
     preserve_cache: bool = False
-    migration_request: Optional[MigrationRequest] = None
+    migration_request: MigrationRequest | None = None
 
     # router replay
     return_routed_experts: bool = False
@@ -135,46 +135,91 @@ class GenerationConfig:
     repetition_ngram_size: int = 0
     repetition_ngram_threshold: int = 0
 
+    @staticmethod
+    def _normalize_stop_token_ids(ids: list[int] | list[list[int]] | None) -> list[list[int]]:
+        """Normalize stop_token_ids to list[list[int]]."""
+        if ids is None:
+            return []
+        out: list[list[int]] = []
+        for item in ids:
+            if isinstance(item, int):
+                out.append([item])
+            else:
+                out.append(list(item))
+        return out
+
     def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer):
-        """Convert stop_words/bad_sords to ids and append the ids to
+        """Convert stop_words/bad_words to ids and append the ids to
         stop_token_ids/bad_token_ids."""
 
-        def special_word_token_ids(words):
-            if words is not None:
-                assert isinstance(words, List) and \
-                    all(isinstance(elem, str) for elem in words), \
-                    f'stop_words must be a list of str but got {type(words)}'
-                indexes = []
-                for word in words:
-                    indexes += tokenizer.indexes_containing_token(word)
-                return indexes
-            return None
-
-        stop_token_ids = special_word_token_ids(self.stop_words) or []
-        bad_token_ids = special_word_token_ids(self.bad_words) or []
-        stop_token_ids.extend(self.stop_token_ids or [])
-        bad_token_ids.extend(self.bad_token_ids or [])
-        self.stop_token_ids = list(set(stop_token_ids)) or None
-        self.bad_token_ids = list(set(bad_token_ids)) or None
+        def words_to_token_seqs(words: list[str]) -> list[list[int]]:
+            assert isinstance(words, list) and \
+                all(isinstance(elem, str) for elem in words), \
+                f'stop_words must be a list of str but got {type(words)}'
+            seqs: list[list[int]] = []
+            for word in words:
+                single_matches = tokenizer.indexes_containing_token(word)
+                if single_matches:
+                    for idx in single_matches:
+                        seqs.append([idx])
+                else:
+                    encoded = tokenizer.encode(word, add_bos=False)
+                    if encoded:
+                        seqs.append(encoded)
+            return seqs
+
+        stop_seqs = words_to_token_seqs(self.stop_words) if self.stop_words else []
+        bad_seqs = words_to_token_seqs(self.bad_words) if self.bad_words else []
+
+        stop_seqs.extend(self._normalize_stop_token_ids(self.stop_token_ids))
+        bad_seqs.extend([[i] for i in (self.bad_token_ids or [])])
+
+        # deduplicate stop_token_ids and bad_token_ids
+        seen = set()
+        deduped: list[list[int]] = []
+        for seq in stop_seqs:
+            key = tuple(seq)
+            if key not in seen:
+                seen.add(key)
+                deduped.append(seq)
+        self.stop_token_ids = deduped or None
-        self.stop_token_ids = deduped or None
+        # Preserve backward-compatible shape: use flat list[int] when only
+        # single-token stop sequences are present; otherwise keep nested list[list[int]].
+        if deduped and all(len(seq) == 1 for seq in deduped):
+            self.stop_token_ids = [seq[0] for seq in deduped] or None
+        else:
+            self.stop_token_ids = deduped or None
-        self.stop_token_ids = deduped or None
+        # Preserve backward-compatible shape: use flat list[int] when only
+        # single-token stop sequences are present; otherwise keep nested list[list[int]].
+        if deduped and all(len(seq) == 1 for seq in deduped):
+            self.stop_token_ids = [seq[0] for seq in deduped] or None
+        else:
+            self.stop_token_ids = deduped or None
+
+        seen_bad = set()
+        deduped_bad: list[int] = []
+        for seq in bad_seqs:
+            if len(seq) > 1:
+                logger.warning(f'Multi-token bad word {seq} is not supported and '
+                               'will be ignored. Only single-token bad words can be '
+                               'masked in logits processing.')
+                continue
+            if seq[0] not in seen_bad:
+                seen_bad.add(seq[0])
+                deduped_bad.append(seq[0])
+        self.bad_token_ids = deduped_bad or None
 
     def update_from_hf_gen_cfg(self, generation_config, tokenizer_eos_token_id):
         """Update the stop_token_ids."""
-        stop_token_ids = set(self.stop_token_ids or [])
+        stop_seqs = self._normalize_stop_token_ids(self.stop_token_ids)
+        existing = {tuple(s) for s in stop_seqs}
+
+        def _add_single(tok_id: int):
+            key = (tok_id, )
+            if key not in existing:
+                existing.add(key)
+                stop_seqs.append([tok_id])
 
-        # add tokenizer's eos_token_id
         if tokenizer_eos_token_id is not None:
-            stop_token_ids.add(tokenizer_eos_token_id)
+            _add_single(tokenizer_eos_token_id)
 
-        # add eos_token_id from model's generation_config.json file if there
-        # is any.
         eos_token_id = generation_config.get('eos_token_id')
         if eos_token_id is not None:
             if isinstance(eos_token_id, int):
-                stop_token_ids.add(eos_token_id)
+                _add_single(eos_token_id)
             else:
-                stop_token_ids.update(eos_token_id)
+                for eid in eos_token_id:
+                    _add_single(eid)
 
-        self.stop_token_ids = list(stop_token_ids)
+        self.stop_token_ids = stop_seqs
 
     def __post_init__(self):
         """Check input validation."""
@@ -184,6 +229,8 @@ def __post_init__(self):
         assert self.temperature >= 0 and self.temperature <= 2  # [0,2]
         assert 0 <= self.min_p <= 1, \
             f'min_p should be in range [0, 1], but found {self.min_p}'
+        if self.stop_token_ids is not None:
+            self.stop_token_ids = self._normalize_stop_token_ids(self.stop_token_ids)
 
 
 @pydantic_dataclass
@@ -251,7 +298,7 @@ class TurbomindEngineConfig:
     """
 
     dtype: str = 'auto'
-    model_format: Optional[str] = None
+    model_format: str | None = None
     tp: int = 1
     dp: int = 1
     cp: int = 1
@@ -264,9 +311,9 @@ class TurbomindEngineConfig:
     outer_dp_size: int = None
     nnodes: int = 1
     node_rank: int = 0
-    dist_init_addr: Optional[str] = None
-    devices: List[int] = None
-    session_len: Optional[int] = None
+    dist_init_addr: str | None = None
+    devices: list[int] = None
+    session_len: int | None = None
     max_batch_size: int = None
     cache_max_entry_count: float = 0.8
     cache_chunk_size: int = -1
@@ -275,16 +322,16 @@ class TurbomindEngineConfig:
     quant_policy: int = 0
     rope_scaling_factor: float = 0.0
     use_logn_attn: bool = False
-    download_dir: Optional[str] = None
-    revision: Optional[str] = None
+    download_dir: str | None = None
+    revision: str | None = None
     max_prefill_token_num: int = 8192
     num_tokens_per_iter: int = 0
     max_prefill_iters: int = 1
     async_: int = 1
-    devices: Optional[List[int]] = None
+    devices: list[int] | None = None
     empty_init: bool = False
     communicator: str = 'nccl'
-    hf_overrides: Optional[Dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     enable_metrics: bool = True
 
     def __post_init__(self):
@@ -388,13 +435,13 @@ class PytorchEngineConfig:
     block_size: int = 64
     num_cpu_blocks: int = 0
     num_gpu_blocks: int = 0
-    adapters: Dict[str, str] = None
+    adapters: dict[str, str] = None
     max_prefill_token_num: int = 4096
     thread_safe: bool = False
     enable_prefix_caching: bool = False
     device_type: str = 'cuda'
     eager_mode: bool = False
-    custom_module_map: Dict[str, str] = None
+    custom_module_map: dict[str, str] = None
     download_dir: str = None
     revision: str = None
     quant_policy: Literal[0, 4, 8] = 0
@@ -406,7 +453,7 @@ class PytorchEngineConfig:
     mp_engine_backend: str = 'mp'
     model_format: str = None
     enable_metrics: bool = True
-    hf_overrides: Optional[Dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
     # router replay
@@ -488,9 +535,9 @@ class Response:
     text: str
     generate_token_len: int
     input_token_len: int
-    finish_reason: Optional[Literal['stop', 'length']] = None
-    token_ids: List[int] = field(default_factory=list)
-    logprobs: List[Dict[int, float]] = None
+    finish_reason: Literal['stop', 'length'] | None = None
+    token_ids: list[int] = field(default_factory=list)
+    logprobs: list[dict[int, float]] = None
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
     index: int = 0
@@ -511,7 +558,7 @@ def _format_none_text_fields(self):
         fields.append(f'logprobs={self.logprobs}')
 
         # Helper function to format tensor information
-        def _format_tensor(name: str, tensor: Optional[torch.Tensor]) -> List[str]:
+        def _format_tensor(name: str, tensor: torch.Tensor | None) -> list[str]:
             if tensor is None:
                 return [f'{name}=None']
             try:
@@ -580,7 +627,7 @@ class EngineEvent:
     timestamp: float
 
     @classmethod
-    def new_event(cls, event_type: EventType, timestamp: Optional[float] = None) -> 'EngineEvent':
+    def new_event(cls, event_type: EventType, timestamp: float | None = None) -> 'EngineEvent':
         # Timestamps MUST use wall-clock time (time.time()) to maintain consistency
         # between csrc(std::chrono::system_clock) and python
         timestamp = time.time() if timestamp is None else timestamp
@@ -604,11 +651,11 @@ class RequestMetrics:
 
     Attributes:
         token_timestamp: A wall-clock time when a token is generated.
-        engine_events: List of engine events during inference.
+        engine_events: list of engine events during inference.
     """
     token_timestamp: float = 0.0
-    engine_events: List[EngineEvent] = field(default_factory=list)
-    spec_info: Optional[Dict[str, Any]] = None
+    engine_events: list[EngineEvent] = field(default_factory=list)
+    spec_info: dict[str, Any] | None = None
 
 
 @dataclass
@@ -625,12 +672,12 @@ class EngineOutput:
         req_metrics: request metrics information
     """
     status: ResponseType
-    token_ids: List[int]
-    logprobs: List[Dict[int, float]] = None
+    token_ids: list[int]
+    logprobs: list[dict[int, float]] = None
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
-    cache_block_ids: Optional[List[int]] = None
-    req_metrics: Optional[RequestMetrics] = None
+    cache_block_ids: list[int] | None = None
+    req_metrics: RequestMetrics | None = None
     routed_experts: torch.Tensor = None
 
 

diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -210,7 +210,7 @@ def _filter_repetition_ngram_(
         return scores
     # use first stop words
     _, found = ngram(generated_ids, n, threshold, max_n, max_ngram_window_size)
-    stop_words = stop_words[:, 0]
+    stop_words = stop_words[:, 0, 0]
     # fill all scores -inf
     scores.masked_fill_(found[:, None], -float('inf'))
     # set stop words to 0
@@ -245,7 +245,7 @@ class SamplingInputs:
     bad_words: torch.LongTensor = None
     bad_mask: torch.BoolTensor = None
     stop_words: torch.LongTensor = None
-    stop_mask: torch.BoolTensor = None
+    stop_word_lens: torch.LongTensor = None
     repetition_penalty: torch.Tensor = None
     top_k: torch.LongTensor = None
     top_p: torch.Tensor = None
@@ -428,11 +428,13 @@ async def __call__(self, scores: torch.Tensor) -> torch.Tensor:
             scores = _process_bad_words_(scores, bad_words, bad_mask)
 
         stop_words = sampling_inputs.stop_words
-        if stop_words is not None:
+        stop_word_lens = sampling_inputs.stop_word_lens
+        if stop_words is not None and stop_word_lens is not None:
             ignore_eos = sampling_inputs.num_ignore_eos > 0
-            stop_mask = sampling_inputs.stop_mask
-            stop_mask = torch.where(ignore_eos[:, None], stop_mask, False)
-            scores = _process_bad_words_(scores, stop_words, stop_mask)
+            single_mask = (stop_word_lens == 1) & ignore_eos[:, None]
+            if single_mask.any():
+                single_tokens = stop_words[:, :, 0]
+                scores = _process_bad_words_(scores, single_tokens, single_mask)
 
         return scores, logprobs
 

diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -695,11 +695,15 @@ async def _step_postprocess_with_output(self,
             logger.debug(f'<ForwardTask> rank[{rank}]: synchronize token ids')
 
             # stopping criteria
+            # Use output_token_ids (all tokens accepted this step) so that multi-token
+            # stop sequences whose last token is not the final spec-decoded token are
+            # detected correctly. For non-spec AR, output_token_ids == next_token_ids.
             stopped, stop_pos, stopping_criteria = stopping_criteria.step(
-                next_token_ids,
+                output_token_ids,
                 sampling_inputs.stop_words,
                 inputs=inputs,
                 extra_inputs=extra_inputs,
+                stop_word_lens=sampling_inputs.stop_word_lens,
             )
 
             # send output

diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
@@ -52,7 +52,7 @@ class SamplingParam:
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
     random_seed: int = None
-    stop_words: List[int] = field(default_factory=list)
+    stop_words: List[List[int]] = field(default_factory=list)
     bad_words: List[int] = field(default_factory=list)
     max_new_tokens: int = 512
     min_new_tokens: int = 0
@@ -75,7 +75,11 @@ def from_gen_config(cls, gen_config: GenerationConfig):
         stop_words = gen_config.stop_token_ids or []
         bad_words = gen_config.bad_token_ids or []
         if gen_config.ignore_eos:
-            bad_words += stop_words
+            if any(len(s) > 1 for s in stop_words):
+                logger.warning('Multi-token stop words are not supported and '
+                               'will be ignored. Only single-token stop words can '
+                               'be used to stop generation.')
+            bad_words += [s[0] for s in stop_words if len(s) == 1]
             stop_words = []
 
         top_k = gen_config.top_k