NVIDIA
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 40 additions & 4 deletions b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 40 additions & 4 deletions
diff --git a/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 15 additions & 7 deletions b/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 15 additions & 7 deletions
@@ -1839,26 +1839,62 @@ def current_input_and_position_ids(
             self.token_to_pos_ids[:num_tokens].unsqueeze(0),
         )
 
-    def last_token_logits(self, logits: Tensor) -> Tensor:
-        """Last tokens of logits.
+    def speculative_required_logit_indices(self, device: torch.device) -> Tensor:
+        """Token-level indices needed for speculative decode verification.
+
+        Returns all decode token positions (base + speculative) concatenated
+        with the last token position of each prefill request.
 
         Args:
-            logits (Tensor): Output logits of forward pass.
+            device (torch.device): Device on which to create the index tensor.
 
         Return:
-            (Tensor) Last token logits.
+            (Tensor) 1-D indices into the packed token sequence, length
+            ``num_decode_requests * (num_speculative_tokens + 1) + num_prefill_requests``.
         """
         paused = self.paused_request_count
         total = self.total_request_count
         query_lengths = self.request_query_lengths[paused:total]
+        num_decode = self.num_decode_requests
+
+        decode_token_count = num_decode * (self.num_speculative_tokens + 1)
+        decode_indices = torch.arange(decode_token_count, device=device)
+
+        cumsum = torch.cumsum(query_lengths, dim=0)
+        prefill_last_indices = cumsum[num_decode:] - 1
+
+        return torch.cat([decode_indices, prefill_last_indices])
+
+    def last_token_logits(self, logits: Tensor) -> Tensor:
+        """Select the logit positions needed for token generation.
+
+        When speculative decoding is active, decode requests need logits for all
+        their tokens (base + speculative) for verification, while prefill requests
+        only need the last token logit. This avoids materializing the full
+        vocab-sized logits for every prefill token, which causes large memory
+        spikes during prefill-heavy batches.
+
+        Args:
+            logits (Tensor): Output logits of forward pass, shape [1, S, H].
 
+        Return:
+            (Tensor) Selected logits, shape [N, H].
+        """
         # todo: @lmcafee, remove these asserts?
         assert logits.size(0) == 1, f"logits.size(0) ({tuple(logits.shape)}) != 1"
         assert logits.size(1) == self.padded_active_token_count, (
             f"logits.size(1) ({tuple(logits.shape)}) != "
             f"padded_active_token_count ({self.padded_active_token_count})."
         )
         logits_2d = logits.squeeze(0)
+
+        if self.num_speculative_tokens > 0:
+            selected = self.speculative_required_logit_indices(logits.device)
+            return logits_2d[selected, :]
+
+        paused = self.paused_request_count
+        total = self.total_request_count
+        query_lengths = self.request_query_lengths[paused:total]
         last_token_idxs = torch.cumsum(query_lengths, dim=0) - 1
         return logits_2d[last_token_idxs, :]
 
 
@@ -214,12 +214,9 @@ def __init__(self, controller: TextGenerationController, context: DynamicInferen
 
         if self.num_speculative_tokens > 0:
             assert (
-                self.num_speculative_tokens <= self.controller.num_mtp_heads
+                model_config.mtp_use_repeated_layer
+                or self.num_speculative_tokens <= self.controller.num_mtp_heads
             ), f"Number of speculative tokens {self.num_speculative_tokens} must be less than or equal to number of MTP heads {self.controller.num_mtp_heads}"
-            assert (
-                not self.materialize_only_last_token_logits
-            ), "materialize_only_last_token_logits must be False when num_speculative_tokens > 0"
-
         self.track_paused_request_events = inference_config.track_paused_request_events
         self.track_generated_token_events = inference_config.track_generated_token_events
         self.enable_chunked_prefill = inference_config.enable_chunked_prefill
@@ -1211,7 +1208,13 @@ def post_process_requests(
                     top_n_logprobs[req_idx] = top_n_logprobs[req_idx][:-num_stop_word_trim]
 
             # Process log_probs if available (unified for both regular and chunked prefill)
-            if request_log_probs is not None:
+            # Skip for requests being finished due to stop words — tokens are not
+            # appended for these requests, so log probs must also be skipped to keep
+            # the two lists in sync.
+            if (
+                request_log_probs is not None
+                and request_id not in self.stop_word_being_finished_ids
+            ):
                 # Initialize lists if they don't exist
                 if not request.prompt_log_probs:
                     request.prompt_log_probs = []
@@ -1244,7 +1247,12 @@ def post_process_requests(
                         request.generated_log_probs.extend(request_log_probs[split_idx:])
 
             # Process top_n_logprobs if available (unified for both regular and chunked prefill)
-            if top_n_logprobs is not None and req_idx in top_n_logprobs:
+            # Same stop-word guard as log probs above.
+            if (
+                top_n_logprobs is not None
+                and req_idx in top_n_logprobs
+                and request_id not in self.stop_word_being_finished_ids
+            ):
                 # Initialize lists if they don't exist
                 if request.prompt_top_n_logprobs is None:
                     request.prompt_top_n_logprobs = []