Fix generation with beacon tokens

tarun-menta · tarun-menta · commit be34e6dd3f3e · 2025-06-28T23:12:54.000-04:00
The key issue was with how `flash_attn_with_kvcache` deals with the
causal mask during multi-token decoding. It gets complicated around the
padding tokens during generation.

This may be an issue during multi token generation. Works fine for now
upto 2 tokens (beacon/pad approach)
diff --git a/surya/common/surya/decoder/__init__.py b/surya/common/surya/decoder/__init__.py
@@ -178,14 +178,6 @@ def forward(
             query_states, key_states, cos, sin
         )
 
-        is_prefill = all(
-            (
-                input_shape[1] > 1,
-                (past_key_value is None)
-                or (past_key_value.get_seq_length(self.layer_idx) == 0),
-            )
-        )
-
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             # cache_idxs, num_valid_tokens, and prefill add support for our new caching mechanism 
@@ -212,7 +204,7 @@ def forward(
                     'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 )
             elif self.config._attn_implementation == "flash_attention_2":
-                if is_prefill:
+                if prefill:
                     attention_interface = flash_attn_prefill
                 else:
                     attention_interface = flash_attn_decode
diff --git a/surya/common/surya/flash_attn_utils.py b/surya/common/surya/flash_attn_utils.py
@@ -111,7 +111,6 @@ def flash_attn_prefill(
     attention_mask: torch.Tensor,
     dropout: float,
     scaling: float,
-    sliding_window: Optional[int],
     query_length: int,
     batch_size: int,
     indices_k: torch.Tensor,
@@ -135,8 +134,6 @@ def flash_attn_prefill(
     cu_seqlens_q, cu_seqlens_k = cu_seq_lens
     max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
 
-    flash_kwargs = {"window_size": (sliding_window, sliding_window)} if sliding_window else {}
-
     # Returning None for attn_weights to match other attention interfaces
     flash_attn_out = _flash_attn_varlen_func(
         q_flash,
@@ -149,7 +146,6 @@ def flash_attn_prefill(
         dropout_p=dropout,
         softmax_scale=scaling,
         causal=module.is_causal,
-        **flash_kwargs
     )
     return pad_input(flash_attn_out, indices_q, batch_size, query_length), None
 
@@ -161,13 +157,12 @@ def flash_attn_decode(
     value_states: torch.Tensor,
     attention_mask: torch.Tensor,
     scaling: float,
-    sliding_window: bool,
     **kwargs,
 ):
     """
     Wrapper for flash attention during the decode stage
     
-    query_states must have shape (batch_size, num_heads, 1, head_dim), 1 is the seq length in the decoding stage
+    query_states must have shape (batch_size, num_heads, seq_len, head_dim), 1 is the seq length in the decoding stage
     key_states and value_states must have shape (batch_size, num_kv_heads, kv_len, head_dim)
 
     This is the opposite of what is required by flash attention, but keeps parity with the HF convention
@@ -177,14 +172,12 @@ def flash_attn_decode(
     cache_leftpad = (attention_mask == 0).cumprod(dim=1).sum(dim=1)
     cache_leftpad = cache_leftpad.to(torch.int32)
     
-    flash_kwargs = {'window_size': (sliding_window, sliding_window)} if sliding_window else {}
     # Returning None for attn_weights to match other attention interfaces
     return _flash_attn_with_kvcache(
         q=query_states,
         k_cache=key_states,
         v_cache=value_states,
         cache_leftpad=cache_leftpad,
-        causal=module.is_causal,
+        causal=False,
         softmax_scale=scaling,
-        **flash_kwargs
     ), None
diff --git a/surya/foundation/__init__.py b/surya/foundation/__init__.py
@@ -230,7 +230,11 @@ def maybe_insert_beacon_tokens(
 
         token = input_ids.squeeze(1)  # shape: [batch_size]
         add_beacon = (num_predicted_tokens % self.beacon_token_interval== 0).squeeze()
-        
+
+        # Return if no beacon tokens need to be added
+        if torch.all(~add_beacon):
+            return input_ids, torch.ones((input_ids.shape[0]), dtype=torch.long, device=input_ids.device)
+
         # Output tensors
         new_input_ids = torch.full((batch_size, 2), self.device_pad_token, dtype=input_ids.dtype, device=input_ids.device)
 
@@ -251,10 +255,11 @@ def decode(self, current_inputs: Optional[ContinuousBatchInput] = None):
         position_ids = current_inputs.position_ids
         num_predicted_tokens = current_inputs.num_predicted_tokens
         num_valid_tokens = current_inputs.num_valid_tokens
+        batch_size = input_ids.shape[0]
 
         # Pre-shift the attention mask based on the cache update
         self.kv_cache.maybe_shift_attention_mask(
-            num_valid_tokens=num_valid_tokens, cache_idxs=list(range(input_ids.shape[0]))
+            num_valid_tokens=num_valid_tokens, cache_idxs=list(range(batch_size))
         )
         with settings.INFERENCE_MODE():
             outputs = self.model(
@@ -263,7 +268,8 @@ def decode(self, current_inputs: Optional[ContinuousBatchInput] = None):
                 position_ids=position_ids,
                 use_cache=True,
                 past_key_values=self.kv_cache,
-                logits_to_keep=torch.max(num_valid_tokens).item(),
+                # We may pass multiple input ids per batch element (right padded) and we need the original size to index into them
+                logits_to_keep=None,
                 prefill=False,
                 num_valid_tokens=num_valid_tokens
             )
@@ -274,9 +280,12 @@ def decode(self, current_inputs: Optional[ContinuousBatchInput] = None):
         input_ids = processed_output.input_ids
         num_predicted_tokens += 1
 
-        # input_ids, num_valid_tokens = self.maybe_insert_beacon_tokens(input_ids, num_predicted_tokens)
-        # TODO we should only consider position_ids upto the valid range for each batch element
-        position_ids = position_ids[:, -1:] + torch.arange(1, input_ids.shape[1] + 1, device=input_ids.device)
+        batch_indices = torch.arange(batch_size, device=position_ids.device)
+        last_token_indices = (num_valid_tokens - 1)
+        last_valid_positions = position_ids[batch_indices, last_token_indices].reshape(batch_size, 1)
+
+        input_ids, num_valid_tokens = self.maybe_insert_beacon_tokens(input_ids, num_predicted_tokens)
+        position_ids = last_valid_positions + torch.arange(1, input_ids.shape[1] + 1, device=input_ids.device)
 
         new_input = ContinuousBatchInput(
             input_ids=input_ids,
@@ -377,7 +386,7 @@ def prefill(self, current_inputs: Optional[ContinuousBatchInput] = None):
         
         # Process outputs
         # No extra tokens during prefill
-        num_valid_tokens = torch.ones((input_ids.shape[0], 1), device=self.model.device, dtype=torch.long)
+        num_valid_tokens = torch.ones((input_ids.shape[0]), device=self.model.device, dtype=torch.long)
         num_predicted_tokens = torch.ones((input_ids.shape[0], 1), device=self.model.device, dtype=torch.long)
         processed_outputs = self.process_outputs(outputs, num_valid_tokens=num_valid_tokens)
 
diff --git a/surya/foundation/cache.py b/surya/foundation/cache.py
@@ -97,13 +97,10 @@ def maybe_shift_attention_mask(
                 shift = new_text_len
                 self._shift_attention_mask_left(cache_idx, shift)
             else:
-                # We need to figure out how many text tokens to keep and where to place them
-                keep = self.text_sliding_window - new_text_len
-                assert keep > 0, "Cannot add more new text tokens than the sliding window"
-                
                 # Shift entire cache left to make room for full text sliding window
                 shift_amount = self.text_sliding_window - curr_text_cache_len
-                if shift_amount > 0:        # Cannot be negative, may be exactly 0
+                # If this is <=0, we are already above the sliding window, so the attention mask stays the same
+                if shift_amount > 0:
                     self._shift_attention_mask_left(cache_idx, shift_amount)
                 
     # Mirrors the logic from _prefill_update
@@ -222,17 +219,16 @@ def _decode_update(
 
             curr_text_cache_len = self.text_token_counts[layer_idx][cache_idx].item()
 
-            k_new = key_states[batch_idx, :, :new_text_len, :]  # (H, new_text_len, D)
+            k_new = key_states[batch_idx, :, :new_text_len, :]
             v_new = value_states[batch_idx, :, :new_text_len, :]
 
             if curr_text_cache_len + new_text_len <= self.text_sliding_window:
                 # If we are under the sliding window length, shift the entire cache left
                 # Since we setup the max cache length with enough buffer, this will ONLY drop 
                 # left padding tokens out
                 shift = new_text_len
-                if curr_text_cache_len > 0:
-                    k_cache[cache_idx, :, :-shift, :] = k_cache[cache_idx, :, shift:, :].clone()
-                    v_cache[cache_idx, :, :-shift, :] = v_cache[cache_idx, :, shift:, :].clone()
+                k_cache[cache_idx, :, :-shift, :] = k_cache[cache_idx, :, shift:, :].clone()
+                v_cache[cache_idx, :, :-shift, :] = v_cache[cache_idx, :, shift:, :].clone()
                 k_cache[cache_idx, :, -shift:, :] = k_new
                 v_cache[cache_idx, :, -shift:, :] = v_new
 
@@ -268,4 +264,4 @@ def _decode_update(
         self.key_cache[layer_idx] = k_cache
         self.value_cache[layer_idx] = v_cache
 
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]