Get multi-token predictions from the model

tarun-menta · tarun-menta · commit 5c7f0883bfd5 · 2025-06-29T02:15:35.000-04:00
diff --git a/surya/common/surya/__init__.py b/surya/common/surya/__init__.py
@@ -304,6 +304,31 @@ def get_2d_learned_embeddings(
             all_embeddings, dim=0
         )  # Shape is num_image_tokens x embed_dim
 
+    def get_logits(self, hidden_states):
+        assert hidden_states.shape[1] == 1, "Multi output predictions only applied on the last token"
+
+        all_lm_logits = []
+        all_bbox_logits = []
+        
+        current_hidden = hidden_states
+        
+        # Loop includes initial prediction (i=0) plus multi_out_distance additional predictions
+        for i in range(self.config.multi_output_distance + 1):
+            if i > 0:
+                current_hidden = self.multi_output_projections[i-1](current_hidden)
+            
+            lm_logits = self.lm_head(current_hidden)
+            bbox_logits = F.sigmoid(self.bbox_head(current_hidden))
+            
+            all_lm_logits.append(lm_logits)
+            all_bbox_logits.append(bbox_logits)
+        
+        # Concatenate along sequence dimension (dim=1)
+        final_lm_logits = torch.cat(all_lm_logits, dim=1)
+        final_bbox_logits = torch.cat(all_bbox_logits, dim=1)
+        
+        return final_lm_logits, final_bbox_logits
+
     def forward(
         self,
         input_ids=None,
@@ -317,7 +342,6 @@ def forward(
         output_hidden_states=False,
         output_attentions=False,
         use_cache=False,
-        logits_to_keep=None,
         encoder_chunk_size=None,
         cache_idxs=None,
         num_valid_tokens=None,
@@ -386,12 +410,9 @@ def forward(
 
         hidden_states = outputs.last_hidden_state
         # Only keep the last `logits_to_keep` logits, should bring down memory usage during inference
-        if logits_to_keep is not None:
-            hidden_states = hidden_states[:, -logits_to_keep:, :]
-
+        hidden_states = hidden_states[:, -1:, :]
         hidden_states = hidden_states.contiguous()
-        bbox_logits = F.sigmoid(self.bbox_head(hidden_states))
-        lm_logits = self.lm_head(hidden_states)
+        lm_logits, bbox_logits = self.get_logits(hidden_states)
 
         return SuryaModelOutput(
             bbox_logits=bbox_logits,
diff --git a/surya/foundation/__init__.py b/surya/foundation/__init__.py
@@ -180,8 +180,11 @@ def process_outputs(self, outputs: SuryaModelOutput, num_valid_tokens: torch.Ten
         lm_logits = outputs["lm_logits"].float()  # shape: [B, T, V]
         bbox_logits = outputs["bbox_logits"].float()  # shape: [B, T, D]
         
-        next_token_logits = lm_logits[:, -1:, :]
-        next_bbox_logits = bbox_logits[:, -1:, :]
+        # We make multitoken predictions - Currently only considering the first predicted token
+        # TODO Add support for using all the predictions
+        # TODO This requires a change to the beacon token logic
+        next_token_logits = lm_logits[:, :1, :]
+        next_bbox_logits = bbox_logits[:, :1, :]
 
         # Get predictions
         preds = torch.argmax(next_token_logits, dim=-1)  # shape: [B, 1]
@@ -263,8 +266,6 @@ def decode(self, current_inputs: Optional[ContinuousBatchInput] = None):
                 position_ids=position_ids,
                 use_cache=True,
                 past_key_values=self.kv_cache,
-                # We may pass multiple input ids per batch element (right padded) and we need the original size to index into them
-                logits_to_keep=None,
                 prefill=False,
                 num_valid_tokens=num_valid_tokens
             )
@@ -371,7 +372,6 @@ def prefill(self, current_inputs: Optional[ContinuousBatchInput] = None):
                 inputs_embeds=None,
                 past_key_values=self.kv_cache,
                 use_cache=True,
-                logits_to_keep=1,
                 encoder_chunk_size=self.get_encoder_chunk_size(),
                 cache_idxs=idxs_to_merge,
                 prefill=True,