datalab-to
diff --git a/‎surya/common/surya/__init__.py
Lines changed: 158 additions & 1 deletion b/‎surya/common/surya/__init__.py
Lines changed: 158 additions & 1 deletion
diff --git a/‎surya/common/surya/config.py
Lines changed: 16 additions & 3 deletions b/‎surya/common/surya/config.py
Lines changed: 16 additions & 3 deletions
@@ -7,6 +7,8 @@
 import torch.nn.functional as F
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.cache_utils import Cache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
 from surya.common.s3 import S3DownloaderMixin
 from surya.common.surya.config import SuryaModelConfig
@@ -108,6 +110,13 @@ def __init__(
         self.bbox_head = nn.Linear(config.hidden_size, 6)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
 
+        if self.config.multi_output_distance is not None and self.config.multi_output_distance > 0:
+            self.multi_output_embeds = nn.Embedding(
+                config.max_multi_out,
+                config.hidden_size,
+                padding_idx=0,
+            )
+
     def tie_weights(self):
         self._tie_weights()
 
@@ -279,6 +288,7 @@ def forward(
         inputs_embeds=None,
         attention_mask=None,
         position_ids=None,
+        cache_position=None,
         past_key_values=None,
         output_hidden_states=False,
         output_attentions=False,
@@ -309,11 +319,33 @@ def forward(
             kwargs["cu_seqlens_k"] = cu_seqlens_k
             kwargs["max_seqlen_in_batch_k"] = max_seqlen_in_batch_k
 
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length() if past_key_values is not None else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values,
+            output_attentions,
+        )
+
+        attention_mask = causal_mask
         outputs = self.decoder(
-            input_ids=None,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             position_ids=position_ids,
+            cache_position=cache_position,
             past_key_values=past_key_values,
             return_dict=True,
             use_cache=use_cache,
@@ -336,3 +368,128 @@ def forward(
             attentions=outputs.attentions if output_attentions else None,
             past_key_values=outputs.past_key_values,
         )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
+        )
+
+        # We always pass in a 2D attention mask from the processor - In both static and dynamic cache cases
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(
+                causal_mask, min_dtype
+            )
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: SuryaModelConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen2Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=device,
+            )
+            diagonal_attend_mask = torch.arange(
+                target_length, device=device
+            ) > cache_position.reshape(-1, 1)
+            # NOTE - Removed sliding window handling here from original impl. since we manage it differently
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
+                    :, None, None, :
+                ].to(causal_mask.device)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        return causal_mask
@@ -1,3 +1,4 @@
+from typing import Optional
 from transformers import PretrainedConfig
 
 from surya.common.s3 import S3DownloaderMixin
@@ -18,18 +19,24 @@ def __init__(
         eos_token_id=1,
         pad_token_id=2,
         image_token_id=3,
+        register_token_ids=(4, 5, 6, 7),
+        eoi_token_id=8,
+        beacon_token_id=9,
         special_token_count=4,
         max_sequence_length=1536,
         special_ocr_tokens=None,
         vision_encoder=None,
         decoder=None,
         tasks: dict | None = None,
         bbox_embed_size: int = 64,
-        register_token_ids=(4, 5, 6, 7),
-        unmask_image: bool = False,
         num_register_tokens: int = 4,
         image_embed_encoding_size: int = 1024,
         image_embed_encoding_multiplier: int = 256,
+        num_beacon_tokens: int = 1,
+        beacon_token_interval: int = 4096,
+        sliding_window: Optional[int] = None,
+        multi_output_distance: int = 4,
+        max_multi_out: int = 8,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -41,17 +48,23 @@ def __init__(
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
+        self.eoi_token_id = eoi_token_id
+        self.beacon_token_id = beacon_token_id
         self.special_ocr_tokens = special_ocr_tokens
         self.special_token_count = special_token_count  # pad, bos, etc, tokens
         self.max_sequence_length = max_sequence_length
         self.tasks = tasks
         self.tie_word_embeddings = True
         self.bbox_embed_size = bbox_embed_size
-        self.unmask_image = unmask_image
         self.num_register_tokens = num_register_tokens
         self.register_token_ids = register_token_ids
         self.image_embed_encoding_size = image_embed_encoding_size
         self.image_embed_encoding_multiplier = image_embed_encoding_multiplier
+        self.num_beacon_tokens = num_beacon_tokens
+        self.beacon_token_interval = beacon_token_interval
+        self.sliding_window = sliding_window
+        self.multi_output_distance = multi_output_distance
+        self.max_multi_out = max_multi_out
 
         if isinstance(vision_encoder, dict):
             vision_encoder = SuryaEncoderConfig(**vision_encoder)