huggingface
diff --git a/‎src/transformers/cache_utils.py
+59-65 b/‎src/transformers/cache_utils.py
+59-65
diff --git a/‎src/transformers/generation/configuration_utils.py
+1 b/‎src/transformers/generation/configuration_utils.py
+1
diff --git a/‎src/transformers/generation/utils.py
+42-1 b/‎src/transformers/generation/utils.py
+42-1
diff --git a/‎src/transformers/integrations/compressed_tensors.py
+2-2 b/‎src/transformers/integrations/compressed_tensors.py
+2-2
diff --git a/‎src/transformers/integrations/flex_attention.py
+44-18 b/‎src/transformers/integrations/flex_attention.py
+44-18
@@ -1204,7 +1204,7 @@ def __init__(
             config.num_attention_heads
             if getattr(config, "num_key_value_heads", None) is None
             else config.num_key_value_heads
-        )
+        ) // 8  # TODO use TP!
 
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
@@ -1663,84 +1663,75 @@ def __init__(
         max_batch_size: int,
         max_cache_len: Optional[int] = None,
         device: Union[torch.device, str, None] = None,
-        dtype: torch.dtype = torch.float32,
+        dtype: torch.dtype = torch.bfloat16,
         layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
     ) -> None:
         super().__init__()
         if not hasattr(config, "sliding_window") or config.sliding_window is None:
-            raise ValueError(
-                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
-                "sliding window attention, please check if there is a `sliding_window` field in the model "
-                "config and it's not set to None."
-            )
+            self.sliding_window = getattr(config.get_text_config(), "attention_chunk_size", 8092)
+        else:
+            self.sliding_window = config.sliding_window
         self.max_cache_len = max_cache_len
         self.max_batch_size = max_batch_size
-        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
-        self.head_dim = (
-            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
-        )
-
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self._dtype = dtype
-        self.num_key_value_heads = (
-            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
-        )
 
-        layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2  # 2 is for BC
-        self.is_sliding = torch.tensor(
-            [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool
-        )
+        if hasattr(config.get_text_config(), "no_rope_layers"):
+            self.is_sliding = torch.tensor(config.no_rope_layers)
+        else:
+            layer_switch = getattr(config, "sliding_window_pattern", 2)
+            self.is_sliding = torch.tensor(
+                [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool
+            )
+
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
-        global_cache_shape = (self.max_batch_size, self.num_key_value_heads, max_cache_len, self.head_dim)
+        self.cumulative_length = [0 for _ in range(config.num_hidden_layers)]
+
+    def initialise_cache_layer(self, layer_idx, key_states):
+        if len(self.key_cache) > layer_idx:
+            return
+
+        num_key_value_heads = key_states.shape[1]
+        device = key_states.device
+        global_cache_shape = (self.max_batch_size, num_key_value_heads, self.max_cache_len, self.head_dim)
         sliding_cache_shape = (
             self.max_batch_size,
-            self.num_key_value_heads,
-            min(config.sliding_window, max_cache_len),
+            num_key_value_heads,
+            self.sliding_window,
             self.head_dim,
         )
-        device = torch.device(device) if device is not None and isinstance(device, str) else None
-        for i in range(config.num_hidden_layers):
-            if layer_device_map is not None:
-                layer_device = layer_device_map[i]
-            else:
-                layer_device = device
-            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
-            # breaks when updating the cache.
-            cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
-            torch._dynamo.mark_static_address(new_layer_key_cache)
-            torch._dynamo.mark_static_address(new_layer_value_cache)
-            self.key_cache.append(new_layer_key_cache)
-            self.value_cache.append(new_layer_value_cache)
+        # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
+        # breaks when updating the cache.
+        cache_shape = sliding_cache_shape if self.is_sliding[layer_idx] else global_cache_shape
+        new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device)
+        new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device)
+        torch._dynamo.mark_static_address(new_layer_key_cache)
+        torch._dynamo.mark_static_address(new_layer_value_cache)
+        self.key_cache.append(new_layer_key_cache)
+        self.value_cache.append(new_layer_value_cache)
 
     def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len):
-        if cache_position.shape[0] > max_cache_len:
-            k_out = key_states[:, :, -max_cache_len:, :]
-            v_out = value_states[:, :, -max_cache_len:, :]
-            # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly
-            self.key_cache[layer_idx] += k_out
-            self.value_cache[layer_idx] += v_out
-            # we should return the whole states instead of k_out, v_out to take the whole prompt
-            # into consideration when building kv cache instead of just throwing away tokens outside of the window
-            return key_states, value_states
-
-        slicing = torch.ones(max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0)
-        cache_position = cache_position.clamp(0, max_cache_len - 1)
-        to_shift = cache_position >= max_cache_len - 1
-        indices = (slicing + to_shift[-1].int() - 1) % max_cache_len
-        k_out = k_out[:, :, indices]
-        v_out = v_out[:, :, indices]
-
-        k_out[:, :, cache_position] = key_states
-        v_out[:, :, cache_position] = value_states
-        # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment)
-        self.key_cache[layer_idx].zero_()
-        self.value_cache[layer_idx].zero_()
-
-        self.key_cache[layer_idx] += k_out
-        self.value_cache[layer_idx] += v_out
-        return k_out, v_out
+        cumulative_length = self.cumulative_length[layer_idx]
+        is_full = cumulative_length >= max_cache_len
+        if is_full:
+            full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
+            full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
+        elif not is_full and cumulative_length + key_states.shape[2] > max_cache_len:
+            full_key_states = torch.cat((k_out[:, :, :cumulative_length, :], key_states), dim=-2)
+            full_value_states = torch.cat((v_out[:, :, :cumulative_length, :], value_states), dim=-2)
+        else:
+            self.key_cache[layer_idx].index_copy_(2, cache_position, key_states)
+            self.value_cache[layer_idx].index_copy_(2, cache_position, value_states)
+            self.cumulative_length[layer_idx] += key_states.shape[-2]
+            return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+        self.key_cache[layer_idx].copy_(full_key_states[:, :, -max_cache_len:, :])
+        self.value_cache[layer_idx].copy_(full_value_states[:, :, -max_cache_len:, :])
+        self.cumulative_length[layer_idx] += key_states.shape[-2]
+        # we should return the whole states instead of k_out, v_out to take the whole prompt
+        # into consideration when building kv cache instead of just throwing away tokens outside of the window
+        return full_key_states, full_value_states
 
     def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len):
         k_out[:, :, cache_position] = key_states
@@ -1760,7 +1751,7 @@ def update(
         if cache_kwargs is None:
             cache_kwargs = {}
         cache_position = cache_kwargs.get("cache_position")
-        sliding_window = cache_kwargs.get("sliding_window")
+        self.initialise_cache_layer(layer_idx, key_states)
 
         # These two `if` blocks are only reached in multigpu and if `layer_device_map` is not passed. They are used
         # when the cache is initialized in the forward pass (e.g. Gemma2)
@@ -1774,7 +1765,7 @@ def update(
         key_states = key_states.to(k_out.dtype)
         value_states = value_states.to(v_out.dtype)
 
-        if sliding_window:
+        if self.is_sliding[layer_idx]:
             update_fn = self._sliding_update
         else:
             update_fn = self._static_update
@@ -1801,6 +1792,8 @@ def get_seq_length(self, layer_idx: Optional[int] = 0):
                 "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. "
                 "Using the `layer_idx` argument is not supported."
             )
+        if len(self.key_cache) == 0:
+            return 0
         return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum()
 
     def reset(self):
@@ -1809,6 +1802,7 @@ def reset(self):
             # In-place ops prevent breaking the static address
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
+        self.cumulative_length = [0 for _ in range(len(self.cumulative_length))]
 
 
 class MambaCache:
 
@@ -416,6 +416,7 @@ def __init__(self, **kwargs):
             if isinstance(self.cache_config, dict):
                 self.cache_config = cache_config_class.from_dict(self.cache_config)
         self.return_legacy_cache = kwargs.pop("return_legacy_cache", None)
+        self.prefill_chunk_size = kwargs.pop("prefill_chunk_size", None)
 
         # Parameters for manipulation of the model output logits
         self.temperature = kwargs.pop("temperature", 1.0)
 
@@ -3318,7 +3318,12 @@ def _sample(
                 os.environ["TOKENIZERS_PARALLELISM"] = "0"
                 model_forward = self.get_compiled_call(generation_config.compile_config)
 
-        is_prefill = True
+        if generation_config.prefill_chunk_size is not None:
+            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
+            is_prefill = False
+        else:
+            is_prefill = True
+
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
             # prepare model inputs
             model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
@@ -4768,6 +4773,42 @@ def _assisted_decoding(
         else:
             return input_ids
 
+    def _prefill_chunking(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, **model_kwargs):
+        chunk_size = generation_config.prefill_chunk_size
+        # Only chunk up the token just before last, so that decoding is completely performed outside this function
+        # (here we simply prefill the cache)
+        input_chunks = torch.split(input_ids[:, :-1], chunk_size, dim=-1)
+
+        if "past_key_values" not in model_kwargs:
+            raise ValueError("Cannot use prefill chunkink without a cache")
+
+        model_forward = self.get_compiled_call(generation_config.compile_config)
+        attention_mask = model_kwargs.pop("attention_mask", None)
+
+        past_length = 0
+        for input_chunk in input_chunks:
+            current_length = past_length + input_chunk.shape[-1]
+            # Prepare inputs
+            if attention_mask is not None:
+                model_kwargs["attention_mask"] = attention_mask[:, :current_length]
+            model_kwargs["cache_position"] = torch.arange(
+                past_length, current_length, dtype=torch.long, device=input_chunk.device
+            )
+            model_kwargs["position_ids"] = model_kwargs["cache_position"].unsqueeze(0)
+            model_inputs = self.prepare_inputs_for_generation(input_chunk, **model_kwargs)
+
+            # outputs = model_forward(**model_inputs, return_dict=True) TODO REACTIVATE THIS!!!
+            outputs = self(**model_inputs, return_dict=True)
+
+            model_kwargs["past_key_values"] = outputs.past_key_values
+            past_length = current_length
+
+        model_kwargs["attention_mask"] = attention_mask
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + 1
+        _ = model_kwargs.pop("position_ids", None)
+
+        return model_kwargs
+
 
 def _speculative_sampling(
     candidate_input_ids,
 
@@ -1,4 +1,3 @@
-
 from transformers.utils import is_torch_available
 
 
@@ -10,7 +9,8 @@
 
 
 def skip(*args, **kwargs):
-        pass
+    pass
+
 
 class CompressedExpertsLinear(nn.Module):
     """
 
@@ -34,10 +34,7 @@
 
 
 if is_torch_flex_attn_available():
-    from torch.nn.attention.flex_attention import (
-        BlockMask,
-        flex_attention,
-    )
+    from torch.nn.attention.flex_attention import BlockMask, flex_attention
     from torch.nn.attention.flex_attention import (
         create_block_mask as create_block_causal_mask_flex,
     )
@@ -64,14 +61,23 @@ def __init__(self):
         Initialize or update the singleton instance.
         """
         if self._is_flex_compiled is False:
-            self._compiled_flex_attention = torch.compile(flex_attention, dynamic=False)
+            self._compiled_flex_attention = torch.compile(flex_attention, backend="inductor")
             self._is_flex_compiled = True
 
     def __call__(self):
         return self._compiled_flex_attention
 
 
-def make_flex_block_causal_mask(attention_mask_2d: torch.Tensor) -> "BlockMask":
+Offset = Union[torch.Tensor, int]
+
+
+def make_flex_block_causal_mask(
+    attention_mask_2d: torch.Tensor,
+    attention_chunk_size: Optional[int] = None,
+    query_length=None,
+    key_length=None,
+    offsets: Optional[Tuple[Offset, Offset]] = None,
+) -> "BlockMask":
     """
     Create a block causal document mask for a batch of sequences, both packed and unpacked.
     Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
@@ -94,10 +100,13 @@ def make_flex_block_causal_mask(attention_mask_2d: torch.Tensor) -> "BlockMask":
     Returns:
         BlockMask
     """
+    attention_mask_2d = torch.nn.functional.pad(attention_mask_2d, value=0, pad=(0, key_length))
     device = attention_mask_2d.device
+    document_ids = attention_mask_2d.clone()
 
-    document_ids = attention_mask_2d
-    batch_size, total_seq_len = document_ids.shape
+    if attention_chunk_size is not None:
+        # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
+        document_ids = (document_ids.fill_(1).cumsum(-1) - 1) // (attention_chunk_size)
 
     # Instead of passing a tensor mask, flex attention requires a mask_mod function
     # that determines which elements of QK^T should be included in the attention
@@ -112,18 +121,30 @@ def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
         See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
         for an illustration.
         """
-        causal_mask = q_idx >= kv_idx
+        causal_mask = q_idx >= kv_idx  # not valid when decoding
         document_mask = document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
-        padding_mask = document_ids[batch_idx, q_idx] > 0
-        return causal_mask & document_mask & padding_mask
-
+        padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
+        final_mask = causal_mask & padding_mask & document_mask
+        return final_mask
+
+    if offsets is not None:
+        q_offset = offsets[0]
+        kv_offset = offsets[1]
+
+        def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+            offset_q = q_idx + q_offset
+            offset_kv = kv_idx + kv_offset
+            return causal_mask_mod(batch_idx, head_idx, offset_q, offset_kv)
+    else:
+        mask_mod = causal_mask_mod
     return create_block_causal_mask_flex(
-        mask_mod=causal_mask_mod,
-        B=batch_size,
+        mask_mod=mask_mod,
+        B=1,
         H=None,  # attention head
-        Q_LEN=total_seq_len,
-        KV_LEN=total_seq_len,
+        Q_LEN=query_length,
+        KV_LEN=key_length,
         device=device,
+        _compile=True,
     )
 
 
@@ -155,6 +176,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
+
 def flex_attention_forward(
     module: torch.nn.Module,
     query: torch.Tensor,
@@ -169,7 +191,7 @@ def flex_attention_forward(
     block_mask = None
     causal_mask = None
     if isinstance(attention_mask, BlockMask):
-        block_mask = attention_mask
+        block_mask = attention_mask  # ._adjust(query.shape[2], key.shape[2])
     else:
         causal_mask = attention_mask
 
@@ -187,11 +209,14 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
 
     enable_gqa = True
     num_local_query_heads = query.shape[1]
-    if not((num_local_query_heads & (num_local_query_heads)) == 0):
+
+    # When running TP this helps:
+    if not ((num_local_query_heads & (num_local_query_heads - 1)) == 0):
         key = repeat_kv(key, num_local_query_heads)
         value = repeat_kv(value, num_local_query_heads)
         enable_gqa = False
 
+    kernel_options = kwargs.get("kernel_options", None)
     attn_output, attention_weights = compile_friendly_flex_attention(
         query,
         key,
@@ -200,6 +225,7 @@ def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
         block_mask=block_mask,
         enable_gqa=enable_gqa,
         scale=scaling,
+        kernel_options=kernel_options,
         # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless.
         # For simplification, we thus always return it as no additional computations are introduced.
         return_lse=True,