Attention mask as a buffer instead of parameter

tarun-menta · tarun-menta · commit 084239fccb00 · 2025-06-20T14:53:25.000-04:00
diff --git a/surya/foundation/cache.py b/surya/foundation/cache.py
@@ -29,8 +29,7 @@ def __init__(
         self.text_sliding_window = text_sliding_window
         self.num_layers = config.num_hidden_layers
 
-        # TODO Setup these as buffers since its a nn.Module
-        self.attention_mask = torch.zeros((self.batch_size, self.max_cache_len), device=device, dtype=torch.int)
+        self.register_buffer(f"attention_mask", torch.zeros((self.batch_size, self.max_cache_len), device=device, dtype=torch.int))
         self.text_token_counts = [torch.zeros(self.batch_size) for _ in range(self.num_layers)]
 
     def _shift_attention_mask_left(self, batch_idx: int, shift_amount: int):