Remove redundant code in gemma3 decoder

protobird-git · copybara-github · commit 750f58e8d66c · 2025-05-09T10:32:59.000-07:00
- sliding window mask is calculated on the fly
- no need to get local mask from cache any more when mask_as_input is false

PiperOrigin-RevId: 756820429
diff --git a/ai_edge_torch/generative/examples/gemma3/decoder.py b/ai_edge_torch/generative/examples/gemma3/decoder.py
@@ -119,9 +119,7 @@ def __init__(self, config: cfg.ModelConfig):
         config.vocab_size, config.embedding_dim, padding_idx=0
     )
     self.lm_head = nn.Linear(
-        config.embedding_dim,
-        config.vocab_size,
-        bias=config.lm_head_use_bias,
+        config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
     )
     # Gemma3 re-uses the embedding as the head projection layer.
     self.lm_head.weight.data = self.tok_embedding.weight.data
@@ -130,30 +128,13 @@ def __init__(self, config: cfg.ModelConfig):
         for idx in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
-        config.embedding_dim,
-        config.final_norm_config,
+        config.embedding_dim, config.final_norm_config
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
         size=config.kv_cache_max,
     )
-    # Gemma3 has same hyper parameters for each layer except for attention
-    # types. Use the first layer.
-    attn_config = config.block_config(0).attn_config
-    self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
-        size=config.kv_cache_max,
-        window_size=attn_config.sliding_window_size,
-    )
     self.config = config
 
-  def get_attention_mask(
-      self,
-      attn_type: cfg.AttentionType,
-      input_pos: torch.Tensor,
-  ) -> torch.Tensor:
-    if attn_type == cfg.AttentionType.LOCAL_SLIDING:
-      return self.sliding_window_mask_cache.index_select(2, input_pos)
-    return self.mask_cache.index_select(2, input_pos)
-
   def get_local_global_attention_mask(
       self,
       attention_mask: torch.Tensor,
@@ -200,9 +181,7 @@ def create_sliding_mask(
         sliding_mask_bool,
         torch.zeros_like(sliding_mask_bool, dtype=torch.float),
         torch.full_like(
-            sliding_mask_bool,
-            self.config.causal_mask_value,
-            dtype=torch.float,
+            sliding_mask_bool, self.config.causal_mask_value, dtype=torch.float
         ),
     )
 
@@ -272,12 +251,8 @@ def forward(
         for i in range(self.config.num_layers)
     ]
     if mask is None:
-      mask = [
-          self.get_attention_mask(
-              self.config.block_config(i).attn_config.attn_type, input_pos
-          )
-          for i in range(self.config.num_layers)
-      ]
+      mask = self.mask_cache.index_select(2, input_pos)
+      mask = mask[:, :, :, : self.config.kv_cache_max]
 
     return self._forward_with_embeds(
         input_embeds, rope, mask, input_pos, kv_cache, pixel_mask, export_config
@@ -329,6 +304,7 @@ def _forward_with_embeds(
       if kv_entry:
         updated_kv_entries.append(kv_entry)
     updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entries))
+
     if export_config is not None:
       if (
           torch.numel(input_pos) > 1
diff --git a/ai_edge_torch/generative/utilities/model_builder.py b/ai_edge_torch/generative/utilities/model_builder.py
@@ -75,8 +75,7 @@ def __init__(self, config: cfg.ModelConfig):
         for idx in range(config.num_layers)
     )
     self.final_norm = builder.build_norm(
-        config.embedding_dim,
-        config.final_norm_config,
+        config.embedding_dim, config.final_norm_config
     )
     self.mask_cache = attn_utils.build_causal_mask_cache(
         size=config.kv_cache_max,

Original file line number	Diff line number	Diff line change
`@@ -75,8 +75,7 @@ def __init__(self, config: cfg.ModelConfig):`
`75`	`75`	`for idx in range(config.num_layers)`
`76`	`76`	`)`
`77`	`77`	`self.final_norm = builder.build_norm(`
`78`		`- config.embedding_dim,`
`79`		`- config.final_norm_config,`
	`78`	`+ config.embedding_dim, config.final_norm_config`
`80`	`79`	`)`
`81`	`80`	`self.mask_cache = attn_utils.build_causal_mask_cache(`
`82`	`81`	`size=config.kv_cache_max,`