google-ai-edge
diff --git a/‎ai_edge_torch/generative/examples/amd_llama_135m/amd_llama_135m.py
Lines changed: 15 additions & 7 deletions b/‎ai_edge_torch/generative/examples/amd_llama_135m/amd_llama_135m.py
Lines changed: 15 additions & 7 deletions
diff --git a/‎ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py
Lines changed: 1 addition & 2 deletions b/‎ai_edge_torch/generative/examples/amd_llama_135m/convert_to_tflite.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎ai_edge_torch/generative/examples/deepseek/convert_to_tflite.py
Lines changed: 1 addition & 3 deletions b/‎ai_edge_torch/generative/examples/deepseek/convert_to_tflite.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎ai_edge_torch/generative/examples/deepseek/deepseek.py
Lines changed: 15 additions & 7 deletions b/‎ai_edge_torch/generative/examples/deepseek/deepseek.py
Lines changed: 15 additions & 7 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/convert_gemma1_to_tflite.py
Lines changed: 1 addition & 2 deletions b/‎ai_edge_torch/generative/examples/gemma/convert_gemma1_to_tflite.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py
Lines changed: 1 addition & 2 deletions b/‎ai_edge_torch/generative/examples/gemma/convert_gemma2_to_tflite.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/gemma1.py
Lines changed: 16 additions & 8 deletions b/‎ai_edge_torch/generative/examples/gemma/gemma1.py
Lines changed: 16 additions & 8 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma/gemma2.py
Lines changed: 24 additions & 24 deletions b/‎ai_edge_torch/generative/examples/gemma/gemma2.py
Lines changed: 24 additions & 24 deletions
diff --git a/‎ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py
Lines changed: 1 addition & 2 deletions b/‎ai_edge_torch/generative/examples/gemma3/convert_gemma3_to_tflite.py
Lines changed: 1 addition & 2 deletions
@@ -29,8 +29,16 @@ class AmdLlama(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config() -> cfg.ModelConfig:
-  """Returns the model config for an AMD-Llama-135m model."""
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for an AMD-Llama-135m model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for an AMD-Llama-135m model.
+  """
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=64,
@@ -55,15 +63,16 @@ def get_model_config() -> cfg.ModelConfig:
       num_layers=12,
       max_seq_len=2048,
       embedding_dim=768,
+      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
   )
   return config
 
 
-def get_fake_model_config() -> cfg.ModelConfig:
-  config = get_model_config()
+def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
   config.vocab_size = 128
   config.num_layers = 2
   config.block_config(0).ff_config.intermediate_size = 64
@@ -73,13 +82,12 @@ def get_fake_model_config() -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] | None = None,
-    mask_cache_size: int = 0,
+    **kwargs
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(),
+      config=get_model_config(**kwargs),
       tensor_names=TENSOR_NAMES,
       model_class=AmdLlama,
       custom_loader=custom_loader,
-      mask_cache_size=mask_cache_size,
   )
@@ -31,14 +31,13 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      mask_cache_size=converter.get_mask_cache_size_from_flags(),
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -23,22 +23,20 @@
 
 flags = converter.define_conversion_flags('deepseek')
 
-
 def main(_):
   checkpoint_path = flags.FLAGS.checkpoint_path
   pytorch_model = deepseek.build_model(
       checkpoint_path,
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      mask_cache_size=converter.get_mask_cache_size_from_flags(),
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -29,8 +29,16 @@ class DeepSeekDistillQwen(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config() -> cfg.ModelConfig:
-  """Returns the model config for a Qwen 2.5 3B model."""
+def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a Qwen 2.5 3B model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a SmolLM model.
+  """
   attn_config = cfg.AttentionConfig(
       num_heads=12,
       head_dim=128,
@@ -58,15 +66,16 @@ def get_model_config() -> cfg.ModelConfig:
       num_layers=28,
       max_seq_len=4096,
       embedding_dim=1536,
+      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_share_weight_with_embedding=False,
   )
   return config
 
 
-def get_fake_model_config() -> cfg.ModelConfig:
-  config = get_model_config()
+def get_fake_model_config(**kwargs) -> cfg.ModelConfig:
+  config = get_model_config(**kwargs)
   config.vocab_size = 128
   config.num_layers = 2
   # DeepSeek-R1-Distill-Qwen has only one block config.
@@ -77,13 +86,12 @@ def get_fake_model_config() -> cfg.ModelConfig:
 def build_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    mask_cache_size: int = 0,
+    **kwargs
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config(),
+      config=get_model_config(**kwargs),
       tensor_names=TENSOR_NAMES,
       model_class=DeepSeekDistillQwen,
       custom_loader=custom_loader,
-      mask_cache_size=mask_cache_size,
   )
@@ -31,14 +31,13 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      mask_cache_size=converter.get_mask_cache_size_from_flags(),
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -33,14 +33,13 @@ def main(_):
       custom_loader=loader.maybe_get_custom_loader(
           checkpoint_path, flags.FLAGS.custom_checkpoint_loader
       ),
-      mask_cache_size=converter.get_mask_cache_size_from_flags(),
+      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
   )
   converter.convert_to_tflite(
       pytorch_model,
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),
 
@@ -42,8 +42,16 @@ class Gemma1(model_builder.DecoderOnlyModel):
   pass
 
 
-def get_model_config_2b() -> cfg.ModelConfig:
-  """Returns the model config for a Gemma 2B model."""
+def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a Gemma 2B model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a Gemma 2B model.
+  """
   attn_config = cfg.AttentionConfig(
       num_heads=8,
       head_dim=256,
@@ -72,33 +80,33 @@ def get_model_config_2b() -> cfg.ModelConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
+      kv_cache_max_len=kv_cache_max_len,
       block_configs=block_config,
       final_norm_config=norm_config,
       lm_head_use_bias=False,
   )
   return config
 
 
-def get_fake_model_config() -> cfg.ModelConfig:
-  config = get_model_config_2b()
+def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
+  config = get_model_config_2b(kv_cache_max_len)
   # Gemma has only one block config.
   config.block_config(0).ff_config.intermediate_size = 128
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 256
+  config.max_seq_len = 2 * kv_cache_max_len
   return config
 
 
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    mask_cache_size: int = 0,
+    **kwargs
 ) -> nn.Module:
   return model_builder.build_decoder_only_model(
       checkpoint_path=checkpoint_path,
-      config=get_model_config_2b(),
+      config=get_model_config_2b(**kwargs),
       tensor_names=TENSOR_NAMES,
       model_class=Gemma1,
       custom_loader=custom_loader,
-      mask_cache_size=mask_cache_size,
   )
@@ -104,7 +104,7 @@ def forward(
 class Gemma2(nn.Module):
   """A Gemma2 model built from the Edge Generative API layers."""
 
-  def __init__(self, config: cfg.ModelConfig, mask_cache_size: int = 0):
+  def __init__(self, config: cfg.ModelConfig):
     super().__init__()
 
     # Construct model layers.
@@ -126,24 +126,17 @@ def __init__(self, config: cfg.ModelConfig, mask_cache_size: int = 0):
         config.embedding_dim,
         config.final_norm_config,
     )
-    self.config = config
-    self.build_mask_cache(mask_cache_size)
-
-  def build_mask_cache(self, mask_cache_size: int):
-    assert (
-        mask_cache_size <= self.config.max_seq_len
-    ), "Mask cache size must be less than or equal to the max seq length."
-    if mask_cache_size <= 0:
-      self.mask_cache = None
-      self.sliding_window_mask_cache = None
-      return
-    self.mask_cache = attn_utils.build_causal_mask_cache(mask_cache_size)
+    self.mask_cache = attn_utils.build_causal_mask_cache(
+        size=config.kv_cache_max,
+    )
     # Gemma2 has same hyper parameters for each layer except for attention
     # types. Use the first layer.
+    attn_config = config.block_config(0).attn_config
     self.sliding_window_mask_cache = attn_utils.build_sliding_window_mask_cache(
-        size=mask_cache_size,
-        window_size=self.config.block_config(0).attn_config.sliding_window_size,
+        size=config.kv_cache_max,
+        window_size=attn_config.sliding_window_size,
     )
+    self.config = config
 
   def get_attention_mask(
       self, attn_type: cfg.AttentionType, input_pos: torch.Tensor
@@ -174,7 +167,6 @@ def forward(
     n_elem = int(attn_config.rotary_percentage * attn_config.head_dim)
     rope = rotary_pos_emb.build_rope(input_pos, n_elem, attn_config.rotary_base)
     if mask is None:
-      assert self.mask_cache is not None, "Mask cache must be built."
       mask = [
           self.get_attention_mask(
               self.config.block_config(i).attn_config.attn_type, input_pos
@@ -230,8 +222,16 @@ def _forward_with_embeds(
     return {"logits": res, "kv_cache": updated_kv_cache}
 
 
-def get_model_config_2b() -> cfg.ModelConfig:
-  """Returns the model config for a Gemma2 2B model."""
+def get_model_config_2b(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
+  """Returns the model config for a Gemma2 2B model.
+
+  Args:
+    kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
+      is 1024.
+
+  Returns:
+    The model config for a Gemma 2B model.
+  """
   norm_config = cfg.NormalizationConfig(
       type=cfg.NormalizationType.RMS_NORM, epsilon=1e-6, zero_centered=True
   )
@@ -277,6 +277,7 @@ def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
       max_seq_len=8192,
       embedding_dim=embedding_dim,
       embedding_scale=embedding_dim**0.5,
+      kv_cache_max_len=kv_cache_max_len,
       block_configs=[get_block_config(i) for i in range(num_layers)],
       final_norm_config=norm_config,
       lm_head_use_bias=False,
@@ -285,11 +286,11 @@ def get_block_config(idx: int) -> cfg.TransformerBlockConfig:
   return config
 
 
-def get_fake_model_config() -> cfg.ModelConfig:
-  config = get_model_config_2b()
+def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
+  config = get_model_config_2b(kv_cache_max_len)
   config.vocab_size = 128
   config.num_layers = 2
-  config.max_seq_len = 256
+  config.max_seq_len = 2 * kv_cache_max_len
   config.embedding_dim = 128
   config.embedding_scale = config.embedding_dim**0.5
   config.block_configs = config.block_configs[: config.num_layers]
@@ -304,17 +305,16 @@ def get_fake_model_config() -> cfg.ModelConfig:
 def build_2b_model(
     checkpoint_path: str,
     custom_loader: Callable[[str], Dict[str, torch.Tensor]] = None,
-    mask_cache_size: int = 0,
+    **kwargs,
 ) -> nn.Module:
   for tensor_names in TENSOR_NAMES_DICT.values():
     try:
       return model_builder.build_decoder_only_model(
           checkpoint_path=checkpoint_path,
-          config=get_model_config_2b(),
+          config=get_model_config_2b(**kwargs),
           tensor_names=tensor_names,
           model_class=Gemma2,
           custom_loader=custom_loader,
-          mask_cache_size=mask_cache_size,
       )
     except KeyError as _:
       continue
@@ -40,7 +40,7 @@ def main(_):
         custom_loader=loader.maybe_get_custom_loader(
             checkpoint_path, flags.FLAGS.custom_checkpoint_loader
         ),
-        mask_cache_size=converter.get_mask_cache_size_from_flags(),
+        kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
     )
   else:
     raise ValueError(f'Unsupported model size: {_MODEL_SIZE.value}')
@@ -50,7 +50,6 @@ def main(_):
       output_path=flags.FLAGS.output_path,
       output_name_prefix=flags.FLAGS.output_name_prefix,
       prefill_seq_len=flags.FLAGS.prefill_seq_lens,
-      kv_cache_max_len=flags.FLAGS.kv_cache_max_len,
       quantize=flags.FLAGS.quantize,
       lora_ranks=flags.FLAGS.lora_ranks,
       export_config=export_config.get_from_flags(),