ml-gde
diff --git a/‎.devcontainer/devcontainer.json
Lines changed: 2 additions & 2 deletions b/‎.devcontainer/devcontainer.json
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/t5_inference_example.py
Lines changed: 21 additions & 0 deletions b/‎examples/t5_inference_example.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎jaxgarden/__init__.py
Lines changed: 22 additions & 1 deletion b/‎jaxgarden/__init__.py
Lines changed: 22 additions & 1 deletion
diff --git a/‎jaxgarden/attention/rope_multi_head_attention.py
Lines changed: 29 additions & 29 deletions b/‎jaxgarden/attention/rope_multi_head_attention.py
Lines changed: 29 additions & 29 deletions
diff --git a/‎jaxgarden/models/base.py
Lines changed: 1 addition & 1 deletion b/‎jaxgarden/models/base.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎jaxgarden/models/gemma2.py
Lines changed: 1 addition & 1 deletion b/‎jaxgarden/models/gemma2.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎jaxgarden/models/llama.py
Lines changed: 2 additions & 2 deletions b/‎jaxgarden/models/llama.py
Lines changed: 2 additions & 2 deletions
@@ -20,8 +20,8 @@
                 "python.linting.enabled": true,
                 "editor.formatOnSave": true,
                 "editor.codeActionsOnSave": {
-                    "source.organizeImports": "true",
-                    "source.fixAll": "true"
+                    "source.organizeImports": "always",
+                    "source.fixAll": "always"
                 },
                 "python.formatting.provider": "none",
                 "[python]": {
 
@@ -0,0 +1,21 @@
+from flax import nnx
+
+from jaxgarden import T5Config, T5ForCausalLM, Tokenizer
+
+if __name__ == "__main__":
+    config = T5Config()
+    model = T5ForCausalLM(config, rngs=nnx.Rngs(0))
+    model_id = "google-t5/t5-base"
+
+    # download checkpoint from HuggingFace Hub
+    model.from_hf(model_id, force_download=True)
+
+    tokenizer = Tokenizer.from_pretrained(model_id)
+
+    text = "The meaning of life is"
+    model_inputs = tokenizer.encode(text)
+    output = model.generate(**model_inputs, max_length=20, do_sample=True)
+    output_text = tokenizer.decode(output)
+
+    print(output, output.shape)
+    print(output_text)
@@ -29,9 +29,21 @@
     ModernBertLayer,
     ModernBertMLP,
 )
-from jaxgarden.tokenization import Tokenizer
+from jaxgarden.models.t5 import (
+    T5MLP,
+    T5Attention,
+    T5Block,
+    T5Config,
+    T5CrossAttention,
+    T5ForCausalLM,
+    T5LayerNorm,
+    T5SelfAttention,
+    T5Stack,
+)
+from jaxgarden.tokenization import Tokenizer  # type: ignore
 
 __all__ = [
+    "T5MLP",
     # Base classes
     "BaseConfig",
     "BaseModel",
@@ -60,6 +72,15 @@
     "ModernBertMLP",
     # Attention modules
     "MultiHeadAttention",
+    # T5 Models
+    "T5Attention",
+    "T5Block",
+    "T5Config",
+    "T5CrossAttention",
+    "T5ForCausalLM",
+    "T5LayerNorm",
+    "T5SelfAttention",
+    "T5Stack",
     # tokenization
     "Tokenizer",
     # Functional interfaces
 
@@ -47,8 +47,9 @@ def apply_rotary_pos_emb(x: jnp.ndarray, cos_emb: jnp.ndarray, sin_emb: jnp.ndar
     return (x * cos_emb) + (rotate_half(x) * sin_emb)
 
 
-def precompute_rotary_embeddings(seq_len: int, head_dim: int,
-    base: float = 10000.0) -> tuple[jnp.ndarray, jnp.ndarray]:
+def precompute_rotary_embeddings(
+    seq_len: int, head_dim: int, base: float = 10000.0
+) -> tuple[jnp.ndarray, jnp.ndarray]:
     """Precomputes the RoPE cosine and sine embeddings.
 
     Args:
@@ -91,11 +92,11 @@ class RoPEMultiHeadAttention(nn.Module):
     rope_base: float = 10000.0
     dtype: jnp.dtype = jnp.float32
 
-    def setup(self) -> None: # Added -> None return type
+    def setup(self) -> None:  # Added -> None return type
         """Initializes the attention projections."""
         # Check head_dim validity early during setup
         if self.head_dim % 2 != 0:
-             raise ValueError(f"head_dim ({self.head_dim}) must be even for RoPE.")
+            raise ValueError(f"head_dim ({self.head_dim}) must be even for RoPE.")
 
         # Define layers here - they will be initialized when the module is first called
         total_head_dim = self.num_heads * self.head_dim
@@ -109,13 +110,12 @@ def setup(self) -> None: # Added -> None return type
             features=total_head_dim, use_bias=False, dtype=self.dtype, name="value_proj"
         )
         self.output_proj = nn.Dense(
-            features=self.num_heads * self.head_dim, # Output should match embed_dim
+            features=self.num_heads * self.head_dim,  # Output should match embed_dim
             use_bias=False,
             dtype=self.dtype,
-            name="output_proj"
+            name="output_proj",
         )
 
-
     @nn.compact
     # Also using Optional for the mask type hint for clarity with None default
     def __call__(self, x: jnp.ndarray, mask: jnp.ndarray | None = None) -> jnp.ndarray:
@@ -136,8 +136,7 @@ def __call__(self, x: jnp.ndarray, mask: jnp.ndarray | None = None) -> jnp.ndarr
 
         if embed_dim != total_head_dim:
             raise ValueError(
-                f"embed_dim ({embed_dim}) must equal num_heads*head_dim"
-                f" ({total_head_dim})"
+                f"embed_dim ({embed_dim}) must equal num_heads*head_dim ({total_head_dim})"
             )
         # Note: head_dim even check moved to setup for earlier failure
 
@@ -159,7 +158,6 @@ def __call__(self, x: jnp.ndarray, mask: jnp.ndarray | None = None) -> jnp.ndarr
         cos_emb = cos_emb.astype(self.dtype)
         sin_emb = sin_emb.astype(self.dtype)
 
-
         # 4. Apply RoPE to Query and Key
         query = apply_rotary_pos_emb(query, cos_emb, sin_emb)
         key = apply_rotary_pos_emb(key, cos_emb, sin_emb)
@@ -172,44 +170,46 @@ def __call__(self, x: jnp.ndarray, mask: jnp.ndarray | None = None) -> jnp.ndarr
         # 6. Scaled Dot-Product Attention
         # Attention scores: (batch, num_heads, seq_len, seq_len)
         attn_scores = jnp.matmul(query, key.transpose((0, 1, 3, 2))) / jnp.sqrt(
-            self.head_dim).astype(self.dtype) # Ensure sqrt is correct dtype
+            self.head_dim
+        ).astype(self.dtype)  # Ensure sqrt is correct dtype
 
         # Apply mask (if provided)
         if mask is not None:
             # Standard Flax causal mask is boolean (True means mask)
             # nn.make_causal_mask returns (1, seq_len, seq_len) or (batch, 1, seq_len, seq_len)
             # Check if mask needs broadcasting or conversion
-            if mask.ndim == 2: # Likely (seq_len, seq_len)
-                 mask = mask[None, None, :, :] # -> (1, 1, seq_len, seq_len)
+            if mask.ndim == 2:  # Likely (seq_len, seq_len)
+                mask = mask[None, None, :, :]  # -> (1, 1, seq_len, seq_len)
             elif mask.ndim == 3 and mask.shape[1] != self.num_heads:
-                 # Likely (batch, seq_len, seq_len) or causal (1, sl, sl)
+                # Likely (batch, seq_len, seq_len) or causal (1, sl, sl)
                 mask = mask[:, None, :, :]
-                     # Assume (batch, seq_len, seq_len) -> (batch, 1, seq_len, seq_len)
+                # Assume (batch, seq_len, seq_len) -> (batch, 1, seq_len, seq_len)
 
             # Ensure mask is broadcastable to attn_scores shape
             mask_shape_expected = (batch_size, self.num_heads, seq_len, seq_len)
             if mask.shape != mask_shape_expected:
-                 # Attempt broadcasting common causal mask shapes
-                 if mask.shape == (1, 1, seq_len, seq_len) or mask.shape == (batch_size, 1,
-                        seq_len, seq_len): # Causal mask for all batches/heads
-                     mask = jnp.broadcast_to(mask, mask_shape_expected)
-                 # Add other broadcasting cases if needed
-                 else:
-                     raise ValueError(f"Mask shape {mask.shape} != exp shape {mask_shape_expected}")
-
+                # Attempt broadcasting common causal mask shapes
+                if mask.shape == (1, 1, seq_len, seq_len) or mask.shape == (
+                    batch_size,
+                    1,
+                    seq_len,
+                    seq_len,
+                ):  # Causal mask for all batches/heads
+                    mask = jnp.broadcast_to(mask, mask_shape_expected)
+                # Add other broadcasting cases if needed
+                else:
+                    raise ValueError(f"Mask shape {mask.shape} != exp shape {mask_shape_expected}")
 
             # Apply mask: Use large negative number where mask is True
             # (or where mask value is 0 if using 0/-inf convention)
             # Assuming boolean mask convention (True = mask) common in Flax examples
             # If using 0/-inf mask, the logic would be: attn_scores = attn_scores + mask
             attn_scores = jnp.where(mask, jnp.finfo(self.dtype).min, attn_scores)
 
-
         # Softmax to get attention weights
-        attn_weights = jax.nn.softmax(
-            attn_scores, axis=-1
-        ).astype(self.dtype)  # Shape: (batch, num_heads, seq_len, seq_len)
-
+        attn_weights = jax.nn.softmax(attn_scores, axis=-1).astype(
+            self.dtype
+        )  # Shape: (batch, num_heads, seq_len, seq_len)
 
         # Apply attention weights to Value
         # Output per head: (batch, num_heads, seq_len, head_dim)
@@ -222,6 +222,6 @@ def __call__(self, x: jnp.ndarray, mask: jnp.ndarray | None = None) -> jnp.ndarr
         attn_output = attn_output.reshape(batch_size, seq_len, total_head_dim)
 
         # Final linear projection
-        output = self.output_proj(attn_output) # Use self.output_proj defined in setup
+        output = self.output_proj(attn_output)  # Use self.output_proj defined in setup
 
         return output
@@ -71,7 +71,7 @@ def __init__(
     @property
     def state(self) -> nnx.State:
         """Splits state from the graph and returns it"""
-        return nnx.split(self, nnx.Param, ...)[1]
+        return nnx.split(self, nnx.Param, ...)[1]  # type: ignore
 
     @property
     def state_dict(self) -> dict[str, jnp.ndarray]:
 
@@ -422,7 +422,7 @@ def __call__(
 
 
 # 3. Main Model
-class Gemma2ForCausalLM(BaseModel, GenerationMixin):
+class Gemma2ForCausalLM(GenerationMixin, BaseModel):
     config: Gemma2Config  # This helps to fix a mypy issue
 
     def __init__(self, config: Gemma2Config, *, rngs: nnx.Rngs) -> None:
 
@@ -431,7 +431,7 @@ def __call__(
         return x
 
 
-class LlamaForCausalLM(BaseModel, GenerationMixin):
+class LlamaForCausalLM(GenerationMixin, BaseModel):
     """LLama model for causal language modeling.
 
     This implements the full LLama model for generating text.
@@ -511,7 +511,7 @@ def __call__(
         assert input_ids.shape[0] == 1, "Only batch size 1 is supported"
         print(input_ids.shape)
         position_ids = jnp.arange(input_ids.shape[-1])[None, :].astype(jnp.int32)
-        attention_mask = jnp.where(attention_mask, 0.0, -jnp.inf)[None, None, ...]
+        attention_mask = jnp.where(attention_mask, 0.0, -jnp.inf)[None, None, ...]  # type: ignore
         x = self.token_embed(input_ids)
         for layer in self.layers:
             x = layer(x, position_ids, attention_mask)