AI-Hypercomputer
diff --git a/‎MaxText/benchmark_chunked_prefill.py
+15-16 b/‎MaxText/benchmark_chunked_prefill.py
+15-16
diff --git a/‎MaxText/checkpointing.py
+14-5 b/‎MaxText/checkpointing.py
+14-5
diff --git a/‎MaxText/configs/base.yml
+7-8 b/‎MaxText/configs/base.yml
+7-8
diff --git a/‎MaxText/decode.py
+2 b/‎MaxText/decode.py
+2
@@ -32,22 +32,21 @@
 """
 
 
-# pylint: disable=ungrouped-imports
-import datetime
 import os
 from typing import Any, Sequence
+import datetime
 
 import jax
-from absl import app
-from jetstream.core import prefix_cache
+
 from jetstream.engine import chunked_prefill
 from jetstream.engine import engine_api
 
-from MaxText import max_utils
+from absl import app
+
+from MaxText import max_utils, prefix_cache
 from MaxText import maxengine
 from MaxText import pyconfig
 
-
 _WARMUP_ITERS = 2
 _BENCHMARK_ITERS = 5
 
@@ -81,7 +80,7 @@ def copy_prefix():
     return jax.tree.map(lambda x: x.copy(), prefix)
 
   # --- Fill the cache with dummy entries ---
-  print(f"Filling cache with {cache_num} dummy entries...")
+  print("Filling cache with", cache_num, "dummy entries...")
   for i in range(cache_num):
     # Create a unique dummy key, ensuring it's different from key_to_hit
     # and has the same length for consistency (though not strictly required by Trie).
@@ -105,10 +104,10 @@ def copy_prefix():
     jax.block_until_ready(load_result.prefix)
     del load_result
 
-  print(f"Finished filling cache with {cache_num} dummy entries.")
+  print("Finished filling cache with", cache_num, "dummy entries.")
 
   # --- Add the actual target entry ---
-  print(f"Adding the target entry with key length {len(key_to_hit)}...")
+  print("Adding the target entry with key length ", len(key_to_hit), "...", sep="")
 
   value_to_hit = prefix_cache.Value(
       prefix=copy_prefix(),
@@ -171,7 +170,7 @@ def run_chunked_prefill_utility():
     prefill_result = run_chunked_prefill_utility()
     jax.block_until_ready(prefill_result)
     end = datetime.datetime.now()
-    print(f"  Warmup iteration {i+1} time: {end - start}")
+    print("  Warmup iteration", i + 1, "time:", end - start)
 
   print("\nStarting benchmark...")
   total_time = datetime.timedelta()
@@ -182,10 +181,10 @@ def run_chunked_prefill_utility():
     end = datetime.datetime.now()
     iter_time = end - start
     total_time += iter_time
-    print(f"  Benchmark iteration {i+1} time: {iter_time}")
+    print("  Benchmark iteration", i + 1, "time:", iter_time)
 
   average_time = total_time / _BENCHMARK_ITERS
-  print(f"\nAverage time taken for chunked prefill over {_BENCHMARK_ITERS} iterations: {average_time}")
+  print("\nAverage time taken for chunked prefill over", _BENCHMARK_ITERS, "iterations:", average_time)
 
   # Run prefix caching benchmark
   prefill_result = run_chunked_prefill_utility()
@@ -235,13 +234,13 @@ def run_chunked_prefill_with_prefix_caching(cache_hit_chunk: int, need_save: boo
 
   for cache_hit_chunk in range(len(chunked_tokens_list)):
     for need_save in [True, False]:
-      print(f"\nBenchmark prefix caching {cache_hit_chunk=}, {need_save=}")
+      print("\nBenchmark prefix caching cache_hit_chunk=", cache_hit_chunk, " need_save=", need_save, sep="")
       for i in range(_WARMUP_ITERS):
         start = datetime.datetime.now()
         prefill_result = run_chunked_prefill_with_prefix_caching(cache_hit_chunk, need_save)
         jax.block_until_ready(prefill_result)
         end = datetime.datetime.now()
-        print(f"  Warmup iteration {i+1} time: {end - start}")
+        print("  Warmup iteration", i + 1, "time:", end - start)
 
       total_time = datetime.timedelta()
       for i in range(_BENCHMARK_ITERS):
@@ -251,10 +250,10 @@ def run_chunked_prefill_with_prefix_caching(cache_hit_chunk: int, need_save: boo
         end = datetime.datetime.now()
         iter_time = end - start
         total_time += iter_time
-        print(f"  Benchmark iteration {i+1} time: {iter_time}")
+        print("  Benchmark iteration", i + 1, "time:", iter_time)
 
       average_time = total_time / _BENCHMARK_ITERS
-      print(f"\nAverage time taken for prefix caching chunked prefill: {average_time}")
+      print("\nAverage time taken for prefix caching chunked prefill:", average_time)
 
 
 if __name__ == "__main__":
 
@@ -184,8 +184,8 @@ def load_state_if_possible(
     enable_single_replica_ckpt_restoring: Optional[bool] = False,
     dataset_type: Optional[str] = "tfds",
     step: int = -1,  # -1 means latest
-    use_ocdbt = True,
-    use_zarr3 = True,
+    use_ocdbt=True,
+    use_zarr3=True,
 ):
   """Loads TrainState as possible from the inputs.
 
@@ -293,7 +293,11 @@ def map_to_pspec(data):
 
   if load_parameters_from_path != "":
     restored_params = load_params_from_path(
-        load_parameters_from_path, abstract_unboxed_pre_state.params, checkpoint_storage_concurrent_gb, use_ocdbt=use_ocdbt, use_zarr3=use_zarr3
+        load_parameters_from_path,
+        abstract_unboxed_pre_state.params,
+        checkpoint_storage_concurrent_gb,
+        use_ocdbt=use_ocdbt,
+        use_zarr3=use_zarr3,
     )
     return None, restored_params
   elif load_full_state_from_path != "":
@@ -329,7 +333,9 @@ def setup_checkpoint_logger(config) -> Any | None:  # pytype: disable=attribute-
   return orbax_cloud_logger
 
 
-def load_params_from_path(load_parameters_from_path, abstract_unboxed_params, checkpoint_storage_concurrent_gb, use_ocdbt=True, use_zarr3=True):
+def load_params_from_path(
+    load_parameters_from_path, abstract_unboxed_params, checkpoint_storage_concurrent_gb, use_ocdbt=True, use_zarr3=True
+):
   """Load decode params from checkpoint at specified path."""
   assert load_parameters_from_path, "load_parameters_from_path is not defined."
   max_logging.log(f"restoring params from {load_parameters_from_path}")
@@ -338,7 +344,10 @@ def load_params_from_path(load_parameters_from_path, abstract_unboxed_params, ch
   # *_concurrent_gb should be set for large models, the default is 96.
   ckptr = ocp.Checkpointer(
       ocp.PyTreeCheckpointHandler(
-          restore_concurrent_gb=checkpoint_storage_concurrent_gb, save_concurrent_gb=checkpoint_storage_concurrent_gb, use_ocdbt=use_ocdbt, use_zarr3=use_zarr3
+          restore_concurrent_gb=checkpoint_storage_concurrent_gb,
+          save_concurrent_gb=checkpoint_storage_concurrent_gb,
+          use_ocdbt=use_ocdbt,
+          use_zarr3=use_zarr3,
       )
   )
 
 
@@ -323,8 +323,8 @@ logical_axis_rules: [
                       ['cache_kv', []],
                       ['cache_sequence', []],
                       ['exp', 'expert'],
-                      ['paged_kv_heads', ['tensor']],
-                      ['num_pages', []],
+                      ['paged_kv_heads', []],
+                      ['num_pages', ['tensor']],
                       ['tokens_per_page', []],
                       ['paged_kv_head_dim_size', []],
                     ]
@@ -654,15 +654,14 @@ sa_v_layout: "HEAD_DIM_MINOR"
 ### Determine if we want to use load balance for context parallelism
 context_parallel_load_balance: True
 
+#######################
 ### Paged Attention ###
+#######################
 # These settings take effect only when `attention=paged`.
 # They should be adjusted based on the available HBM and model config.
-# Note: one page group corresponds to one request/slot 
-pagedattn_num_pages: 64  # total number of pages to allocate
-pagedattn_tokens_per_page: 32  # number of tokens each page can hold
-pagedattn_pages_per_compute_block: 4  # number of pages processed together in pallas kernels
-pagedattn_max_pages_per_group: -1  # defaults to number of pages needed to reach max_target_length
-
+pagedattn_num_pages: 64
+pagedattn_tokens_per_page: 32
+pagedattn_pages_per_compute_block: 8
 
 # Chunked Prefill Parameters
 prefill_chunk_size: 256
 
@@ -19,7 +19,9 @@
 
 import jax
 import jax.numpy as jnp
+
 from absl import app
+
 from jetstream.engine import engine_api
 
 from MaxText import max_utils