jax-ml
diff --git a/‎jax_tpu_embedding/sparsecore/examples/shakespeare/jax_sc_shakespeare_jit.py
+8-26 b/‎jax_tpu_embedding/sparsecore/examples/shakespeare/jax_sc_shakespeare_jit.py
+8-26
diff --git a/‎jax_tpu_embedding/sparsecore/examples/shakespeare/jax_sc_shakespeare_pmap.py
+14-31 b/‎jax_tpu_embedding/sparsecore/examples/shakespeare/jax_sc_shakespeare_pmap.py
+14-31
diff --git a/‎jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_cc_test.py
+6-2 b/‎jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_cc_test.py
+6-2
diff --git a/‎jax_tpu_embedding/sparsecore/lib/fdo/BUILD
+6-2 b/‎jax_tpu_embedding/sparsecore/lib/fdo/BUILD
+6-2
diff --git a/‎jax_tpu_embedding/sparsecore/lib/fdo/fdo_client.py
+2-1 b/‎jax_tpu_embedding/sparsecore/lib/fdo/fdo_client.py
+2-1
diff --git a/‎jax_tpu_embedding/sparsecore/lib/fdo/file_fdo_client.py
+14-19 b/‎jax_tpu_embedding/sparsecore/lib/fdo/file_fdo_client.py
+14-19
diff --git a/‎jax_tpu_embedding/sparsecore/lib/fdo/file_fdo_client_test.py
+15-14 b/‎jax_tpu_embedding/sparsecore/lib/fdo/file_fdo_client_test.py
+15-14
@@ -370,18 +370,15 @@ def run_model():
           None,
           emb_var_outsharding,
       ),
-      donate_argnums=(9),
+      donate_argnums=(6),
   )
   def train_step_fn(
       mesh: jax.sharding.Mesh,
       model: nn.Module,
       optimizer,
       feature_specs,
       train_state: TrainState,
-      lhs_row_pointers,
-      lhs_local_embedding_ids,
-      lhs_local_sample_ids,
-      lhs_gains,
+      preprocessed_inputs,
       emb_variables,
       labels,
   ) -> tuple[TrainState, TrainMetrics, Nested[jax.Array]]:
@@ -398,15 +395,12 @@ def train_step_fn(
       tpu_sparse_dense_matmul = shard_map(
           f=tpu_sparse_dense_matmul,
           mesh=mesh,
-          in_specs=(pd, pd, pd, pd, pe),
+          in_specs=(pd, pe),
           out_specs=pd,
           check_rep=False,
       )
       emb_act = tpu_sparse_dense_matmul(
-          lhs_row_pointers,
-          lhs_local_embedding_ids,
-          lhs_local_sample_ids,
-          lhs_gains,
+          preprocessed_inputs,
           emb_variables,
       )
 
@@ -443,16 +437,13 @@ def train_step_fn(
       tpu_sparse_dense_matmul_grad = shard_map(
           f=tpu_sparse_dense_matmul_grad,
           mesh=mesh,
-          in_specs=(pd, pd, pd, pd, pd, pe),
+          in_specs=(pd, pd, pe),
           out_specs=pe,
           check_rep=False,
       )
       emb_variables = tpu_sparse_dense_matmul_grad(
           emb_grad,
-          lhs_row_pointers,
-          lhs_local_embedding_ids,
-          lhs_local_sample_ids,
-          lhs_gains,
+          preprocessed_inputs,
           emb_variables,
       )
 
@@ -512,13 +503,7 @@ def train_step_fn(
         lambda y: jax.make_array_from_process_local_data(global_sharding, y),
         x,
     )
-    (
-        lhs_row_pointers,
-        lhs_local_embedding_ids,
-        lhs_local_sample_ids,
-        lhs_gains,
-        stats,
-    ) = map(
+    preprocessed_inputs, stats = map(
         make_global_view,
         embedding.preprocess_sparse_dense_matmul_input(
             features,
@@ -541,10 +526,7 @@ def train_step_fn(
         optimizer,
         feature_specs,
         train_state,
-        lhs_row_pointers,
-        lhs_local_embedding_ids,
-        lhs_local_sample_ids,
-        lhs_gains,
+        preprocessed_inputs,
         emb_variables,
         labels,
     )
 
@@ -265,10 +265,7 @@ def train_step_fn(
       optimizer,
       feature_specs,
       train_state: TrainState,
-      lhs_row_pointers,
-      lhs_local_embedding_ids,
-      lhs_local_sample_ids,
-      lhs_gains,
+      preprocessed_inputs,
       emb_variables: Mapping[str, embedding.EmbeddingVariables],
       labels,
   ) -> tuple[
@@ -284,10 +281,7 @@ def train_step_fn(
           sharding_strategy='MOD',
       )
       emb_act = tpu_sparse_dense_matmul(
-          lhs_row_pointers,
-          lhs_local_embedding_ids,
-          lhs_local_sample_ids,
-          lhs_gains,
+          preprocessed_inputs,
           emb_variables,
       )
 
@@ -323,10 +317,7 @@ def train_step_fn(
       )
       emb_variables = tpu_sparse_dense_matmul_grad(
           emb_grad,
-          lhs_row_pointers,
-          lhs_local_embedding_ids,
-          lhs_local_sample_ids,
-          lhs_gains,
+          preprocessed_inputs,
           emb_variables,
       )
 
@@ -385,17 +376,15 @@ def train_step_fn(
     )
 
     # Preprocess the inputs.
-    (lhs_row_pointers, lhs_embedding_ids, lhs_sample_ids, lhs_gains, _) = (
-        embedding.preprocess_sparse_dense_matmul_input(
-            features,
-            feature_weights,
-            feature_specs,
-            local_device_count=global_mesh.local_mesh.size,
-            global_device_count=global_mesh.size,
-            num_sc_per_device=num_sc_per_device,
-            sharding_strategy='MOD',
-            has_leading_dimension=True,
-        )
+    preprocessed_inputs, _ = embedding.preprocess_sparse_dense_matmul_input(
+        features,
+        feature_weights,
+        feature_specs,
+        local_device_count=global_mesh.local_mesh.size,
+        global_device_count=global_mesh.size,
+        num_sc_per_device=num_sc_per_device,
+        sharding_strategy='MOD',
+        has_leading_dimension=True,
     )
 
     # TODO(patn): This (local_slice)will go away once the input processor is
@@ -432,10 +421,7 @@ def train_step_fn(
         continue
       jaxpr = jax.make_jaxpr(p_train_step_fn)(
           train_state,
-          lhs_row_pointers,
-          lhs_embedding_ids,
-          lhs_sample_ids,
-          lhs_gains,
+          preprocessed_inputs,
           emb_variables,
           labels_sharded,
       )
@@ -448,10 +434,7 @@ def train_step_fn(
 
     train_state, metrics_update, emb_variables = p_train_step_fn(
         train_state,
-        lhs_row_pointers,
-        lhs_embedding_ids,
-        lhs_sample_ids,
-        lhs_gains,
+        preprocessed_inputs,
         emb_variables,
         labels_sharded,
     )
 
@@ -763,15 +763,19 @@ def test_multi_process_fdo(self, has_leading_dimension):
             allow_id_dropping=False,
         )
     )
+    stats = embedding.SparseDenseMatmulInputStats(
+        max_ids_per_partition=stats["max_ids"],
+        max_unique_ids_per_partition=stats["max_unique_ids"],
+    )
     fdo_client.record(stats)
     fdo_client.publish()
     # Duplicated ids on row 0 and 6 are combined.
     np.testing.assert_equal(
-        stats["max_ids"]["one_table_to_rule_them_all"],
+        stats.max_ids_per_partition["one_table_to_rule_them_all"],
         np.array([7, 4, 6, 5, 9, 5, 5, 5], dtype=np.int32),
     )
     np.testing.assert_equal(
-        stats["max_unique_ids"]["one_table_to_rule_them_all"],
+        stats.max_unique_ids_per_partition["one_table_to_rule_them_all"],
         np.array([3, 3, 4, 4, 5, 3, 3, 5], dtype=np.int32),
     )
 
 
@@ -22,14 +22,18 @@ package(
 pytype_strict_library(
     name = "fdo_client",
     srcs = ["fdo_client.py"],
-    deps = [pypi_requirement("numpy")],
+    deps = [
+        "//jax_tpu_embedding/sparsecore/lib/nn:embedding",
+        pypi_requirement("numpy"),
+    ],
 )
 
 pytype_strict_library(
     name = "file_fdo_client",
     srcs = ["file_fdo_client.py"],
     deps = [
         ":fdo_client",
+        "//jax_tpu_embedding/sparsecore/lib/nn:embedding",
         pypi_requirement("absl/logging"),
         pypi_requirement("jax"),
         pypi_requirement("numpy"),
@@ -42,8 +46,8 @@ pytype_strict_contrib_test(
     env = {"JAX_PLATFORMS": "cpu"},
     deps = [
         ":file_fdo_client",
+        "//jax_tpu_embedding/sparsecore/lib/nn:embedding",
         pypi_requirement("absl/testing:absltest"),
-        pypi_requirement("jax"),
         pypi_requirement("numpy"),
     ],
 )
 
@@ -16,6 +16,7 @@
 import abc
 from collections.abc import Mapping
 
+from jax_tpu_embedding.sparsecore.lib.nn import embedding
 import numpy as np
 
 
@@ -41,7 +42,7 @@ class FDOClient(abc.ABC):
   @abc.abstractmethod
   def record(
       self,
-      data: Mapping[str, Mapping[str, np.ndarray]],
+      data: embedding.SparseDenseMatmulInputStats,
   ) -> None:
     """Records the raw stats to local memory.
 
 
@@ -24,13 +24,14 @@
 from absl import logging
 import jax
 from jax_tpu_embedding.sparsecore.lib.fdo import fdo_client
+from jax_tpu_embedding.sparsecore.lib.nn import embedding
 import numpy as np
 
 
 _FILE_NAME = 'fdo_stats'
 _FILE_EXTENSION = 'npz'
-_MAX_ID_STATS_KEY = '_max_ids'
-_MAX_UNIQUE_ID_STATS_KEY = '_max_unique_ids'
+_MAX_ID_STATS_SUFFIX = '_max_ids'
+_MAX_UNIQUE_ID_STATS_SUFFIX = '_max_unique_ids'
 
 
 class NPZFileFDOClient(fdo_client.FDOClient):
@@ -57,7 +58,7 @@ def __init__(self, base_dir: str):
     self._max_ids_per_partition = collections.defaultdict(np.ndarray)
     self._max_unique_ids_per_partition = collections.defaultdict(np.ndarray)
 
-  def record(self, data: Mapping[str, Mapping[str, np.ndarray]]) -> None:
+  def record(self, data: embedding.SparseDenseMatmulInputStats) -> None:
     """Records stats per process.
 
     Accumulates the max ids observed per process per sparsecore per device for
@@ -67,9 +68,7 @@ def record(self, data: Mapping[str, Mapping[str, np.ndarray]]) -> None:
     Args:
       data: A mapping representing data to be recorded.
     """
-    if _MAX_ID_STATS_KEY[1:] not in data:
-      raise ValueError(f'Expected stat ({_MAX_ID_STATS_KEY[1:]}) not found.')
-    max_ids_per_process = data[_MAX_ID_STATS_KEY[1:]]
+    max_ids_per_process = data.max_ids_per_partition
     for table_name, stats in max_ids_per_process.items():
       logging.vlog(
           2, 'Recording observed max ids for table: %s -> %s', table_name, stats
@@ -80,11 +79,7 @@ def record(self, data: Mapping[str, Mapping[str, np.ndarray]]) -> None:
         self._max_ids_per_partition[table_name] = np.vstack(
             (self._max_ids_per_partition[table_name], stats)
         )
-    if _MAX_UNIQUE_ID_STATS_KEY[1:] not in data:
-      raise ValueError(
-          f'Expected stats ({_MAX_UNIQUE_ID_STATS_KEY[1:]}) not found.'
-      )
-    max_uniques_per_process = data[_MAX_UNIQUE_ID_STATS_KEY[1:]]
+    max_uniques_per_process = data.max_unique_ids_per_partition
     for table_name, stats in max_uniques_per_process.items():
       logging.vlog(
           2,
@@ -107,7 +102,7 @@ def _generate_file_name(self) -> str:
         _FILE_NAME, jax.process_index(), time.time_ns(), _FILE_EXTENSION
     )
     return os.path.join(self._base_dir, filename)
-# LINT.ThenChange(:_get_latest_files_by_process)
+  # LINT.ThenChange(:_get_latest_files_by_process)
 
   def _get_latest_files_by_process(self, files: list[str]) -> list[str]:
     """Returns the latest file for each process."""
@@ -150,11 +145,11 @@ def publish(self) -> None:
     processes.
     """
     merged_stats = {
-        f'{table_name}{_MAX_ID_STATS_KEY}': stats
+        f'{table_name}{_MAX_ID_STATS_SUFFIX}': stats
         for table_name, stats in self._max_ids_per_partition.items()
     }
     merged_stats.update({
-        f'{table_name}{_MAX_UNIQUE_ID_STATS_KEY}': stats
+        f'{table_name}{_MAX_UNIQUE_ID_STATS_SUFFIX}': stats
         for table_name, stats in self._max_unique_ids_per_partition.items()
     })
     self._write_to_file(merged_stats)
@@ -197,16 +192,16 @@ def load(
     stats = self._read_from_file(files_glob)
     max_id_stats, max_unique_id_stats = {}, {}
     for table_name, stats in stats.items():
-      if table_name.endswith(f'{_MAX_ID_STATS_KEY}'):
-        max_id_stats[table_name[: -len(_MAX_ID_STATS_KEY)]] = stats
-      elif table_name.endswith(f'{_MAX_UNIQUE_ID_STATS_KEY}'):
-        max_unique_id_stats[table_name[: -len(_MAX_UNIQUE_ID_STATS_KEY)]] = (
+      if table_name.endswith(f'{_MAX_ID_STATS_SUFFIX}'):
+        max_id_stats[table_name[: -len(_MAX_ID_STATS_SUFFIX)]] = stats
+      elif table_name.endswith(f'{_MAX_UNIQUE_ID_STATS_SUFFIX}'):
+        max_unique_id_stats[table_name[: -len(_MAX_UNIQUE_ID_STATS_SUFFIX)]] = (
             stats
         )
       else:
         raise ValueError(
             f'Unexpected table name and stats key: {table_name}, expected to'
-            f' end with {_MAX_ID_STATS_KEY} or {_MAX_UNIQUE_ID_STATS_KEY}'
+            f' end with {_MAX_ID_STATS_SUFFIX} or {_MAX_UNIQUE_ID_STATS_SUFFIX}'
         )
     self._max_ids_per_partition = max_id_stats
     self._max_unique_ids_per_partition = max_unique_id_stats
 
@@ -17,6 +17,7 @@
 
 from absl.testing import absltest
 from jax_tpu_embedding.sparsecore.lib.fdo import file_fdo_client
+from jax_tpu_embedding.sparsecore.lib.nn import embedding
 import numpy as np
 
 
@@ -36,26 +37,26 @@ def _assert_stats_equal(self, actual, expected):
 
   def test_record_and_publish_load(self):
     fdo_client = file_fdo_client.NPZFileFDOClient(self.base_dir)
-    max_id_stats = {"tab_one": np.array([10, 20, 30, 40])}
-    max_unique_stats = {"tab_one": np.array([1, 2, 3, 4])}
-    fdo_client.record(
-        {"max_ids": max_id_stats, "max_unique_ids": max_unique_stats}
+    stats = embedding.SparseDenseMatmulInputStats(
+        max_ids_per_partition={"tab_one": np.array([10, 20, 30, 40])},
+        max_unique_ids_per_partition={"tab_one": np.array([1, 2, 3, 4])},
     )
+    fdo_client.record(stats)
     fdo_client.publish()
     loaded_max_ids, loaded_max_uniques = fdo_client.load()
-    self._assert_stats_equal(loaded_max_ids, max_id_stats)
-    self._assert_stats_equal(loaded_max_uniques, max_unique_stats)
+    self._assert_stats_equal(loaded_max_ids, stats.max_ids_per_partition)
+    self._assert_stats_equal(
+        loaded_max_uniques, stats.max_unique_ids_per_partition
+    )
 
   def test_multiple_record(self):
     fdo_client = file_fdo_client.NPZFileFDOClient(self.base_dir)
-    fdo_client.record({
-        "max_ids": {"tab_one": np.array([10, 20, 30, 40])},
-        "max_unique_ids": {"tab_one": np.array([1, 2, 3, 4])},
-    })
-    fdo_client.record({
-        "max_ids": {"tab_one": np.array([10, 20, 30, 40])},
-        "max_unique_ids": {"tab_one": np.array([1, 2, 3, 4])},
-    })
+    stats = embedding.SparseDenseMatmulInputStats(
+        max_ids_per_partition={"tab_one": np.array([10, 20, 30, 40])},
+        max_unique_ids_per_partition={"tab_one": np.array([1, 2, 3, 4])},
+    )
+    fdo_client.record(stats)
+    fdo_client.record(stats)
     fdo_client.publish()
     loaded_max_ids, loaded_max_uniques = fdo_client.load()
Original file line number	Diff line number	Diff line change
`@@ -763,15 +763,19 @@ def test_multi_process_fdo(self, has_leading_dimension):`
`763`	`763`	`allow_id_dropping=False,`
`764`	`764`	`)`
`765`	`765`	`)`
	`766`	`+ stats = embedding.SparseDenseMatmulInputStats(`
	`767`	`+ max_ids_per_partition=stats["max_ids"],`
	`768`	`+ max_unique_ids_per_partition=stats["max_unique_ids"],`
	`769`	`+ )`
`766`	`770`	`fdo_client.record(stats)`
`767`	`771`	`fdo_client.publish()`
`768`	`772`	`# Duplicated ids on row 0 and 6 are combined.`
`769`	`773`	`np.testing.assert_equal(`
`770`		`- stats["max_ids"]["one_table_to_rule_them_all"],`
	`774`	`+ stats.max_ids_per_partition["one_table_to_rule_them_all"],`
`771`	`775`	`np.array([7, 4, 6, 5, 9, 5, 5, 5], dtype=np.int32),`
`772`	`776`	`)`
`773`	`777`	`np.testing.assert_equal(`
`774`		`- stats["max_unique_ids"]["one_table_to_rule_them_all"],`
	`778`	`+ stats.max_unique_ids_per_partition["one_table_to_rule_them_all"],`
`775`	`779`	`np.array([3, 3, 4, 4, 5, 3, 3, 5], dtype=np.int32),`
`776`	`780`	`)`
`777`	`781`