* Adding an argument to set the reduction ratio thresholds for automatic damping adjustment.

botev · KfacJaxDev · commit 5f2ec527fcac · 2022-04-24T05:54:35.000-07:00
* Bug fix: get_default_tag now correctly returns None if the tag is not present.
* Adding an option to skip a raising error check if we are running a different graph.
* Fixing a bug in ExplicitExactCurvature.update_cache().

PiperOrigin-RevId: 443657951
diff --git a/kfac_jax/_src/curvature_estimator.py b/kfac_jax/_src/curvature_estimator.py
@@ -80,10 +80,12 @@
 )
 
 
-def get_default_tag_to_block_ctor(tag_name: str) -> CurvatureBlockCtor:
+def get_default_tag_to_block_ctor(
+    tag_name: str
+) -> Optional[CurvatureBlockCtor]:
   """Returns the default curvature block constructor for the give tag name."""
   global _DEFAULT_TAG_TO_BLOCK_CTOR
-  return _DEFAULT_TAG_TO_BLOCK_CTOR[tag_name]
+  return _DEFAULT_TAG_TO_BLOCK_CTOR.get(tag_name)
 
 
 def set_default_tag_to_block_ctor(
@@ -1262,7 +1264,7 @@ def blocks_vectors_to_params_vector(
 
   def update_curvature_matrix_estimate(
       self,
-      state: curvature_blocks.Full.State,
+      state: BlockDiagonalCurvature.State,
       ema_old: chex.Numeric,
       ema_new: chex.Numeric,
       batch_size: int,
@@ -1297,18 +1299,19 @@ def single_state_update(
 
   def update_cache(
       self,
-      state: curvature_blocks.Full.State,
+      state: BlockDiagonalCurvature.State,
       identity_weight: chex.Numeric,
       exact_powers: Optional[curvature_blocks.ScalarOrSequence],
       approx_powers: Optional[curvature_blocks.ScalarOrSequence],
       eigenvalues: bool,
       pmap_axis_name: Optional[str],
   ) -> curvature_blocks.Full.State:
-    return self.blocks[0].update_cache(
-        state=state,
+    block_state = self.blocks[0].update_cache(
+        state=state.blocks_states[0],
         identity_weight=identity_weight,
         exact_powers=exact_powers,
         approx_powers=approx_powers,
         eigenvalues=eigenvalues,
         pmap_axis_name=pmap_axis_name,
     )
+    return BlockDiagonalCurvature.State(blocks_states=(block_state,))
diff --git a/kfac_jax/_src/optimizer.py b/kfac_jax/_src/optimizer.py
@@ -101,6 +101,8 @@ def __init__(
       include_damping_in_quad_change: bool = False,
       damping_adaptation_interval: int = 5,
       damping_adaptation_decay: chex.Numeric = 0.9,
+      damping_lower_threshold: chex.Numeric = 0.25,
+      damping_upper_threshold: chex.Numeric = 0.75,
       always_use_exact_qmodel_for_damping_adjustment: bool = False,
       norm_constraint: Optional[chex.Numeric] = None,
       num_burnin_steps: int = 10,
@@ -200,6 +202,10 @@ def __init__(
       damping_adaptation_decay: Scalar. The ``damping`` parameter is multiplied
         by the ``damping_adaptation_decay`` every
         ``damping_adaptation_interval`` number of iterations. (Default: ``0.9``)
+      damping_lower_threshold: Scalar. The ``damping`` parameter is increased if
+        the reduction ratio is below this threshold. (Default: ``0.25``)
+      damping_upper_threshold: Scalar. The ``damping`` parameter is decreased if
+        the reduction ratio is below this threshold. (Default: ``0.75``)
       always_use_exact_qmodel_for_damping_adjustment: Boolean. When using
         learning rate and/or momentum adaptation, the quadratic model change
         used for damping adaption is always computed using the exact curvature
@@ -314,6 +320,8 @@ def schedule_with_first_step_zero(global_step: chex.Array) -> chex.Array:
     self._include_damping_in_quad_change = include_damping_in_quad_change
     self._damping_adaptation_decay = damping_adaptation_decay
     self._damping_adaptation_interval = damping_adaptation_interval
+    self._damping_lower_threshold = damping_lower_threshold
+    self._damping_upper_threshold = damping_upper_threshold
     self._always_use_exact_qmodel_for_damping_adjustment = (
         always_use_exact_qmodel_for_damping_adjustment)
     self._norm_constraint = norm_constraint
@@ -1111,10 +1119,10 @@ def _compute_new_damping_and_rho(
     rho = (new_loss - old_loss) / quad_change
 
     # Update damping
-    should_decrease = rho > 0.75
-    decreased_damping = current_damping * self.damping_decay_factor
-    should_increase = rho < 0.25
+    should_increase = rho < self._damping_lower_threshold
     increased_damping = current_damping / self.damping_decay_factor
+    should_decrease = rho > self._damping_upper_threshold
+    decreased_damping = current_damping * self.damping_decay_factor
 
     # This is basically an if-else statement
     damping = (should_decrease * decreased_damping +
diff --git a/kfac_jax/_src/tracer.py b/kfac_jax/_src/tracer.py
@@ -282,6 +282,7 @@ def cached_transformation(
     auto_register_tags: bool = True,
     allow_left_out_params: bool = False,
     allow_no_losses: bool = False,
+    raise_error_on_diff_jaxpr: bool = True,
     **auto_registration_kwargs: Any,
 ) -> TransformedFunction[T, T]:
   """Caches ``transformation(preprocessed_jaxpr, func_args, *args)``.
@@ -302,6 +303,9 @@ def cached_transformation(
       tag.
     allow_no_losses: If this is set to ``False`` an error would be raised if no
       registered losses have been found when tracing the function.
+    raise_error_on_diff_jaxpr: Whether to raise an exception if the function has
+      been traced before, with different arguments, and the new Jaxpr graph
+      differs in more than just the shapes and dtypes of the Jaxpr equations.
     **auto_registration_kwargs: Any additional keyword arguments, to be passed
       to the automatic registration pass.
 
@@ -341,8 +345,8 @@ def wrapped_transformation(
     if not allow_no_losses and not processed_jaxpr.loss_tags:
       raise ValueError("No registered losses have been found during tracing.")
 
-    # If any previous `ProcessedJaxpr` exits verify that they are equivalent
-    if cache:
+    if cache and raise_error_on_diff_jaxpr:
+      # If any previous `ProcessedJaxpr` exists verify that they are equivalent
       ref_jaxpr, _ = cache[next(iter(cache))]
       if ref_jaxpr != processed_jaxpr:
         raise ValueError("The consecutive tracing of the provided function "
@@ -889,6 +893,7 @@ def layer_tags_vjp(
     func: utils.Func,
     params_index: int = 0,
     auto_register_tags: bool = True,
+    raise_error_on_diff_jaxpr: bool = True,
     **auto_registration_kwargs,
 ) -> ...:
   """Creates a function for primal values and tangents w.r.t. all layer tags.
@@ -910,6 +915,8 @@ def layer_tags_vjp(
       parameters.
     auto_register_tags: Whether to run an automatic layer registration on the
       function (e.g. :func:`~auto_register_tags`).
+    raise_error_on_diff_jaxpr: When tracing with different arguments, if the
+      returned jaxpr has a different graph will raise an exception.
     **auto_registration_kwargs: Any additional keyword arguments, to be passed
       to the automatic registration pass.
 
@@ -924,5 +931,6 @@ def layer_tags_vjp(
       params_index=params_index,
       auto_register_tags=auto_register_tags,
       allow_left_out_params=False,
+      raise_error_on_diff_jaxpr=raise_error_on_diff_jaxpr,
       **auto_registration_kwargs
   )
diff --git a/tests/test_estimator.py b/tests/test_estimator.py
@@ -106,7 +106,7 @@ def test_explicit_exact_full(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -167,7 +167,7 @@ def test_block_diagonal_full(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -231,7 +231,7 @@ def test_block_diagonal_full_to_hessian(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -300,7 +300,7 @@ def test_diagonal(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -366,7 +366,7 @@ def test_kronecker_factored(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -446,7 +446,7 @@ def test_eigenvalues(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)
 
@@ -534,7 +534,7 @@ def test_matmul(
     data = {}
     for name, shape in data_point_shapes.items():
       data_key, key = jax.random.split(data_key)
-      data[name] = jax.random.uniform(key, (data_size,) + shape)
+      data[name] = jax.random.uniform(key, (data_size, *shape))
       if name == "labels":
         data[name] = jnp.argmax(data[name], axis=-1)