Merge pull request #3463 from levskaya:vjp_fix

Flax Authors · Flax Authors · commit e2c3dfd3743c · 2023-11-06T15:39:02.000-08:00
PiperOrigin-RevId: 579977891
diff --git a/docs/api_reference/flax.linen/transformations.rst b/docs/api_reference/flax.linen/transformations.rst
@@ -28,6 +28,8 @@ Transformations
   map_variables
   jvp
   vjp
+  grad
+  value_and_grad
   custom_vjp
   while_loop
   cond
diff --git a/flax/linen/__init__.py b/flax/linen/__init__.py
@@ -144,6 +144,8 @@
     scan as scan,
     switch as switch,
     vjp as vjp,
+    grad as grad,
+    value_and_grad as value_and_grad,
     vmap as vmap,
     while_loop as while_loop,
 )
diff --git a/flax/linen/transforms.py b/flax/linen/transforms.py
@@ -1055,7 +1055,8 @@ def vjp(
   vjp_variables: lift.CollectionFilter = 'params',
   variables: lift.CollectionFilter = True,
   rngs: lift.PRNGSequenceFilter = True,
-) -> Tuple[Any, Any]:
+  multi_scope: bool = False,
+):
   """A lifted version of ``jax.vjp``.
 
   See ``jax.vjp`` for the unlifted vector-Jacobiam product (backward gradient).
@@ -1105,7 +1106,8 @@ def __call__(self, x, y):
     variables: other variables collections that are available inside `fn` but
       do not receive a cotangent.
     rngs: the prngs that are available inside `fn`.
-
+    multi_scope: for Modules containing multiple scopes from outside modules passed in,
+      allow for variable gradients to be returned for multiple scopes instead of erroring.
   Returns:
     If ``has_aux`` is ``False``, returns a ``(primals_out, vjpfun)`` pair, where
     ``primals_out`` is ``fn(*primals)``.
@@ -1121,7 +1123,7 @@ def __call__(self, x, y):
     (fn,),
     mdl,
     *primals,
-    multi_scope=False,
+    multi_scope=multi_scope,
     has_aux=has_aux,
     reduce_axes=reduce_axes,
     vjp_variables=vjp_variables,
@@ -1130,6 +1132,184 @@ def __call__(self, x, y):
   )
 
 
+def value_and_grad(
+  fn: Callable[..., Any],
+  mdl: Module,
+  *primals,
+  has_aux: bool = False,
+  reduce_axes=(),
+  variables: lift.CollectionFilter = True,
+  rngs: lift.PRNGSequenceFilter = True,
+):
+  """A limited, lifted equivalent of ``jax.value_and_grad``.
+
+  Note that for this convenience function, gradients are only calculated for
+  the function inputs, and not with respect to any module variables. The
+  target function must return a scalar-valued output.  For a more general
+  lifted vjp, see ``nn.vjp`` for the lifted vector-Jacobiam product.
+
+  Example::
+
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.zeros_init(), ())
+        return p * x * y
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        z, (x_grad, y_grad) = nn.value_and_grad(
+            lambda mdl, x, y: mdl(x, y), LearnScale(), x, y)
+        return z, x_grad, y_grad
+
+  Args:
+    fn: Function to be differentiated. Its arguments should be arrays, scalars,
+      or standard Python containers of arrays or scalars. It should return an
+      array, scalar, or standard Python container of arrays or scalars. It will
+      receive the scope and primals as arguments.
+    mdl: The module of which the variables will be differentiated.
+    *primals: A sequence of primal values at which the Jacobian of ``fn``
+      should be evaluated. The length of ``primals`` should be equal to the
+      number of positional parameters to ``fn``. Each primal value should be a
+      tuple of arrays, scalar, or standard Python containers thereof.
+    has_aux: Optional, bool. Indicates whether ``fn`` returns a pair where the
+     first element is considered the output of the mathematical function to be
+     differentiated and the second element is auxiliary data. Default False.
+    reduce_axes: Optional, tuple of axis names. If an axis is listed here, and
+      ``fn`` implicitly broadcasts a value over that axis, the backward pass
+      will perform a ``psum`` of the corresponding gradient. Otherwise, the
+      grad will be per-example over named axes. For example, if ``'batch'``
+      is a named batch axis, ``vjp(f, *args, reduce_axes=('batch',))`` will
+      create a grad function that sums over the batch while ``grad(f, *args)``
+      will create a per-example grad.
+    variables: variables collections that are available inside `fn` but
+      do not receive a cotangent.
+    rngs: the prngs that are available inside `fn`.
+  Returns:
+    If ``has_aux`` is ``False``, returns a ``primals_out, grads`` pair, where
+    ``primals_out`` is ``fn(*primals)``.  ``grads`` are the gradients for the
+    corresponding primals and do not include the gradients for module variables.
+    If ``has_aux`` is ``True``, returns a
+    ``(primals_out, aux), grads`` tuple where ``aux`` is the auxiliary data
+    returned by ``fn``.
+  """
+
+  vjp_partial = functools.partial(
+    vjp,
+    fn,
+    mdl,
+    *primals,
+    has_aux=has_aux,
+    reduce_axes=reduce_axes,
+    vjp_variables=False,
+    variables=variables,
+    rngs=rngs,
+    multi_scope=True,
+  )
+
+  if has_aux:
+    out, vjp_fun, aux = vjp_partial()
+    if out.shape != ():
+      raise ValueError(
+        'grad can only work on functions with '
+        f'scalar-valued outputs. out shape={out.shape}'
+      )
+    _, *argument_grads = vjp_fun(jax.numpy.ones_like(out))
+    return (out, aux), argument_grads
+  else:
+    out, vjp_fun = vjp_partial()
+    if out.shape != ():
+      raise ValueError(
+        'grad can only work on functions with '
+        f'scalar-valued outputs. out shape={out.shape}'
+      )
+    _, *argument_grads = vjp_fun(jax.numpy.ones_like(out))
+    return out, argument_grads
+
+
+def grad(
+  fn: Callable[..., Any],
+  mdl: Module,
+  *primals,
+  has_aux: bool = False,
+  reduce_axes=(),
+  variables: lift.CollectionFilter = True,
+  rngs: lift.PRNGSequenceFilter = True,
+):
+  """A limited, lifted equivalent of ``jax.grad``.
+
+  Note that for this convenience function, gradients are only calculated for
+  the function inputs, and not with respect to any module variables. The
+  target function must return a scalar-valued output.  For a more general
+  lifted vjp, see ``nn.vjp`` for the lifted vector-Jacobiam product.
+
+  Example::
+
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.zeros_init(), ())
+        return p * x * y
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        x_grad, y_grad = nn.grad(
+            lambda mdl, x, y: mdl(x, y), LearnScale(), x, y)
+        return x_grad, y_grad
+
+  Args:
+    fn: Function to be differentiated. Its arguments should be arrays, scalars,
+      or standard Python containers of arrays or scalars. It should return an
+      array, scalar, or standard Python container of arrays or scalars. It will
+      receive the scope and primals as arguments.
+    mdl: The module of which the variables will be differentiated.
+    *primals: A sequence of primal values at which the Jacobian of ``fn``
+      should be evaluated. The length of ``primals`` should be equal to the
+      number of positional parameters to ``fn``. Each primal value should be a
+      tuple of arrays, scalar, or standard Python containers thereof.
+    has_aux: Optional, bool. Indicates whether ``fn`` returns a pair where the
+     first element is considered the output of the mathematical function to be
+     differentiated and the second element is auxiliary data. Default False.
+    reduce_axes: Optional, tuple of axis names. If an axis is listed here, and
+      ``fn`` implicitly broadcasts a value over that axis, the backward pass
+      will perform a ``psum`` of the corresponding gradient. Otherwise, the
+      grad will be per-example over named axes. For example, if ``'batch'``
+      is a named batch axis, ``vjp(f, *args, reduce_axes=('batch',))`` will
+      create a grad function that sums over the batch while ``grad(f, *args)``
+      will create a per-example grad.
+    variables: variables collections that are available inside `fn` but
+      do not receive a cotangent.
+    rngs: the prngs that are available inside `fn`.
+  Returns:
+    If ``has_aux`` is ``False``, returns ``grads``, where ``grads`` are the
+    gradients for the corresponding primals and do not include the gradients
+    for module variables.
+    If ``has_aux`` is ``True``, returns a
+    ``(grads, aux)`` tuple where ``aux`` is the auxiliary data
+    returned by ``fn``.
+  """
+
+  value_and_grad_partial = functools.partial(
+    value_and_grad,
+    fn,
+    mdl,
+    *primals,
+    has_aux=has_aux,
+    reduce_axes=reduce_axes,
+    variables=variables,
+    rngs=rngs,
+  )
+
+  if has_aux:
+    (_, aux), argument_grads = value_and_grad_partial()
+    return argument_grads, aux
+  else:
+    _, argument_grads = value_and_grad_partial()
+    return argument_grads
+
+
 def jvp(
   fn: Callable[..., Any],
   mdl: Module,
diff --git a/tests/linen/linen_transforms_test.py b/tests/linen/linen_transforms_test.py
@@ -2007,6 +2007,135 @@ def __call__(self, x):
     )
     self.assertEqual(jax.tree_map(jnp.shape, vs), outer_expect)
 
+  def test_grad_simple(self):
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.ones_init(), ())
+        return jnp.sum(p * x * y)
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        x_grad, y_grad = nn.grad(
+          lambda mdl, x, y: mdl(x, y), LearnScale(), x, y
+        )
+        return x_grad, y_grad
+
+    x = random.uniform(random.key(1), (4,))
+    y = random.uniform(random.key(2), (4,))
+    vs = Foo().init(random.key(0), x, y)
+
+    x_grad, y_grad = Foo().apply(vs, x, y)
+    self.assertTrue(tree_allclose(x_grad, y))
+    self.assertTrue(tree_allclose(y_grad, x))
+
+  def test_grad_simple_with_aux(self):
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.ones_init(), ())
+        return jnp.sum(p * x * y), p
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        (x_grad, y_grad), aux = nn.grad(
+          lambda mdl, x, y: mdl(x, y), LearnScale(), x, y, has_aux=True
+        )
+        return aux, x_grad, y_grad
+
+    x = random.uniform(random.key(1), (4,))
+    y = random.uniform(random.key(2), (4,))
+    vs = Foo().init(random.key(0), x, y)
+
+    aux, x_grad, y_grad = Foo().apply(vs, x, y)
+    self.assertTrue(tree_allclose(x_grad, y))
+    self.assertTrue(tree_allclose(y_grad, x))
+    self.assertTrue(tree_allclose(aux, vs['params']['LearnScale_0']['scale']))
+
+  def test_value_and_grad_simple(self):
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.ones_init(), ())
+        return jnp.sum(p * x * y)
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        z, (x_grad, y_grad) = nn.value_and_grad(
+          lambda mdl, x, y: mdl(x, y), LearnScale(), x, y
+        )
+        return z, x_grad, y_grad
+
+    x = random.uniform(random.key(1), (4,))
+    y = random.uniform(random.key(2), (4,))
+    vs = Foo().init(random.key(0), x, y)
+
+    z, x_grad, y_grad = Foo().apply(vs, x, y)
+    self.assertTrue(tree_allclose(x_grad, y))
+    self.assertTrue(tree_allclose(y_grad, x))
+
+  def test_value_and_grad_simple_with_aux(self):
+    class LearnScale(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        p = self.param('scale', nn.initializers.ones_init(), ())
+        return jnp.sum(p * x * y), p
+
+    class Foo(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        (z, aux), (x_grad, y_grad) = nn.value_and_grad(
+          lambda mdl, x, y: mdl(x, y), LearnScale(), x, y, has_aux=True
+        )
+        return z, aux, x_grad, y_grad
+
+    x = random.uniform(random.key(1), (4,))
+    y = random.uniform(random.key(2), (4,))
+    vs = Foo().init(random.key(0), x, y)
+
+    z, aux, x_grad, y_grad = Foo().apply(vs, x, y)
+    self.assertTrue(tree_allclose(x_grad, y))
+    self.assertTrue(tree_allclose(y_grad, x))
+    self.assertTrue(tree_allclose(aux, vs['params']['LearnScale_0']['scale']))
+
+  def test_value_and_grad_multiscope(self):
+    class Foo(nn.Module):
+      bar: nn.Module
+
+      @nn.compact
+      def __call__(self, x, y):
+        def fn(self, x, y):
+          qup = nn.Dense(y.shape[-1])
+          delta = y - self.bar(qup(x))
+          return jnp.sum(delta**2)
+
+        z, (x_grad, y_grad) = nn.value_and_grad(fn, self, x, y)
+        return z, x_grad, y_grad
+
+    class Baz(nn.Module):
+      @nn.compact
+      def __call__(self, x, y):
+        bar = nn.Dense(y.shape[-1])
+        return Foo(bar=bar)(x, y)
+
+    x = random.uniform(random.key(1), (4,))
+    y = random.uniform(random.key(2), (4,))
+    vs = Baz().init(random.key(0), x, y)
+    z, x_grad, y_grad = Baz().apply(vs, x, y)
+
+    def comparison_fn(x, y):
+      w1 = vs['params']['Foo_0']['Dense_0']['kernel']
+      w2 = vs['params']['Dense_0']['kernel']
+      delta = y - jnp.dot(jnp.dot(x, w1), w2)
+      return jnp.sum(delta**2)
+
+    self.assertTrue(tree_allclose(comparison_fn(x, y), z))
+    self.assertTrue(tree_allclose(jax.grad(comparison_fn, 0)(x, y), x_grad))
+    self.assertTrue(tree_allclose(jax.grad(comparison_fn, 1)(x, y), y_grad))
+
 
 if __name__ == '__main__':
   absltest.main()

Original file line number	Diff line number	Diff line change
`@@ -144,6 +144,8 @@`
`144`	`144`	`scan as scan,`
`145`	`145`	`switch as switch,`
`146`	`146`	`vjp as vjp,`
	`147`	`+ grad as grad,`
	`148`	`+ value_and_grad as value_and_grad,`
`147`	`149`	`vmap as vmap,`
`148`	`150`	`while_loop as while_loop,`
`149`	`151`	`)`