Merge pull request #3529 from chiamp:attention

Flax Authors · Flax Authors · commit 50cd169debcc · 2023-12-06T05:36:00.000-08:00
PiperOrigin-RevId: 588392463
diff --git a/flax/linen/attention.py b/flax/linen/attention.py
@@ -50,6 +50,7 @@ def dot_product_attention_weights(
   deterministic: bool = False,
   dtype: Optional[Dtype] = None,
   precision: PrecisionLike = None,
+  module: Optional[Module] = None,
 ):
   """Computes dot-product attention weights given query and key.
 
@@ -76,6 +77,10 @@ def dot_product_attention_weights(
     dtype: the dtype of the computation (default: infer from inputs and params)
     precision: numerical precision of the computation see `jax.lax.Precision`
       for details.
+    module: the Module that will sow the attention weights into the
+      'intermediates' collection. Remember to mark 'intermediates' as mutable via
+      `mutable=['intermediates'] in order to have that collection returned.
+      If `module` is None, the attention weights will not be sowed.
 
   Returns:
     Output of shape `[batch..., num_heads, q_length, kv_length]`.
@@ -107,6 +112,9 @@ def dot_product_attention_weights(
   # normalize the attention weights
   attn_weights = jax.nn.softmax(attn_weights).astype(dtype)
 
+  if module:
+    module.sow('intermediates', 'attention_weights', attn_weights)
+
   # apply attention dropout
   if not deterministic and dropout_rate > 0.0:
     keep_prob = 1.0 - dropout_rate
@@ -134,6 +142,7 @@ def dot_product_attention(
   deterministic: bool = False,
   dtype: Optional[Dtype] = None,
   precision: PrecisionLike = None,
+  module: Optional[Module] = None,
 ):
   """Computes dot-product attention given query, key, and value.
 
@@ -164,6 +173,10 @@ def dot_product_attention(
     dtype: the dtype of the computation (default: infer from inputs)
     precision: numerical precision of the computation see `jax.lax.Precision`
       for details.
+    module: the Module that will sow the attention weights into the
+      'intermediates' collection. Remember to mark 'intermediates' as mutable via
+      `mutable=['intermediates'] in order to have that collection returned.
+      If `module` is None, the attention weights will not be sowed.
 
   Returns:
     Output of shape `[batch..., q_length, num_heads, v_depth_per_head]`.
@@ -191,6 +204,7 @@ def dot_product_attention(
     deterministic,
     dtype,
     precision,
+    module,
   )
 
   # return weighted sum over values for each query position
@@ -306,6 +320,7 @@ def __call__(
     mask: Optional[Array] = None,
     deterministic: Optional[bool] = None,
     dropout_rng: Optional[PRNGKey] = None,
+    return_weights: bool = False,
   ):
     ...
 
@@ -318,6 +333,7 @@ def __call__(
     mask: Optional[Array] = None,
     deterministic: Optional[bool] = None,
     dropout_rng: Optional[PRNGKey] = None,
+    return_weights: bool = False,
   ):
     ...
 
@@ -332,6 +348,7 @@ def __call__(
     mask: Optional[Array] = None,
     deterministic: Optional[bool] = None,
     dropout_rng: Optional[PRNGKey] = None,
+    return_weights: bool = False,
   ):
     """Applies multi-head dot product attention on the input data.
 
@@ -358,6 +375,10 @@ def __call__(
         dropout, whereas if true, the attention weights are deterministic.
       dropout_rng: optional rng key to pass to the attention layer's dropout
         mask. Otherwise, self.make_rng('dropout') is used instead.
+      return_weights: if `True`, the attention weights are sowed into the
+        'intermediates' collection. Remember to mark 'intermediates' as
+        mutable via `mutable=['intermediates'] in order to have that
+        collection returned.
 
     Returns:
       output of shape `[batch_sizes..., length, features]`.
@@ -506,18 +527,33 @@ def __call__(
       m_deterministic = True
 
     # apply attention
-    x = self.attention_fn(
-      query,
-      key,
-      value,
-      mask=mask,
-      dropout_rng=dropout_rng,
-      dropout_rate=self.dropout_rate,
-      broadcast_dropout=self.broadcast_dropout,
-      deterministic=m_deterministic,
-      dtype=self.dtype,
-      precision=self.precision,
-    )  # pytype: disable=wrong-keyword-args
+    if return_weights:
+      x = self.attention_fn(
+        query,
+        key,
+        value,
+        mask=mask,
+        dropout_rng=dropout_rng,
+        dropout_rate=self.dropout_rate,
+        broadcast_dropout=self.broadcast_dropout,
+        deterministic=m_deterministic,
+        dtype=self.dtype,
+        precision=self.precision,
+        module=self if return_weights else None,
+      )  # pytype: disable=wrong-keyword-args
+    else:
+      x = self.attention_fn(
+        query,
+        key,
+        value,
+        mask=mask,
+        dropout_rng=dropout_rng,
+        dropout_rate=self.dropout_rate,
+        broadcast_dropout=self.broadcast_dropout,
+        deterministic=m_deterministic,
+        dtype=self.dtype,
+        precision=self.precision,
+      )
     # back to the original inputs dimensions
     out = DenseGeneral(
       features=features,
diff --git a/tests/linen/linen_attention_test.py b/tests/linen/linen_attention_test.py
@@ -343,6 +343,57 @@ def test_multihead_mask_warning(self):
       with self.assertRaises(errors.ScopeParamShapeError):
         module.apply(initial_vars, query, key, causal_mask)
 
+  def test_multihead_sow_attention_weights(self):
+    rng = random.key(0)
+    x = jnp.ones((4, 6, 5))
+
+    class Model(nn.Module):
+      attention_kwargs: dict
+
+      @nn.compact
+      def __call__(self, x, return_weights=False):
+        x = nn.MultiHeadDotProductAttention(**self.attention_kwargs)(
+          x, return_weights=return_weights
+        )
+        x = nn.MultiHeadDotProductAttention(**self.attention_kwargs)(x)
+        x = nn.MultiHeadDotProductAttention(**self.attention_kwargs)(
+          x, return_weights=return_weights
+        )
+        return x
+
+    module = Model(
+      dict(
+        num_heads=8,
+        qkv_features=16,
+        kernel_init=initializers.ones,
+        bias_init=initializers.zeros,
+        deterministic=False,
+      )
+    )
+    v = module.init(rng, x)
+    _, intermediates = module.apply(
+      v, x, mutable=['intermediates'], return_weights=True
+    )
+    self.assertEqual(
+      intermediates['intermediates']['MultiHeadDotProductAttention_0'][
+        'attention_weights'
+      ][0].shape,
+      (4, 8, 6, 6),
+    )
+    self.assertNotIn(
+      'MultiHeadDotProductAttention_1', intermediates['intermediates']
+    )
+    self.assertEqual(
+      intermediates['intermediates']['MultiHeadDotProductAttention_2'][
+        'attention_weights'
+      ][0].shape,
+      (4, 8, 6, 6),
+    )
+    _, intermediates = module.apply(
+      v, x, mutable=['intermediates'], return_weights=False
+    )
+    self.assertNotIn('intermediates', intermediates)
+
 
 if __name__ == '__main__':
   absltest.main()