LO._bilinear_derivative only computes derivatives for args that require gradients

gpleiss · gpleiss · commit f6e19a3219f3 · 2023-05-25T14:29:57.000Z
diff --git a/linear_operator/operators/_linear_operator.py b/linear_operator/operators/_linear_operator.py
@@ -362,7 +362,7 @@ def _bilinear_derivative(self, left_vecs: Tensor, right_vecs: Tensor) -> Tuple[O
         # Construct a detached version of each argument in the linear operator
         args = []
         for arg in self.representation():
-            if torch.is_tensor(arg) and arg.dtype.is_floating_point:
+            if torch.is_tensor(arg) and arg.dtype.is_floating_point and arg.requires_grad:
                 args.append(arg.detach().requires_grad_(True))
             else:
                 args.append(arg.detach())
@@ -467,11 +467,14 @@ def _args(self) -> Tuple[Union[torch.Tensor, "LinearOperator", int], ...]:
     def _args(self, args: Tuple[Union[torch.Tensor, "LinearOperator", int], ...]) -> None:
         self._args_memo = args
 
+    @property
+    def _differentiable_kwargs(self) -> Dict[str, Union[Tensor, "LinearOperator"]]:
+        return dict(zip(self._differentiable_kwarg_names, self._differentiable_kwarg_vals))
+
     @property
     def _kwargs(self) -> Dict[str, Any]:
-        kwargs = dict(
-            zip(self._differentiable_kwarg_names, self._differentiable_kwarg_vals), **self._nondifferentiable_kwargs
-        )
+        kwargs = self._differentiable_kwargs
+        kwargs.update(self._nondifferentiable_kwargs)
         return kwargs
 
     def _approx_diagonal(self: Float[LinearOperator, "*batch N N"]) -> Float[torch.Tensor, "*batch N"]:
diff --git a/linear_operator/operators/kernel_linear_operator.py b/linear_operator/operators/kernel_linear_operator.py
@@ -141,10 +141,13 @@ def __init__(
             )
 
         # Create a version of each argument that is expanded to the broadcast batch shape
-        x1 = x1.expand(*batch_broadcast_shape, *x1.shape[-2:]).contiguous()
-        x2 = x2.expand(*batch_broadcast_shape, *x2.shape[-2:]).contiguous()
+        #
+        # NOTE: we must explicitly call requires_grad on each of these arguments
+        # for the automatic _bilinear_derivative to work in torch.autograd.Functions
+        x1 = x1.expand(*batch_broadcast_shape, *x1.shape[-2:]).contiguous().requires_grad_(x1.requires_grad)
+        x2 = x2.expand(*batch_broadcast_shape, *x2.shape[-2:]).contiguous().requires_grad_(x2.requires_grad)
         tensor_params = dict(
-            (name, val.expand(*batch_broadcast_shape, *param_nonbatch_shapes[name]))
+            (name, val.expand(*batch_broadcast_shape, *param_nonbatch_shapes[name]).requires_grad_(val.requires_grad))
             for name, val in tensor_params.items()
         )
         new_param_batch_shapes = dict((name, batch_broadcast_shape) for name in param_batch_shapes.keys())