cornellius-gp
diff --git a/‎gpytorch/variational/_variational_strategy.py
+52-30 b/‎gpytorch/variational/_variational_strategy.py
+52-30
diff --git a/‎gpytorch/variational/batch_decoupled_variational_strategy.py
+26-15 b/‎gpytorch/variational/batch_decoupled_variational_strategy.py
+26-15
diff --git a/‎gpytorch/variational/ciq_variational_strategy.py
+10-9 b/‎gpytorch/variational/ciq_variational_strategy.py
+10-9
@@ -3,15 +3,19 @@
 import functools
 from abc import ABC, abstractproperty
 from copy import deepcopy
+from typing import Optional, Tuple
 
 import torch
+from linear_operator.operators import LinearOperator
+from torch import Tensor
 
 from .. import settings
 from ..distributions import Delta, MultivariateNormal
 from ..likelihoods import GaussianLikelihood
-from ..models import ExactGP
+from ..models import ApproximateGP, ExactGP
 from ..module import Module
 from ..utils.memoize import add_to_cache, cached, clear_cache_hook
+from . import _VariationalDistribution
 
 
 class _BaseExactGP(ExactGP):
@@ -42,14 +46,16 @@ class _VariationalStrategy(Module, ABC):
     has_fantasy_strategy = False
 
     def __init__(
-        self, model, inducing_points, variational_distribution, learn_inducing_locations=True, jitter_val=None
+        self,
+        model: ApproximateGP,
+        inducing_points: Tensor,
+        variational_distribution: _VariationalDistribution,
+        learn_inducing_locations: bool = True,
+        jitter_val: Optional[float] = None,
     ):
         super().__init__()
 
-        if jitter_val is None:
-            self.jitter_val = settings.variational_cholesky_jitter.value(inducing_points.dtype)
-        else:
-            self.jitter_val = jitter_val
+        self._jitter_val = jitter_val
 
         # Model
         object.__setattr__(self, "model", model)
@@ -70,7 +76,7 @@ def __init__(
     def _clear_cache(self):
         clear_cache_hook(self)
 
-    def _expand_inputs(self, x, inducing_points):
+    def _expand_inputs(self, x: Tensor, inducing_points: Tensor) -> Tuple[Tensor, Tensor]:
         """
         Pre-processing step in __call__ to make x the same batch_shape as the inducing points
         """
@@ -79,9 +85,19 @@ def _expand_inputs(self, x, inducing_points):
         x = x.expand(*batch_shape, *x.shape[-2:])
         return x, inducing_points
 
+    @property
+    def jitter_val(self) -> float:
+        if self._jitter_val is None:
+            return settings.variational_cholesky_jitter.value(dtype=self.inducing_points.dtype)
+        return self._jitter_val
+
+    @jitter_val.setter
+    def jitter_val(self, jitter_val: float):
+        self._jitter_val = jitter_val
+
     @abstractproperty
     @cached(name="prior_distribution_memo")
-    def prior_distribution(self):
+    def prior_distribution(self) -> MultivariateNormal:
         r"""
         The :func:`~gpytorch.variational.VariationalStrategy.prior_distribution` method determines how to compute the
         GP prior distribution of the inducing points, e.g. :math:`p(u) \sim N(\mu(X_u), K(X_u, X_u))`. Most commonly,
@@ -94,22 +110,29 @@ def prior_distribution(self):
 
     @property
     @cached(name="variational_distribution_memo")
-    def variational_distribution(self):
+    def variational_distribution(self) -> MultivariateNormal:
         return self._variational_distribution()
 
-    def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None, **kwargs):
+    def forward(
+        self,
+        x: Tensor,
+        inducing_points: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        **kwargs,
+    ) -> MultivariateNormal:
         r"""
         The :func:`~gpytorch.variational.VariationalStrategy.forward` method determines how to marginalize out the
         inducing point function values. Specifically, forward defines how to transform a variational distribution
         over the inducing point values, :math:`q(u)`, in to a variational distribution over the function values at
         specified locations x, :math:`q(f|x)`, by integrating :math:`\int p(f|x, u)q(u)du`
 
-        :param torch.Tensor x: Locations :math:`\mathbf X` to get the
+        :param x: Locations :math:`\mathbf X` to get the
             variational posterior of the function values at.
-        :param torch.Tensor inducing_points: Locations :math:`\mathbf Z` of the inducing points
-        :param torch.Tensor inducing_values: Samples of the inducing function values :math:`\mathbf u`
+        :param inducing_points: Locations :math:`\mathbf Z` of the inducing points
+        :param inducing_values: Samples of the inducing function values :math:`\mathbf u`
             (or the mean of the distribution :math:`q(\mathbf u)` if q is a Gaussian.
-        :param ~linear_operator.operators.LinearOperator variational_inducing_covar: If
+        :param variational_inducing_covar: If
             the distribuiton :math:`q(\mathbf u)` is
             Gaussian, then this variable is the covariance matrix of that Gaussian.
             Otherwise, it will be None.
@@ -119,19 +142,19 @@ def forward(self, x, inducing_points, inducing_values, variational_inducing_cova
         """
         raise NotImplementedError
 
-    def kl_divergence(self):
+    def kl_divergence(self) -> Tensor:
         r"""
         Compute the KL divergence between the variational inducing distribution :math:`q(\mathbf u)`
         and the prior inducing distribution :math:`p(\mathbf u)`.
-
-        :rtype: torch.Tensor
         """
         with settings.max_preconditioner_size(0):
             kl_divergence = torch.distributions.kl.kl_divergence(self.variational_distribution, self.prior_distribution)
         return kl_divergence
 
     @cached(name="amortized_exact_gp")
-    def amortized_exact_gp(self, mean_module=None, covar_module=None):
+    def amortized_exact_gp(
+        self, mean_module: Optional[Module] = None, covar_module: Optional[Module] = None
+    ) -> ExactGP:
         mean_module = self.model.mean_module if mean_module is None else mean_module
         covar_module = self.model.covar_module if covar_module is None else covar_module
 
@@ -186,17 +209,17 @@ def amortized_exact_gp(self, mean_module=None, covar_module=None):
             inducing_exact_model.prediction_strategy = pred_strat
         return inducing_exact_model
 
-    def pseudo_points(self):
+    def pseudo_points(self) -> Tuple[Tensor, Tensor]:
         raise NotImplementedError("Each variational strategy must implement its own pseudo points method")
 
     def get_fantasy_model(
         self,
-        inputs,
-        targets,
-        mean_module=None,
-        covar_module=None,
+        inputs: Tensor,
+        targets: Tensor,
+        mean_module: Optional[Module] = None,
+        covar_module: Optional[Module] = None,
         **kwargs,
-    ):
+    ) -> ExactGP:
         r"""
         Performs the online variational conditioning (OVC) strategy of Maddox et al, '21 to return
         an exact GP model that incorporates the inputs and targets alongside the variational model's inducing
@@ -211,17 +234,16 @@ def get_fantasy_model(
         modules are attributes of the model itself called mean_module and covar_module respectively OR that you
         pass them into this method explicitly.
 
-        :param torch.Tensor inputs: (`b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`) Locations of fantasy
+        :param inputs: (`b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`) Locations of fantasy
             observations.
-        :param torch.Tensor targets: (`b1 x ... x bk x m` or `f x b1 x ... x bk x m`) Labels of fantasy observations.
-        :param torch.nn.Module mean_module: torch module describing the mean function of the GP model. Optional if
+        :param targets: (`b1 x ... x bk x m` or `f x b1 x ... x bk x m`) Labels of fantasy observations.
+        :param mean_module: torch module describing the mean function of the GP model. Optional if
             `mean_module` is already an attribute of the variational GP.
-        :param torch.nn.Module covar_module: torch module describing the covariance function of the GP model. Optional
+        :param covar_module: torch module describing the covariance function of the GP model. Optional
             if `covar_module` is already an attribute of the variational GP.
         :return: An `ExactGP` model with `k + m` training examples, where the `m` fantasy examples have been added
             and all test-time caches have been updated. We assume that there are `k` inducing points in this variational
             GP. Note that we return an `ExactGP` rather than a variational GP.
-        :rtype: ~gpytorch.models.ExactGP
 
         Reference: "Conditioning Sparse Variational Gaussian Processes for Online Decision-Making,"
             Maddox, Stanton, Wilson, NeurIPS, '21
@@ -282,7 +304,7 @@ def get_fantasy_model(
         fantasy_model.prediction_strategy = fant_pred_strat
         return fantasy_model
 
-    def __call__(self, x, prior=False, **kwargs):
+    def __call__(self, x: Tensor, prior: bool = False, **kwargs) -> MultivariateNormal:
         # If we're in prior mode, then we're done!
         if prior:
             return self.model.forward(x, **kwargs)
 
@@ -1,12 +1,17 @@
 #!/usr/bin/env python3
 
+from typing import Optional, Tuple
+
 import torch
-from linear_operator.operators import MatmulLinearOperator, SumLinearOperator
+from linear_operator.operators import LinearOperator, MatmulLinearOperator, SumLinearOperator
+from torch import Tensor
 from torch.distributions.kl import kl_divergence
 
 from ..distributions import Delta, MultivariateNormal
+from ..models import ApproximateGP
 from ..utils.errors import CachingError
 from ..utils.memoize import pop_from_cache_ignore_args
+from ._variational_distribution import _VariationalDistribution
 from .delta_variational_distribution import DeltaVariationalDistribution
 from .variational_strategy import VariationalStrategy
 
@@ -58,21 +63,20 @@ class BatchDecoupledVariationalStrategy(VariationalStrategy):
         :obj:`~gpytorch.variational.OrthogonallyDecoupledVariationalStrategy` (a variant proposed by
         `Salimbeni et al. (2018)`_ that uses orthogonal projections.)
 
-    :param ~gpytorch.models.ApproximateGP model: Model this strategy is applied to.
+    :param model: Model this strategy is applied to.
         Typically passed in when the VariationalStrategy is created in the
         __init__ method of the user defined model.
-    :param torch.Tensor inducing_points: Tensor containing a set of inducing
+    :param inducing_points: Tensor containing a set of inducing
         points to use for variational inference.
-    :param ~gpytorch.variational.VariationalDistribution variational_distribution: A
+    :param variational_distribution: A
         VariationalDistribution object that represents the form of the variational distribution :math:`q(\mathbf u)`
     :param learn_inducing_locations: (Default True): Whether or not
         the inducing point locations :math:`\mathbf Z` should be learned (i.e. are they
         parameters of the model).
-    :type learn_inducing_locations: `bool`, optional
-    :type mean_var_batch_dim: `int`, optional
     :param mean_var_batch_dim: (Default `None`):
         Set this parameter (ideally to `-1`) to indicate which dimension corresponds to different
         kernel hyperparameters for the mean/variance functions.
+    :param jitter_val: Amount of diagonal jitter to add for Cholesky factorization numerical stability
 
     .. _Cheng et al. (2017):
         https://arxiv.org/abs/1711.10127
@@ -133,12 +137,12 @@ class BatchDecoupledVariationalStrategy(VariationalStrategy):
 
     def __init__(
         self,
-        model,
-        inducing_points,
-        variational_distribution,
-        learn_inducing_locations=True,
-        mean_var_batch_dim=None,
-        jitter_val=None,
+        model: ApproximateGP,
+        inducing_points: Tensor,
+        variational_distribution: _VariationalDistribution,
+        learn_inducing_locations: bool = True,
+        mean_var_batch_dim: Optional[int] = None,
+        jitter_val: Optional[float] = None,
     ):
         if isinstance(variational_distribution, DeltaVariationalDistribution):
             raise NotImplementedError(
@@ -163,15 +167,22 @@ def __init__(
             model, inducing_points, variational_distribution, learn_inducing_locations, jitter_val=jitter_val
         )
 
-    def _expand_inputs(self, x, inducing_points):
+    def _expand_inputs(self, x: Tensor, inducing_points: Tensor) -> Tuple[Tensor, Tensor]:
         # If we haven't explicitly marked a dimension as batch, add the corresponding batch dimension to the input
         if self.mean_var_batch_dim is None:
             x = x.unsqueeze(-3)
         else:
             x = x.unsqueeze(self.mean_var_batch_dim - 2)
         return super()._expand_inputs(x, inducing_points)
 
-    def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None, **kwargs):
+    def forward(
+        self,
+        x: Tensor,
+        inducing_points: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        **kwargs,
+    ) -> MultivariateNormal:
         # We'll compute the covariance, and cross-covariance terms for both the
         # pred-mean and pred-covar, using their different inducing points (and maybe kernel hypers)
 
@@ -225,7 +236,7 @@ def forward(self, x, inducing_points, inducing_values, variational_inducing_cova
 
         return MultivariateNormal(predictive_mean, predictive_covar)
 
-    def kl_divergence(self):
+    def kl_divergence(self) -> Tensor:
         variational_dist = self.variational_distribution
         prior_dist = self.prior_distribution
 
 
@@ -6,6 +6,7 @@
 from linear_operator import to_linear_operator
 from linear_operator.operators import DiagLinearOperator, MatmulLinearOperator, SumLinearOperator
 from linear_operator.utils import linear_cg
+from torch import Tensor
 
 from .. import settings
 from ..distributions import Delta, MultivariateNormal
@@ -141,17 +142,17 @@ class CiqVariationalStrategy(_VariationalStrategy):
         :obj:`~gpytorch.variational.NaturalVariationalDistribution` and
         `natural gradient descent`_.
 
-    :param ~gpytorch.models.ApproximateGP model: Model this strategy is applied to.
+    :param model: Model this strategy is applied to.
         Typically passed in when the VariationalStrategy is created in the
         __init__ method of the user defined model.
-    :param torch.Tensor inducing_points: Tensor containing a set of inducing
+    :param inducing_points: Tensor containing a set of inducing
         points to use for variational inference.
-    :param ~gpytorch.variational.VariationalDistribution variational_distribution: A
+    :param variational_distribution: A
         VariationalDistribution object that represents the form of the variational distribution :math:`q(\mathbf u)`
     :param learn_inducing_locations: (Default True): Whether or not
         the inducing point locations :math:`\mathbf Z` should be learned (i.e. are they
         parameters of the model).
-    :type learn_inducing_locations: `bool`, optional
+    :param jitter_val: Amount of diagonal jitter to add for Cholesky factorization numerical stability
 
     .. _Pleiss et al. (2020):
         https://arxiv.org/pdf/2006.11267.pdf
@@ -161,12 +162,12 @@ class CiqVariationalStrategy(_VariationalStrategy):
         examples/04_Variational_and_Approximate_GPs/Natural_Gradient_Descent.html
     """
 
-    def _ngd(self):
+    def _ngd(self) -> bool:
         return isinstance(self._variational_distribution, NaturalVariationalDistribution)
 
     @property
     @cached(name="prior_distribution_memo")
-    def prior_distribution(self):
+    def prior_distribution(self) -> MultivariateNormal:
         zeros = torch.zeros(
             self._variational_distribution.shape(),
             dtype=self._variational_distribution.dtype,
@@ -178,7 +179,7 @@ def prior_distribution(self):
 
     @property
     @cached(name="variational_distribution_memo")
-    def variational_distribution(self):
+    def variational_distribution(self) -> MultivariateNormal:
         if self._ngd():
             raise RuntimeError(
                 "Variational distribution for NGD-CIQ should be computed during forward calls. "
@@ -253,8 +254,8 @@ def forward(
         # Return the distribution
         return MultivariateNormal(predictive_mean, predictive_covar)
 
-    def kl_divergence(self):
-        r"""
+    def kl_divergence(self) -> Tensor:
+        """
         Compute the KL divergence between the variational inducing distribution :math:`q(\mathbf u)`
         and the prior inducing distribution :math:`p(\mathbf u)`.