cornellius-gp
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎.pyre_configuration
+17 b/‎.pyre_configuration
+17
diff --git a/‎gpytorch/module.py
+3-2 b/‎gpytorch/module.py
+3-2
diff --git a/‎gpytorch/variational/_variational_distribution.py
+7-6 b/‎gpytorch/variational/_variational_distribution.py
+7-6
diff --git a/‎gpytorch/variational/_variational_strategy.py
+18-8 b/‎gpytorch/variational/_variational_strategy.py
+18-8
diff --git a/‎gpytorch/variational/additive_grid_interpolation_variational_strategy.py
+28-11 b/‎gpytorch/variational/additive_grid_interpolation_variational_strategy.py
+28-11
diff --git a/‎gpytorch/variational/cholesky_variational_distribution.py
+11-6 b/‎gpytorch/variational/cholesky_variational_distribution.py
+11-6
diff --git a/‎gpytorch/variational/ciq_variational_strategy.py
+19-16 b/‎gpytorch/variational/ciq_variational_strategy.py
+19-16
@@ -20,6 +20,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Type checking
+.pyre/
+
 # Distribution / packaging
 .Python
 env/
 
@@ -0,0 +1,17 @@
+{
+  "site_package_search_strategy": "pep561",
+  "source_directories": [
+    {"import_root": ".", "source": "gpytorch/"}
+  ],
+  "search_path": [
+    ".",
+    "../linear_operator",
+    {"site-package": "faiss"},
+    {"site-package": "linear_operator"},
+    {"site-package": "pykeops"},
+    {"site-package": "pyro"},
+    {"site-package": "scipy"},
+    {"site-package": "sklearn"}
+  ],
+  "strict": true
+}
@@ -5,10 +5,11 @@
 import itertools
 import operator
 from collections import OrderedDict
+from typing import Union
 
 import torch
 from linear_operator.operators import LinearOperator
-from torch import nn
+from torch import nn, Tensor
 from torch.distributions import Distribution
 
 from .constraints import Interval
@@ -56,7 +57,7 @@ def added_loss_terms(self):
         for _, strategy in self.named_added_loss_terms():
             yield strategy
 
-    def forward(self, *inputs, **kwargs):
+    def forward(self, *inputs, **kwargs) -> Union[Tensor, Distribution, LinearOperator]:
         raise NotImplementedError
 
     def constraints(self):
 
@@ -4,6 +4,7 @@
 
 import torch
 
+from ..distributions import Distribution, MultivariateNormal
 from ..module import Module
 
 
@@ -15,21 +16,21 @@ class _VariationalDistribution(Module, ABC):
     :ivar torch.dtype device: The device of the VariationalDistribution parameters
     """
 
-    def __init__(self, num_inducing_points, batch_shape=torch.Size([]), mean_init_std=1e-3):
+    def __init__(self, num_inducing_points: int, batch_shape: torch.Size = torch.Size([]), mean_init_std: float = 1e-3):
         super().__init__()
         self.num_inducing_points = num_inducing_points
         self.batch_shape = batch_shape
         self.mean_init_std = mean_init_std
 
     @property
-    def device(self):
+    def device(self) -> torch.device:
         return next(self.parameters()).device
 
     @property
-    def dtype(self):
+    def dtype(self) -> torch.dtype:
         return next(self.parameters()).dtype
 
-    def forward(self):
+    def forward(self) -> Distribution:
         r"""
         Constructs and returns the variational distribution
 
@@ -46,13 +47,13 @@ def shape(self) -> torch.Size:
         return torch.Size([*self.batch_shape, self.num_inducing_points])
 
     @abstractmethod
-    def initialize_variational_distribution(self, prior_dist):
+    def initialize_variational_distribution(self, prior_dist: MultivariateNormal) -> None:
         r"""
         Method for initializing the variational distribution, based on the prior distribution.
 
         :param ~gpytorch.distributions.Distribution prior_dist: The prior distribution :math:`p(\mathbf u)`.
         """
         raise NotImplementedError
 
-    def __call__(self):
+    def __call__(self) -> Distribution:
         return self.forward()
@@ -3,34 +3,44 @@
 import functools
 from abc import ABC, abstractproperty
 from copy import deepcopy
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from linear_operator.operators import LinearOperator
 from torch import Tensor
 
 from .. import settings
-from ..distributions import Delta, MultivariateNormal
+from ..distributions import Delta, Distribution, MultivariateNormal
+from ..kernels import Kernel
 from ..likelihoods import GaussianLikelihood
+from ..means import Mean
 from ..models import ApproximateGP, ExactGP
+from ..models.exact_prediction_strategies import DefaultPredictionStrategy
 from ..module import Module
 from ..utils.memoize import add_to_cache, cached, clear_cache_hook
 from . import _VariationalDistribution
 
 
 class _BaseExactGP(ExactGP):
-    def __init__(self, train_inputs, train_targets, likelihood, mean_module, covar_module):
+    def __init__(
+        self,
+        train_inputs: Optional[Union[Tensor, Tuple[Tensor, ...]]],
+        train_targets: Optional[Tensor],
+        likelihood: GaussianLikelihood,
+        mean_module: Mean,
+        covar_module: Kernel,
+    ):
         super().__init__(train_inputs, train_targets, likelihood)
         self.mean_module = mean_module
         self.covar_module = covar_module
 
-    def forward(self, x):
+    def forward(self, x: Tensor, **kwargs) -> MultivariateNormal:
         mean = self.mean_module(x)
         covar = self.covar_module(x)
         return MultivariateNormal(mean, covar)
 
 
-def _add_cache_hook(tsr, pred_strat):
+def _add_cache_hook(tsr: Tensor, pred_strat: DefaultPredictionStrategy) -> Tensor:
     if tsr.grad_fn is not None:
         wrapper = functools.partial(clear_cache_hook, pred_strat)
         functools.update_wrapper(wrapper, clear_cache_hook)
@@ -47,7 +57,7 @@ class _VariationalStrategy(Module, ABC):
 
     def __init__(
         self,
-        model: ApproximateGP,
+        model: Union[ApproximateGP, "_VariationalStrategy"],
         inducing_points: Tensor,
         variational_distribution: _VariationalDistribution,
         learn_inducing_locations: bool = True,
@@ -73,7 +83,7 @@ def __init__(
         self._variational_distribution = variational_distribution
         self.register_buffer("variational_params_initialized", torch.tensor(0))
 
-    def _clear_cache(self):
+    def _clear_cache(self) -> None:
         clear_cache_hook(self)
 
     def _expand_inputs(self, x: Tensor, inducing_points: Tensor) -> Tuple[Tensor, Tensor]:
@@ -110,7 +120,7 @@ def prior_distribution(self) -> MultivariateNormal:
 
     @property
     @cached(name="variational_distribution_memo")
-    def variational_distribution(self) -> MultivariateNormal:
+    def variational_distribution(self) -> Distribution:
         return self._variational_distribution()
 
     def forward(
 
@@ -1,15 +1,27 @@
 #!/usr/bin/env python3
 
+from typing import Iterable, Optional, Tuple
 
 import torch
+from linear_operator.operators import LinearOperator
+from torch import LongTensor, Tensor
 
 from ..distributions import Delta, MultivariateNormal
+from ..models import ApproximateGP
+from ..variational._variational_distribution import _VariationalDistribution
 from ..variational.grid_interpolation_variational_strategy import GridInterpolationVariationalStrategy
 
 
 class AdditiveGridInterpolationVariationalStrategy(GridInterpolationVariationalStrategy):
     def __init__(
-        self, model, grid_size, grid_bounds, num_dim, variational_distribution, mixing_params=False, sum_output=True
+        self,
+        model: ApproximateGP,
+        grid_size: int,
+        grid_bounds: Iterable[Tuple[float, float]],
+        num_dim: int,
+        variational_distribution: _VariationalDistribution,
+        mixing_params: bool = False,
+        sum_output: bool = True,
     ):
         super(AdditiveGridInterpolationVariationalStrategy, self).__init__(
             model, grid_size, grid_bounds, variational_distribution
@@ -21,20 +33,17 @@ def __init__(
             self.register_parameter(name="mixing_params", parameter=torch.nn.Parameter(torch.ones(num_dim) / num_dim))
 
     @property
-    def prior_distribution(self):
-        """
-        If desired, models can compare the input to forward to inducing_points and use a GridKernel for space
-        efficiency.
-
-        However, when using a default VariationalDistribution which has an O(m^2) space complexity anyways, we find that
-        GridKernel is typically not worth it due to the moderate slow down of using FFTs.
-        """
+    def prior_distribution(self) -> MultivariateNormal:
+        # If desired, models can compare the input to forward to inducing_points and use a GridKernel for space
+        # efficiency.
+        # However, when using a default VariationalDistribution which has an O(m^2) space complexity anyways,
+        # we find that GridKernel is typically not worth it due to the moderate slow down of using FFTs.
         out = super(AdditiveGridInterpolationVariationalStrategy, self).prior_distribution
         mean = out.mean.repeat(self.num_dim, 1)
         covar = out.lazy_covariance_matrix.repeat(self.num_dim, 1, 1)
         return MultivariateNormal(mean, covar)
 
-    def _compute_grid(self, inputs):
+    def _compute_grid(self, inputs: Tensor) -> Tuple[LongTensor, Tensor]:
         num_data, num_dim = inputs.size()
         inputs = inputs.transpose(0, 1).reshape(-1, 1)
         interp_indices, interp_values = super(AdditiveGridInterpolationVariationalStrategy, self)._compute_grid(inputs)
@@ -45,7 +54,15 @@ def _compute_grid(self, inputs):
             interp_values = interp_values.mul(self.mixing_params.unsqueeze(1).unsqueeze(2))
         return interp_indices, interp_values
 
-    def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None):
+    def forward(
+        self,
+        x: Tensor,
+        inducing_points: Tensor,
+        inducing_values: Tensor,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        *params,
+        **kwargs,
+    ) -> MultivariateNormal:
         if x.ndimension() == 1:
             x = x.unsqueeze(-1)
         elif x.ndimension() != 2:
 
@@ -16,15 +16,20 @@ class CholeskyVariationalDistribution(_VariationalDistribution):
     matrix. In order to ensure that the covariance matrix remains positive definite, we only consider the lower
     triangle.
 
-    :param int num_inducing_points: Size of the variational distribution. This implies that the variational mean
+    :param num_inducing_points: Size of the variational distribution. This implies that the variational mean
         should be this size, and the variational covariance matrix should have this many rows and columns.
     :param batch_shape: Specifies an optional batch size
         for the variational parameters. This is useful for example when doing additive variational inference.
-    :type batch_shape: :obj:`torch.Size`, optional
-    :param float mean_init_std: (Default: 1e-3) Standard deviation of gaussian noise to add to the mean initialization.
+    :param mean_init_std: (Default: 1e-3) Standard deviation of gaussian noise to add to the mean initialization.
     """
 
-    def __init__(self, num_inducing_points, batch_shape=torch.Size([]), mean_init_std=1e-3, **kwargs):
+    def __init__(
+        self,
+        num_inducing_points: int,
+        batch_shape: torch.Size = torch.Size([]),
+        mean_init_std: float = 1e-3,
+        **kwargs,
+    ):
         super().__init__(num_inducing_points=num_inducing_points, batch_shape=batch_shape, mean_init_std=mean_init_std)
         mean_init = torch.zeros(num_inducing_points)
         covar_init = torch.eye(num_inducing_points, num_inducing_points)
@@ -34,7 +39,7 @@ def __init__(self, num_inducing_points, batch_shape=torch.Size([]), mean_init_st
         self.register_parameter(name="variational_mean", parameter=torch.nn.Parameter(mean_init))
         self.register_parameter(name="chol_variational_covar", parameter=torch.nn.Parameter(covar_init))
 
-    def forward(self):
+    def forward(self) -> MultivariateNormal:
         chol_variational_covar = self.chol_variational_covar
         dtype = chol_variational_covar.dtype
         device = chol_variational_covar.device
@@ -47,7 +52,7 @@ def forward(self):
         variational_covar = CholLinearOperator(chol_variational_covar)
         return MultivariateNormal(self.variational_mean, variational_covar)
 
-    def initialize_variational_distribution(self, prior_dist):
+    def initialize_variational_distribution(self, prior_dist: MultivariateNormal) -> None:
         self.variational_mean.data.copy_(prior_dist.mean)
         self.variational_mean.data.add_(torch.randn_like(prior_dist.mean), alpha=self.mean_init_std)
         self.chol_variational_covar.data.copy_(prior_dist.lazy_covariance_matrix.cholesky().to_dense())
@@ -4,12 +4,13 @@
 
 import torch
 from linear_operator import to_linear_operator
-from linear_operator.operators import DiagLinearOperator, MatmulLinearOperator, SumLinearOperator
+from linear_operator.operators import DiagLinearOperator, LinearOperator, MatmulLinearOperator, SumLinearOperator
 from linear_operator.utils import linear_cg
 from torch import Tensor
+from torch.autograd.function import FunctionCtx
 
 from .. import settings
-from ..distributions import Delta, MultivariateNormal
+from ..distributions import Delta, Distribution, MultivariateNormal
 from ..module import Module
 from ..utils.memoize import cached
 from ._variational_strategy import _VariationalStrategy
@@ -35,7 +36,10 @@ class _NgdInterpTerms(torch.autograd.Function):
 
     @staticmethod
     def forward(
-        ctx, interp_term: torch.Tensor, natural_vec: torch.Tensor, natural_mat: torch.Tensor
+        ctx: FunctionCtx,
+        interp_term: torch.Tensor,
+        natural_vec: torch.Tensor,
+        natural_mat: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # Compute precision
         prec = natural_mat.mul(-2.0)
@@ -80,8 +84,8 @@ def forward(
 
     @staticmethod
     def backward(
-        ctx, interp_mean_grad: torch.Tensor, interp_var_grad: torch.Tensor, kl_div_grad: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        ctx: FunctionCtx, interp_mean_grad: torch.Tensor, interp_var_grad: torch.Tensor, kl_div_grad: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None]:
         # Get the saved terms
         interp_term, s_times_interp_term, interp_mean, natural_vec, expec_vec, prec = ctx.saved_tensors
 
@@ -101,12 +105,10 @@ def backward(
         # interp_mean component: K^{-1/2} k
         # interp_var component: (k^T K^{-1/2} m) K^{-1/2} k
         # kl component: S^{-1} m
-        expec_vec_grad = sum(
-            [
-                (interp_var_grad * interp_mean.unsqueeze(-2) * interp_term).sum(dim=-1).mul(-2),
-                (interp_mean_grad * interp_term).sum(dim=-1),
-                (kl_div_grad.unsqueeze(-1) * natural_vec),
-            ]
+        expec_vec_grad = (
+            (interp_var_grad * interp_mean.unsqueeze(-2) * interp_term).sum(dim=-1).mul(-2)
+            + (interp_mean_grad * interp_term).sum(dim=-1)
+            + (kl_div_grad.unsqueeze(-1) * natural_vec)
         )
 
         # Compute gradient of expected matrix (mm^T + S)
@@ -179,7 +181,7 @@ def prior_distribution(self) -> MultivariateNormal:
 
     @property
     @cached(name="variational_distribution_memo")
-    def variational_distribution(self) -> MultivariateNormal:
+    def variational_distribution(self) -> Distribution:
         if self._ngd():
             raise RuntimeError(
                 "Variational distribution for NGD-CIQ should be computed during forward calls. "
@@ -192,12 +194,13 @@ def forward(
         x: torch.Tensor,
         inducing_points: torch.Tensor,
         inducing_values: torch.Tensor,
-        variational_inducing_covar: Optional[MultivariateNormal] = None,
+        variational_inducing_covar: Optional[LinearOperator] = None,
+        *params,
         **kwargs,
     ) -> MultivariateNormal:
         # Compute full prior distribution
         full_inputs = torch.cat([inducing_points, x], dim=-2)
-        full_output = self.model.forward(full_inputs)
+        full_output = self.model.forward(full_inputs, *params, **kwargs)
         full_covar = full_output.lazy_covariance_matrix
 
         # Covariance terms
@@ -272,7 +275,7 @@ def kl_divergence(self) -> Tensor:
         else:
             return super().kl_divergence()
 
-    def __call__(self, x: torch.Tensor, prior: bool = False, **kwargs) -> MultivariateNormal:
+    def __call__(self, x: torch.Tensor, prior: bool = False, *params, **kwargs) -> MultivariateNormal:
         # This is mostly the same as _VariationalStrategy.__call__()
         # but with special rules for natural gradient descent (to prevent O(M^3) computation)
 
@@ -310,6 +313,7 @@ def __call__(self, x: torch.Tensor, prior: bool = False, **kwargs) -> Multivaria
                 inducing_points,
                 inducing_values=None,
                 variational_inducing_covar=None,
+                *params,
                 **kwargs,
             )
         else:
@@ -332,7 +336,6 @@ def __call__(self, x: torch.Tensor, prior: bool = False, **kwargs) -> Multivaria
                     inducing_points,
                     inducing_values=variational_dist_u.mean,
                     variational_inducing_covar=None,
-                    ngd=False,
                     **kwargs,
                 )
             else: