cornellius-gp · gpleiss · Jul 12, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -80,6 +80,7 @@ def find_version(*file_paths):
 extensions = [
     "sphinx.ext.coverage",
     "sphinx.ext.mathjax",
+    'sphinx.ext.napoleon',
     "sphinx.ext.viewcode",
     "sphinx.ext.githubpages",
     "sphinx.ext.autodoc",

diff --git a/gpytorch/distributions/multitask_multivariate_normal.py b/gpytorch/distributions/multitask_multivariate_normal.py
@@ -119,7 +119,7 @@ def from_batch_mvn(cls, batch_mvn, task_dim=-1):
     def from_independent_mvns(cls, mvns):
         """
         Convert an iterable of MVNs into a :obj:`~gpytorch.distributions.MultitaskMultivariateNormal`.
-        The resulting distribution will have :attr:`len(mvns)` tasks, and the tasks will be independent.
+        The resulting distribution will have ``len(mvns)`` tasks, and the tasks will be independent.
 
         :param ~gpytorch.distributions.MultitaskNormal mvn: The base MVN distributions.
         :returns: the independent multitask distribution
@@ -247,7 +247,7 @@ def to_data_independent_dist(self):
         """
         Convert a multitask MVN into a batched (non-multitask) MVNs
         The result retains the intertask covariances, but gets rid of the inter-data covariances.
-        The resulting distribution will have :attr:`len(mvns)` tasks, and the tasks will be independent.
+        The resulting distribution will have ``len(mvns)`` tasks, and the tasks will be independent.
 
         :returns: the bached data-independent MVN
         :rtype: gpytorch.distributions.MultivariateNormal

diff --git a/gpytorch/functions/__init__.py b/gpytorch/functions/__init__.py
@@ -13,9 +13,9 @@ def add_diag(input, diag):
     Adds a diagonal matrix s*I to the input matrix input.
 
     Args:
-        :attr:`input` (Tensor (nxn) or (bxnxn)):
+        input (Tensor (nxn) or (bxnxn)):
             Tensor or LazyTensor wrapping matrix to add diagonal component to.
-        :attr:`diag` (scalar or Tensor (n) or Tensor (bxn) or Tensor (bx1)):
+        diag (scalar or Tensor (n) or Tensor (bxn) or Tensor (bx1)):
             Diagonal component to add to tensor
 
     Returns:
@@ -89,7 +89,7 @@ def matmul(mat, rhs):
 
 def inv_matmul(mat, right_tensor, left_tensor=None):
     r"""
-    Computes a linear solve (w.r.t :attr:`mat` = :math:`A`) with several right hand sides :math:`R`.
+    Computes a linear solve (w.r.t mat = :math:`A`) with several right hand sides :math:`R`.
     I.e. computes
 
     ... math::
@@ -98,17 +98,17 @@ def inv_matmul(mat, right_tensor, left_tensor=None):
             A^{-1} R,
         \end{equation}
 
-    where :math:`R` is :attr:`right_tensor` and :math:`A` is :attr:`mat`.
+    where :math:`R` is right_tensor and :math:`A` is mat.
 
-    If :attr:`left_tensor` is supplied, computes
+    If left_tensor is supplied, computes
 
     ... math::
 
         \begin{equation}
             L A^{-1} R,
         \end{equation}
 
-    where :math:`L` is :attr:`left_tensor`. Supplying this can reduce the number of
+    where :math:`L` is left_tensor. Supplying this can reduce the number of
     CG calls required.
 
     Args:
@@ -181,7 +181,7 @@ def pivoted_cholesky(mat, rank, error_tol=None, return_pivots=None):
     :type mat: ~gpytorch.lazy.LazyTensor or ~torch.Tensor
     :param int rank: The size of the partial pivoted Cholesky factor.
     :param error_tol: Defines an optional stopping criterion.
-        If the residual of the factorization is less than :attr:`error_tol`, then the
+        If the residual of the factorization is less than error_tol, then the
         factorization will exit early. This will result in a :math:`\leq \text{ rank}` factor.
     :type error_tol: float, optional
     :param bool return_pivots: (default: False) Whether or not to return the pivots alongside

diff --git a/gpytorch/functions/_diagonalization.py b/gpytorch/functions/_diagonalization.py
@@ -14,7 +14,7 @@ def forward(ctx, representation_tree, device, dtype, matrix_shape, max_iter, bat
         :param list matrix_args: The arguments representing the symmetric matrix A (or batch of PSD matrices A)
 
         :rtype: (torch.Tensor, torch.Tensor)
-        :return: :attr:`Q`, :attr: `S` such that :math:`Q S Q^T \approx A`
+        :return: Q, S such that :math:`Q S Q^T \approx A`
         """
 
         ctx.representation_tree = representation_tree

diff --git a/gpytorch/functions/_root_decomposition.py b/gpytorch/functions/_root_decomposition.py
@@ -26,7 +26,7 @@ def forward(
         :param list matrix_args: The arguments representing the symmetric matrix A (or batch of PSD matrices A)
 
         :rtype: (torch.Tensor, torch.Tensor)
-        :return: :attr:`R`, such that :math:`R R^T \approx A`, and :attr:`R_inv`, such that
+        :return: R, such that :math:`R R^T \approx A`, and R_inv, such that
             :math:`R_{inv} R_{inv}^T \approx A^{-1}` (will only be populated if self.inverse = True)
         """
         from ..lazy import lazify

diff --git a/gpytorch/kernels/additive_structure_kernel.py b/gpytorch/kernels/additive_structure_kernel.py
@@ -26,11 +26,11 @@ class AdditiveStructureKernel(Kernel):
     of the additive terms in batch, making it very fast.
 
     Args:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to approximate with KISS-GP
-        :attr:`num_dims` (int):
+        num_dims (int):
             The dimension of the input data.
-        :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
     """
 

diff --git a/gpytorch/kernels/cosine_kernel.py b/gpytorch/kernels/cosine_kernel.py
@@ -25,22 +25,22 @@ class CosineKernel(Kernel):
     where :math:`p` is the period length parameter.
 
     Args:
-        :attr:`batch_shape` (torch.Size, optional):
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each
-            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`
-        :attr:`active_dims` (tuple of ints, optional):
+            batch of input data. It should be `b` if x1 is a `b x n x d` tensor. Default: `torch.Size([])`
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`period_length_prior` (Prior, optional):
+        period_length_prior (Prior, optional):
             Set this if you want to apply a prior to the period length parameter.  Default: `None`
-        :attr:`period_length_constraint` (Constraint, optional):
+        period_length_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the period length parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale/period length can take
             (prevents divide by zero errors). Default: `1e-6`.
 
     Attributes:
-        :attr:`period_length` (Tensor):
+        period_length (Tensor):
             The period length parameter. Size = `*batch_shape x 1 x 1`.
 
     Example:

diff --git a/gpytorch/kernels/cylindrical_kernel.py b/gpytorch/kernels/cylindrical_kernel.py
@@ -21,19 +21,19 @@ class CylindricalKernel(Kernel):
         The data must lie completely within the unit ball.
 
     Args:
-        :attr:`num_angular_weights` (int):
+        num_angular_weights (int):
             The number of components in the angular kernel
-        :attr:`radial_base_kernel` (gpytorch.kernel):
+        radial_base_kernel (gpytorch.kernel):
             The base kernel for computing the radial kernel
-        :attr:`batch_size` (int, optional):
+        batch_size (int, optional):
             Set this if the data is batch of input data.
-            It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `1`
-        :attr:`eps` (float):
+            It should be `b` if x1 is a `b x n x d` tensor. Default: `1`
+        eps (float):
             Small floating point number used to improve numerical stability
             in kernel computations. Default: `1e-6`
-        :attr:`param_transform` (function, optional):
+        param_transform (function, optional):
             Set this if you want to use something other than softplus to ensure positiveness of parameters.
-        :attr:`inv_param_transform` (function, optional):
+        inv_param_transform (function, optional):
             Set this to allow setting parameters directly in transformed space and sampling from priors.
             Automatically inferred for common transformations such as torch.exp or torch.nn.functional.softplus.
     """

diff --git a/gpytorch/kernels/distributional_input_kernel.py b/gpytorch/kernels/distributional_input_kernel.py
@@ -20,7 +20,7 @@ class DistributionalInputKernel(Kernel):
     where :math:`a` is the lengthscale.
 
     Args:
-        :attr:`distance_function` (function) distance function between distributional inputs.
+        distance_function (function) distance function between distributional inputs.
     """
     has_lengthscale = True
 

diff --git a/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py b/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py
@@ -10,11 +10,11 @@ def _symmetrized_kl(dist1, dist2, eps=1e-8):
     the first half of the distribution tensors are the mean, and the second half
     are the log variances.
     Args:
-        :attr:`dist1` (torch.Tensor) has shapes batch x n x dimensions. The first half
+        dist1 (torch.Tensor) has shapes batch x n x dimensions. The first half
             of the last dimensions are the means, while the second half are the log-variances.
-        :attr:`dist2` (torch.Tensor) has shapes batch x n x dimensions. The first half
+        dist2 (torch.Tensor) has shapes batch x n x dimensions. The first half
             of the last dimensions are the means, while the second half are the log-variances.
-        :attr:`eps` (float) jitter term for the noise variance
+        eps (float) jitter term for the noise variance
     """
 
     num_dims = int(dist1.shape[-1] / 2)

diff --git a/gpytorch/kernels/grid_interpolation_kernel.py b/gpytorch/kernels/grid_interpolation_kernel.py
@@ -37,12 +37,12 @@ class GridInterpolationKernel(GridKernel):
     * :math:`\mathbf{w_{x_1}}` and :math:`\mathbf{w_{x_2}}` are sparse vectors based on
       :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}` that apply cubic interpolation.
 
-    The user should supply the size of the grid (using the :attr:`grid_size` attribute).
+    The user should supply the size of the grid (using the grid_size attribute).
     To choose a reasonable grid value, we highly recommend using the
     :func:`gpytorch.utils.grid.choose_grid_size` helper function.
     The bounds of the grid will automatically be determined by data.
 
-    (Alternatively, you can hard-code bounds using the :attr:`grid_bounds`, which
+    (Alternatively, you can hard-code bounds using the grid_bounds, which
     will speed up this kernel's computations.)
 
     .. note::
@@ -51,18 +51,18 @@ class GridInterpolationKernel(GridKernel):
         Periodic, Spectral Mixture, etc.)
 
     Args:
-        - :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to approximate with KISS-GP
-        - :attr:`grid_size` (Union[int, List[int]]):
+        grid_size (Union[int, List[int]]):
             The size of the grid in each dimension.
             If a single int is provided, then every dimension will have the same grid size.
-        - :attr:`num_dims` (int):
+        num_dims (int):
             The dimension of the input data. Required if `grid_bounds=None`
-        - :attr:`grid_bounds` (tuple(float, float), optional):
+        grid_bounds (tuple(float, float), optional):
             The bounds of the grid, if known (high performance mode).
             The length of the tuple must match the number of dimensions.
             The entries represent the min/max values for each dimension.
-        - :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
 
     .. _Kernel Interpolation for Scalable Structured Gaussian Processes:

diff --git a/gpytorch/kernels/grid_kernel.py b/gpytorch/kernels/grid_kernel.py
@@ -25,14 +25,14 @@ class GridKernel(Kernel):
         Periodic, Spectral Mixture, etc.)
 
     Args:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to speed up with grid methods.
-        :attr:`grid` (Tensor):
+        grid (Tensor):
             A g x d tensor where column i consists of the projections of the
             grid in dimension i.
-        :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
-        :attr:`interpolation_mode` (bool):
+        interpolation_mode (bool):
             Used for GridInterpolationKernel where we want the covariance
             between points in the projections of the grid of each dimension.
             We do this by treating `grid` as d batches of g x 1 tensors by

diff --git a/gpytorch/kernels/index_kernel.py b/gpytorch/kernels/index_kernel.py
@@ -25,18 +25,18 @@ class IndexKernel(Kernel):
     These parameters are learned.
 
     Args:
-        :attr:`num_tasks` (int):
+        num_tasks (int):
             Total number of indices.
-        :attr:`batch_shape` (torch.Size, optional):
+        batch_shape (torch.Size, optional):
             Set if the MultitaskKernel is operating on batches of data (and you want different
             parameters for each batch)
-        :attr:`rank` (int):
+        rank (int):
             Rank of :math:`B` matrix. Controls the degree of
             correlation between the outputs. With a rank of 1 the
             outputs are identical except for a scaling factor.
-        :attr:`prior` (:obj:`gpytorch.priors.Prior`):
+        prior (:obj:`gpytorch.priors.Prior`):
             Prior for :math:`B` matrix.
-        :attr:`var_constraint` (Constraint, optional):
+        var_constraint (Constraint, optional):
             Constraint for added diagonal component. Default: `Positive`.
 
     Attributes:

diff --git a/gpytorch/kernels/kernel.py b/gpytorch/kernels/kernel.py
@@ -97,30 +97,30 @@ class Kernel(Module):
 
     .. note::
 
-        The :attr:`lengthscale` parameter is parameterized on a log scale to constrain it to be positive.
-        You can set a prior on this parameter using the :attr:`lengthscale_prior` argument.
+        The lengthscale parameter is parameterized on a log scale to constrain it to be positive.
+        You can set a prior on this parameter using the lengthscale_prior argument.
 
-    Base Args:
-        :attr:`ard_num_dims` (int, optional):
+    Args:
+        ard_num_dims (int, optional):
             Set this if you want a separate lengthscale for each input
-            dimension. It should be `d` if :attr:`x1` is a `n x d` matrix.  Default: `None`
-        :attr:`batch_shape` (torch.Size, optional):
+            dimension. It should be `d` if x1 is a `n x d` matrix.  Default: `None`
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each batch of input
-            data. It should be `b1 x ... x bk` if :attr:`x1` is a `b1 x ... x bk x n x d` tensor.
-        :attr:`active_dims` (tuple of ints, optional):
+            data. It should be `b1 x ... x bk` if x1 is a `b1 x ... x bk x n x d` tensor.
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`lengthscale_prior` (Prior, optional):
+        lengthscale_prior (Prior, optional):
             Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`
-        :attr:`lengthscale_constraint` (Constraint, optional):
+        lengthscale_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.
 
-    Base Attributes:
-        :attr:`lengthscale` (Tensor):
+    Attributes:
+        lengthscale (Tensor):
             The lengthscale parameter. Size/shape of parameter depends on the
-            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
+            ard_num_dims and batch_shape arguments.
 
     Example:
         >>> covar_module = gpytorch.kernels.LinearKernel()
@@ -188,13 +188,13 @@ def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
         This method should be imlemented by all Kernel subclasses.
 
         Args:
-            :attr:`x1` (Tensor `n x d` or `b x n x d`):
+            x1 (Tensor `n x d` or `b x n x d`):
                 First set of data
-            :attr:`x2` (Tensor `m x d` or `b x m x d`):
+            x2 (Tensor `m x d` or `b x m x d`):
                 Second set of data
-            :attr:`diag` (bool):
+            diag (bool):
                 Should the Kernel compute the whole kernel, or just the diag?
-            :attr:`last_dim_is_batch` (tuple, optional):
+            last_dim_is_batch (tuple, optional):
                 If this is true, it treats the last dimension of the data as another batch dimension.
                 (Useful for additive structure over the dimensions). Default: False
 
@@ -284,15 +284,15 @@ def covar_dist(
         all pairs of points in x1 and x2.
 
         Args:
-            :attr:`x1` (Tensor `n x d` or `b1 x ... x bk x n x d`):
+            x1 (Tensor `n x d` or `b1 x ... x bk x n x d`):
                 First set of data.
-            :attr:`x2` (Tensor `m x d` or `b1 x ... x bk x m x d`):
+            x2 (Tensor `m x d` or `b1 x ... x bk x m x d`):
                 Second set of data.
-            :attr:`diag` (bool):
+            diag (bool):
                 Should we return the whole distance matrix, or just the diagonal? If True, we must have `x1 == x2`.
-            :attr:`last_dim_is_batch` (tuple, optional):
+            last_dim_is_batch (tuple, optional):
                 Is the last dimension of the data a batch dimension or not?
-            :attr:`square_dist` (bool):
+            square_dist (bool):
                 Should we square the distance matrix before returning?
 
         Returns:

diff --git a/gpytorch/kernels/linear_kernel.py b/gpytorch/kernels/linear_kernel.py
@@ -24,7 +24,7 @@ class LinearKernel(Kernel):
 
     where
 
-    * :math:`v` is a :attr:`variance` parameter.
+    * :math:`v` is a variance parameter.
 
 
     .. note::
@@ -37,11 +37,11 @@ class LinearKernel(Kernel):
         :math:`O(nd)` time and space.
 
     Args:
-        :attr:`variance_prior` (:class:`gpytorch.priors.Prior`):
+        variance_prior (:class:`gpytorch.priors.Prior`):
             Prior over the variance parameter (default `None`).
-        :attr:`variance_constraint` (Constraint, optional):
+        variance_constraint (Constraint, optional):
             Constraint to place on variance parameter. Default: `Positive`.
-        :attr:`active_dims` (list):
+        active_dims (list):
             List of data dimensions to operate on.
             `len(active_dims)` should equal `num_dimensions`.
     """