diff --git a/docs/source/conf.py b/docs/source/conf.py
index 34f2818de..22070049a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -80,6 +80,7 @@ def find_version(*file_paths):
 extensions = [
     "sphinx.ext.coverage",
     "sphinx.ext.mathjax",
+    'sphinx.ext.napoleon',
     "sphinx.ext.viewcode",
     "sphinx.ext.githubpages",
     "sphinx.ext.autodoc",
diff --git a/gpytorch/distributions/multitask_multivariate_normal.py b/gpytorch/distributions/multitask_multivariate_normal.py
index 6da841a7f..7c8637979 100644
--- a/gpytorch/distributions/multitask_multivariate_normal.py
+++ b/gpytorch/distributions/multitask_multivariate_normal.py
@@ -119,7 +119,7 @@ def from_batch_mvn(cls, batch_mvn, task_dim=-1):
     def from_independent_mvns(cls, mvns):
         """
         Convert an iterable of MVNs into a :obj:`~gpytorch.distributions.MultitaskMultivariateNormal`.
-        The resulting distribution will have :attr:`len(mvns)` tasks, and the tasks will be independent.
+        The resulting distribution will have ``len(mvns)`` tasks, and the tasks will be independent.
 
         :param ~gpytorch.distributions.MultitaskNormal mvn: The base MVN distributions.
         :returns: the independent multitask distribution
@@ -247,7 +247,7 @@ def to_data_independent_dist(self):
         """
         Convert a multitask MVN into a batched (non-multitask) MVNs
         The result retains the intertask covariances, but gets rid of the inter-data covariances.
-        The resulting distribution will have :attr:`len(mvns)` tasks, and the tasks will be independent.
+        The resulting distribution will have ``len(mvns)`` tasks, and the tasks will be independent.
 
         :returns: the bached data-independent MVN
         :rtype: gpytorch.distributions.MultivariateNormal
diff --git a/gpytorch/functions/__init__.py b/gpytorch/functions/__init__.py
index 5f40f591f..9084295ef 100644
--- a/gpytorch/functions/__init__.py
+++ b/gpytorch/functions/__init__.py
@@ -13,9 +13,9 @@ def add_diag(input, diag):
     Adds a diagonal matrix s*I to the input matrix input.
 
     Args:
-        :attr:`input` (Tensor (nxn) or (bxnxn)):
+        input (Tensor (nxn) or (bxnxn)):
             Tensor or LazyTensor wrapping matrix to add diagonal component to.
-        :attr:`diag` (scalar or Tensor (n) or Tensor (bxn) or Tensor (bx1)):
+        diag (scalar or Tensor (n) or Tensor (bxn) or Tensor (bx1)):
             Diagonal component to add to tensor
 
     Returns:
@@ -89,7 +89,7 @@ def matmul(mat, rhs):
 
 def inv_matmul(mat, right_tensor, left_tensor=None):
     r"""
-    Computes a linear solve (w.r.t :attr:`mat` = :math:`A`) with several right hand sides :math:`R`.
+    Computes a linear solve (w.r.t mat = :math:`A`) with several right hand sides :math:`R`.
     I.e. computes
 
     ... math::
@@ -98,9 +98,9 @@ def inv_matmul(mat, right_tensor, left_tensor=None):
             A^{-1} R,
         \end{equation}
 
-    where :math:`R` is :attr:`right_tensor` and :math:`A` is :attr:`mat`.
+    where :math:`R` is right_tensor and :math:`A` is mat.
 
-    If :attr:`left_tensor` is supplied, computes
+    If left_tensor is supplied, computes
 
     ... math::
 
@@ -108,7 +108,7 @@ def inv_matmul(mat, right_tensor, left_tensor=None):
             L A^{-1} R,
         \end{equation}
 
-    where :math:`L` is :attr:`left_tensor`. Supplying this can reduce the number of
+    where :math:`L` is left_tensor. Supplying this can reduce the number of
     CG calls required.
 
     Args:
@@ -181,7 +181,7 @@ def pivoted_cholesky(mat, rank, error_tol=None, return_pivots=None):
     :type mat: ~gpytorch.lazy.LazyTensor or ~torch.Tensor
     :param int rank: The size of the partial pivoted Cholesky factor.
     :param error_tol: Defines an optional stopping criterion.
-        If the residual of the factorization is less than :attr:`error_tol`, then the
+        If the residual of the factorization is less than error_tol, then the
         factorization will exit early. This will result in a :math:`\leq \text{ rank}` factor.
     :type error_tol: float, optional
     :param bool return_pivots: (default: False) Whether or not to return the pivots alongside
diff --git a/gpytorch/functions/_diagonalization.py b/gpytorch/functions/_diagonalization.py
index 81e60d08c..8f669efb6 100644
--- a/gpytorch/functions/_diagonalization.py
+++ b/gpytorch/functions/_diagonalization.py
@@ -14,7 +14,7 @@ def forward(ctx, representation_tree, device, dtype, matrix_shape, max_iter, bat
         :param list matrix_args: The arguments representing the symmetric matrix A (or batch of PSD matrices A)
 
         :rtype: (torch.Tensor, torch.Tensor)
-        :return: :attr:`Q`, :attr: `S` such that :math:`Q S Q^T \approx A`
+        :return: Q, S such that :math:`Q S Q^T \approx A`
         """
 
         ctx.representation_tree = representation_tree
diff --git a/gpytorch/functions/_root_decomposition.py b/gpytorch/functions/_root_decomposition.py
index e906b5f2d..dd54df7dd 100644
--- a/gpytorch/functions/_root_decomposition.py
+++ b/gpytorch/functions/_root_decomposition.py
@@ -26,7 +26,7 @@ def forward(
         :param list matrix_args: The arguments representing the symmetric matrix A (or batch of PSD matrices A)
 
         :rtype: (torch.Tensor, torch.Tensor)
-        :return: :attr:`R`, such that :math:`R R^T \approx A`, and :attr:`R_inv`, such that
+        :return: R, such that :math:`R R^T \approx A`, and R_inv, such that
             :math:`R_{inv} R_{inv}^T \approx A^{-1}` (will only be populated if self.inverse = True)
         """
         from ..lazy import lazify
diff --git a/gpytorch/kernels/additive_structure_kernel.py b/gpytorch/kernels/additive_structure_kernel.py
index 8565d95b4..1c2ba9c6e 100644
--- a/gpytorch/kernels/additive_structure_kernel.py
+++ b/gpytorch/kernels/additive_structure_kernel.py
@@ -26,11 +26,11 @@ class AdditiveStructureKernel(Kernel):
     of the additive terms in batch, making it very fast.
 
     Args:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to approximate with KISS-GP
-        :attr:`num_dims` (int):
+        num_dims (int):
             The dimension of the input data.
-        :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
     """
 
diff --git a/gpytorch/kernels/cosine_kernel.py b/gpytorch/kernels/cosine_kernel.py
index 76261cc89..11add6f2f 100644
--- a/gpytorch/kernels/cosine_kernel.py
+++ b/gpytorch/kernels/cosine_kernel.py
@@ -25,22 +25,22 @@ class CosineKernel(Kernel):
     where :math:`p` is the period length parameter.
 
     Args:
-        :attr:`batch_shape` (torch.Size, optional):
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each
-            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`
-        :attr:`active_dims` (tuple of ints, optional):
+            batch of input data. It should be `b` if x1 is a `b x n x d` tensor. Default: `torch.Size([])`
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`period_length_prior` (Prior, optional):
+        period_length_prior (Prior, optional):
             Set this if you want to apply a prior to the period length parameter.  Default: `None`
-        :attr:`period_length_constraint` (Constraint, optional):
+        period_length_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the period length parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale/period length can take
             (prevents divide by zero errors). Default: `1e-6`.
 
     Attributes:
-        :attr:`period_length` (Tensor):
+        period_length (Tensor):
             The period length parameter. Size = `*batch_shape x 1 x 1`.
 
     Example:
diff --git a/gpytorch/kernels/cylindrical_kernel.py b/gpytorch/kernels/cylindrical_kernel.py
index 86251f943..48f24958c 100644
--- a/gpytorch/kernels/cylindrical_kernel.py
+++ b/gpytorch/kernels/cylindrical_kernel.py
@@ -21,19 +21,19 @@ class CylindricalKernel(Kernel):
         The data must lie completely within the unit ball.
 
     Args:
-        :attr:`num_angular_weights` (int):
+        num_angular_weights (int):
             The number of components in the angular kernel
-        :attr:`radial_base_kernel` (gpytorch.kernel):
+        radial_base_kernel (gpytorch.kernel):
             The base kernel for computing the radial kernel
-        :attr:`batch_size` (int, optional):
+        batch_size (int, optional):
             Set this if the data is batch of input data.
-            It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `1`
-        :attr:`eps` (float):
+            It should be `b` if x1 is a `b x n x d` tensor. Default: `1`
+        eps (float):
             Small floating point number used to improve numerical stability
             in kernel computations. Default: `1e-6`
-        :attr:`param_transform` (function, optional):
+        param_transform (function, optional):
             Set this if you want to use something other than softplus to ensure positiveness of parameters.
-        :attr:`inv_param_transform` (function, optional):
+        inv_param_transform (function, optional):
             Set this to allow setting parameters directly in transformed space and sampling from priors.
             Automatically inferred for common transformations such as torch.exp or torch.nn.functional.softplus.
     """
diff --git a/gpytorch/kernels/distributional_input_kernel.py b/gpytorch/kernels/distributional_input_kernel.py
index e01b3956c..082fbc917 100644
--- a/gpytorch/kernels/distributional_input_kernel.py
+++ b/gpytorch/kernels/distributional_input_kernel.py
@@ -20,7 +20,7 @@ class DistributionalInputKernel(Kernel):
     where :math:`a` is the lengthscale.
 
     Args:
-        :attr:`distance_function` (function) distance function between distributional inputs.
+        distance_function (function) distance function between distributional inputs.
     """
     has_lengthscale = True
 
diff --git a/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py b/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py
index 11cec5948..2c5bded31 100644
--- a/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py
+++ b/gpytorch/kernels/gaussian_symmetrized_kl_kernel.py
@@ -10,11 +10,11 @@ def _symmetrized_kl(dist1, dist2, eps=1e-8):
     the first half of the distribution tensors are the mean, and the second half
     are the log variances.
     Args:
-        :attr:`dist1` (torch.Tensor) has shapes batch x n x dimensions. The first half
+        dist1 (torch.Tensor) has shapes batch x n x dimensions. The first half
             of the last dimensions are the means, while the second half are the log-variances.
-        :attr:`dist2` (torch.Tensor) has shapes batch x n x dimensions. The first half
+        dist2 (torch.Tensor) has shapes batch x n x dimensions. The first half
             of the last dimensions are the means, while the second half are the log-variances.
-        :attr:`eps` (float) jitter term for the noise variance
+        eps (float) jitter term for the noise variance
     """
 
     num_dims = int(dist1.shape[-1] / 2)
diff --git a/gpytorch/kernels/grid_interpolation_kernel.py b/gpytorch/kernels/grid_interpolation_kernel.py
index 4c2c22f4e..527e2f57f 100644
--- a/gpytorch/kernels/grid_interpolation_kernel.py
+++ b/gpytorch/kernels/grid_interpolation_kernel.py
@@ -37,12 +37,12 @@ class GridInterpolationKernel(GridKernel):
     * :math:`\mathbf{w_{x_1}}` and :math:`\mathbf{w_{x_2}}` are sparse vectors based on
       :math:`\mathbf{x_1}` and :math:`\mathbf{x_2}` that apply cubic interpolation.
 
-    The user should supply the size of the grid (using the :attr:`grid_size` attribute).
+    The user should supply the size of the grid (using the grid_size attribute).
     To choose a reasonable grid value, we highly recommend using the
     :func:`gpytorch.utils.grid.choose_grid_size` helper function.
     The bounds of the grid will automatically be determined by data.
 
-    (Alternatively, you can hard-code bounds using the :attr:`grid_bounds`, which
+    (Alternatively, you can hard-code bounds using the grid_bounds, which
     will speed up this kernel's computations.)
 
     .. note::
@@ -51,18 +51,18 @@ class GridInterpolationKernel(GridKernel):
         Periodic, Spectral Mixture, etc.)
 
     Args:
-        - :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to approximate with KISS-GP
-        - :attr:`grid_size` (Union[int, List[int]]):
+        grid_size (Union[int, List[int]]):
             The size of the grid in each dimension.
             If a single int is provided, then every dimension will have the same grid size.
-        - :attr:`num_dims` (int):
+        num_dims (int):
             The dimension of the input data. Required if `grid_bounds=None`
-        - :attr:`grid_bounds` (tuple(float, float), optional):
+        grid_bounds (tuple(float, float), optional):
             The bounds of the grid, if known (high performance mode).
             The length of the tuple must match the number of dimensions.
             The entries represent the min/max values for each dimension.
-        - :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
 
     .. _Kernel Interpolation for Scalable Structured Gaussian Processes:
diff --git a/gpytorch/kernels/grid_kernel.py b/gpytorch/kernels/grid_kernel.py
index 41ac23c0d..fabc0ab05 100644
--- a/gpytorch/kernels/grid_kernel.py
+++ b/gpytorch/kernels/grid_kernel.py
@@ -25,14 +25,14 @@ class GridKernel(Kernel):
         Periodic, Spectral Mixture, etc.)
 
     Args:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to speed up with grid methods.
-        :attr:`grid` (Tensor):
+        grid (Tensor):
             A g x d tensor where column i consists of the projections of the
             grid in dimension i.
-        :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
-        :attr:`interpolation_mode` (bool):
+        interpolation_mode (bool):
             Used for GridInterpolationKernel where we want the covariance
             between points in the projections of the grid of each dimension.
             We do this by treating `grid` as d batches of g x 1 tensors by
diff --git a/gpytorch/kernels/index_kernel.py b/gpytorch/kernels/index_kernel.py
index 76e0bc05b..c455f0c95 100644
--- a/gpytorch/kernels/index_kernel.py
+++ b/gpytorch/kernels/index_kernel.py
@@ -25,18 +25,18 @@ class IndexKernel(Kernel):
     These parameters are learned.
 
     Args:
-        :attr:`num_tasks` (int):
+        num_tasks (int):
             Total number of indices.
-        :attr:`batch_shape` (torch.Size, optional):
+        batch_shape (torch.Size, optional):
             Set if the MultitaskKernel is operating on batches of data (and you want different
             parameters for each batch)
-        :attr:`rank` (int):
+        rank (int):
             Rank of :math:`B` matrix. Controls the degree of
             correlation between the outputs. With a rank of 1 the
             outputs are identical except for a scaling factor.
-        :attr:`prior` (:obj:`gpytorch.priors.Prior`):
+        prior (:obj:`gpytorch.priors.Prior`):
             Prior for :math:`B` matrix.
-        :attr:`var_constraint` (Constraint, optional):
+        var_constraint (Constraint, optional):
             Constraint for added diagonal component. Default: `Positive`.
 
     Attributes:
diff --git a/gpytorch/kernels/kernel.py b/gpytorch/kernels/kernel.py
index ac7a90b00..ecd7283a5 100644
--- a/gpytorch/kernels/kernel.py
+++ b/gpytorch/kernels/kernel.py
@@ -97,30 +97,30 @@ class Kernel(Module):
 
     .. note::
 
-        The :attr:`lengthscale` parameter is parameterized on a log scale to constrain it to be positive.
-        You can set a prior on this parameter using the :attr:`lengthscale_prior` argument.
+        The lengthscale parameter is parameterized on a log scale to constrain it to be positive.
+        You can set a prior on this parameter using the lengthscale_prior argument.
 
-    Base Args:
-        :attr:`ard_num_dims` (int, optional):
+    Args:
+        ard_num_dims (int, optional):
             Set this if you want a separate lengthscale for each input
-            dimension. It should be `d` if :attr:`x1` is a `n x d` matrix.  Default: `None`
-        :attr:`batch_shape` (torch.Size, optional):
+            dimension. It should be `d` if x1 is a `n x d` matrix.  Default: `None`
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each batch of input
-            data. It should be `b1 x ... x bk` if :attr:`x1` is a `b1 x ... x bk x n x d` tensor.
-        :attr:`active_dims` (tuple of ints, optional):
+            data. It should be `b1 x ... x bk` if x1 is a `b1 x ... x bk x n x d` tensor.
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`lengthscale_prior` (Prior, optional):
+        lengthscale_prior (Prior, optional):
             Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`
-        :attr:`lengthscale_constraint` (Constraint, optional):
+        lengthscale_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.
 
-    Base Attributes:
-        :attr:`lengthscale` (Tensor):
+    Attributes:
+        lengthscale (Tensor):
             The lengthscale parameter. Size/shape of parameter depends on the
-            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
+            ard_num_dims and batch_shape arguments.
 
     Example:
         >>> covar_module = gpytorch.kernels.LinearKernel()
@@ -188,13 +188,13 @@ def forward(self, x1, x2, diag=False, last_dim_is_batch=False, **params):
         This method should be imlemented by all Kernel subclasses.
 
         Args:
-            :attr:`x1` (Tensor `n x d` or `b x n x d`):
+            x1 (Tensor `n x d` or `b x n x d`):
                 First set of data
-            :attr:`x2` (Tensor `m x d` or `b x m x d`):
+            x2 (Tensor `m x d` or `b x m x d`):
                 Second set of data
-            :attr:`diag` (bool):
+            diag (bool):
                 Should the Kernel compute the whole kernel, or just the diag?
-            :attr:`last_dim_is_batch` (tuple, optional):
+            last_dim_is_batch (tuple, optional):
                 If this is true, it treats the last dimension of the data as another batch dimension.
                 (Useful for additive structure over the dimensions). Default: False
 
@@ -284,15 +284,15 @@ def covar_dist(
         all pairs of points in x1 and x2.
 
         Args:
-            :attr:`x1` (Tensor `n x d` or `b1 x ... x bk x n x d`):
+            x1 (Tensor `n x d` or `b1 x ... x bk x n x d`):
                 First set of data.
-            :attr:`x2` (Tensor `m x d` or `b1 x ... x bk x m x d`):
+            x2 (Tensor `m x d` or `b1 x ... x bk x m x d`):
                 Second set of data.
-            :attr:`diag` (bool):
+            diag (bool):
                 Should we return the whole distance matrix, or just the diagonal? If True, we must have `x1 == x2`.
-            :attr:`last_dim_is_batch` (tuple, optional):
+            last_dim_is_batch (tuple, optional):
                 Is the last dimension of the data a batch dimension or not?
-            :attr:`square_dist` (bool):
+            square_dist (bool):
                 Should we square the distance matrix before returning?
 
         Returns:
diff --git a/gpytorch/kernels/linear_kernel.py b/gpytorch/kernels/linear_kernel.py
index 5c4fc5a9a..27cc33958 100644
--- a/gpytorch/kernels/linear_kernel.py
+++ b/gpytorch/kernels/linear_kernel.py
@@ -24,7 +24,7 @@ class LinearKernel(Kernel):
 
     where
 
-    * :math:`v` is a :attr:`variance` parameter.
+    * :math:`v` is a variance parameter.
 
 
     .. note::
@@ -37,11 +37,11 @@ class LinearKernel(Kernel):
         :math:`O(nd)` time and space.
 
     Args:
-        :attr:`variance_prior` (:class:`gpytorch.priors.Prior`):
+        variance_prior (:class:`gpytorch.priors.Prior`):
             Prior over the variance parameter (default `None`).
-        :attr:`variance_constraint` (Constraint, optional):
+        variance_constraint (Constraint, optional):
             Constraint to place on variance parameter. Default: `Positive`.
-        :attr:`active_dims` (list):
+        active_dims (list):
             List of data dimensions to operate on.
             `len(active_dims)` should equal `num_dimensions`.
     """
diff --git a/gpytorch/kernels/matern_kernel.py b/gpytorch/kernels/matern_kernel.py
index 04503316b..0173ced30 100644
--- a/gpytorch/kernels/matern_kernel.py
+++ b/gpytorch/kernels/matern_kernel.py
@@ -26,7 +26,7 @@ class MaternKernel(Kernel):
 
     * :math:`d = (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2})`
       is the distance between
-      :math:`x_1` and :math:`x_2` scaled by the :attr:`lengthscale` parameter :math:`\Theta`.
+      :math:`x_1` and :math:`x_2` scaled by the lengthscale parameter :math:`\Theta`.
     * :math:`\nu` is a smoothness parameter (takes values 1/2, 3/2, or 5/2). Smaller values are less smooth.
     * :math:`K_\nu` is a modified Bessel function.
 
@@ -41,7 +41,7 @@ class MaternKernel(Kernel):
     :param nu: (Default: 2.5) The smoothness parameter.
     :type nu: float (0.5, 1.5, or 2.5)
     :param ard_num_dims: (Default: `None`) Set this if you want a separate lengthscale for each
-        input dimension. It should be `d` if :attr:`x1` is a `... x n x d` matrix.
+        input dimension. It should be `d` if x1 is a `... x n x d` matrix.
     :type ard_num_dims: int, optional
     :param batch_shape: (Default: `None`) Set this if you want a separate lengthscale for each
          batch of input data. It should be `torch.Size([b1, b2])` for a `b1 x b2 x n x m` kernel output.
@@ -59,9 +59,6 @@ class MaternKernel(Kernel):
     :param eps: (Default: 1e-6) The minimum value that the lengthscale can take (prevents divide by zero errors).
     :type eps: float, optional
 
-    :var torch.Tensor lengthscale: The lengthscale parameter. Size/shape of parameter depends on the
-        :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
-
     Example:
         >>> x = torch.randn(10, 5)
         >>> # Non-batch: Simple option
diff --git a/gpytorch/kernels/multi_device_kernel.py b/gpytorch/kernels/multi_device_kernel.py
index 73c32c69f..7e5acccee 100644
--- a/gpytorch/kernels/multi_device_kernel.py
+++ b/gpytorch/kernels/multi_device_kernel.py
@@ -15,9 +15,9 @@ class MultiDeviceKernel(DataParallel, Kernel):
     Allocates the covariance matrix on distributed devices, e.g. multiple GPUs.
 
     Args:
-        - :attr:`base_kernel`: Base kernel to distribute
-        - :attr:`device_ids`: list of `torch.device` objects to place kernel chunks on
-        - :attr:`output_device`: Device where outputs will be placed
+        base_kernel: Base kernel to distribute
+        device_ids: list of `torch.device` objects to place kernel chunks on
+        output_device: Device where outputs will be placed
     """
 
     def __init__(
diff --git a/gpytorch/kernels/periodic_kernel.py b/gpytorch/kernels/periodic_kernel.py
index df08f00db..1232b96ae 100644
--- a/gpytorch/kernels/periodic_kernel.py
+++ b/gpytorch/kernels/periodic_kernel.py
@@ -38,7 +38,7 @@ class PeriodicKernel(Kernel):
         decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.
 
     :param ard_num_dims: (Default: `None`) Set this if you want a separate lengthscale for each
-        input dimension. It should be `d` if :attr:`x1` is a `... x n x d` matrix.
+        input dimension. It should be `d` if x1 is a `... x n x d` matrix.
     :type ard_num_dims: int, optional
     :param batch_shape: (Default: `None`) Set this if you want a separate lengthscale for each
          batch of input data. It should be `torch.Size([b1, b2])` for a `b1 x b2 x n x m` kernel output.
@@ -62,10 +62,8 @@ class PeriodicKernel(Kernel):
     :param eps: (Default: 1e-6) The minimum value that the lengthscale can take (prevents divide by zero errors).
     :type eps: float, optional
 
-    :var torch.Tensor lengthscale: The lengthscale parameter. Size/shape of parameter depends on the
-        :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
     :var torch.Tensor period_length: The period length parameter. Size/shape of parameter depends on the
-        :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
+        ard_num_dims and batch_shape arguments.
 
     Example:
         >>> x = torch.randn(10, 5)
diff --git a/gpytorch/kernels/piecewise_polynomial_kernel.py b/gpytorch/kernels/piecewise_polynomial_kernel.py
index 97437b2e7..26abf0951 100644
--- a/gpytorch/kernels/piecewise_polynomial_kernel.py
+++ b/gpytorch/kernels/piecewise_polynomial_kernel.py
@@ -33,7 +33,7 @@ class PiecewisePolynomialKernel(Kernel):
     :param int q: (default= 2) The smoothness parameter.
     :type q: int (0, 1, 2 or 3)
     :param ard_num_dims: (Default: `None`) Set this if you want a separate lengthscale for each
-        input dimension. It should be `d` if :attr:`x1` is a `... x n x d` matrix.
+        input dimension. It should be `d` if x1 is a `... x n x d` matrix.
     :type ard_num_dims: int, optional
     :param batch_shape: (Default: `None`) Set this if you want a separate lengthscale for each
          batch of input data. It should be `torch.Size([b1, b2])` for a `b1 x b2 x n x m` kernel output.
@@ -51,9 +51,6 @@ class PiecewisePolynomialKernel(Kernel):
     :param eps: (Default: 1e-6) The minimum value that the lengthscale can take (prevents divide by zero errors).
     :type eps: float, optional
 
-    :var torch.Tensor lengthscale: The lengthscale parameter. Size/shape of parameter depends on the
-        :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
-
     .. _Rasmussen and Williams (2006):
         http://www.gaussianprocess.org/gpml/
 
diff --git a/gpytorch/kernels/polynomial_kernel.py b/gpytorch/kernels/polynomial_kernel.py
index 405051bc9..3a98e8d4e 100644
--- a/gpytorch/kernels/polynomial_kernel.py
+++ b/gpytorch/kernels/polynomial_kernel.py
@@ -22,14 +22,14 @@ class PolynomialKernel(Kernel):
 
     where
 
-    * :math:`c` is an :attr:`offset` parameter.
+    * :math:`c` is an offset parameter.
 
     Args:
-        :attr:`offset_prior` (:class:`gpytorch.priors.Prior`):
+        offset_prior (:class:`gpytorch.priors.Prior`):
             Prior over the offset parameter (default `None`).
-        :attr:`offset_constraint` (Constraint, optional):
+        offset_constraint (Constraint, optional):
             Constraint to place on offset parameter. Default: `Positive`.
-        :attr:`active_dims` (list):
+        active_dims (list):
             List of data dimensions to operate on.
             `len(active_dims)` should equal `num_dimensions`.
     """
diff --git a/gpytorch/kernels/product_structure_kernel.py b/gpytorch/kernels/product_structure_kernel.py
index b21fcd9f0..89fa87998 100644
--- a/gpytorch/kernels/product_structure_kernel.py
+++ b/gpytorch/kernels/product_structure_kernel.py
@@ -29,11 +29,11 @@ class ProductStructureKernel(Kernel):
     See `Product Kernel Interpolation for Scalable Gaussian Processes`_ for more detail.
 
     Args:
-        - :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel to approximate with KISS-GP
-        - :attr:`num_dims` (int):
+        num_dims (int):
             The dimension of the input data.
-        - :attr:`active_dims` (tuple of ints, optional):
+        active_dims (tuple of ints, optional):
             Passed down to the `base_kernel`.
 
     .. _Product Kernel Interpolation for Scalable Gaussian Processes:
diff --git a/gpytorch/kernels/rbf_kernel.py b/gpytorch/kernels/rbf_kernel.py
index 09b3b63a2..4e708f26b 100644
--- a/gpytorch/kernels/rbf_kernel.py
+++ b/gpytorch/kernels/rbf_kernel.py
@@ -21,7 +21,7 @@ class RBFKernel(Kernel):
           (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2}) \right)
        \end{equation*}
 
-    where :math:`\Theta` is a :attr:`lengthscale` parameter.
+    where :math:`\Theta` is a lengthscale parameter.
     See :class:`gpytorch.kernels.Kernel` for descriptions of the lengthscale options.
 
     .. note::
@@ -30,26 +30,26 @@ class RBFKernel(Kernel):
         decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.
 
     Args:
-        :attr:`ard_num_dims` (int, optional):
+        ard_num_dims (int, optional):
             Set this if you want a separate lengthscale for each
-            input dimension. It should be `d` if :attr:`x1` is a `n x d` matrix. Default: `None`
-        :attr:`batch_shape` (torch.Size, optional):
+            input dimension. It should be `d` if x1 is a `n x d` matrix. Default: `None`
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each
-            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
-        :attr:`active_dims` (tuple of ints, optional):
+            batch of input data. It should be `b` if x1 is a `b x n x d` tensor. Default: `torch.Size([])`.
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`lengthscale_prior` (Prior, optional):
+        lengthscale_prior (Prior, optional):
             Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
-        :attr:`lengthscale_constraint` (Constraint, optional):
+        lengthscale_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.
 
     Attributes:
-        :attr:`lengthscale` (Tensor):
+        lengthscale (Tensor):
             The lengthscale parameter. Size/shape of parameter depends on the
-            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
+            ard_num_dims and batch_shape arguments.
 
     Example:
         >>> x = torch.randn(10, 5)
diff --git a/gpytorch/kernels/rbf_kernel_grad.py b/gpytorch/kernels/rbf_kernel_grad.py
index 568bc280f..4e66e8d5e 100644
--- a/gpytorch/kernels/rbf_kernel_grad.py
+++ b/gpytorch/kernels/rbf_kernel_grad.py
@@ -19,23 +19,23 @@ class RBFKernelGrad(RBFKernel):
         decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.
 
     Args:
-        :attr:`batch_shape` (torch.Size, optional):
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each
-             batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
-        :attr:`active_dims` (tuple of ints, optional):
+             batch of input data. It should be `b` if x1 is a `b x n x d` tensor. Default: `torch.Size([])`.
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`lengthscale_prior` (Prior, optional):
+        lengthscale_prior (Prior, optional):
             Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
-        :attr:`lengthscale_constraint` (Constraint, optional):
+        lengthscale_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.
 
     Attributes:
-        :attr:`lengthscale` (Tensor):
+        lengthscale (Tensor):
             The lengthscale parameter. Size/shape of parameter depends on the
-            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
+            ard_num_dims and batch_shape arguments.
 
     Example:
         >>> x = torch.randn(10, 5)
diff --git a/gpytorch/kernels/rq_kernel.py b/gpytorch/kernels/rq_kernel.py
index 6694831f0..2b13fe214 100644
--- a/gpytorch/kernels/rq_kernel.py
+++ b/gpytorch/kernels/rq_kernel.py
@@ -20,7 +20,7 @@ class RQKernel(Kernel):
           (\mathbf{x_1} - \mathbf{x_2})^\top \Theta^{-2} (\mathbf{x_1} - \mathbf{x_2}) \right)^{-\alpha}
        \end{equation*}
 
-    where :math:`\Theta` is a :attr:`lengthscale` parameter, and :math:`\alpha` is the
+    where :math:`\Theta` is a lengthscale parameter, and :math:`\alpha` is the
     rational quadratic relative weighting parameter.
     See :class:`gpytorch.kernels.Kernel` for descriptions of the lengthscale options.
 
@@ -30,31 +30,31 @@ class RQKernel(Kernel):
         decorate this kernel with a :class:`gpytorch.kernels.ScaleKernel`.
 
     Args:
-        :attr:`ard_num_dims` (int, optional):
+        ard_num_dims (int, optional):
             Set this if you want a separate lengthscale for each
-            input dimension. It should be `d` if :attr:`x1` is a `n x d` matrix. Default: `None`
-        :attr:`batch_shape` (torch.Size, optional):
+            input dimension. It should be `d` if x1 is a `n x d` matrix. Default: `None`
+        batch_shape (torch.Size, optional):
             Set this if you want a separate lengthscale for each
-            batch of input data. It should be `b` if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`.
-        :attr:`active_dims` (tuple of ints, optional):
+            batch of input data. It should be `b` if x1 is a `b x n x d` tensor. Default: `torch.Size([])`.
+        active_dims (tuple of ints, optional):
             Set this if you want to compute the covariance of only a few input dimensions. The ints
             corresponds to the indices of the dimensions. Default: `None`.
-        :attr:`lengthscale_prior` (Prior, optional):
+        lengthscale_prior (Prior, optional):
             Set this if you want to apply a prior to the lengthscale parameter.  Default: `None`.
-        :attr:`lengthscale_constraint` (Constraint, optional):
+        lengthscale_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the lengthscale parameter. Default: `Positive`.
-        :attr:`alpha_constraint` (Constraint, optional):
+        alpha_constraint (Constraint, optional):
             Set this if you want to apply a constraint to the alpha parameter. Default: `Positive`.
-        :attr:`eps` (float):
+        eps (float):
             The minimum value that the lengthscale can take (prevents divide by zero errors). Default: `1e-6`.
 
     Attributes:
-        :attr:`lengthscale` (Tensor):
+        lengthscale (Tensor):
             The lengthscale parameter. Size/shape of parameter depends on the
-            :attr:`ard_num_dims` and :attr:`batch_shape` arguments.
-        :attr:`alpha` (Tensor):
+            ard_num_dims and batch_shape arguments.
+        alpha (Tensor):
             The rational quadratic relative weighting parameter. Size/shape of parameter depends
-            on the :attr:`batch_shape` argument
+            on the batch_shape argument
     """
 
     has_lengthscale = True
diff --git a/gpytorch/kernels/scale_kernel.py b/gpytorch/kernels/scale_kernel.py
index 12f2823a8..a2a6fffda 100644
--- a/gpytorch/kernels/scale_kernel.py
+++ b/gpytorch/kernels/scale_kernel.py
@@ -27,25 +27,25 @@ class ScaleKernel(Kernel):
     keyword argument to the appropriate number of batches.
 
     .. note::
-        The :attr:`outputscale` parameter is parameterized on a log scale to constrain it to be positive.
-        You can set a prior on this parameter using the :attr:`outputscale_prior` argument.
+        The outputscale parameter is parameterized on a log scale to constrain it to be positive.
+        You can set a prior on this parameter using the outputscale_prior argument.
 
     Args:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The base kernel to be scaled.
-        :attr:`batch_shape` (int, optional):
+        batch_shape (int, optional):
             Set this if you want a separate outputscale for each batch of input data. It should be `b`
-            if :attr:`x1` is a `b x n x d` tensor. Default: `torch.Size([])`
-        :attr:`outputscale_prior` (Prior, optional): Set this if you want to apply a prior to the outputscale
+            if x1 is a `b x n x d` tensor. Default: `torch.Size([])`
+        outputscale_prior (Prior, optional): Set this if you want to apply a prior to the outputscale
             parameter.  Default: `None`
-        :attr:`outputscale_constraint` (Constraint, optional): Set this if you want to apply a constraint to the
+        outputscale_constraint (Constraint, optional): Set this if you want to apply a constraint to the
             outputscale parameter. Default: `Positive`.
 
     Attributes:
-        :attr:`base_kernel` (Kernel):
+        base_kernel (Kernel):
             The kernel module to be scaled.
-        :attr:`outputscale` (Tensor):
-            The outputscale parameter. Size/shape of parameter depends on the :attr:`batch_shape` arguments.
+        outputscale (Tensor):
+            The outputscale parameter. Size/shape of parameter depends on the batch_shape arguments.
 
     Example:
         >>> x = torch.randn(10, 5)
diff --git a/gpytorch/kernels/spectral_mixture_kernel.py b/gpytorch/kernels/spectral_mixture_kernel.py
index 40f8ad1e8..e63185ff4 100644
--- a/gpytorch/kernels/spectral_mixture_kernel.py
+++ b/gpytorch/kernels/spectral_mixture_kernel.py
@@ -23,14 +23,14 @@ class SpectralMixtureKernel(Kernel):
     .. note::
         Unlike other kernels,
 
-            * :attr:`ard_num_dims` **must equal** the number of dimensions of the data.
+            * ard_num_dims **must equal** the number of dimensions of the data.
             * This kernel should not be combined with a :class:`gpytorch.kernels.ScaleKernel`.
 
     :param int num_mixtures: The number of components in the mixture.
     :param int ard_num_dims: Set this to match the dimensionality of the input.
-        It should be `d` if :attr:`x1` is a `... x n x d` matrix. (Default: `1`.)
+        It should be `d` if x1 is a `... x n x d` matrix. (Default: `1`.)
     :param batch_shape: Set this if the data is batch of input data. It should
-        be `b_1 x ... x b_j` if :attr:`x1` is a `b_1 x ... x b_j x n x d` tensor. (Default: `torch.Size([])`.)
+        be `b_1 x ... x b_j` if x1 is a `b_1 x ... x b_j x n x d` tensor. (Default: `torch.Size([])`.)
     :type batch_shape: torch.Size, optional
     :param active_dims: Set this if you want to compute the covariance of only
         a few input dimensions. The ints corresponds to the indices of the dimensions. (Default: `None`.)
@@ -38,17 +38,17 @@ class SpectralMixtureKernel(Kernel):
     :param eps: The minimum value that the lengthscale can take (prevents divide by zero errors). (Default: `1e-6`.)
     :type eps: float, optional
 
-    :param mixture_scales_prior: A prior to set on the :attr:`mixture_scales` parameter
+    :param mixture_scales_prior: A prior to set on the mixture_scales parameter
     :type mixture_scales_prior: ~gpytorch.priors.Prior, optional
-    :param mixture_scales_constraint: A constraint to set on the :attr:`mixture_scales` parameter
+    :param mixture_scales_constraint: A constraint to set on the mixture_scales parameter
     :type mixture_scales_constraint: ~gpytorch.constraints.Interval, optional
-    :param mixture_means_prior: A prior to set on the :attr:`mixture_means` parameter
+    :param mixture_means_prior: A prior to set on the mixture_means parameter
     :type mixture_means_prior: ~gpytorch.priors.Prior, optional
-    :param mixture_means_constraint: A constraint to set on the :attr:`mixture_means` parameter
+    :param mixture_means_constraint: A constraint to set on the mixture_means parameter
     :type mixture_means_constraint: ~gpytorch.constraints.Interval, optional
-    :param mixture_weights_prior: A prior to set on the :attr:`mixture_weights` parameter
+    :param mixture_weights_prior: A prior to set on the mixture_weights parameter
     :type mixture_weights_prior: ~gpytorch.priors.Prior, optional
-    :param mixture_weights_constraint: A constraint to set on the :attr:`mixture_weights` parameter
+    :param mixture_weights_constraint: A constraint to set on the mixture_weights parameter
     :type mixture_weights_constraint: ~gpytorch.constraints.Interval, optional
 
     :ivar torch.Tensor mixture_scales: The lengthscale parameter. Given
diff --git a/gpytorch/lazy/block_diag_lazy_tensor.py b/gpytorch/lazy/block_diag_lazy_tensor.py
index 439fa393f..95cacba82 100644
--- a/gpytorch/lazy/block_diag_lazy_tensor.py
+++ b/gpytorch/lazy/block_diag_lazy_tensor.py
@@ -13,15 +13,15 @@
 class BlockDiagLazyTensor(BlockLazyTensor):
     """
     Represents a lazy tensor that is the block diagonal of square matrices.
-    The :attr:`block_dim` attribute specifies which dimension of the base LazyTensor
+    The block_dim attribute specifies which dimension of the base LazyTensor
     specifies the blocks.
     For example, (with `block_dim=-3` a `k x n x n` tensor represents `k` `n x n` blocks (a `kn x kn` matrix).
     A `b x k x n x n` tensor represents `k` `b x n x n` blocks (a `b x kn x kn` batch matrix).
 
     Args:
-        :attr:`base_lazy_tensor` (LazyTensor or Tensor):
+        base_lazy_tensor (LazyTensor or Tensor):
             Must be at least 3 dimensional.
-        :attr:`block_dim` (int):
+        block_dim (int):
             The dimension that specifies the blocks.
     """
 
diff --git a/gpytorch/lazy/block_interleaved_lazy_tensor.py b/gpytorch/lazy/block_interleaved_lazy_tensor.py
index 307c99200..0aadea0c5 100644
--- a/gpytorch/lazy/block_interleaved_lazy_tensor.py
+++ b/gpytorch/lazy/block_interleaved_lazy_tensor.py
@@ -9,15 +9,15 @@
 class BlockInterleavedLazyTensor(BlockLazyTensor):
     """
     Represents a lazy tensor that is the block diagonal of square matrices.
-    The :attr:`block_dim` attribute specifies which dimension of the base LazyTensor
+    The block_dim attribute specifies which dimension of the base LazyTensor
     specifies the blocks.
     For example, (with `block_dim=-3` a `k x n x n` tensor represents `k` `n x n` blocks (a `kn x kn` matrix).
     A `b x k x n x n` tensor represents `k` `b x n x n` blocks (a `b x kn x kn` batch matrix).
 
     Args:
-        :attr:`base_lazy_tensor` (LazyTensor or Tensor):
+        base_lazy_tensor (LazyTensor or Tensor):
             Must be at least 3 dimensional.
-        :attr:`block_dim` (int):
+        block_dim (int):
             The dimension that specifies the blocks.
     """
 
diff --git a/gpytorch/lazy/block_lazy_tensor.py b/gpytorch/lazy/block_lazy_tensor.py
index 5b87f4d3e..277a1df51 100644
--- a/gpytorch/lazy/block_lazy_tensor.py
+++ b/gpytorch/lazy/block_lazy_tensor.py
@@ -16,15 +16,15 @@ class BlockLazyTensor(LazyTensor):
     (e.g. block diagonal, sum over blocks, etc.)
 
     BlockLazyTensors represent the groups of blocks as a batched Tensor.
-    The :attr:block_dim` attribute specifies which dimension of the base LazyTensor
+    The block_dim attribute specifies which dimension of the base LazyTensor
     specifies the blocks.
     For example, (with `block_dim=-3` a `k x n x n` tensor represents `k` `n x n` blocks.
     A `b x k x n x n` tensor represents `k` `b x n x n` blocks.
 
     Args:
-        - :attr:`base_lazy_tensor` (LazyTensor or Tensor):
+        base_lazy_tensor (LazyTensor or Tensor):
             Must be at least 3 dimenional.
-        - :attr:`block_dim` (int):
+        block_dim (int):
             The dimension that specifies blocks.
     """
 
diff --git a/gpytorch/lazy/cat_lazy_tensor.py b/gpytorch/lazy/cat_lazy_tensor.py
index 24e56882f..21bf13de6 100644
--- a/gpytorch/lazy/cat_lazy_tensor.py
+++ b/gpytorch/lazy/cat_lazy_tensor.py
@@ -35,14 +35,14 @@ class CatLazyTensor(LazyTensor):
     dimension.
 
     Args:
-        - :attr:`lazy_tensors` (list of LazyTensors):
+        lazy_tensors (list of LazyTensors):
             A list of LazyTensors whose sizes are the same except in
-            concatenating dimension :attr:`dim`
-        - :attr:`dim` (int):
+            concatenating dimension dim
+        dim (int):
             The concatenating dimension which can be a batch dimension.
-        - :attr:`output_device` (torch.device):
-            The CatLazyTensor will appear to appear on :attr:`output_device`
-            and place any output `torch.Tensors` on :attr:`output_device`
+        output_device (torch.device):
+            The CatLazyTensor will appear to appear on output_device
+            and place any output `torch.Tensors` on output_device
     """
 
     def _check_args(self, *lazy_tensors, dim=0, output_device=None):
diff --git a/gpytorch/lazy/diag_lazy_tensor.py b/gpytorch/lazy/diag_lazy_tensor.py
index c50643b01..cce6a635c 100644
--- a/gpytorch/lazy/diag_lazy_tensor.py
+++ b/gpytorch/lazy/diag_lazy_tensor.py
@@ -19,7 +19,7 @@ def __init__(self, diag):
         Diagonal lazy tensor. Supports arbitrary batch sizes.
 
         Args:
-            :attr:`diag` (Tensor):
+            diag (Tensor):
                 A `b1 x ... x bk x n` Tensor, representing a `b1 x ... x bk`-sized batch
                 of `n x n` diagonal matrices
         """
@@ -208,10 +208,10 @@ def __init__(self, diag_values, diag_shape):
         Used e.g. for adding jitter to matrices.
 
         Args:
-            :attr:`diag_values` (Tensor):
+            diag_values (Tensor):
                 A `b1 x ... x bk x 1` Tensor, representing a `b1 x ... x bk`-sized batch
                 of `diag_shape x diag_shape` diagonal matrices
-            :attr:`diag_shape` (int):
+            diag_shape (int):
                 The (non-batch) dimension of the (square) matrix
         """
         if settings.debug.on():
diff --git a/gpytorch/lazy/identity_lazy_tensor.py b/gpytorch/lazy/identity_lazy_tensor.py
index 1300f1715..84426afa6 100644
--- a/gpytorch/lazy/identity_lazy_tensor.py
+++ b/gpytorch/lazy/identity_lazy_tensor.py
@@ -19,7 +19,7 @@ def __init__(self, diag_shape, batch_shape=torch.Size([]), dtype=None, device=No
         Identity matrix lazy tensor. Supports arbitrary batch sizes.
 
         Args:
-            :attr:`diag` (Tensor):
+            diag (Tensor):
                 A `b1 x ... x bk x n` Tensor, representing a `b1 x ... x bk`-sized batch
                 of `n x n` identity matrices
         """
diff --git a/gpytorch/lazy/lazy_tensor.py b/gpytorch/lazy/lazy_tensor.py
index c477759bc..7bd664615 100644
--- a/gpytorch/lazy/lazy_tensor.py
+++ b/gpytorch/lazy/lazy_tensor.py
@@ -213,11 +213,11 @@ def _getitem(self, row_index, col_index, *batch_indices):
               handled by the `_getindices` method
 
         Args:
-            :attr:`row_index` (slice, Tensor):
+            row_index (slice, Tensor):
                 Index for the row of the LazyTensor
-            :attr:`col_index` (slice, Tensor):
+            col_index (slice, Tensor):
                 Index for the col of the LazyTensor
-            :attr:`batch_indices` (tuple of slice, int, Tensor):
+            batch_indices (tuple of slice, int, Tensor):
                 Indices for the batch dimensions
 
         Returns:
@@ -1178,9 +1178,9 @@ def inv_matmul(self, right_tensor, left_tensor=None):
                 A^{-1} R,
             \end{equation}
 
-        where :math:`R` is :attr:`right_tensor` and :math:`A` is the LazyTensor.
+        where :math:`R` is right_tensor and :math:`A` is the LazyTensor.
 
-        If :attr:`left_tensor` is supplied, computes
+        If left_tensor is supplied, computes
 
         ... math::
 
@@ -1188,7 +1188,7 @@ def inv_matmul(self, right_tensor, left_tensor=None):
                 L A^{-1} R,
             \end{equation}
 
-        where :math:`L` is :attr:`left_tensor`. Supplying this can reduce the number of
+        where :math:`L` is left_tensor. Supplying this can reduce the number of
         CG calls required.
 
         Args:
@@ -1523,7 +1523,7 @@ def pivoted_cholesky(self, rank, error_tol=None, return_pivots=False):
 
         :param int rank: The size of the partial pivoted Cholesky factor.
         :param error_tol: Defines an optional stopping criterion.
-            If the residual of the factorization is less than :attr:`error_tol`, then the
+            If the residual of the factorization is less than error_tol, then the
             factorization will exit early. This will result in a :math:`\leq \text{ rank}` factor.
         :type error_tol: float, optional
         :param bool return_pivots: (default: False) Whether or not to return the pivots alongside
@@ -1553,7 +1553,7 @@ def prod(self, dim=None):
                 Returns a `b/k x n x m` LazyTensor.
 
         Args:
-            :attr:`mul_batch_size` (int or None):
+            mul_batch_size (int or None):
                 Controls the number of groups that are multiplied over (default: None).
 
         Returns:
@@ -1903,7 +1903,7 @@ def sum(self, dim=None):
         If set to None, then sums all dimensions
 
         Args:
-            :attr:`dim` (int):
+            dim (int):
                 Which dimension is being summed over (default=None)
 
         Returns:
@@ -1949,12 +1949,8 @@ def svd(self) -> Tuple["LazyTensor", Tensor, "LazyTensor"]:
         Does NOT sort the sigular values.
 
         Returns:
-            :obj:`~gpytorch.lazy.LazyTensor`:
-                The left singular vectors (`U`).
-            :obj:`torch.Tensor`:
-                The singular values (`S`).
-            :obj:`~gpytorch.lazy.LazyTensor`:
-                The right singular vectors (`V`).
+            Tuple containing the left singular vectors (`U`), the singular values (`S`),
+            and the right singular vectors (`V`).
         """
         return self._svd()
 
@@ -1966,13 +1962,11 @@ def symeig(self, eigenvectors: bool = False) -> Tuple[Tensor, Optional["LazyTens
         structure. Does NOT sort the eigenvalues.
 
         Args:
-            :attr:`eigenvectors` (bool): If True, compute the eigenvectors in addition to the eigenvalues.
+            eigenvectors (bool): If True, compute the eigenvectors in addition to the eigenvalues.
         Returns:
-            :obj:`torch.Tensor`:
-                The eigenvalues.
-            :obj:`~gpytorch.lazy.LazyTensor`:
-                The eigenvectors. If `eigenvectors=False`, this is None. Otherwise, this LazyTensor
-                contains the orthonormal eigenvectors of the matrix.
+            Tuple containing the eigenvalues and eigenvectors. If `eigenvectors=False`,
+            this is None. Otherwise, this LazyTensor contains the orthonormal eigenvectors
+            of the matrix.
         """
         try:
             evals, evecs = pop_from_cache(self, "symeig", eigenvectors=True)
@@ -2099,7 +2093,7 @@ def zero_mean_mvn_samples(self, num_samples):
         Self should be symmetric, either (batch_size x num_dim x num_dim) or (num_dim x num_dim)
 
         Args:
-            :attr:`num_samples` (int):
+            num_samples (int):
                 Number of samples to draw.
 
         Returns:
@@ -2150,7 +2144,7 @@ def __add__(self, other):
         or lazy tensor.
 
         Args:
-            :attr:`other` (:obj:`torch.tensor` or :obj:`gpytorch.lazy.LazyTensor`):
+            other (:obj:`torch.tensor` or :obj:`gpytorch.lazy.LazyTensor`):
                 Matrix to add to this one.
 
         Returns:
@@ -2187,7 +2181,7 @@ def __div__(self, other):
         the elementwise reciprocal of another matrix or lazy tensor.
 
         Args:
-            :attr:`other` (:obj:`torch.tensor` or :obj:`gpytorch.lazy.LazyTensor`):
+            other (:obj:`torch.tensor` or :obj:`gpytorch.lazy.LazyTensor`):
                 Matrix to divide this one by.
 
         Returns:
diff --git a/gpytorch/lazy/sum_batch_lazy_tensor.py b/gpytorch/lazy/sum_batch_lazy_tensor.py
index 7956a5503..ae59f87ed 100644
--- a/gpytorch/lazy/sum_batch_lazy_tensor.py
+++ b/gpytorch/lazy/sum_batch_lazy_tensor.py
@@ -10,15 +10,15 @@
 class SumBatchLazyTensor(BlockLazyTensor):
     """
     Represents a lazy tensor that is actually the sum of several lazy tensors blocks.
-    The :attr:`block_dim` attribute specifies which dimension of the base LazyTensor
+    The block_dim attribute specifies which dimension of the base LazyTensor
     specifies the blocks.
     For example, (with `block_dim=-3` a `k x n x n` tensor represents `k` `n x n` blocks (a `n x n` matrix).
     A `b x k x n x n` tensor represents `k` `b x n x n` blocks (a `b x n x n` batch matrix).
 
     Args:
-        :attr:`base_lazy_tensor` (LazyTensor):
+        base_lazy_tensor (LazyTensor):
             A `k x n x n` LazyTensor, or a `b x k x n x n` LazyTensor.
-        :attr:`block_dim` (int):
+        block_dim (int):
             The dimension that specifies the blocks.
     """
 
diff --git a/gpytorch/lazy/toeplitz_lazy_tensor.py b/gpytorch/lazy/toeplitz_lazy_tensor.py
index ca1170d60..0b2f0e57a 100644
--- a/gpytorch/lazy/toeplitz_lazy_tensor.py
+++ b/gpytorch/lazy/toeplitz_lazy_tensor.py
@@ -10,7 +10,7 @@ class ToeplitzLazyTensor(LazyTensor):
     def __init__(self, column):
         """
         Args:
-            :attr: `column` (Tensor)
+            column (Tensor)
                 If `column` is a 1D Tensor of length `n`, this represents a
                 Toeplitz matrix with `column` as its first column.
                 If `column` is `b_1 x b_2 x ... x b_k x n`, then this represents a batch
diff --git a/gpytorch/lazy/triangular_lazy_tensor.py b/gpytorch/lazy/triangular_lazy_tensor.py
index 91e2c4087..b7a80791e 100644
--- a/gpytorch/lazy/triangular_lazy_tensor.py
+++ b/gpytorch/lazy/triangular_lazy_tensor.py
@@ -27,10 +27,10 @@ def __init__(self, tensor: Allsor, upper: bool = False) -> None:
         Triangular lazy tensor. Supports arbitrary batch sizes.
 
         Args:
-            :attr:`tensor` (Tensor or LazyTensor):
+            tensor (Tensor or LazyTensor):
                 A `b1 x ... x bk x n x n` Tensor, representing a `b1 x ... x bk`-sized batch
                 of `n x n` triangular matrices.
-            :attr:`upper` (bool):
+            upper (bool):
                 If True, the tensor is considered to be upper-triangular, otherwise lower-triangular.
         """
         if isinstance(tensor, TriangularLazyTensor):
diff --git a/gpytorch/likelihoods/likelihood.py b/gpytorch/likelihoods/likelihood.py
index 63430204c..30d6c5269 100644
--- a/gpytorch/likelihoods/likelihood.py
+++ b/gpytorch/likelihoods/likelihood.py
@@ -96,7 +96,7 @@ class Likelihood(_Likelihood):
             \end{cases}
 
         In either case, to implement a likelihood function, GPyTorch only
-        requires a :attr:`forward` method that computes the conditional distribution
+        requires a forward method that computes the conditional distribution
         :math:`p(y \mid f(\mathbf x))`.
 
         Calling this object does one of two things:
@@ -191,7 +191,7 @@ def forward(self, function_samples, *args, data={}, **kwargs):
             :type data: dict {str: torch.Tensor}, optional - Pyro integration only
             :param args: Additional args
             :param kwargs: Additional kwargs
-            :rtype: :obj:`Distribution` (with same shape as :attr:`function_samples` )
+            :rtype: :obj:`Distribution` (with same shape as function_samples )
             """
             raise NotImplementedError
 
@@ -228,7 +228,7 @@ def marginal(self, function_dist, *args, **kwargs):
 
             With both exact inference and variational inference, the form of
             :math:`p(\mathbf f|\mathcal D, \mathbf x)` or :math:`p(\mathbf f|
-            \mathbf x)` should usually be Gaussian. As a result, :attr:`function_dist`
+            \mathbf x)` should usually be Gaussian. As a result, function_dist
             should usually be a :obj:`~gpytorch.distributions.MultivariateNormal` specified by the mean and
             (co)variance of :math:`p(\mathbf f|...)`.
 
diff --git a/gpytorch/mlls/_approximate_mll.py b/gpytorch/mlls/_approximate_mll.py
index c5f4f2449..530019e4c 100644
--- a/gpytorch/mlls/_approximate_mll.py
+++ b/gpytorch/mlls/_approximate_mll.py
@@ -10,22 +10,22 @@
 class _ApproximateMarginalLogLikelihood(MarginalLogLikelihood, ABC):
     r"""
     An approximate marginal log likelihood (typically a bound) for approximate GP models.
-    We expect that :attr:`model` is a :obj:`gpytorch.models.ApproximateGP`.
+    We expect that model is a :obj:`gpytorch.models.ApproximateGP`.
 
     Args:
-        :attr:`likelihood` (:obj:`gpytorch.likelihoods.Likelihood`):
+        likelihood (:obj:`gpytorch.likelihoods.Likelihood`):
             The likelihood for the model
-        :attr:`model` (:obj:`gpytorch.models.ApproximateGP`):
+        model (:obj:`gpytorch.models.ApproximateGP`):
             The approximate GP model
-        :attr:`num_data` (int):
+        num_data (int):
             The total number of training data points (necessary for SGD)
-        :attr:`beta` (float - default 1.):
+        beta (float - default 1.):
             A multiplicative factor for the KL divergence term.
             Setting it to 1 (default) recovers true variational inference
             (as derived in `Scalable Variational Gaussian Process Classification`_).
             Setting it to anything less than 1 reduces the regularization effect of the model
             (similarly to what was proposed in `the beta-VAE paper`_).
-        :attr:`combine_terms` (bool):
+        combine_terms (bool):
             Whether or not to sum the expected NLL with the KL terms (default True)
     """
 
@@ -45,12 +45,13 @@ def forward(self, approximate_dist_f, target, **kwargs):
         Calling this function will call the likelihood's `expected_log_prob` function.
 
         Args:
-            :attr:`approximate_dist_f` (:obj:`gpytorch.distributions.MultivariateNormal`):
+            approximate_dist_f (:obj:`gpytorch.distributions.MultivariateNormal`):
                 :math:`q(\mathbf f)` the outputs of the latent function (the :obj:`gpytorch.models.ApproximateGP`)
-            :attr:`target` (`torch.Tensor`):
+            target (`torch.Tensor`):
                 :math:`\mathbf y` The target values
-            :attr:`**kwargs`:
-                Additional arguments passed to the likelihood's `expected_log_prob` function.
+
+        Keyword Args:
+            Additional arguments passed to the likelihood's `expected_log_prob` function.
         """
         # Get likelihood term and KL term
         num_batch = approximate_dist_f.event_shape[0]
diff --git a/gpytorch/mlls/leave_one_out_pseudo_likelihood.py b/gpytorch/mlls/leave_one_out_pseudo_likelihood.py
index 6252515f6..e89ddb3e6 100644
--- a/gpytorch/mlls/leave_one_out_pseudo_likelihood.py
+++ b/gpytorch/mlls/leave_one_out_pseudo_likelihood.py
@@ -52,7 +52,7 @@ def forward(self, function_dist: MultivariateNormal, target: Tensor, *params) ->
         :param ~gpytorch.distributions.MultivariateNormal output: the outputs of the latent function
             (the :obj:`~gpytorch.models.GP`)
         :param torch.Tensor target: :math:`\mathbf y` The target values
-        :param dict kwargs: Additional arguments to pass to the likelihood's :attr:`forward` function.
+        :param dict kwargs: Additional arguments to pass to the likelihood's forward function.
         """
         output = self.likelihood(function_dist, *params)
         m, L = output.mean, output.lazy_covariance_matrix.cholesky(upper=False)
diff --git a/gpytorch/mlls/marginal_log_likelihood.py b/gpytorch/mlls/marginal_log_likelihood.py
index be696c9c8..ab5dc4c7b 100644
--- a/gpytorch/mlls/marginal_log_likelihood.py
+++ b/gpytorch/mlls/marginal_log_likelihood.py
@@ -43,6 +43,6 @@ def forward(self, output, target, **kwargs):
         :param ~gpytorch.distributions.MultivariateNormal output: the outputs of the latent function
             (the :obj:`~gpytorch.models.GP`)
         :param torch.Tensor target: :math:`\mathbf y` The target values
-        :param dict kwargs: Additional arguments to pass to the likelihood's :attr:`forward` function.
+        :param dict kwargs: Additional arguments to pass to the likelihood's forward function.
         """
         raise NotImplementedError
diff --git a/gpytorch/models/exact_prediction_strategies.py b/gpytorch/models/exact_prediction_strategies.py
index 94ba7f8ab..77c578943 100644
--- a/gpytorch/models/exact_prediction_strategies.py
+++ b/gpytorch/models/exact_prediction_strategies.py
@@ -83,7 +83,7 @@ def _exact_predictive_covar_inv_quad_form_cache(self, train_train_covar_inv_root
             test_train_covar (:obj:`torch.tensor`): the observed noise (from the likelihood)
 
         Returns
-            - A precomputed cache
+            A precomputed cache
         """
         res = train_train_covar_inv_root
         if settings.detach_test_caches.on():
@@ -120,19 +120,18 @@ def get_fantasy_strategy(self, inputs, targets, full_inputs, full_targets, full_
         GP model, use the :meth:`~gpytorch.models.ExactGP.get_fantasy_model` method.
 
         Args:
-            - :attr:`inputs` (Tensor `b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`): Locations of fantasy
+            inputs (Tensor `b1 x ... x bk x m x d` or `f x b1 x ... x bk x m x d`): Locations of fantasy
                 observations.
-            - :attr:`targets` (Tensor `b1 x ... x bk x m` or `f x b1 x ... x bk x m`): Labels of fantasy observations.
-            - :attr:`full_inputs` (Tensor `b1 x ... x bk x n+m x d` or `f x b1 x ... x bk x n+m x d`): Training data
+            targets (Tensor `b1 x ... x bk x m` or `f x b1 x ... x bk x m`): Labels of fantasy observations.
+            full_inputs (Tensor `b1 x ... x bk x n+m x d` or `f x b1 x ... x bk x n+m x d`): Training data
                 concatenated with fantasy inputs
-            - :attr:`full_targets` (Tensor `b1 x ... x bk x n+m` or `f x b1 x ... x bk x n+m`): Training labels
+            full_targets (Tensor `b1 x ... x bk x n+m` or `f x b1 x ... x bk x n+m`): Training labels
                 concatenated with fantasy labels.
-            - :attr:`full_output` (:class:`gpytorch.distributions.MultivariateNormal`): Prior called on full_inputs
+            full_output (:class:`gpytorch.distributions.MultivariateNormal`): Prior called on full_inputs
 
         Returns:
-            - :class:`DefaultPredictionStrategy`
-                A `DefaultPredictionStrategy` model with `n + m` training examples, where the `m` fantasy examples have
-                been added and all test-time caches have been updated.
+            A `DefaultPredictionStrategy` model with `n + m` training examples, where the `m` fantasy examples have
+            been added and all test-time caches have been updated.
         """
         full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix
 
diff --git a/gpytorch/models/model_list.py b/gpytorch/models/model_list.py
index 3a1ff5931..b122777fd 100644
--- a/gpytorch/models/model_list.py
+++ b/gpytorch/models/model_list.py
@@ -56,13 +56,12 @@ def get_fantasy_model(self, inputs, targets, **kwargs):
         and returns the same class of fantasy models.
 
         Args:
-            - :attr:`inputs`: List of locations of fantasy observations, one for each model.
-            - :attr:`targets` List of labels of fantasy observations, one for each model.
+            inputs: List of locations of fantasy observations, one for each model.
+            targets List of labels of fantasy observations, one for each model.
 
         Returns:
-            - :class:`IndependentModelList`
-                An `IndependentModelList` model, where each sub-model is the fantasy model of the respective
-                sub-model in the original model at the corresponding input locations / labels.
+            An `IndependentModelList` model, where each sub-model is the fantasy model of the respective
+            sub-model in the original model at the corresponding input locations / labels.
         """
 
         if "noise" in kwargs:
diff --git a/gpytorch/models/pyro/pyro_gp.py b/gpytorch/models/pyro/pyro_gp.py
index 460d34eca..12c0527e4 100644
--- a/gpytorch/models/pyro/pyro_gp.py
+++ b/gpytorch/models/pyro/pyro_gp.py
@@ -18,16 +18,16 @@ class PyroGP(GP, _PyroMixin):
     See `the Pyro examples <examples/09_Pyro_Integration/index.html>`_ for detailed examples.
 
     Args:
-        :attr:`variational_strategy` (:obj:`~gpytorch.variational.VariationalStrategy`):
+        variational_strategy (:obj:`~gpytorch.variational.VariationalStrategy`):
             The variational strategy that defines the variational distribution and
             the marginalization strategy.
-        :attr:`likelihood` (:obj:`~gpytorch.likelihoods.Likelihood`):
+        likelihood (:obj:`~gpytorch.likelihoods.Likelihood`):
             The likelihood for the model
-        :attr:`num_data` (int):
+        num_data (int):
             The total number of training data points (necessary for SGD)
-        :attr:`name_prefix` (str, optional):
+        name_prefix (str, optional):
             A prefix to put in front of pyro sample/plate sites
-        :attr:`beta` (float - default 1.):
+        beta (float - default 1.):
             A multiplicative factor for the KL divergence term.
             Setting it to 1 (default) recovers true variational inference
             (as derived in `Scalable Variational Gaussian Process Classification`_).
diff --git a/gpytorch/module.py b/gpytorch/module.py
index 3a583798e..0cf0af6dc 100644
--- a/gpytorch/module.py
+++ b/gpytorch/module.py
@@ -190,9 +190,9 @@ def register_parameter(self, name, parameter):
         Adds a parameter to the module. The parameter can be accessed as an attribute using the given name.
 
         Args:
-            :attr:`name` (str):
+            name (str):
                 The name of the parameter
-            :attr:`parameter` (torch.nn.Parameter):
+            parameter (torch.nn.Parameter):
                 The parameter
         """
         if "_parameters" not in self.__dict__:
@@ -204,11 +204,11 @@ def register_prior(self, name, prior, param_or_closure, setting_closure=None):
         Adds a prior to the module. The prior can be accessed as an attribute using the given name.
 
         Args:
-            :attr:`name` (str):
+            name (str):
                 The name of the prior
-            :attr:`prior` (Prior):
+            prior (Prior):
                 The prior to be registered`
-            :attr:`param_or_closure` (string or callable):
+            param_or_closure (string or callable):
                 Either the name of the parameter, or a closure (which upon calling evalutes a function on
                 the module instance and one or more parameters):
                 single parameter without a transform: `.register_prior("foo_prior", foo_prior, "foo_param")`
@@ -216,7 +216,7 @@ def register_prior(self, name, prior, param_or_closure, setting_closure=None):
                 `.register_prior("foo_prior", NormalPrior(0, 1), lambda module: torch.log(module.foo_param))`
                 function of multiple parameters:
                 `.register_prior("foo2_prior", foo2_prior, lambda module: f(module.param1, module.param2)))`
-            :attr:`setting_closure` (callable, optional):
+            setting_closure (callable, optional):
                 A function taking in the module instance and a tensor in (transformed) parameter space,
                 initializing the internal parameter representation to the proper value by applying the
                 inverse transform. Enables setting parametres directly in the transformed space, as well
@@ -408,7 +408,7 @@ def pyro_load_from_samples(self, samples_dict):
         the prior to properly set the unconstrained parameter.
 
         Args:
-            :attr:`samples_dict` (dict): Dictionary mapping *prior names* to sample values.
+            samples_dict (dict): Dictionary mapping *prior names* to sample values.
         """
         return _pyro_load_from_samples(module=self, samples_dict=samples_dict, memo=None, prefix="")
 
diff --git a/gpytorch/settings.py b/gpytorch/settings.py
index cf3a698c7..c85579e1e 100644
--- a/gpytorch/settings.py
+++ b/gpytorch/settings.py
@@ -311,7 +311,7 @@ class fast_computations:
     functions used in GP inference.
     The functions that can be controlled are:
 
-    * :attr:`covar_root_decomposition`
+    * covar_root_decomposition
         This feature flag controls how matrix root decompositions
         (:math:`K = L L^\top`) are computed (e.g. for sampling, computing caches, etc.).
 
@@ -323,7 +323,7 @@ class fast_computations:
         * If set to False,
             covariance matrices :math:`K` are decomposed using the Cholesky decomposition.
 
-    * :attr:`log_prob`
+    * log_prob
         This feature flag controls how GPyTorch computes the marginal log likelihood for exact GPs
         and `log_prob` for multivariate normal distributions
 
@@ -336,7 +336,7 @@ class fast_computations:
         * If set to False,
             `log_prob` is computed using the Cholesky decomposition.
 
-    * :attr:`fast_solves`
+    * fast_solves
         This feature flag controls how GPyTorch computes the solves of positive-definite matrices.
 
         * If set to True,
diff --git a/gpytorch/utils/cholesky.py b/gpytorch/utils/cholesky.py
index 2d2d3001f..135cc5c3c 100644
--- a/gpytorch/utils/cholesky.py
+++ b/gpytorch/utils/cholesky.py
@@ -50,16 +50,16 @@ def _psd_safe_cholesky(A, out=None, jitter=None, max_tries=None):
 def psd_safe_cholesky(A, upper=False, out=None, jitter=None, max_tries=None):
     """Compute the Cholesky decomposition of A. If A is only p.s.d, add a small jitter to the diagonal.
     Args:
-        :attr:`A` (Tensor):
+        A (Tensor):
             The tensor to compute the Cholesky decomposition of
-        :attr:`upper` (bool, optional):
+        upper (bool, optional):
             See torch.cholesky
-        :attr:`out` (Tensor, optional):
+        out (Tensor, optional):
             See torch.cholesky
-        :attr:`jitter` (float, optional):
+        jitter (float, optional):
             The jitter to add to the diagonal of A in case A is only p.s.d. If omitted,
             uses settings.cholesky_jitter.value()
-        :attr:`max_tries` (int, optional):
+        max_tries (int, optional):
             Number of attempts (with successively increasing jitter) to make before raising an error.
     """
     L = _psd_safe_cholesky(A, out=out, jitter=jitter, max_tries=max_tries)
diff --git a/gpytorch/utils/permutation.py b/gpytorch/utils/permutation.py
index a112cda7a..53bba5a99 100644
--- a/gpytorch/utils/permutation.py
+++ b/gpytorch/utils/permutation.py
@@ -20,12 +20,12 @@ def apply_permutation(
         \end{equation}
 
     where the permutation matrices :math:`\boldsymbol{\Pi}_\text{left}` and :math:`\boldsymbol{\Pi}_\text{right}^\top`
-    are represented by vectors :attr:`left_permutation` and :attr:`right_permutation`.
+    are represented by vectors left_permutation and right_permutation.
 
     The permutation matrices may be partial permutations (only selecting a subset of rows/columns)
     or full permutations (permuting all rows/columns).
 
-    Importantly, if :math:`\mathbf K` is a batch of matrices, :attr:`left_permutation` and :attr:`right_permutation`
+    Importantly, if :math:`\mathbf K` is a batch of matrices, left_permutation and right_permutation
     can be a batch of permutation vectors, and this function will apply the appropriate permutation to each batch entry.
     Broadcasting rules apply.
 
diff --git a/gpytorch/variational/_variational_strategy.py b/gpytorch/variational/_variational_strategy.py
index 779d1fc04..de4dae08b 100644
--- a/gpytorch/variational/_variational_strategy.py
+++ b/gpytorch/variational/_variational_strategy.py
@@ -105,7 +105,7 @@ def forward(self, x, inducing_points, inducing_values, variational_inducing_cova
             (or the mean of the distribution :math:`q(\mathbf u)` if q is a Gaussian.
         :param ~gpytorch.lazy.LazyTensor variational_inducing_covar: If the distribuiton :math:`q(\mathbf u)`
             is Gaussian, then this variable is the covariance matrix of that Gaussian. Otherwise, it will be
-            :attr:`None`.
+            None.
 
         :rtype: :obj:`~gpytorch.distributions.MultivariateNormal`
         :return: The distribution :math:`q( \mathbf f(\mathbf X))`
diff --git a/gpytorch/variational/batch_decoupled_variational_strategy.py b/gpytorch/variational/batch_decoupled_variational_strategy.py
index c9fc52116..40cb86d2f 100644
--- a/gpytorch/variational/batch_decoupled_variational_strategy.py
+++ b/gpytorch/variational/batch_decoupled_variational_strategy.py
@@ -42,11 +42,11 @@ class BatchDecoupledVariationalStrategy(VariationalStrategy):
     Additionally, you can use a different set of kernel hyperparameters for the mean and the variance function.
     We recommend using this feature only with the :obj:`~gpytorch.mlls.PredictiveLogLikelihood` objective function
     as proposed in "Parametric Gaussian Process Regressors" (`Jankowiak et al. (2020)`_).
-    Use the :attr:`mean_var_batch_dim` to indicate which batch dimension corresponds to the different mean/var
+    Use the mean_var_batch_dim to indicate which batch dimension corresponds to the different mean/var
     kernels.
 
     .. note::
-        We recommend using the "right-most" batch dimension (i.e. :attr:`mean_var_batch_dim=-1`) for the dimension
+        We recommend using the "right-most" batch dimension (i.e. ``mean_var_batch_dim=-1``) for the dimension
         that corresponds to the different mean/variance kernel parameters.
 
         Assuming you want `b1` many independent GPs, the :obj:`~gpytorch.variational._VariationalDistribution`
diff --git a/gpytorch/variational/independent_multitask_variational_strategy.py b/gpytorch/variational/independent_multitask_variational_strategy.py
index 9fbb461c3..0e4c89b5b 100644
--- a/gpytorch/variational/independent_multitask_variational_strategy.py
+++ b/gpytorch/variational/independent_multitask_variational_strategy.py
@@ -24,7 +24,7 @@ class IndependentMultitaskVariationalStrategy(_VariationalStrategy):
     dimensions corresponds to the multiple tasks.
 
     :param ~gpytorch.variational.VariationalStrategy base_variational_strategy: Base variational strategy
-    :param int num_tasks: Number of tasks. Should correspond to the batch size of :attr:`task_dim`.
+    :param int num_tasks: Number of tasks. Should correspond to the batch size of task_dim.
     :param int task_dim: (Default: -1) Which batch dimension is the task dimension
     """
 
@@ -102,7 +102,7 @@ class MultitaskVariationalStrategy(IndependentMultitaskVariationalStrategy):
     dimensions corresponds to the multiple tasks.
 
     :param ~gpytorch.variational.VariationalStrategy base_variational_strategy: Base variational strategy
-    :param int num_tasks: Number of tasks. Should correspond to the batch size of :attr:`task_dim`.
+    :param int num_tasks: Number of tasks. Should correspond to the batch size of task_dim.
     :param int task_dim: (Default: -1) Which batch dimension is the task dimension
     """
 
diff --git a/gpytorch/variational/lmc_variational_strategy.py b/gpytorch/variational/lmc_variational_strategy.py
index 22657b535..7690c8fab 100644
--- a/gpytorch/variational/lmc_variational_strategy.py
+++ b/gpytorch/variational/lmc_variational_strategy.py
@@ -157,11 +157,11 @@ def __call__(self, x, task_indices=None, prior=False, **kwargs):
         There are two modes:
 
         1.  Compute **all tasks** for all inputs.
-            If this is the case, the :attr:`task_indices` attribute should be None.
+            If this is the case, the task_indices attribute should be None.
             The return type will be a (... x N x num_tasks)
             :class:`~gpytorch.distributions.MultitaskMultivariateNormal`.
         2.  Compute **one task** per inputs.
-            If this is the case, the (... x N) :attr:`task_indices` tensor should contain
+            If this is the case, the (... x N) task_indices tensor should contain
             the indices of each input's assigned task.
             The return type will be a (... x N)
             :class:`~gpytorch.distributions.MultivariateNormal`.