cornellius-gp
diff --git a/‎examples/02_Scalable_Exact_GPs/KeOps_GP_Regression.ipynb
+77-78 b/‎examples/02_Scalable_Exact_GPs/KeOps_GP_Regression.ipynb
+77-78
diff --git a/‎gpytorch/kernels/keops/__init__.py
+2-1 b/‎gpytorch/kernels/keops/__init__.py
+2-1
diff --git a/‎gpytorch/kernels/keops/keops_kernel.py
+45-29 b/‎gpytorch/kernels/keops/keops_kernel.py
+45-29
@@ -17,22 +17,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import math\n",
     "import torch\n",
     "import gpytorch\n",
+    "import tqdm.notebook as tqdm\n",
     "from matplotlib import pyplot as plt\n",
     "\n",
     "%matplotlib inline\n",
@@ -45,22 +37,16 @@
    "metadata": {},
    "source": [
     "### Downloading Data\n",
-    "We will be using the 3droad UCI dataset which contains a total of 278,319 data points. The next cell will download this dataset from a Google drive and load it."
+    "We will be using the 3droad UCI dataset which contains a total of 434,874 data points. We will split the data in half for training and half for testing.\n",
+    "\n",
+    "The next cell will download this dataset from a Google drive and load it."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading '3droad' UCI dataset...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import urllib.request\n",
     "import os.path\n",
@@ -76,15 +62,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Num train: 217437\n",
+      "Num test: 217437\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "\n",
     "N = data.shape[0]\n",
     "# make train/val/test\n",
-    "n_train = int(0.8 * N)\n",
+    "n_train = int(0.5 * N)\n",
+    "\n",
     "train_x, train_y = data[:n_train, :-1], data[:n_train, -1]\n",
     "test_x, test_y = data[n_train:, :-1], data[n_train:, -1]\n",
     "\n",
@@ -106,7 +102,12 @@
     "output_device = torch.device('cuda:0')\n",
     "\n",
     "train_x, train_y = train_x.to(output_device), train_y.to(output_device)\n",
-    "test_x, test_y = test_x.to(output_device), test_y.to(output_device)"
+    "test_x, test_y = test_x.to(output_device), test_y.to(output_device)\n",
+    "\n",
+    "print(\n",
+    "    f\"Num train: {train_y.size(-1)}\\n\"\n",
+    "    f\"Num test: {test_y.size(-1)}\"\n",
+    ")"
    ]
   },
   {
@@ -120,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -139,16 +140,36 @@
     "\n",
     "# initialize likelihood and model\n",
     "likelihood = gpytorch.likelihoods.GaussianLikelihood().cuda()\n",
-    "model = ExactGPModel(train_x, train_y, likelihood).cuda()"
+    "model = ExactGPModel(train_x, train_y, likelihood).cuda()\n",
+    "\n",
+    "# Because we know some properties about this dataset,\n",
+    "# we will initialize the lengthscale to be somewhat small\n",
+    "# This step isn't necessary, but it will help the model converge faster.\n",
+    "model.covar_module.base_kernel.lengthscale = 0.05"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
     "scrolled": false
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "691194d2d51e4d389fef9f0f7cb34f6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Training:   0%|          | 0/25 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# Find optimal model hyperparameters\n",
     "model.train()\n",
@@ -158,64 +179,44 @@
     "optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters\n",
     "\n",
     "# \"Loss\" for GPs - the marginal log likelihood\n",
-    "mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n",
+    "mll = gpytorch.mlls.ExactMarginalLogLikelihood(model.likelihood, model)\n",
     "\n",
     "import time\n",
-    "training_iter = 50\n",
-    "for i in range(training_iter):\n",
+    "training_iter = 25\n",
+    "iterator = tqdm.tqdm(range(training_iter), desc=\"Training\")\n",
+    "for i in iterator:\n",
     "    start_time = time.time()\n",
     "    # Zero gradients from previous iteration\n",
     "    optimizer.zero_grad()\n",
     "    # Output from model\n",
     "    output = model(train_x)\n",
     "    # Calc loss and backprop gradients\n",
     "    loss = -mll(output, train_y)\n",
+    "    print_values = dict(\n",
+    "        loss=loss.item(),\n",
+    "        ls=model.covar_module.base_kernel.lengthscale.norm().item(),\n",
+    "        os=model.covar_module.outputscale.item(),\n",
+    "        noise=model.likelihood.noise.item(),\n",
+    "        mu=model.mean_module.constant.item(),\n",
+    "    )\n",
+    "    iterator.set_postfix(**print_values)\n",
     "    loss.backward()\n",
-    "    print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (\n",
-    "        i + 1, training_iter, loss.item(),\n",
-    "        model.covar_module.base_kernel.lengthscale.item(),\n",
-    "        model.likelihood.noise.item()\n",
-    "    ))\n",
-    "    optimizer.step()\n",
-    "    print(time.time() - start_time)"
+    "    optimizer.step()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Compiling libKeOpstorchd7ba409487 in /home/jake.gardner/.cache/pykeops-1.1.1//build-libKeOpstorchd7ba409487:\n",
-      "       formula: Sum_Reduction(((((Var(0,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))) + (IntCst(1) + (Var(3,1,2) * Square(Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1))))))))) * Exp((Var(4,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))))) * Var(5,3320,1)),0)\n",
-      "       aliases: Var(0,1,2); Var(1,18,0); Var(2,18,1); Var(3,1,2); Var(4,1,2); Var(5,3320,1); \n",
-      "       dtype  : float32\n",
-      "... Done.\n",
-      "Compiling libKeOpstorch7385e76d34 in /home/jake.gardner/.cache/pykeops-1.1.1//build-libKeOpstorch7385e76d34:\n",
-      "       formula: Sum_Reduction(((((Var(0,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))) + (IntCst(1) + (Var(3,1,2) * Square(Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1))))))))) * Exp((Var(4,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))))) * Var(5,1,1)),0)\n",
-      "       aliases: Var(0,1,2); Var(1,18,0); Var(2,18,1); Var(3,1,2); Var(4,1,2); Var(5,1,1); \n",
-      "       dtype  : float32\n",
-      "... Done.\n",
-      "Compiling libKeOpstorch97105370ea in /home/jake.gardner/.cache/pykeops-1.1.1//build-libKeOpstorch97105370ea:\n",
-      "       formula: Sum_Reduction(((((Var(0,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))) + (IntCst(1) + (Var(3,1,2) * Square(Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1))))))))) * Exp((Var(4,1,2) * Sqrt(Sum(Square((Var(1,18,0) - Var(2,18,1)))))))) * Var(5,100,1)),0)\n",
-      "       aliases: Var(0,1,2); Var(1,18,0); Var(2,18,1); Var(3,1,2); Var(4,1,2); Var(5,100,1); \n",
-      "       dtype  : float32\n",
-      "... Done.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Get into evaluation (predictive posterior) mode\n",
     "model.eval()\n",
-    "likelihood.eval()\n",
     "\n",
     "# Test points are regularly spaced along [0,1]\n",
     "# Make predictions by feeding model through likelihood\n",
     "with torch.no_grad(), gpytorch.settings.fast_pred_var():\n",
-    "    observed_pred = likelihood(model(test_x))"
+    "    observed_pred = model.likelihood(model(test_x))"
    ]
   },
   {
@@ -227,29 +228,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "tensor(0.1068, device='cuda:0')"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RMSE: 0.138\n"
+     ]
     }
    ],
    "source": [
-    "torch.sqrt(torch.mean(torch.pow(observed_pred.mean - test_y, 2)))"
+    "rmse = (observed_pred.mean - test_y).square().mean().sqrt().item()\n",
+    "print(f\"RMSE: {rmse:.3f}\")"
    ]
   }
  ],
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -263,7 +262,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.8.0"
   }
  },
  "nbformat": 4,
 
@@ -1,5 +1,6 @@
+from .keops_kernel import KeOpsKernel
 from .matern_kernel import MaternKernel
 from .periodic_kernel import PeriodicKernel
 from .rbf_kernel import RBFKernel
 
-__all__ = ["MaternKernel", "RBFKernel", "PeriodicKernel"]
+__all__ = ["KeOpsKernel", "MaternKernel", "PeriodicKernel", "RBFKernel"]
@@ -1,48 +1,64 @@
-from abc import abstractmethod
-from typing import Any
+import warnings
+from typing import Any, Tuple, Union
 
 import torch
+from linear_operator import LinearOperator
 from torch import Tensor
 
 from ... import settings
 from ..kernel import Kernel
 
 try:
     import pykeops  # noqa F401
+    from pykeops.torch import LazyTensor
+
+    def _lazify_and_expand_inputs(
+        x1: Tensor, x2: Tensor
+    ) -> Tuple[Union[Tensor, LazyTensor], Union[Tensor, LazyTensor]]:
+        r"""
+        Potentially wrap inputs x1 and x2 as KeOps LazyTensors,
+        depending on whether or not we want to use KeOps under the hood or not.
+        """
+        x1_ = x1[..., :, None, :]
+        x2_ = x2[..., None, :, :]
+        if _use_keops(x1, x2):
+            res = LazyTensor(x1_), LazyTensor(x2_)
+            return res
+        return x1_, x2_
+
+    def _use_keops(x1: Tensor, x2: Tensor) -> bool:
+        r"""
+        Determine whether or not to use KeOps under the hood
+        This largely depends on the size of the kernel matrix
+
+        There are situations where we do not want the KeOps linear operator to use KeOps under the hood.
+        See https://github.com/cornellius-gp/gpytorch/pull/1319
+        """
+        return (
+            settings.use_keops.on()
+            and x1.size(-2) >= settings.max_cholesky_size.value()
+            and x2.size(-2) >= settings.max_cholesky_size.value()
+        )
 
     class KeOpsKernel(Kernel):
-        @abstractmethod
-        def _nonkeops_forward(self, x1: Tensor, x2: Tensor, diag: bool = False, **kwargs: Any):
-            r"""
-            Computes the covariance matrix (or diagonal) without using KeOps.
-            This function must implement both the diag=True and diag=False options.
-            """
-            raise NotImplementedError
-
-        @abstractmethod
-        def _keops_forward(self, x1: Tensor, x2: Tensor, **kwargs: Any):
-            r"""
-            Computes the covariance matrix with KeOps.
-            This function only implements the diag=False option, and no diag keyword should be passed in.
-            """
-            raise NotImplementedError
-
-        def forward(self, x1: Tensor, x2: Tensor, diag: bool = False, **kwargs: Any):
-            if diag:
-                return self._nonkeops_forward(x1, x2, diag=True, **kwargs)
-            elif x1.size(-2) < settings.max_cholesky_size.value() or x2.size(-2) < settings.max_cholesky_size.value():
-                return self._nonkeops_forward(x1, x2, diag=False, **kwargs)
-            else:
-                return self._keops_forward(x1, x2, **kwargs)
-
-        def __call__(self, *args: Any, **kwargs: Any):
+        def __call__(self, *args: Any, **kwargs: Any) -> Union[LinearOperator, Tensor, LazyTensor]:
             # Hotfix for zero gradients. See https://github.com/cornellius-gp/gpytorch/issues/1543
             args = [arg.contiguous() if torch.is_tensor(arg) else arg for arg in args]
             kwargs = {k: v.contiguous() if torch.is_tensor(v) else v for k, v in kwargs.items()}
             return super().__call__(*args, **kwargs)
 
 except ImportError:
 
+    def _lazify_and_expand_inputs(x1: Tensor, x2: Tensor) -> Tuple[Tensor, Tensor]:
+        return x1, x2
+
+    def _use_keops(x1: Tensor, x2: Tensor) -> bool:
+        return False
+
     class KeOpsKernel(Kernel):
-        def __init__(self, *args: Any, **kwargs: Any):
-            raise RuntimeError("You must have KeOps installed to use a KeOpsKernel")
+        def __call__(self, *args: Any, **kwargs: Any) -> Union[LinearOperator, Tensor]:
+            warnings.warn(
+                "KeOps is not installed. " f"{type(self)} will revert to the the non-keops version of this kernel.",
+                RuntimeWarning,
+            )
+            return super().__call__(*args, **kwargs)