Two questions about the likelihood in ExactGPs
, and fitting GPRs that smoothly account for noise
#2094
-
In issue #2092 I had a couple of questions about the difference between the posterior First, I'm curious why we have to call Second, I was expecting samples from Do you have any tips for building a model whose posterior samples account for the noise? And do you have any references you could point me to to better understand the difference between noise modelling and the likelihood? Thanks! For completeness, here's the code used to create the image above. """
Just a slightly modified copy of
https://docs.gpytorch.ai/en/latest/examples/01_Exact_GPs/Simple_GP_Regression.html
"""
import numpy as np
import torch
from matplotlib import pyplot as plt
import gpytorch
noise_level = 1e-1
train_x = torch.linspace(0, 2 * np.pi, 100)
train_y = torch.sin(train_x) + noise_level * torch.randn(train_x.size())
# Everything's the same as in the tutorial, except
# we add a prior to have high lengthscales.
class ExactGPModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y):
likelihood = gpytorch.likelihoods.GaussianLikelihood()
super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(
lengthscale_prior=gpytorch.priors.GammaPrior(10.0, 1.0)
)
)
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
model = ExactGPModel(train_x, train_y)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
mll = gpytorch.mlls.ExactMarginalLogLikelihood(model.likelihood, model)
for i in range(100):
optimizer.zero_grad()
output = model(train_x)
loss = -mll(output, train_y)
loss.backward()
optimizer.step()
model.eval()
new_inputs = torch.linspace(0, 2 * np.pi, 50)
pred1 = model(new_inputs)
pred2 = model.likelihood(model(new_inputs))
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(2 * 7, 7))
ax1.plot(new_inputs.detach().numpy(), pred1.mean.detach().numpy())
ax2.plot(new_inputs.detach().numpy(), pred2.mean.detach().numpy())
ax1.plot(train_x.detach().numpy(), train_y.detach().numpy(), "*k", alpha=0.2)
ax2.plot(train_x.detach().numpy(), train_y.detach().numpy(), "*k", alpha=0.2)
for _ in range(5):
sample1 = pred1.sample().detach().numpy()
ax1.plot(new_inputs.detach().numpy(), sample1, "--")
sample2 = pred2.sample().detach().numpy()
ax2.plot(new_inputs.detach().numpy(), sample2, "--")
ax1.set_title("Samples from model(new_data)")
ax2.set_title("Samples from likelihood(model(new_data))")
plt.tight_layout()
plt.show()
plt.close() |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 1 reply
-
The difference between them is an additional
The posterior
Again, this is because y_pred is modeling the latent function |
Beta Was this translation helpful? Give feedback.
model(x)
returns the posterior distributionf(x*) | y ~ N( k^* (K + \sigma^2 I)^{-1} y, k** - k^* (K + \sigma^2 I) k)
.likelihood(model(x))
returns the posterior predictive distributiony(x*) | y ~ N( k^* (K + \sigma^2 I)^{-1} y, k** - k^* (K + \sigma^2 I) k + \sigma^2)
.The difference between them is an additional
\sigma^2
variance term. This is becausey(x*) | y = f(x*) | y + \epsilon
, where\epsilon ~ N(0, \sigma^2)
is the observational noise.