diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index ccde13925bc..f3035fe825e 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -45,9 +45,16 @@ All of the above apply to:
 - Changes to the BART implementation:
   - A BART variable can be combined with other random variables. The `inv_link` argument has been removed (see [4914](https://github.com/pymc-devs/pymc3/pull/4914)).
   - Moved BART to its own module (see [5058](https://github.com/pymc-devs/pymc3/pull/5058)).
+- Changes to the Gaussian Process (GP) submodule (see [5055](https://github.com/pymc-devs/pymc/pull/5055)):
+  - For all implementations, `gp.Latent`, `gp.Marginal` etc., `cov_func` and `mean_func` are required kwargs.
+  - In Windows test conda environment the `mkl` version is fixed to verison 2020.4, and `mkl-service` is fixed to `2.3.0`.  This was required for `gp.MarginalKron` to function properly.
+  - `gp.MvStudentT` uses rotated samples from `StudentT` directly now, instead of sampling from `pm.Chi2` and then from `pm.Normal`.
+  - The "jitter" parameter, or the diagonal noise term added to Gram matrices such that the Cholesky is numerically stable, is now exposed to the user instead of hard-coded.  See the function `gp.util.stabilize`.
+  - The `is_observed` arguement for `gp.Marginal*` implementations has been deprecated.
+  - In the gp.utils file, the `kmeans_inducing_points` function now passes through `kmeans_kwargs` to scipy's k-means function.
+  - The function `replace_with_values` function has been added to `gp.utils`.
 - ...
 
-
 ### Expected breaks
 + New API was already available in `v3`.
 + Old API had deprecation warnings since at least `3.11.0` (2021-01).
diff --git a/conda-envs/windows-environment-test-py38.yml b/conda-envs/windows-environment-test-py38.yml
index 5a1f3629c2e..79f0d8c599b 100644
--- a/conda-envs/windows-environment-test-py38.yml
+++ b/conda-envs/windows-environment-test-py38.yml
@@ -12,7 +12,8 @@ dependencies:
 - fastprogress>=0.2.0
 - h5py>=2.7
 - libpython
-- mkl-service
+- mkl==2020.4
+- mkl-service==2.3.0
 - m2w64-toolchain
 - numpy>=1.15.0
 - pandas
diff --git a/pymc/gp/cov.py b/pymc/gp/cov.py
index ccbbe59b81e..1a19edb2bbf 100644
--- a/pymc/gp/cov.py
+++ b/pymc/gp/cov.py
@@ -64,7 +64,7 @@ def __init__(self, input_dim, active_dims=None):
         if active_dims is None:
             self.active_dims = np.arange(input_dim)
         else:
-            self.active_dims = np.asarray(active_dims, np.int)
+            self.active_dims = np.asarray(active_dims, int)
 
     def __call__(self, X, Xs=None, diag=False):
         r"""
@@ -152,7 +152,9 @@ def __array_wrap__(self, result):
         elif isinstance(result[0][0], Prod):
             return result[0][0].factor_list[0] * A
         else:
-            raise RuntimeError
+            raise TypeError(
+                f"Unknown Covariance combination type {result[0][0]}.  Known types are `Add` or `Prod`."
+            )
 
 
 class Combination(Covariance):
diff --git a/pymc/gp/gp.py b/pymc/gp/gp.py
index 0ed4d719f9d..b8a76519598 100644
--- a/pymc/gp/gp.py
+++ b/pymc/gp/gp.py
@@ -24,9 +24,11 @@
 from pymc.gp.cov import Constant, Covariance
 from pymc.gp.mean import Zero
 from pymc.gp.util import (
+    JITTER_DEFAULT,
     cholesky,
     conditioned_vars,
-    infer_shape,
+    infer_size,
+    replace_with_values,
     solve_lower,
     solve_upper,
     stabilize,
@@ -41,7 +43,7 @@ class Base:
     Base class.
     """
 
-    def __init__(self, mean_func=Zero(), cov_func=Constant(0.0)):
+    def __init__(self, *, mean_func=Zero(), cov_func=Constant(0.0)):
         self.mean_func = mean_func
         self.cov_func = cov_func
 
@@ -51,7 +53,7 @@ def __add__(self, other):
             raise TypeError("Cannot add different GP types")
         mean_total = self.mean_func + other.mean_func
         cov_total = self.cov_func + other.cov_func
-        return self.__class__(mean_total, cov_total)
+        return self.__class__(mean_func=mean_total, cov_func=cov_total)
 
     def prior(self, name, X, *args, **kwargs):
         raise NotImplementedError
@@ -62,7 +64,7 @@ def marginal_likelihood(self, name, X, *args, **kwargs):
     def conditional(self, name, Xnew, *args, **kwargs):
         raise NotImplementedError
 
-    def predict(self, Xnew, point=None, given=None, diag=False):
+    def predict(self, Xnew, point=None, given=None, diag=False, model=None):
         raise NotImplementedError
 
 
@@ -122,21 +124,21 @@ class Latent(Base):
             fcond = gp.conditional("fcond", Xnew=Xnew)
     """
 
-    def __init__(self, mean_func=Zero(), cov_func=Constant(0.0)):
-        super().__init__(mean_func, cov_func)
+    def __init__(self, *, mean_func=Zero(), cov_func=Constant(0.0)):
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
 
-    def _build_prior(self, name, X, reparameterize=True, **kwargs):
+    def _build_prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
         mu = self.mean_func(X)
-        cov = stabilize(self.cov_func(X))
-        shape = infer_shape(X, kwargs.pop("shape", None))
+        cov = stabilize(self.cov_func(X), jitter)
         if reparameterize:
-            v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=shape, **kwargs)
+            size = infer_size(X, kwargs.pop("size", None))
+            v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=size, **kwargs)
             f = pm.Deterministic(name, mu + cholesky(cov).dot(v))
         else:
-            f = pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+            f = pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
         return f
 
-    def prior(self, name, X, reparameterize=True, **kwargs):
+    def prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
         R"""
         Returns the GP prior distribution evaluated over the input
         locations `X`.
@@ -157,11 +159,14 @@ def prior(self, name, X, reparameterize=True, **kwargs):
         reparameterize: bool
             Reparameterize the distribution by rotating the random
             variable by the Cholesky factor of the covariance matrix.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  Default value is 1e-6.
         **kwargs
             Extra keyword arguments that are passed to distribution constructor.
         """
 
-        f = self._build_prior(name, X, reparameterize, **kwargs)
+        f = self._build_prior(name, X, reparameterize, jitter, **kwargs)
         self.X = X
         self.f = f
         return f
@@ -181,10 +186,10 @@ def _get_given_vals(self, given):
             X, f = self.X, self.f
         return X, f, cov_total, mean_total
 
-    def _build_conditional(self, Xnew, X, f, cov_total, mean_total):
+    def _build_conditional(self, Xnew, X, f, cov_total, mean_total, jitter):
         Kxx = cov_total(X)
         Kxs = self.cov_func(X, Xnew)
-        L = cholesky(stabilize(Kxx))
+        L = cholesky(stabilize(Kxx, jitter))
         A = solve_lower(L, Kxs)
         v = solve_lower(L, f - mean_total(X))
         mu = self.mean_func(Xnew) + at.dot(at.transpose(A), v)
@@ -192,7 +197,7 @@ def _build_conditional(self, Xnew, X, f, cov_total, mean_total):
         cov = Kss - at.dot(at.transpose(A), A)
         return mu, cov
 
-    def conditional(self, name, Xnew, given=None, **kwargs):
+    def conditional(self, name, Xnew, given=None, jitter=0.0, **kwargs):
         R"""
         Returns the conditional distribution evaluated over new input
         locations `Xnew`.
@@ -217,14 +222,17 @@ def conditional(self, name, Xnew, given=None, **kwargs):
             Can optionally take as key value pairs: `X`, `y`, `noise`,
             and `gp`.  See the section in the documentation on additive GP
             models in PyMC for more information.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
         """
         givens = self._get_given_vals(given)
-        mu, cov = self._build_conditional(Xnew, *givens)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+        mu, cov = self._build_conditional(Xnew, *givens, jitter)
+        return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
 
 @conditioned_vars(["X", "f", "nu"])
@@ -257,28 +265,27 @@ class TP(Latent):
         Processes as Alternatives to Gaussian Processes.  arXiv preprint arXiv:1402.4306.
     """
 
-    def __init__(self, mean_func=Zero(), cov_func=Constant(0.0), nu=None):
+    def __init__(self, *, mean_func=Zero(), cov_func=Constant(0.0), nu=None):
         if nu is None:
             raise ValueError("Student's T process requires a degrees of freedom parameter, 'nu'")
         self.nu = nu
-        super().__init__(mean_func, cov_func)
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
 
     def __add__(self, other):
         raise TypeError("Student's T processes aren't additive")
 
-    def _build_prior(self, name, X, reparameterize=True, **kwargs):
+    def _build_prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
         mu = self.mean_func(X)
-        cov = stabilize(self.cov_func(X))
-        shape = infer_shape(X, kwargs.pop("shape", None))
+        cov = stabilize(self.cov_func(X), jitter)
         if reparameterize:
-            chi2 = pm.ChiSquared(name + "_chi2_", self.nu)
-            v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=shape, **kwargs)
-            f = pm.Deterministic(name, (at.sqrt(self.nu) / chi2) * (mu + cholesky(cov).dot(v)))
+            size = infer_size(X, kwargs.pop("size", None))
+            v = pm.StudentT(name + "_rotated_", mu=0.0, sigma=1.0, nu=self.nu, size=size, **kwargs)
+            f = pm.Deterministic(name, mu + cholesky(cov).dot(v))
         else:
-            f = pm.MvStudentT(name, nu=self.nu, mu=mu, cov=cov, size=shape, **kwargs)
+            f = pm.MvStudentT(name, nu=self.nu, mu=mu, cov=cov, **kwargs)
         return f
 
-    def prior(self, name, X, reparameterize=True, **kwargs):
+    def prior(self, name, X, reparameterize=True, jitter=JITTER_DEFAULT, **kwargs):
         R"""
         Returns the TP prior distribution evaluated over the input
         locations `X`.
@@ -299,16 +306,16 @@ def prior(self, name, X, reparameterize=True, **kwargs):
             Extra keyword arguments that are passed to distribution constructor.
         """
 
-        f = self._build_prior(name, X, reparameterize, **kwargs)
+        f = self._build_prior(name, X, reparameterize, jitter, **kwargs)
         self.X = X
         self.f = f
         return f
 
-    def _build_conditional(self, Xnew, X, f):
+    def _build_conditional(self, Xnew, X, f, jitter):
         Kxx = self.cov_func(X)
         Kxs = self.cov_func(X, Xnew)
         Kss = self.cov_func(Xnew)
-        L = cholesky(stabilize(Kxx))
+        L = cholesky(stabilize(Kxx, jitter))
         A = solve_lower(L, Kxs)
         cov = Kss - at.dot(at.transpose(A), A)
         v = solve_lower(L, f - self.mean_func(X))
@@ -318,7 +325,7 @@ def _build_conditional(self, Xnew, X, f):
         covT = (self.nu + beta - 2) / (nu2 - 2) * cov
         return nu2, mu, covT
 
-    def conditional(self, name, Xnew, **kwargs):
+    def conditional(self, name, Xnew, jitter=0.0, **kwargs):
         R"""
         Returns the conditional distribution evaluated over new input
         locations `Xnew`.
@@ -333,6 +340,10 @@ def conditional(self, name, Xnew, **kwargs):
             Name of the random variable
         Xnew: array-like
             Function input values.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
@@ -340,9 +351,8 @@ def conditional(self, name, Xnew, **kwargs):
 
         X = self.X
         f = self.f
-        nu2, mu, cov = self._build_conditional(Xnew, X, f)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvStudentT(name, nu=nu2, mu=mu, cov=cov, size=shape, **kwargs)
+        nu2, mu, cov = self._build_conditional(Xnew, X, f, jitter)
+        return pm.MvStudentT(name, nu=nu2, mu=mu, cov=cov, **kwargs)
 
 
 @conditioned_vars(["X", "y", "noise"])
@@ -391,14 +401,14 @@ class Marginal(Base):
             fcond = gp.conditional("fcond", Xnew=Xnew)
     """
 
-    def _build_marginal_likelihood(self, X, noise):
+    def _build_marginal_likelihood(self, X, noise, jitter):
         mu = self.mean_func(X)
         Kxx = self.cov_func(X)
         Knx = noise(X)
         cov = Kxx + Knx
-        return mu, cov
+        return mu, stabilize(cov, jitter)
 
-    def marginal_likelihood(self, name, X, y, noise, is_observed=True, **kwargs):
+    def marginal_likelihood(self, name, X, y, noise, jitter=0.0, is_observed=True, **kwargs):
         R"""
         Returns the marginal likelihood distribution, given the input
         locations `X` and the data `y`.
@@ -422,9 +432,9 @@ def marginal_likelihood(self, name, X, y, noise, is_observed=True, **kwargs):
         noise: scalar, Variable, or Covariance
             Standard deviation of the Gaussian noise.  Can also be a Covariance for
             non-white noise.
-        is_observed: bool
-            Whether to set `y` as an `observed` variable in the `model`.
-            Default is `True`.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  Default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
@@ -432,15 +442,19 @@ def marginal_likelihood(self, name, X, y, noise, is_observed=True, **kwargs):
 
         if not isinstance(noise, Covariance):
             noise = pm.gp.cov.WhiteNoise(noise)
-        mu, cov = self._build_marginal_likelihood(X, noise)
+        mu, cov = self._build_marginal_likelihood(X, noise, jitter)
         self.X = X
         self.y = y
         self.noise = noise
         if is_observed:
             return pm.MvNormal(name, mu=mu, cov=cov, observed=y, **kwargs)
         else:
-            shape = infer_shape(X, kwargs.pop("shape", None))
-            return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+            warnings.warn(
+                "The 'is_observed' argument has been deprecated.  If the GP is "
+                "unobserved use gp.Latent instead.",
+                FutureWarning,
+            )
+            return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
     def _get_given_vals(self, given):
         if given is None:
@@ -460,12 +474,14 @@ def _get_given_vals(self, given):
             X, y, noise = self.X, self.y, self.noise
         return X, y, noise, cov_total, mean_total
 
-    def _build_conditional(self, Xnew, pred_noise, diag, X, y, noise, cov_total, mean_total):
+    def _build_conditional(
+        self, Xnew, pred_noise, diag, X, y, noise, cov_total, mean_total, jitter
+    ):
         Kxx = cov_total(X)
         Kxs = self.cov_func(X, Xnew)
         Knx = noise(X)
         rxx = y - mean_total(X)
-        L = cholesky(stabilize(Kxx) + Knx)
+        L = cholesky(stabilize(Kxx, jitter) + Knx)
         A = solve_lower(L, Kxs)
         v = solve_lower(L, rxx)
         mu = self.mean_func(Xnew) + at.dot(at.transpose(A), v)
@@ -480,9 +496,9 @@ def _build_conditional(self, Xnew, pred_noise, diag, X, y, noise, cov_total, mea
             cov = Kss - at.dot(at.transpose(A), A)
             if pred_noise:
                 cov += noise(Xnew)
-            return mu, cov if pred_noise else stabilize(cov)
+            return mu, cov if pred_noise else stabilize(cov, jitter)
 
-    def conditional(self, name, Xnew, pred_noise=False, given=None, **kwargs):
+    def conditional(self, name, Xnew, pred_noise=False, given=None, jitter=0.0, **kwargs):
         R"""
         Returns the conditional distribution evaluated over new input
         locations `Xnew`.
@@ -510,17 +526,22 @@ def conditional(self, name, Xnew, pred_noise=False, given=None, **kwargs):
             Can optionally take as key value pairs: `X`, `y`, `noise`,
             and `gp`.  See the section in the documentation on additive GP
             models in PyMC for more information.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
         """
 
         givens = self._get_given_vals(given)
-        mu, cov = self._build_conditional(Xnew, pred_noise, False, *givens)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+        mu, cov = self._build_conditional(Xnew, pred_noise, False, *givens, jitter)
+        return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
-    def predict(self, Xnew, point=None, diag=False, pred_noise=False, given=None):
+    def predict(
+        self, Xnew, point=None, diag=False, pred_noise=False, given=None, jitter=0.0, model=None
+    ):
         R"""
         Return the mean vector and covariance matrix of the conditional
         distribution as numpy arrays, given a `point`, such as the MAP
@@ -541,16 +562,17 @@ def predict(self, Xnew, point=None, diag=False, pred_noise=False, given=None):
             Default is `False`.
         given: dict
             Same as `conditional` method.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         """
         if given is None:
             given = {}
+        mu, cov = self._predict_at(Xnew, diag, pred_noise, given, jitter)
+        return replace_with_values([mu, cov], replacements=point, model=model)
 
-        mu, cov = self.predictt(Xnew, diag, pred_noise, given)
-        # XXX: This needs to be refactored
-        # return draw_values([mu, cov], point=point)
-        return None, None
-
-    def predictt(self, Xnew, diag=False, pred_noise=False, given=None):
+    def _predict_at(self, Xnew, diag=False, pred_noise=False, given=None, jitter=0.0):
         R"""
         Return the mean vector and covariance matrix of the conditional
         distribution as symbolic variables.
@@ -570,7 +592,7 @@ def predictt(self, Xnew, diag=False, pred_noise=False, given=None):
             Same as `conditional` method.
         """
         givens = self._get_given_vals(given)
-        mu, cov = self._build_conditional(Xnew, pred_noise, diag, *givens)
+        mu, cov = self._build_conditional(Xnew, pred_noise, diag, *givens, jitter)
         return mu, cov
 
 
@@ -643,7 +665,7 @@ def __init__(self, mean_func=Zero(), cov_func=Constant(0.0), approx="FITC"):
         if approx not in self._available_approx:
             raise NotImplementedError(approx)
         self.approx = approx
-        super().__init__(mean_func, cov_func)
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
 
     def __add__(self, other):
         # new_gp will default to FITC approx
@@ -657,11 +679,11 @@ def __add__(self, other):
     # Use y as first argument, so that we can use functools.partial
     # in marginal_likelihood instead of lambda. This makes pickling
     # possible.
-    def _build_marginal_likelihood_logp(self, y, X, Xu, sigma):
+    def _build_marginal_likelihood_logp(self, y, X, Xu, sigma, jitter):
         sigma2 = at.square(sigma)
         Kuu = self.cov_func(Xu)
         Kuf = self.cov_func(Xu, X)
-        Luu = cholesky(stabilize(Kuu))
+        Luu = cholesky(stabilize(Kuu, jitter))
         A = solve_lower(Luu, Kuf)
         Qffd = at.sum(A * A, 0)
         if self.approx == "FITC":
@@ -686,7 +708,9 @@ def _build_marginal_likelihood_logp(self, y, X, Xu, sigma):
         quadratic = 0.5 * (at.dot(r, r_l) - at.dot(c, c))
         return -1.0 * (constant + logdet + quadratic + trace)
 
-    def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kwargs):
+    def marginal_likelihood(
+        self, name, X, Xu, y, noise=None, is_observed=True, jitter=0.0, **kwargs
+    ):
         R"""
         Returns the approximate marginal likelihood distribution, given the input
         locations `X`, inducing point locations `Xu`, data `y`, and white noise
@@ -709,6 +733,9 @@ def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kw
         is_observed: bool
             Whether to set `y` as an `observed` variable in the `model`.
             Default is `True`.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  Default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
@@ -717,24 +744,19 @@ def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kw
         self.X = X
         self.Xu = Xu
         self.y = y
+
         if noise is None:
-            sigma = kwargs.get("sigma")
-            if sigma is None:
-                raise ValueError("noise argument must be specified")
-            else:
-                self.sigma = sigma
-                warnings.warn(
-                    "The 'sigma' argument has been deprecated. Use 'noise' instead.",
-                    FutureWarning,
-                )
+            raise ValueError("noise argument must be specified")
         else:
             self.sigma = noise
+
         if is_observed:
             return pm.DensityDist(
                 name,
                 X,
                 Xu,
                 self.sigma,
+                jitter,
                 logp=self._build_marginal_likelihood_logp,
                 observed=y,
                 ndims_params=[2, 2, 0],
@@ -742,11 +764,17 @@ def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kw
                 **kwargs,
             )
         else:
+            warnings.warn(
+                "The 'is_observed' argument has been deprecated.  If the GP is "
+                "unobserved use gp.Latent instead.",
+                FutureWarning,
+            )
             return pm.DensityDist(
                 name,
                 X,
                 Xu,
                 self.sigma,
+                jitter,
                 logp=self._build_marginal_likelihood_logp,
                 observed=y,
                 ndims_params=[2, 2, 0],
@@ -755,11 +783,13 @@ def marginal_likelihood(self, name, X, Xu, y, noise=None, is_observed=True, **kw
                 **kwargs,
             )
 
-    def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total):
+    def _build_conditional(
+        self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total, jitter
+    ):
         sigma2 = at.square(sigma)
         Kuu = cov_total(Xu)
         Kuf = cov_total(Xu, X)
-        Luu = cholesky(stabilize(Kuu))
+        Luu = cholesky(stabilize(Kuu, jitter))
         A = solve_lower(Luu, Kuf)
         Qffd = at.sum(A * A, 0)
         if self.approx == "FITC":
@@ -786,7 +816,7 @@ def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total,
             cov = self.cov_func(Xnew) - at.dot(at.transpose(As), As) + at.dot(at.transpose(C), C)
             if pred_noise:
                 cov += sigma2 * at.identity_like(cov)
-            return mu, cov if pred_noise else stabilize(cov)
+            return mu, cov if pred_noise else stabilize(cov, jitter)
 
     def _get_given_vals(self, given):
         if given is None:
@@ -803,7 +833,7 @@ def _get_given_vals(self, given):
             X, Xu, y, sigma = self.X, self.Xu, self.y, self.sigma
         return X, Xu, y, sigma, cov_total, mean_total
 
-    def conditional(self, name, Xnew, pred_noise=False, given=None, **kwargs):
+    def conditional(self, name, Xnew, pred_noise=False, given=None, jitter=0.0, **kwargs):
         R"""
         Returns the approximate conditional distribution of the GP evaluated over
         new input locations `Xnew`.
@@ -822,15 +852,18 @@ def conditional(self, name, Xnew, pred_noise=False, given=None, **kwargs):
             Can optionally take as key value pairs: `X`, `Xu`, `y`, `noise`,
             and `gp`.  See the section in the documentation on additive GP
             models in PyMC for more information.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
         """
 
         givens = self._get_given_vals(given)
-        mu, cov = self._build_conditional(Xnew, pred_noise, False, *givens)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+        mu, cov = self._build_conditional(Xnew, pred_noise, False, *givens, jitter)
+        return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
 
 @conditioned_vars(["Xs", "f"])
@@ -887,27 +920,26 @@ class LatentKron(Base):
             fcond = gp.conditional("fcond", Xnew=Xnew)
     """
 
-    def __init__(self, mean_func=Zero(), cov_funcs=(Constant(0.0))):
+    def __init__(self, *, mean_func=Zero(), cov_funcs=(Constant(0.0))):
         try:
             self.cov_funcs = list(cov_funcs)
         except TypeError:
             self.cov_funcs = [cov_funcs]
         cov_func = pm.gp.cov.Kron(self.cov_funcs)
-        super().__init__(mean_func, cov_func)
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
 
     def __add__(self, other):
         raise TypeError("Additive, Kronecker-structured processes not implemented")
 
-    def _build_prior(self, name, Xs, **kwargs):
-        self.N = np.prod([len(X) for X in Xs])
+    def _build_prior(self, name, Xs, jitter, **kwargs):
+        self.N = int(np.prod([len(X) for X in Xs]))
         mu = self.mean_func(cartesian(*Xs))
-        chols = [cholesky(stabilize(cov(X))) for cov, X in zip(self.cov_funcs, Xs)]
-        # remove reparameterization option
+        chols = [cholesky(stabilize(cov(X), jitter)) for cov, X in zip(self.cov_funcs, Xs)]
         v = pm.Normal(name + "_rotated_", mu=0.0, sigma=1.0, size=self.N, **kwargs)
         f = pm.Deterministic(name, mu + at.flatten(kron_dot(chols, v)))
         return f
 
-    def prior(self, name, Xs, **kwargs):
+    def prior(self, name, Xs, jitter=JITTER_DEFAULT, **kwargs):
         """
         Returns the prior distribution evaluated over the input
         locations `Xs`.
@@ -921,22 +953,26 @@ def prior(self, name, Xs, **kwargs):
             must be passable to its respective covariance without error. The
             total covariance function is measured on the full grid
             `cartesian(*Xs)`.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  Default value is 1e-6.
         **kwargs
             Extra keyword arguments that are passed to the `KroneckerNormal`
             distribution constructor.
         """
         if len(Xs) != len(self.cov_funcs):
             raise ValueError("Must provide a covariance function for each X")
-        f = self._build_prior(name, Xs, **kwargs)
+
+        f = self._build_prior(name, Xs, jitter, **kwargs)
         self.Xs = Xs
         self.f = f
         return f
 
-    def _build_conditional(self, Xnew):
+    def _build_conditional(self, Xnew, jitter):
         Xs, f = self.Xs, self.f
         X = cartesian(*Xs)
         delta = f - self.mean_func(X)
-        covs = [stabilize(cov(Xi)) for cov, Xi in zip(self.cov_funcs, Xs)]
+        covs = [stabilize(cov(Xi), jitter) for cov, Xi in zip(self.cov_funcs, Xs)]
         chols = [cholesky(cov) for cov in covs]
         cholTs = [at.transpose(chol) for chol in chols]
         Kss = self.cov_func(Xnew)
@@ -946,10 +982,10 @@ def _build_conditional(self, Xnew):
         alpha = kron_solve_upper(cholTs, alpha)
         mu = at.dot(Ksx, alpha).ravel() + self.mean_func(Xnew)
         A = kron_solve_lower(chols, Kxs)
-        cov = stabilize(Kss - at.dot(at.transpose(A), A))
+        cov = stabilize(Kss - at.dot(at.transpose(A), A), jitter)
         return mu, cov
 
-    def conditional(self, name, Xnew, **kwargs):
+    def conditional(self, name, Xnew, jitter=0.0, **kwargs):
         """
         Returns the conditional distribution evaluated over new input
         locations `Xnew`.
@@ -976,13 +1012,16 @@ def conditional(self, name, Xnew, **kwargs):
         Xnew: array-like
             Function input values.  If one-dimensional, must be a column
             vector with shape `(n, 1)`.
+        jitter: scalar
+            A small correction added to the diagonal of positive semi-definite
+            covariance matrices to ensure numerical stability.  For conditionals
+            the default value is 0.0.
         **kwargs
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
         """
-        mu, cov = self._build_conditional(Xnew)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+        mu, cov = self._build_conditional(Xnew, jitter)
+        return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
 
 @conditioned_vars(["Xs", "y", "sigma"])
@@ -1043,13 +1082,13 @@ class MarginalKron(Base):
             fcond = gp.conditional("fcond", Xnew=Xnew)
     """
 
-    def __init__(self, mean_func=Zero(), cov_funcs=(Constant(0.0))):
+    def __init__(self, *, mean_func=Zero(), cov_funcs=(Constant(0.0))):
         try:
             self.cov_funcs = list(cov_funcs)
         except TypeError:
             self.cov_funcs = [cov_funcs]
         cov_func = pm.gp.cov.Kron(self.cov_funcs)
-        super().__init__(mean_func, cov_func)
+        super().__init__(mean_func=mean_func, cov_func=cov_func)
 
     def __add__(self, other):
         raise TypeError("Additive, Kronecker-structured processes not implemented")
@@ -1061,14 +1100,12 @@ def _build_marginal_likelihood(self, Xs):
         return mu, covs
 
     def _check_inputs(self, Xs, y):
-        N = np.prod([len(X) for X in Xs])
+        N = int(np.prod([len(X) for X in Xs]))
         if len(Xs) != len(self.cov_funcs):
             raise ValueError("Must provide a covariance function for each X")
         if N != len(y):
             raise ValueError(
-                (
-                    "Length of y ({}) must match length of cartesian" "cartesian product of Xs ({})"
-                ).format(len(y), N)
+                f"Length of y ({len(y)}) must match length of cartesian product of Xs ({N})"
             )
 
     def marginal_likelihood(self, name, Xs, y, sigma, is_observed=True, **kwargs):
@@ -1105,10 +1142,15 @@ def marginal_likelihood(self, name, Xs, y, sigma, is_observed=True, **kwargs):
         if is_observed:
             return pm.KroneckerNormal(name, mu=mu, covs=covs, sigma=sigma, observed=y, **kwargs)
         else:
-            shape = np.prod([len(X) for X in Xs])
-            return pm.KroneckerNormal(name, mu=mu, covs=covs, sigma=sigma, size=shape, **kwargs)
+            warnings.warn(
+                "The 'is_observed' argument has been deprecated.  If the GP is "
+                "unobserved use gp.LatentKron instead.",
+                FutureWarning,
+            )
+            size = int(np.prod([len(X) for X in Xs]))
+            return pm.KroneckerNormal(name, mu=mu, covs=covs, sigma=sigma, size=size, **kwargs)
 
-    def _build_conditional(self, Xnew, pred_noise, diag):
+    def _build_conditional(self, Xnew, diag, pred_noise):
         Xs, y, sigma = self.Xs, self.y, self.sigma
 
         # Old points
@@ -1121,7 +1163,6 @@ def _build_conditional(self, Xnew, pred_noise, diag):
         if sigma is not None:
             eigs += sigma ** 2
 
-        # New points
         Km = self.cov_func(Xnew, diag=diag)
         Knm = self.cov_func(X, Xnew)
         Kmn = Knm.T
@@ -1147,7 +1188,7 @@ def _build_conditional(self, Xnew, pred_noise, diag):
                 cov += sigma * at.identity_like(cov)
         return mu, cov
 
-    def conditional(self, name, Xnew, pred_noise=False, **kwargs):
+    def conditional(self, name, Xnew, pred_noise=False, diag=False, **kwargs):
         """
         Returns the conditional distribution evaluated over new input
         locations `Xnew`, just as in `Marginal`.
@@ -1181,11 +1222,10 @@ def conditional(self, name, Xnew, pred_noise=False, **kwargs):
             Extra keyword arguments that are passed to `MvNormal` distribution
             constructor.
         """
-        mu, cov = self._build_conditional(Xnew, pred_noise, False)
-        shape = infer_shape(Xnew, kwargs.pop("shape", None))
-        return pm.MvNormal(name, mu=mu, cov=cov, size=shape, **kwargs)
+        mu, cov = self._build_conditional(Xnew, diag, pred_noise)
+        return pm.MvNormal(name, mu=mu, cov=cov, **kwargs)
 
-    def predict(self, Xnew, point=None, diag=False, pred_noise=False):
+    def predict(self, Xnew, point=None, diag=False, pred_noise=False, model=None):
         R"""
         Return the mean vector and covariance matrix of the conditional
         distribution as numpy arrays, given a `point`, such as the MAP
@@ -1205,12 +1245,10 @@ def predict(self, Xnew, point=None, diag=False, pred_noise=False):
             Whether or not observation noise is included in the conditional.
             Default is `False`.
         """
-        mu, cov = self._build_conditional(Xnew, pred_noise, diag)
-        # XXX: This needs to be refactored
-        # return draw_values([mu, cov], point=point)
-        return None, None
+        mu, cov = self._predict_at(Xnew, diag, pred_noise)
+        return replace_with_values([mu, cov], replacements=point, model=model)
 
-    def predictt(self, Xnew, diag=False, pred_noise=False):
+    def _predict_at(self, Xnew, diag=False, pred_noise=False):
         R"""
         Return the mean vector and covariance matrix of the conditional
         distribution as symbolic variables.
@@ -1227,5 +1265,5 @@ def predictt(self, Xnew, diag=False, pred_noise=False):
             Whether or not observation noise is included in the conditional.
             Default is `False`.
         """
-        mu, cov = self._build_conditional(Xnew, pred_noise, diag)
+        mu, cov = self._build_conditional(Xnew, diag, pred_noise)
         return mu, cov
diff --git a/pymc/gp/util.py b/pymc/gp/util.py
index f47331248c1..730e664eec4 100644
--- a/pymc/gp/util.py
+++ b/pymc/gp/util.py
@@ -17,6 +17,7 @@
 import aesara.tensor as at
 import numpy as np
 
+from aesara.compile import SharedVariable
 from aesara.tensor.slinalg import (  # noqa: W0611; pylint: disable=unused-import
     cholesky,
     solve,
@@ -30,10 +31,66 @@
 from aesara.tensor.var import TensorConstant
 from scipy.cluster.vq import kmeans
 
+from pymc.aesaraf import compile_rv_inplace, walk_model
 
-def infer_shape(X, n_points=None):
+# Avoid circular dependency when importing modelcontext
+from pymc.distributions.distribution import NoDistribution
+
+assert NoDistribution  # keep both pylint and black happy
+from pymc.model import modelcontext
+
+JITTER_DEFAULT = 1e-6
+
+
+def replace_with_values(vars_needed, replacements=None, model=None):
     R"""
-    Maybe attempt to infer the shape of a Gaussian process input matrix.
+    Replace random variable nodes in the graph with values given by the replacements dict.
+    Uses untransformed versions of the inputs, performs some basic input validation.
+
+    Parameters
+    ----------
+    vars_needed: list of TensorVariables
+        A list of variable outputs
+    replacements: dict with string keys, numeric values
+        The variable name and values to be replaced in the model graph.
+    model: Model
+        A PyMC model object
+    """
+    model = modelcontext(model)
+
+    inputs, input_names = [], []
+    for rv in walk_model(vars_needed, walk_past_rvs=True):
+        if rv in model.named_vars.values() and not isinstance(rv, SharedVariable):
+            inputs.append(rv)
+            input_names.append(rv.name)
+
+    # Then it's deterministic, no inputs are required, can eval and return
+    if len(inputs) == 0:
+        return tuple(v.eval() for v in vars_needed)
+
+    fn = compile_rv_inplace(
+        inputs,
+        vars_needed,
+        allow_input_downcast=True,
+        accept_inplace=True,
+        on_unused_input="ignore",
+    )
+
+    # Remove unneeded inputs
+    replacements = {name: val for name, val in replacements.items() if name in input_names}
+    missing = set(input_names) - set(replacements.keys())
+
+    # Error if more inputs are needed
+    if len(missing) > 0:
+        missing_str = ", ".join(missing)
+        raise ValueError(f"Values for {missing_str} must be included in `replacements`.")
+
+    return fn(**replacements)
+
+
+def infer_size(X, n_points=None):
+    R"""
+    Maybe attempt to infer the size, or N, of a Gaussian process input matrix.
 
     If a specific shape cannot be inferred, for instance if X is symbolic, then an
     error is raised.
@@ -48,31 +105,31 @@ def infer_shape(X, n_points=None):
     """
     if n_points is None:
         try:
-            n_points = np.int(X.shape[0])
+            n_points = int(X.shape[0])
         except TypeError:
             raise TypeError("Cannot infer 'shape', provide as an argument")
     return n_points
 
 
-def stabilize(K, c=1e-6):
+def stabilize(K, jitter=JITTER_DEFAULT):
     R"""
     Adds small diagonal to a covariance matrix.
 
     Often the matrices calculated from covariance functions, `K = cov_func(X)`
     do not appear numerically to be positive semi-definite.  Adding a small
-    correction, `c`, to the diagonal is usually enough to fix this.
+    correction, `jitter`, to the diagonal is usually enough to fix this.
 
     Parameters
     ----------
     K: array-like
         A square covariance or kernel matrix.
-    c: float
+    jitter: float
         A small constant.
     """
-    return K + c * at.identity_like(K)
+    return K + jitter * at.identity_like(K)
 
 
-def kmeans_inducing_points(n_inducing, X):
+def kmeans_inducing_points(n_inducing, X, **kmeans_kwargs):
     R"""
     Use the K-means algorithm to initialize the locations `X` for the inducing
     points `fu`.
@@ -83,6 +140,8 @@ def kmeans_inducing_points(n_inducing, X):
         The number of inducing points (or k, the number of clusters)
     X: array-like
         Gaussian process input matrix.
+    **kmeans_kwargs:
+        Extra keyword arguments that are passed to `scipy.cluster.vq.kmeans`
     """
     # first whiten X
     if isinstance(X, TensorConstant):
@@ -100,7 +159,11 @@ def kmeans_inducing_points(n_inducing, X):
     # if std of a column is very small (zero), don't normalize that column
     scaling[scaling <= 1e-6] = 1.0
     Xw = X / scaling
-    Xu, distortion = kmeans(Xw, n_inducing)
+
+    if "k_or_guess" in kmeans_kwargs:
+        warn.UserWarning("Use `n_inducing` to set the `k_or_guess` parameter instead.")
+
+    Xu, distortion = kmeans(Xw, k_or_guess=n_inducing, **kmeans_kwargs)
     return Xu * scaling
 
 
diff --git a/pymc/tests/test_gp.py b/pymc/tests/test_gp.py
index 424e87e5e8b..7ae9b23c866 100644
--- a/pymc/tests/test_gp.py
+++ b/pymc/tests/test_gp.py
@@ -34,7 +34,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             zero_mean = pm.gp.mean.Zero()
-        M = aesara.function([], zero_mean(X))()
+        M = zero_mean(X).eval()
         assert np.all(M == 0)
         assert M.shape == (10,)
 
@@ -44,7 +44,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             const_mean = pm.gp.mean.Constant(6)
-        M = aesara.function([], const_mean(X))()
+        M = const_mean(X).eval()
         assert np.all(M == 6)
         assert M.shape == (10,)
 
@@ -54,7 +54,7 @@ def test_value(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             linear_mean = pm.gp.mean.Linear(2, 0.5)
-        M = aesara.function([], linear_mean(X))()
+        M = linear_mean(X).eval()
         npt.assert_allclose(M[1], 0.7222, atol=1e-3)
         assert M.shape == (10,)
 
@@ -66,7 +66,7 @@ def test_add(self):
             mean1 = pm.gp.mean.Linear(coeffs=2, intercept=0.5)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 + mean2 + mean2
-        M = aesara.function([], mean(X))()
+        M = mean(X).eval()
         npt.assert_allclose(M[1], 0.7222 + 2 + 2, atol=1e-3)
 
     def test_prod(self):
@@ -75,7 +75,7 @@ def test_prod(self):
             mean1 = pm.gp.mean.Linear(coeffs=2, intercept=0.5)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 * mean2 * mean2
-        M = aesara.function([], mean(X))()
+        M = mean(X).eval()
         npt.assert_allclose(M[1], 0.7222 * 2 * 2, atol=1e-3)
 
     def test_add_multid(self):
@@ -86,7 +86,7 @@ def test_add_multid(self):
             mean1 = pm.gp.mean.Linear(coeffs=A, intercept=b)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 + mean2 + mean2
-        M = aesara.function([], mean(X))()
+        M = mean(X).eval()
         npt.assert_allclose(M[1], 10.8965 + 2 + 2, atol=1e-3)
 
     def test_prod_multid(self):
@@ -97,7 +97,7 @@ def test_prod_multid(self):
             mean1 = pm.gp.mean.Linear(coeffs=A, intercept=b)
             mean2 = pm.gp.mean.Constant(2)
             mean = mean1 * mean2 * mean2
-        M = aesara.function([], mean(X))()
+        M = mean(X).eval()
         npt.assert_allclose(M[1], 10.8965 * 2 * 2, atol=1e-3)
 
 
@@ -108,10 +108,10 @@ def test_symadd_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 + cov2
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightadd_scalar(self):
@@ -119,10 +119,10 @@ def test_rightadd_scalar(self):
         with pm.Model() as model:
             a = 1
             cov = pm.gp.cov.ExpQuad(1, 0.1) + a
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 1.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftadd_scalar(self):
@@ -130,10 +130,10 @@ def test_leftadd_scalar(self):
         with pm.Model() as model:
             a = 1
             cov = a + pm.gp.cov.ExpQuad(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 1.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightadd_matrix(self):
@@ -141,10 +141,10 @@ def test_rightadd_matrix(self):
         M = 2 * np.ones((10, 10))
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1) + M
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftadd_matrixt(self):
@@ -152,10 +152,10 @@ def test_leftadd_matrixt(self):
         M = 2 * at.ones((10, 10))
         with pm.Model() as model:
             cov = M + pm.gp.cov.ExpQuad(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_matrix(self):
@@ -164,8 +164,8 @@ def test_leftprod_matrix(self):
         with pm.Model() as model:
             cov = M + pm.gp.cov.ExpQuad(1, 0.1)
             cov_true = pm.gp.cov.ExpQuad(1, 0.1) + M
-        K = aesara.function([], cov(X))()
-        K_true = aesara.function([], cov_true(X))()
+        K = cov(X).eval()
+        K_true = cov_true(X).eval()
         assert np.allclose(K, K_true)
 
     def test_inv_rightadd(self):
@@ -181,10 +181,10 @@ def test_symprod_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 * cov2
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightprod_scalar(self):
@@ -192,10 +192,10 @@ def test_rightprod_scalar(self):
         with pm.Model() as model:
             a = 2
             cov = pm.gp.cov.ExpQuad(1, 0.1) * a
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_scalar(self):
@@ -203,10 +203,10 @@ def test_leftprod_scalar(self):
         with pm.Model() as model:
             a = 2
             cov = a * pm.gp.cov.ExpQuad(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_rightprod_matrix(self):
@@ -214,10 +214,10 @@ def test_rightprod_matrix(self):
         M = 2 * np.ones((10, 10))
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1) * M
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_leftprod_matrix(self):
@@ -226,8 +226,8 @@ def test_leftprod_matrix(self):
         with pm.Model() as model:
             cov = M * pm.gp.cov.ExpQuad(1, 0.1)
             cov_true = pm.gp.cov.ExpQuad(1, 0.1) * M
-        K = aesara.function([], cov(X))()
-        K_true = aesara.function([], cov_true(X))()
+        K = cov(X).eval()
+        K_true = cov_true(X).eval()
         assert np.allclose(K, K_true)
 
     def test_multiops(self):
@@ -244,12 +244,12 @@ def test_multiops(self):
                 + pm.gp.cov.ExpQuad(1, 0.1)
                 + 3
             )
-        K1 = aesara.function([], cov1(X))()
-        K2 = aesara.function([], cov2(X))()
+        K1 = cov1(X).eval()
+        K2 = cov2(X).eval()
         assert np.allclose(K1, K2)
         # check diagonal
-        K1d = aesara.function([], cov1(X, diag=True))()
-        K2d = aesara.function([], cov2(X, diag=True))()
+        K1d = cov1(X, diag=True).eval()
+        K2d = cov2(X, diag=True).eval()
         npt.assert_allclose(np.diag(K1), K2d, atol=1e-5)
         npt.assert_allclose(np.diag(K2), K1d, atol=1e-5)
 
@@ -265,10 +265,10 @@ def test_symexp_cov(self):
         with pm.Model() as model:
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = cov1 ** 2
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_covexp_numpy(self):
@@ -276,10 +276,10 @@ def test_covexp_numpy(self):
         with pm.Model() as model:
             a = np.array([[2]])
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_covexp_aesara(self):
@@ -287,10 +287,10 @@ def test_covexp_aesara(self):
         with pm.Model() as model:
             a = at.alloc(2.0, 1, 1)
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_covexp_shared(self):
@@ -298,10 +298,10 @@ def test_covexp_shared(self):
         with pm.Model() as model:
             a = aesara.shared(2.0)
             cov = pm.gp.cov.ExpQuad(1, 0.1) ** a
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940 ** 2, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_invalid_covexp(self):
@@ -321,11 +321,11 @@ def test_symprod_cov(self):
             cov1 = pm.gp.cov.ExpQuad(1, 0.1)
             cov2 = pm.gp.cov.ExpQuad(1, 0.1)
             cov = pm.gp.cov.Kron([cov1, cov2])
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 1 * 0.53940, atol=1e-3)
         npt.assert_allclose(K[0, 11], 0.53940 * 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_multiops(self):
@@ -342,8 +342,8 @@ def test_multiops(self):
             )
             cov2 = pm.gp.cov.ExpQuad(1, 0.1) * pm.gp.cov.ExpQuad(2, 0.1)
             cov = pm.gp.cov.Kron([cov1, cov2])
-        K_true = kronecker(aesara.function([], cov1(X1))(), aesara.function([], cov2(X2))()).eval()
-        K = aesara.function([], cov(X))()
+        K_true = kronecker(cov1(X1).eval(), cov2(X2).eval()).eval()
+        K = cov(X).eval()
         npt.assert_allclose(K_true, K)
 
 
@@ -352,30 +352,30 @@ def test_slice1(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, 0.1, active_dims=[0, 0, 1])
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.20084298, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_slice2(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, ls=[0.1, 0.1], active_dims=[1, 2])
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.34295549, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_slice3(self):
         X = np.linspace(0, 1, 30).reshape(10, 3)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(3, ls=np.array([0.1, 0.1]), active_dims=[1, 2])
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.34295549, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_diffslice(self):
@@ -384,10 +384,10 @@ def test_diffslice(self):
             cov = pm.gp.cov.ExpQuad(3, ls=0.1, active_dims=[1, 0, 0]) + pm.gp.cov.ExpQuad(
                 3, ls=[0.1, 0.2, 0.3]
             )
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.683572, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -402,7 +402,7 @@ def test_stable(self):
         X = np.random.uniform(low=320.0, high=400.0, size=[2000, 2])
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, 0.1)
-        dists = aesara.function([], cov.square_dist(X, X))()
+        dists = cov.square_dist(X, X).eval()
         assert not np.any(dists < 0)
 
 
@@ -411,44 +411,44 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_2d(self):
         X = np.linspace(0, 1, 10).reshape(5, 2)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, 0.5)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.820754, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_2dard(self):
         X = np.linspace(0, 1, 10).reshape(5, 2)
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(2, np.array([1, 2]))
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.969607, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_inv_lengthscale(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.ExpQuad(1, ls_inv=10)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.53940, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -457,14 +457,14 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.WhiteNoise(sigma=0.5)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.0, atol=1e-3)
         npt.assert_allclose(K[0, 0], 0.5 ** 2, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
         # check predict
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.0, atol=1e-3)
         # white noise predicting should return all zeros
         npt.assert_allclose(K[0, 0], 0.0, atol=1e-3)
@@ -475,14 +475,14 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Constant(2.5)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 2.5, atol=1e-3)
         npt.assert_allclose(K[0, 0], 2.5, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 2.5, atol=1e-3)
         npt.assert_allclose(K[0, 0], 2.5, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -491,12 +491,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.RatQuad(1, ls=0.1, alpha=0.5)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.66896, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.66896, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -505,12 +505,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Exponential(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.57375, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.57375, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -519,12 +519,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern52(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.46202, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.46202, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -533,12 +533,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern32(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.42682, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.42682, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -547,11 +547,11 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Matern12(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.32919, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.32919, atol=1e-3)
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -560,12 +560,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Cosine(1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.766, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.766, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -574,12 +574,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Periodic(1, 0.1, 0.1)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.00288, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.00288, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -588,12 +588,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Linear(1, 0.5)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.19444, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.19444, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -602,12 +602,12 @@ def test_1d(self):
         X = np.linspace(0, 1, 10)[:, None]
         with pm.Model() as model:
             cov = pm.gp.cov.Polynomial(1, 0.5, 2, 0)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.03780, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.03780, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
 
@@ -621,12 +621,12 @@ def warp_func(x, a, b, c):
         with pm.Model() as model:
             cov_m52 = pm.gp.cov.Matern52(1, 0.2)
             cov = pm.gp.cov.WarpedInput(1, warp_func=warp_func, args=(1, 10, 1), cov_func=cov_m52)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 0.79593, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 0.79593, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -646,12 +646,12 @@ def tanh_func(x, x1, x2, w, x0):
 
         with pm.Model() as model:
             cov = pm.gp.cov.Gibbs(1, tanh_func, args=(0.05, 0.6, 0.4, 1.0))
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[2, 3], 0.136683, atol=1e-4)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[2, 3], 0.136683, atol=1e-4)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -673,12 +673,12 @@ def scaling_func(x, a, b):
         with pm.Model() as model:
             cov_m52 = pm.gp.cov.Matern52(1, 0.2)
             cov = pm.gp.cov.ScaledCov(1, scaling_func=scaling_func, args=(2, -1), cov_func=cov_m52)
-        K = aesara.function([], cov(X))()
+        K = cov(X).eval()
         npt.assert_allclose(K[0, 1], 3.00686, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
+        K = cov(X, X).eval()
         npt.assert_allclose(K[0, 1], 3.00686, atol=1e-3)
         # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
+        Kd = cov(X, diag=True).eval()
         npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
 
     def test_raises(self):
@@ -689,6 +689,34 @@ def test_raises(self):
             pm.gp.cov.ScaledCov(1, "str is not Covariance object", lambda x: x)
 
 
+class TestCircular:
+    def test_1d_tau1(self):
+        X = np.linspace(0, 1, 10)[:, None]
+        etalon = 0.600881
+        with pm.Model():
+            cov = pm.gp.cov.Circular(1, 1, tau=5)
+        K = cov(X).eval()
+        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
+        K = cov(X, X).eval()
+        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
+        # check diagonal
+        Kd = cov(X, diag=True).eval()
+        npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
+
+    def test_1d_tau2(self):
+        X = np.linspace(0, 1, 10)[:, None]
+        etalon = 0.691239
+        with pm.Model():
+            cov = pm.gp.cov.Circular(1, 1, tau=4)
+        K = cov(X).eval()
+        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
+        K = cov(X, X).eval()
+        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
+        # check diagonal
+        Kd = cov(X, diag=True).eval()
+        npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
+
+
 class TestHandleArgs:
     def test_handleargs(self):
         def func_noargs(x):
@@ -767,22 +795,21 @@ def test_raises3(self):
                 B = pm.gp.cov.Coregion(1)
 
 
-@pytest.mark.xfail(reason="MvNormal was not yet refactored")
 class TestMarginalVsLatent:
     R"""
     Compare the logp of models Marginal, noise=0 and Latent.
     """
 
     def setup_method(self):
-        X = np.random.randn(50, 3)
-        y = np.random.randn(50) * 0.01
-        Xnew = np.random.randn(60, 3)
-        pnew = np.random.randn(60) * 0.01
+        X = np.random.randn(20, 3)
+        y = np.random.randn(20)
+        Xnew = np.random.randn(30, 3)
+        pnew = np.random.randn(30)
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.Marginal(mean_func, cov_func)
-            f = gp.marginal_likelihood("f", X, y, noise=0.0, is_observed=False, observed=y)
+            gp = pm.gp.Marginal(mean_func=mean_func, cov_func=cov_func)
+            f = gp.marginal_likelihood("f", X, y, noise=0.0)
             p = gp.conditional("p", Xnew)
         self.logp = model.logp({"p": pnew})
         self.X = X
@@ -794,7 +821,7 @@ def testLatent1(self):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.Latent(mean_func, cov_func)
+            gp = pm.gp.Latent(mean_func=mean_func, cov_func=cov_func)
             f = gp.prior("f", self.X, reparameterize=False)
             p = gp.conditional("p", self.Xnew)
         latent_logp = model.logp({"f": self.y, "p": self.pnew})
@@ -804,7 +831,7 @@ def testLatent2(self):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.Latent(mean_func, cov_func)
+            gp = pm.gp.Latent(mean_func=mean_func, cov_func=cov_func)
             f = gp.prior("f", self.X, reparameterize=True)
             p = gp.conditional("p", self.Xnew)
         chol = np.linalg.cholesky(cov_func(self.X).eval())
@@ -813,7 +840,6 @@ def testLatent2(self):
         npt.assert_allclose(latent_logp, self.logp, atol=5)
 
 
-@pytest.mark.xfail(reason="The `gp.predict` method was not yet refactored")
 class TestMarginalVsMarginalSparse:
     R"""
     Compare logp of models Marginal and MarginalSparse.
@@ -822,13 +848,13 @@ class TestMarginalVsMarginalSparse:
 
     def setup_method(self):
         X = np.random.randn(50, 3)
-        y = np.random.randn(50) * 0.01
+        y = np.random.randn(50)
         Xnew = np.random.randn(60, 3)
-        pnew = np.random.randn(60) * 0.01
+        pnew = np.random.randn(60)
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.Marginal(mean_func, cov_func)
+            gp = pm.gp.Marginal(mean_func=mean_func, cov_func=cov_func)
             sigma = 0.1
             f = gp.marginal_likelihood("f", X, y, noise=sigma)
             p = gp.conditional("p", Xnew)
@@ -845,7 +871,7 @@ def testApproximations(self, approx):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.MarginalSparse(mean_func, cov_func, approx=approx)
+            gp = pm.gp.MarginalSparse(mean_func=mean_func, cov_func=cov_func, approx=approx)
             f = gp.marginal_likelihood("f", self.X, self.X, self.y, self.sigma)
             p = gp.conditional("p", self.Xnew)
         approx_logp = model.logp({"f": self.y, "p": self.pnew})
@@ -856,10 +882,10 @@ def testPredictVar(self, approx):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.MarginalSparse(mean_func, cov_func, approx=approx)
+            gp = pm.gp.MarginalSparse(mean_func=mean_func, cov_func=cov_func, approx=approx)
             f = gp.marginal_likelihood("f", self.X, self.X, self.y, self.sigma)
-        mu1, var1 = self.gp.predict(self.Xnew, diag=True)
-        mu2, var2 = gp.predict(self.Xnew, diag=True)
+            mu1, var1 = self.gp.predict(self.Xnew, diag=True)
+            mu2, var2 = gp.predict(self.Xnew, diag=True)
         npt.assert_allclose(mu1, mu2, atol=0, rtol=1e-3)
         npt.assert_allclose(var1, var2, atol=0, rtol=1e-3)
 
@@ -867,10 +893,10 @@ def testPredictCov(self):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             mean_func = pm.gp.mean.Constant(0.5)
-            gp = pm.gp.MarginalSparse(mean_func, cov_func, approx="DTC")
-            f = gp.marginal_likelihood("f", self.X, self.X, self.y, self.sigma, is_observed=False)
-        mu1, cov1 = self.gp.predict(self.Xnew, pred_noise=True)
-        mu2, cov2 = gp.predict(self.Xnew, pred_noise=True)
+            gp = pm.gp.MarginalSparse(mean_func=mean_func, cov_func=cov_func, approx="DTC")
+            f = gp.marginal_likelihood("f", self.X, self.X, self.y, self.sigma)
+            mu1, cov1 = self.gp.predict(self.Xnew, pred_noise=True)
+            mu2, cov2 = gp.predict(self.Xnew, pred_noise=True)
         npt.assert_allclose(mu1, mu2, atol=0, rtol=1e-3)
         npt.assert_allclose(cov1, cov2, atol=0, rtol=1e-3)
 
@@ -878,7 +904,7 @@ def testPredictCov(self):
 class TestGPAdditive:
     def setup_method(self):
         self.X = np.random.randn(50, 3)
-        self.y = np.random.randn(50) * 0.01
+        self.y = np.random.randn(50)
         self.Xnew = np.random.randn(60, 3)
         self.noise = pm.gp.cov.WhiteNoise(0.1)
         self.covs = (
@@ -888,19 +914,20 @@ def setup_method(self):
         )
         self.means = (pm.gp.mean.Constant(0.5), pm.gp.mean.Constant(0.5), pm.gp.mean.Constant(0.5))
 
-    @pytest.mark.xfail(reason="MvNormal was not yet refactored")
     def testAdditiveMarginal(self):
         with pm.Model() as model1:
-            gp1 = pm.gp.Marginal(self.means[0], self.covs[0])
-            gp2 = pm.gp.Marginal(self.means[1], self.covs[1])
-            gp3 = pm.gp.Marginal(self.means[2], self.covs[2])
+            gp1 = pm.gp.Marginal(mean_func=self.means[0], cov_func=self.covs[0])
+            gp2 = pm.gp.Marginal(mean_func=self.means[1], cov_func=self.covs[1])
+            gp3 = pm.gp.Marginal(mean_func=self.means[2], cov_func=self.covs[2])
 
             gpsum = gp1 + gp2 + gp3
             fsum = gpsum.marginal_likelihood("f", self.X, self.y, noise=self.noise)
             model1_logp = model1.logp({"fsum": self.y})
 
         with pm.Model() as model2:
-            gptot = pm.gp.Marginal(reduce(add, self.means), reduce(add, self.covs))
+            gptot = pm.gp.Marginal(
+                mean_func=reduce(add, self.means), cov_func=reduce(add, self.covs)
+            )
             fsum = gptot.marginal_likelihood("f", self.X, self.y, noise=self.noise)
             model2_logp = model2.logp({"fsum": self.y})
         npt.assert_allclose(model1_logp, model2_logp, atol=0, rtol=1e-2)
@@ -913,17 +940,24 @@ def testAdditiveMarginal(self):
             fp2 = gptot.conditional("fp2", self.Xnew)
 
         fp = np.random.randn(self.Xnew.shape[0])
-        npt.assert_allclose(fp1.logp({"fp1": fp}), fp2.logp({"fp2": fp}), atol=0, rtol=1e-2)
+        logp1 = model1.logp({"fp1": fp})
+        logp2 = model2.logp({"fp2": fp})
+        npt.assert_allclose(logp1, logp2, atol=0, rtol=1e-2)
 
-    @pytest.mark.xfail(reason="MarginalSparse must be refactored to not use DensityDist")
     @pytest.mark.parametrize("approx", ["FITC", "VFE", "DTC"])
     def testAdditiveMarginalSparse(self, approx):
         Xu = np.random.randn(10, 3)
         sigma = 0.1
         with pm.Model() as model1:
-            gp1 = pm.gp.MarginalSparse(self.means[0], self.covs[0], approx=approx)
-            gp2 = pm.gp.MarginalSparse(self.means[1], self.covs[1], approx=approx)
-            gp3 = pm.gp.MarginalSparse(self.means[2], self.covs[2], approx=approx)
+            gp1 = pm.gp.MarginalSparse(
+                mean_func=self.means[0], cov_func=self.covs[0], approx=approx
+            )
+            gp2 = pm.gp.MarginalSparse(
+                mean_func=self.means[1], cov_func=self.covs[1], approx=approx
+            )
+            gp3 = pm.gp.MarginalSparse(
+                mean_func=self.means[2], cov_func=self.covs[2], approx=approx
+            )
 
             gpsum = gp1 + gp2 + gp3
             fsum = gpsum.marginal_likelihood("f", self.X, Xu, self.y, noise=sigma)
@@ -931,7 +965,7 @@ def testAdditiveMarginalSparse(self, approx):
 
         with pm.Model() as model2:
             gptot = pm.gp.MarginalSparse(
-                reduce(add, self.means), reduce(add, self.covs), approx=approx
+                mean_func=reduce(add, self.means), cov_func=reduce(add, self.covs), approx=approx
             )
             fsum = gptot.marginal_likelihood("f", self.X, Xu, self.y, noise=sigma)
             model2_logp = model2.logp({"fsum": self.y})
@@ -946,24 +980,25 @@ def testAdditiveMarginalSparse(self, approx):
         with model2:
             fp2 = gptot.conditional("fp2", self.Xnew)
 
-        fp = np.random.randn(self.Xnew.shape[0], 1)
-        npt.assert_allclose(
-            pm.logp(fp1, fp1).eval({fp1: fp}), pm.logp(fp2, fp2).eval({fp2: fp}), atol=0, rtol=1e-2
-        )
+        fp = np.random.randn(self.Xnew.shape[0])
+
+        model1_logp = model1.logp({"fp1": fp, "fsum": self.y})
+        model2_logp = model2.logp({"fp2": fp, "fsum": self.y})
+
+        npt.assert_allclose(model1_logp, model2_logp, atol=0, rtol=1e-2)
 
-    @pytest.mark.xfail(reason="MvNormal was not yet refactored")
     def testAdditiveLatent(self):
         with pm.Model() as model1:
-            gp1 = pm.gp.Latent(self.means[0], self.covs[0])
-            gp2 = pm.gp.Latent(self.means[1], self.covs[1])
-            gp3 = pm.gp.Latent(self.means[2], self.covs[2])
+            gp1 = pm.gp.Latent(mean_func=self.means[0], cov_func=self.covs[0])
+            gp2 = pm.gp.Latent(mean_func=self.means[1], cov_func=self.covs[1])
+            gp3 = pm.gp.Latent(mean_func=self.means[2], cov_func=self.covs[2])
 
             gpsum = gp1 + gp2 + gp3
             fsum = gpsum.prior("fsum", self.X, reparameterize=False)
             model1_logp = model1.logp({"fsum": self.y})
 
         with pm.Model() as model2:
-            gptot = pm.gp.Latent(reduce(add, self.means), reduce(add, self.covs))
+            gptot = pm.gp.Latent(mean_func=reduce(add, self.means), cov_func=reduce(add, self.covs))
             fsum = gptot.prior("fsum", self.X, reparameterize=False)
             model2_logp = model2.logp({"fsum": self.y})
         npt.assert_allclose(model1_logp, model2_logp, atol=0, rtol=1e-2)
@@ -974,12 +1009,9 @@ def testAdditiveLatent(self):
             fp2 = gptot.conditional("fp2", self.Xnew)
 
         fp = np.random.randn(self.Xnew.shape[0])
-        npt.assert_allclose(
-            fp1.logp({"fsum": self.y, "fp1": fp}),
-            fp2.logp({"fsum": self.y, "fp2": fp}),
-            atol=0,
-            rtol=1e-2,
-        )
+        logp1 = model1.logp({"fsum": self.y, "fp1": fp})
+        logp2 = model2.logp({"fsum": self.y, "fp2": fp})
+        npt.assert_allclose(logp1, logp2, atol=0, rtol=1e-2)
 
     def testAdditiveSparseRaises(self):
         # cant add different approximations
@@ -1007,7 +1039,6 @@ def testAdditiveTypeRaises2(self):
                 gp1 + gp2
 
 
-@pytest.mark.xfail(reason="MvNormal was not yet refactored")
 class TestTP:
     R"""
     Compare TP with high degress of freedom to GP
@@ -1015,41 +1046,41 @@ class TestTP:
 
     def setup_method(self):
         X = np.random.randn(20, 3)
-        y = np.random.randn(20) * 0.01
-        Xnew = np.random.randn(50, 3)
-        pnew = np.random.randn(50) * 0.01
-        with pm.Model() as model:
+        y = np.random.randn(20)
+        Xnew = np.random.randn(30, 3)
+        pnew = np.random.randn(30)
+
+        with pm.Model() as model1:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
             gp = pm.gp.Latent(cov_func=cov_func)
             f = gp.prior("f", X, reparameterize=False)
             p = gp.conditional("p", Xnew)
+        self.gp_latent_logp = model1.logp({"f": y, "p": pnew})
         self.X = X
         self.y = y
         self.Xnew = Xnew
         self.pnew = pnew
-        self.latent_logp = model.logp({"f": y, "p": pnew})
-        self.plogp = p.logp({"f": y, "p": pnew})
+        self.nu = 10000
 
     def testTPvsLatent(self):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
-            tp = pm.gp.TP(cov_func=cov_func, nu=10000)
+            tp = pm.gp.TP(cov_func=cov_func, nu=self.nu)
             f = tp.prior("f", self.X, reparameterize=False)
             p = tp.conditional("p", self.Xnew)
         tp_logp = model.logp({"f": self.y, "p": self.pnew})
-        npt.assert_allclose(self.latent_logp, tp_logp, atol=0, rtol=1e-2)
+        npt.assert_allclose(self.gp_latent_logp, tp_logp, atol=0, rtol=1e-2)
 
     def testTPvsLatentReparameterized(self):
         with pm.Model() as model:
             cov_func = pm.gp.cov.ExpQuad(3, [0.1, 0.2, 0.3])
-            tp = pm.gp.TP(cov_func=cov_func, nu=10000)
+            tp = pm.gp.TP(cov_func=cov_func, nu=self.nu)
             f = tp.prior("f", self.X, reparameterize=True)
             p = tp.conditional("p", self.Xnew)
         chol = np.linalg.cholesky(cov_func(self.X).eval())
-        y_rotated = np.linalg.solve(chol, self.y)
-        # testing full model logp unreliable due to introduction of f_chi2__log__
-        plogp = p.logp({"f_rotated_": y_rotated, "p": self.pnew, "f_chi2__log__": np.log(1e20)})
-        npt.assert_allclose(self.plogp, plogp, atol=0, rtol=1e-2)
+        f_rotated = np.linalg.solve(chol, self.y)
+        tp_logp = model.logp({"f_rotated_": f_rotated, "p": self.pnew})
+        npt.assert_allclose(self.gp_latent_logp, tp_logp, atol=0, rtol=1e-2)
 
     def testAdditiveTPRaises(self):
         with pm.Model() as model:
@@ -1060,7 +1091,6 @@ def testAdditiveTPRaises(self):
                 gp1 + gp2
 
 
-@pytest.mark.xfail(reason="MvNormal was not yet refactored")
 class TestLatentKron:
     """
     Compare gp.LatentKron to gp.Latent, both with Gaussian noise.
@@ -1077,7 +1107,7 @@ def setup_method(self):
         self.y = np.random.randn(self.N) * 0.1
         self.Xnews = (np.random.randn(5, 1), np.random.randn(5, 1), np.random.randn(5, 1))
         self.Xnew = np.concatenate(self.Xnews, axis=1)
-        self.pnew = np.random.randn(len(self.Xnew)) * 0.01
+        self.pnew = np.random.randn(len(self.Xnew))
         ls = 0.2
         with pm.Model() as latent_model:
             self.cov_funcs = (
@@ -1116,7 +1146,6 @@ def testLatentKronRaisesSizes(self):
             gp.prior("f", Xs=[np.linspace(0, 1, 7)[:, None], np.linspace(0, 1, 5)[:, None]])
 
 
-@pytest.mark.xfail(reason="The `gp.predict` method was not yet refactored")
 class TestMarginalKron:
     """
     Compare gp.MarginalKron to gp.Marginal.
@@ -1134,7 +1163,8 @@ def setup_method(self):
         self.Xnews = (np.random.randn(5, 1), np.random.randn(5, 1), np.random.randn(5, 1))
         self.Xnew = np.concatenate(self.Xnews, axis=1)
         self.sigma = 0.2
-        self.pnew = np.random.randn(len(self.Xnew)) * 0.01
+        self.pnew = np.random.randn(len(self.Xnew))
+
         ls = 0.2
         with pm.Model() as model:
             self.cov_funcs = [
@@ -1153,16 +1183,19 @@ def setup_method(self):
     def testMarginalKronvsMarginalpredict(self):
         with pm.Model() as kron_model:
             kron_gp = pm.gp.MarginalKron(mean_func=self.mean, cov_funcs=self.cov_funcs)
-            f = kron_gp.marginal_likelihood("f", self.Xs, self.y, sigma=self.sigma, shape=self.N)
+            f = kron_gp.marginal_likelihood("f", self.Xs, self.y, sigma=self.sigma)
             p = kron_gp.conditional("p", self.Xnew)
             mu, cov = kron_gp.predict(self.Xnew)
         npt.assert_allclose(mu, self.mu, atol=1e-5, rtol=1e-2)
         npt.assert_allclose(cov, self.cov, atol=1e-5, rtol=1e-2)
+        with kron_model:
+            _, var = kron_gp.predict(self.Xnew, diag=True)
+        npt.assert_allclose(np.diag(cov), var, atol=1e-5, rtol=1e-2)
 
     def testMarginalKronvsMarginal(self):
         with pm.Model() as kron_model:
             kron_gp = pm.gp.MarginalKron(mean_func=self.mean, cov_funcs=self.cov_funcs)
-            f = kron_gp.marginal_likelihood("f", self.Xs, self.y, sigma=self.sigma, shape=self.N)
+            f = kron_gp.marginal_likelihood("f", self.Xs, self.y, sigma=self.sigma)
             p = kron_gp.conditional("p", self.Xnew)
         kron_logp = kron_model.logp({"p": self.pnew})
         npt.assert_allclose(kron_logp, self.logp, atol=0, rtol=1e-2)
@@ -1175,7 +1208,7 @@ def testMarginalKronRaises(self):
             gp1 + gp2
 
 
-class TestUtil:
+class TestPlotGP:
     def test_plot_gp_dist(self):
         """Test that the plotting helper works with the stated input shapes."""
         import matplotlib.pyplot as plt
@@ -1204,29 +1237,54 @@ def test_plot_gp_dist_warn_nan(self):
         pass
 
 
-class TestCircular:
-    def test_1d_tau1(self):
-        X = np.linspace(0, 1, 10)[:, None]
-        etalon = 0.600881
-        with pm.Model():
-            cov = pm.gp.cov.Circular(1, 1, tau=5)
-        K = aesara.function([], cov(X))()
-        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
-        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
-        npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
+class TestKmeansInducing:
+    def setup_method(self):
+        self.centers = (-5, 5)
+        self.x = np.concatenate(
+            (self.centers[0] + np.random.randn(500), self.centers[1] + np.random.randn(500))
+        )
 
-    def test_1d_tau2(self):
-        X = np.linspace(0, 1, 10)[:, None]
-        etalon = 0.691239
-        with pm.Model():
-            cov = pm.gp.cov.Circular(1, 1, tau=4)
-        K = aesara.function([], cov(X))()
-        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        K = aesara.function([], cov(X, X))()
-        npt.assert_allclose(K[0, 1], etalon, atol=1e-3)
-        # check diagonal
-        Kd = aesara.function([], cov(X, diag=True))()
-        npt.assert_allclose(np.diag(K), Kd, atol=1e-5)
+    def test_kmeans(self):
+        X = self.x[:, None]
+        Xu = pm.gp.util.kmeans_inducing_points(2, X).flatten()
+        npt.assert_allclose(np.asarray(self.centers), np.sort(Xu), atol=0.1)
+
+        X = at.as_tensor_variable(self.x[:, None])
+        Xu = pm.gp.util.kmeans_inducing_points(2, X).flatten()
+        npt.assert_allclose(np.asarray(self.centers), np.sort(Xu), atol=0.1)
+
+    def test_kmeans_raises(self):
+        with pytest.raises(TypeError):
+            Xu = pm.gp.util.kmeans_inducing_points(2, "str is the wrong type").flatten()
+
+
+class TestReplaceWithValues:
+    def test_basic_replace(self):
+        with pm.Model() as model:
+            a = pm.Normal("a")
+            b = pm.Normal("b", mu=a)
+            c = a * b
+
+        (c_val,) = pm.gp.util.replace_with_values(
+            [c], replacements={"a": 2, "b": 3, "x": 100}, model=model
+        )
+        assert c_val == np.array(6.0)
+
+    def test_replace_no_inputs_needed(self):
+        with pm.Model() as model:
+            a = at.as_tensor_variable(2.0)
+            b = 1.0 + a
+            c = a * b
+            (c_val,) = pm.gp.util.replace_with_values([c], replacements={"x": 100})
+        assert c_val == np.array(6.0)
+
+    def test_missing_input(self):
+        with pm.Model() as model:
+            a = pm.Normal("a")
+            b = pm.Normal("b", mu=a)
+            c = a * b
+
+        with pytest.raises(ValueError):
+            (c_val,) = pm.gp.util.replace_with_values(
+                [c], replacements={"a": 2, "x": 100}, model=model
+            )