Reputation: 21
I am using gaussian_kde from scipy.stats to fit a joint PDF from a multivariate data on, let's say, X and Y.
Now I want to resample from this PDF conditionally on a value of X. For example, once my X=x, generate Y from its conditional distribution.
Let's use the example from the documentation here. kernel.resample(1)
would generate a pair of (X,Y) over all of the distribution. How could I generate Y once X is, for example, 0?
Upvotes: 2
Views: 1428
Reputation: 1521
I wrote a small package (kdetools
on PyPI) to do conditional sampling using a drop-in replacement superclass of scipy.stats.gaussian_kde
. Applying it to JohanC's example:
import numpy as np
import matplotlib.pyplot as plt
import kdetools as kt
def measure(n):
"Measurement model, return two coupled measurements."
m1 = np.random.normal(size=n)
m2 = np.random.normal(scale=0.5, size=n)
return m1 + m2, m1 - m2**2
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = kt.gaussian_kde(values)
kernel.set_bandwidth(bw_method='cv', bw_type='diagonal')
Z = np.reshape(kernel(positions).T, X.shape)
fig, ax = plt.subplots()
ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
extent=[xmin, xmax, ymin, ymax])
ax.plot(m1, m2, 'k.', markersize=1)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
ax.axvline(-1, ls='--', lw=1, color='C0')
ax.axvline(0, ls='--', lw=1, color='C1')
ax.axvline(1, ls='--', lw=1, color='C2')
plt.show()
# Take samples of y|x=-1, 0, 1
x_cond = np.array([-1,0,1])
samples = kernel.conditional_resample(10000, x_cond=x_cond, dims_cond=[0])
fig, ax = plt.subplots(1, 1, figsize=(6,4))
xs = np.linspace(-4, 4, 1000)
for i in range(3):
kde1d = kt.gaussian_kde(samples[i].ravel())
ax.plot(xs, kde1d(xs), label=f'y|x={x_cond[i]}')
ax.hist(samples[i].ravel(), density=True, bins=50, color=f'C{i}', alpha=0.5)
ax.legend();
It's pretty quick too:
%timeit samples = kernel.conditional_resample(10000, x_cond=x_cond, dims_cond=[0])
3.36 ms ± 123 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Upvotes: 0
Reputation: 80449
An approach could be to create a custom continuous distribution from a pdf.
The pdf can be created from the kernel
function. As the pdf needs an area of 1, the kernel limited to a given x0
should be scaled by the area.
The custom distribution seems to be quite slow though. A faster solution could be to create a histogram from ys = np.linspace(-10, 10, 1000); kernel(np.vstack([np.full_like(ys, x0), ys]))
and use rv_histogram
. Still faster (but much less random) would be to use np.random.choice(..., p=...)
with p calculated from the constrained kernel.
The following code starts from an adoption of the linked example code of a 2D kde.
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
def measure(n):
m1 = np.random.normal(size=n)
m2 = np.random.normal(scale=0.5, size=n)
return m1 + m2, m1 - m2 ** 2
m1, m2 = measure(2000)
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
x0 = 0.678
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))
ax1.imshow(np.rot90(Z), cmap=plt.cm.magma_r, alpha=0.4, extent=[xmin, xmax, ymin, ymax])
ax1.plot(m1, m2, 'k.', markersize=2)
ax1.axvline(x0, color='dodgerblue', ls=':')
ax1.set_xlim([xmin, xmax])
ax1.set_ylim([ymin, ymax])
# create a distribution given the kernel function limited to x=x0
class Special_distrib(stats.rv_continuous):
def _pdf(self, y, x0, area_x0):
return kernel(np.vstack([np.full_like(y, x0), y])) / area_x0
ys = np.linspace(-10, 10, 1000)
area_x0 = np.trapz(kernel(np.vstack([np.full_like(ys, x0), ys])), ys)
special_distr = Special_distrib(name="special")
vals = special_distr.rvs(x0, area_x0, size=500)
ax2.hist(vals, bins=20, color='dodgerblue')
plt.show()
Upvotes: 1