Dave
Dave

Reputation: 1208

scipy gaussian_kde and circular data

I am using scipys gaussian_kde to get probability density of some bimodal data. However, as my data is angular (it's directions in degrees) I have a problem when values occur near the limits. The code below gives two example kde's, when the domain is 0-360 it under estimates as it cannot deal with the circular nature of the data. The pdf needs to be defined on the unit circle but I can't find anything in scipy.stats suitable to this type of data (von mises distribution is there but only works for unimodal data). Has anyone out there ran into this one before? Is there anything (preferable python based) available to estimate bimodal pdf's on the unit circle?

import numpy as np
import scipy as sp
from pylab import plot,figure,subplot,show,hist
from scipy import stats



baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
               -63.43494882, -63.43494882, -70.01689348, -70.01689348,
               -59.93141718, -63.43494882, -59.93141718, -63.43494882,
               -63.43494882, -63.43494882, -57.52880771, -53.61564818,
               -57.52880771, -63.43494882, -63.43494882, -92.29061004,
               -16.92751306, -99.09027692, -99.09027692, -16.92751306,
               -99.09027692, -16.92751306,  -9.86580694,  -8.74616226,
                -9.86580694,  -8.74616226,  -8.74616226,  -2.20259816,
                -2.20259816,  -2.20259816,  -9.86580694,  -2.20259816,
                -2.48955292,  -2.48955292,  -2.48955292,  -2.48955292,
                 4.96974073,   4.96974073,   4.96974073,   4.96974073,
                -2.48955292,  -2.48955292,  -2.48955292,  -2.48955292,
                -2.48955292,  -9.86580694,  -9.86580694,  -9.86580694,
               -16.92751306, -19.29004622, -19.29004622, -26.56505118,
               -19.29004622, -19.29004622, -19.29004622, -19.29004622])


xx = np.linspace(-180, 180, 181)
scipy_kde = stats.gaussian_kde(baz)              
print scipy_kde.integrate_box_1d(-180,180)

figure()
plot(xx, scipy_kde(xx), c='green')             

baz[baz<0] += 360             
xx = np.linspace(0, 360, 181)
scipy_kde = stats.gaussian_kde(baz)              
print scipy_kde.integrate_box_1d(-180,180)
plot(xx, scipy_kde(xx), c='red')             

Upvotes: 11

Views: 4262

Answers (4)

Reinderien
Reinderien

Reputation: 15231

As a small but very important addition to the answer from @kingjr: I expect that many developers will want to show this KDE in a polar projection, as in

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import vonmises


def vonmises_kde(data, kappa, n_bins=100):
    from scipy.special import i0
    bins = np.linspace(-np.pi, np.pi, n_bins)
    x = np.linspace(-np.pi, np.pi, n_bins)
    # integrate vonmises kernels
    kde = np.exp(kappa*np.cos(x[:, None]-data[None, :])).sum(1)/(2*np.pi*i0(kappa))
    kde /= np.trapz(kde, x=bins)
    return bins, kde


# generate complex circular distribution
data = np.r_[vonmises(-1, 5, 1000), vonmises(2, 10, 500), vonmises(3, 20, 100)]

# plot data histogram
fig, axes = plt.subplots(2, 1, subplot_kw={'projection': 'polar'})
axes[0].hist(data, 100)

# plot kernel density estimates
x, kde = vonmises_kde(data, 20)
axes[1].plot(x, kde)

plt.show()

radial plots

Upvotes: 1

kingjr
kingjr

Reputation: 183

Dave's answer isn't correct, because scipy's vonmises doesn't wrap around [-pi, pi].

Instead you can use the following code, which is based on the same principle. It is based on the equations described in numpy.

def vonmises_kde(data, kappa, n_bins=100):
    from scipy.special import i0
    bins = np.linspace(-np.pi, np.pi, n_bins)
    x = np.linspace(-np.pi, np.pi, n_bins)
    # integrate vonmises kernels
    kde = np.exp(kappa*np.cos(x[:, None]-data[None, :])).sum(1)/(2*np.pi*i0(kappa))
    kde /= np.trapz(kde, x=bins)
    return bins, kde

Here is an example

import matplotlib.pyplot as plt
import numpy as np
from numpy.random import vonmises

# generate complex circular distribution
data = np.r_[vonmises(-1, 5, 1000), vonmises(2, 10, 500), vonmises(3, 20, 100)]

# plot data histogram
fig, axes = plt.subplots(2, 1)
axes[0].hist(data, 100)

# plot kernel density estimates
x, kde = vonmises_kde(data, 20)
axes[1].plot(x, kde)

Histogram and kernel density plots

Upvotes: 15

rudolfbyker
rudolfbyker

Reputation: 2417

Here is a fast approximation to @kingjr's more exact answer:

def vonmises_pdf(x, mu, kappa):
    return np.exp(kappa * np.cos(x - mu)) / (2. * np.pi * scipy.special.i0(kappa))


def vonmises_fft_kde(data, kappa, n_bins):
    bins = np.linspace(-np.pi, np.pi, n_bins + 1, endpoint=True)
    hist_n, bin_edges = np.histogram(data, bins=bins)
    bin_centers = np.mean([bin_edges[1:], bin_edges[:-1]], axis=0)
    kernel = vonmises_pdf(
        x=bin_centers,
        mu=0,
        kappa=kappa
    )
    kde = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kernel) * np.fft.rfft(hist_n)))
    kde /= np.trapz(kde, x=bin_centers)
    return bin_centers, kde

Test (using tqdm for progress bar and timing, and matplotlib to verify results):

import numpy as np
from tqdm import tqdm
import scipy.stats
import matplotlib.pyplot as plt

n_runs = 1000
n_bins = 100
kappa = 10

for _ in tqdm(xrange(n_runs)):
    bins1, kde1 = vonmises_kde(
        data=np.r_[
            np.random.vonmises(-1, 5, 1000),
            np.random.vonmises(2, 10, 500),
            np.random.vonmises(3, 20, 100)
        ],
        kappa=kappa,
        n_bins=n_bins
    )


for _ in tqdm(xrange(n_runs)):
    bins2, kde2 = vonmises_fft_kde(
        data=np.r_[
            np.random.vonmises(-1, 5, 1000),
            np.random.vonmises(2, 10, 500),
            np.random.vonmises(3, 20, 100)
        ],
        kappa=kappa,
        n_bins=n_bins
    )

plt.figure()
plt.plot(bins1, kde1, label="kingjr's solution")
plt.plot(bins2, kde2, label="dolf's FFT solution")
plt.legend()
plt.show()

Results:

100%|██████████| 1000/1000 [00:07<00:00, 135.29it/s]
100%|██████████| 1000/1000 [00:00<00:00, 1945.14it/s]

(1945 / 135 = 14 times faster)

This is how close the results are for the FFT-approximation with 100 bins.

For even more speed, use an integer power of 2 as the number of bins. It also scales better (i.e. it stays fast with many bins and lots of data). On my PC it's 118 times faster than the original answer with n_bins=1024.

Why does it work?

The product of the FFTs of two signals (without zero-padding) is equal to the circular (or cyclic) convolution of the two signals. A kernel density estimation is basically a kernel convolved with a signal that has an impulse at the position of each data point.

Why is it not exact?

Since I use a histogram to space the data evenly, I lose the exact position of each sample, and only use the centre of the bin to which it belongs. The number of samples in each bin is used as the magnitude of the impulse at that point. For example: Ignoring the normalisation for a moment, if you have a bin from 0 to 1, and two samples in that bin, at 0.1 and at 0.2, the exact KDE will be the kernel centred around 0.1 + the kernel centred around 0.2. The approximation will be 2x `the kernel centred around 0.5, which is the centre of the bin.

Upvotes: 7

Dave
Dave

Reputation: 1208

So I have what I think is a reasonable solution. Basically I use Von Mises distribution as a basis function for a kernel density estimate. Code is below in case it is useful for anyone else.

def vonmises_KDE(data, kappa, plot=None):       

    """    
    Create a kernal densisity estimate of circular data using the von mises 
    distribution as the basis function.

    """

    # imports
    from scipy.stats import vonmises
    from scipy.interpolate import interp1d

    # convert to radians
    data = np.radians(data)

    # set limits for von mises
    vonmises.a = -np.pi
    vonmises.b = np.pi
    x_data = np.linspace(-np.pi, np.pi, 100)

    kernels = []

    for d in data:

        # Make the basis function as a von mises PDF
        kernel = vonmises(kappa, loc=d)
        kernel = kernel.pdf(x_data)
        kernels.append(kernel)

        if plot:
            # For plotting
            kernel /= kernel.max()
            kernel *= .2
            plt.plot(x_data, kernel, "grey", alpha=.5)


    vonmises_kde = np.sum(kernels, axis=0)
    vonmises_kde = vonmises_kde / np.trapz(vonmises_kde, x=x_data)
    f = interp1d( x_data, vonmises_kde )


    if plot:
        plt.plot(x_data, vonmises_kde, c='red')  

    return x_data, vonmises_kde, f

baz = np.array([-92.29061004, -85.42607874, -85.42607874, -70.01689348,
               -63.43494882, -63.43494882, -70.01689348, -70.01689348,
               -59.93141718, -63.43494882, -59.93141718, -63.43494882,
               -63.43494882, -63.43494882, -57.52880771, -53.61564818,
               -57.52880771, -63.43494882, -63.43494882, -92.29061004,
               -16.92751306, -99.09027692, -99.09027692, -16.92751306,
               -99.09027692, -16.92751306,  -9.86580694,  -8.74616226,
                -9.86580694,  -8.74616226,  -8.74616226,  -2.20259816,
                -2.20259816,  -2.20259816,  -9.86580694,  -2.20259816,
                -2.48955292,  -2.48955292,  -2.48955292,  -2.48955292,
                 4.96974073,   4.96974073,   4.96974073,   4.96974073,
                -2.48955292,  -2.48955292,  -2.48955292,  -2.48955292,
                -2.48955292,  -9.86580694,  -9.86580694,  -9.86580694,
               -16.92751306, -19.29004622, -19.29004622, -26.56505118,
               -19.29004622, -19.29004622, -19.29004622, -19.29004622])   
kappa = 12
x_data, vonmises_kde, f = vonmises_KDE(baz, kappa, plot=1)

Upvotes: 0

Related Questions