cuda_hpc80
cuda_hpc80

Reputation: 607

numpy interpolation using pandas

I am trying to do interpolation for pandas columns belonging to different dataframes with different sampling rates. I stripped the timestamp and used the count value as index. I looked at multiple ways to do interpolation on pandas and could not come with an elegant solution. Here's my hack using np.interp method. Is there a better method or alternative in pandas? Thanks in advance !

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-deep')

df1 = pd.DataFrame({'S1':np.random.random(10)})
df2 = pd.DataFrame({'S2':np.random.random(5)})

# Interpolate df2 keeping first and last values and increase length of df2  to len(df1)
df1_index = np.arange(0,1,np.float(1/np.float(len(df1))))
df2_index = np.arange(0,1,np.float(1/np.float(len(df2))))
df2_on_df1 = np.interp(df1_index, df2_index, df2['S2'])
print len(df1), len(df2), len(df2_on_df1)
plt.plot(df2_on_df1)
plt.hold
# plt.plot(df1)
plt.plot(df2)
plt.legend(loc='upper right')
plt.show()

enter image description here

Upvotes: 2

Views: 2535

Answers (1)

Nathaniel
Nathaniel

Reputation: 3290

You can perform a left join on df1 and then use pandas.Series.interpolate to fill in the NaN values using linear interpolation:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('seaborn-deep')

df1 = pd.DataFrame({'S1':np.random.random(10)})
df2 = pd.DataFrame({'S2':np.random.random(5)})

df1_index = np.arange(0,1,np.float(1/np.float(len(df1))))
df2_index = np.arange(0,1,np.float(1/np.float(len(df2))))

df1 = df1.set_index(df1_index)
df2 = df2.set_index(df2_index)

# Perform a left join
df1 = df1.join(df2, how = 'left')
print(df1)

               S1        S2
    0.0  0.931240  0.484878
    0.1  0.356905       NaN
    0.2  0.729272  0.109443
    0.3  0.340487       NaN
    0.4  0.876859  0.858113
    0.5  0.937151       NaN
    0.6  0.551431  0.924410
    0.7  0.303283       NaN
    0.8  0.550821  0.810445
    0.9  0.689091       NaN

# Fill the NaN values in S2 with linear interpolation to match the length of S1
df1.S2 = df1.S2.interpolate()
print(df1)

               S1        S2
    0.0  0.931240  0.484878
    0.1  0.356905  0.297161
    0.2  0.729272  0.109443
    0.3  0.340487  0.483778
    0.4  0.876859  0.858113
    0.5  0.937151  0.891262
    0.6  0.551431  0.924410
    0.7  0.303283  0.867428
    0.8  0.550821  0.810445
    0.9  0.689091  0.810445

# Plot the data
plt.figure()
plt.subplot(3,1,1)
plt.title('df1')
plt.plot(df1.index, df1.S1, c = 'C0')
plt.scatter(df1.index, df1.S1, c = 'C0')
plt.xlim(-0.05,0.95)

plt.subplot(3,1,2)
plt.title('df2')
plt.plot(df2.index, df2, c = 'C1')
plt.scatter(df2.index, df2, c = 'C1') #s=100, edgecolor = 'C1', facecolor = 'w')
plt.xlim(-0.05,0.95)

plt.subplot(3,1,3)
plt.title('df2 on df1')
plt.plot(df1.index, df1.S2, c = 'C1')
plt.scatter(df1_index, df1.S2, c = 'C0')
plt.xlim(-0.05,0.95)

plt.tight_layout()

enter image description here

Upvotes: 1

Related Questions