Reputation: 607
I am trying to do interpolation for pandas columns belonging to different dataframes with different sampling rates. I stripped the timestamp and used the count value as index. I looked at multiple ways to do interpolation on pandas and could not come with an elegant solution. Here's my hack using np.interp
method. Is there a better method or alternative in pandas? Thanks in advance !
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-deep')
df1 = pd.DataFrame({'S1':np.random.random(10)})
df2 = pd.DataFrame({'S2':np.random.random(5)})
# Interpolate df2 keeping first and last values and increase length of df2 to len(df1)
df1_index = np.arange(0,1,np.float(1/np.float(len(df1))))
df2_index = np.arange(0,1,np.float(1/np.float(len(df2))))
df2_on_df1 = np.interp(df1_index, df2_index, df2['S2'])
print len(df1), len(df2), len(df2_on_df1)
plt.plot(df2_on_df1)
plt.hold
# plt.plot(df1)
plt.plot(df2)
plt.legend(loc='upper right')
plt.show()
Upvotes: 2
Views: 2535
Reputation: 3290
You can perform a left join on df1 and then use pandas.Series.interpolate to fill in the NaN values using linear interpolation:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('seaborn-deep')
df1 = pd.DataFrame({'S1':np.random.random(10)})
df2 = pd.DataFrame({'S2':np.random.random(5)})
df1_index = np.arange(0,1,np.float(1/np.float(len(df1))))
df2_index = np.arange(0,1,np.float(1/np.float(len(df2))))
df1 = df1.set_index(df1_index)
df2 = df2.set_index(df2_index)
# Perform a left join
df1 = df1.join(df2, how = 'left')
print(df1)
S1 S2
0.0 0.931240 0.484878
0.1 0.356905 NaN
0.2 0.729272 0.109443
0.3 0.340487 NaN
0.4 0.876859 0.858113
0.5 0.937151 NaN
0.6 0.551431 0.924410
0.7 0.303283 NaN
0.8 0.550821 0.810445
0.9 0.689091 NaN
# Fill the NaN values in S2 with linear interpolation to match the length of S1
df1.S2 = df1.S2.interpolate()
print(df1)
S1 S2
0.0 0.931240 0.484878
0.1 0.356905 0.297161
0.2 0.729272 0.109443
0.3 0.340487 0.483778
0.4 0.876859 0.858113
0.5 0.937151 0.891262
0.6 0.551431 0.924410
0.7 0.303283 0.867428
0.8 0.550821 0.810445
0.9 0.689091 0.810445
# Plot the data
plt.figure()
plt.subplot(3,1,1)
plt.title('df1')
plt.plot(df1.index, df1.S1, c = 'C0')
plt.scatter(df1.index, df1.S1, c = 'C0')
plt.xlim(-0.05,0.95)
plt.subplot(3,1,2)
plt.title('df2')
plt.plot(df2.index, df2, c = 'C1')
plt.scatter(df2.index, df2, c = 'C1') #s=100, edgecolor = 'C1', facecolor = 'w')
plt.xlim(-0.05,0.95)
plt.subplot(3,1,3)
plt.title('df2 on df1')
plt.plot(df1.index, df1.S2, c = 'C1')
plt.scatter(df1_index, df1.S2, c = 'C0')
plt.xlim(-0.05,0.95)
plt.tight_layout()
Upvotes: 1