Reputation: 265
I have two sets of x-y data, whose x
values should be merged. To illustrate, the first set looks something like this:
0.5;3.4
0.8;3.8
0.9;1.2
1.3;1.1
1.9;2.3
And the second set like this:
0.3;-0.2
0.8;-0.9
1.0;0.1
1.5;1.2
1.6;6.3
The data is in two separate csv files. I would like to merge both files into one, so that the x
values are in order, and the y
values appear in two columns with their (linearly) interpolated values (y1
and y2
) completed. The second column contains the y
values of the first dataset (plus interpolated values) and the third column the y
values of the second dataset.
0.3;y1;-0.2
0.5;3.4;y2
0.8;3.8;-0.9
0.9;1.2;y2
1.0;y1;0.1
1.3;1.1;y2
1.5;y1;1.2
1.6;y1;6.3
1.9;2.3;y2
My only idea so far would be to read the data into numpy arrays, join them together, sort the values and calculate the average of preceding and following values, in case the value is empty.
Is there a more elegant way to do this in Python?
Edit: Here is my attempt. It works and delivers the result I imagined, although the script is quite long.
#-*- coding: utf-8 -*-
import numpy as np
from matplotlib import pyplot as plt
from scipy.interpolate import interp1d
import csv
# Read data files and turn them into numpy array for further processing
def read_datafile(file_name):
data = np.loadtxt(file_name, delimiter=";")
return data
data1 = read_datafile("testcsv1.csv")
data2 = read_datafile("testcsv2.csv")
# Add empty column at the appropriate position
emptycol1 = np.empty((len(data1), 3))
emptycol1[:] = np.nan
emptycol2 = np.empty((len(data2), 3))
emptycol2[:] = np.nan
emptycol1[:,:-1] = data1
emptycol2[:,[0, 2]] = data2
# Merge and sort the data sets. Create empty array to add final results
merged_temp = np.concatenate((emptycol1, emptycol2))
merged_temp = np.array(sorted(merged_temp, key = lambda x: float(x[0])))
merged = np.empty((1, 3))
# Check for entries where the x values already match. Merge those into one row
i = 0
while i < len(merged_temp)-1:
if merged_temp[i, 0] == merged_temp[i+1, 0]:
newrow = np.array([merged_temp[i, 0], merged_temp[i, 1], merged_temp[i+1, 2]])
merged = np.vstack((merged, newrow))
i += 2
else:
newrow = np.array([merged_temp[i, 0], merged_temp[i, 1], merged_temp[i, 2]])
merged = np.vstack((merged, newrow))
i += 1
# Check for so far undefined values (gaps in the data). Interpolate between them (linearly)
for i in range(len(merged)-1):
# First y column
if np.isnan(merged[i, 1]) == True:
# If only one value is missing (maybe not necessary to separate this case)
if (np.isnan(merged[i-1, 1]) == False) and (np.isnan(merged[i+1, 1]) == False):
merged[i, 1] = (merged[i-1, 1] + merged[i+1, 1])/2
# If two or more values are missing
elif np.isnan(merged[i, 1]) == True:
l = 0
while (np.isnan(merged[i+l, 1]) == True) and (i+l != len(merged)-1):
l += 1
x1 = np.array([i-1, i+l]) # endpoints
x = np.linspace(i, i+l-1, l, endpoint=True) # missing points
y = np.array([merged[i-1, 1], merged[i+l, 1]]) # values at endpoints
f = interp1d(x1, y) # linear interpolation
for k in x:
merged[k, 1] = f(k)
# Second y column
if np.isnan(merged[i, 2]) == True:
# If only one value is missing
if (np.isnan(merged[i-1, 2]) == False) and (np.isnan(merged[i+1, 2]) == False):
merged[i, 2] = (merged[i-1, 2] + merged[i+1, 2])/2
# If two or more values are missing
elif np.isnan(merged[i, 2]) == True:
l = 0
while (np.isnan(merged[i+l, 2]) == True) and (i+l != len(merged)-1):
l += 1
x1 = np.array([i-1, i+l]) # endpoints
x = np.linspace(i, i+l-1, l, endpoint=True) # missing points
y = np.array([merged[i-1, 2], merged[i+l, 2]]) # values at endpoints
f = interp1d(x1, y) # linear interpolation
for k in x:
merged[k, 2] = f(k)
# Remove lines which still have "nan" values (beginning and end). This could be prevented by an extrapolation
merged = merged[~np.isnan(merged).any(axis=1)]
merged = np.delete(merged, (0), axis=0)
# Write table to new csv file in the same directory
with open("testcsv_merged.csv", "w") as mergedfile:
writer = csv.writer(mergedfile)
[writer.writerow(r) for r in merged]
Upvotes: 0
Views: 2058
Reputation: 18668
A one liner : dfi = pd.merge(df1,df2,'outer',0).set_index(0).sort_index().interpolate()
In [383]: dfi
Out[383]:
1_x 1_y
0
0.3 NaN -0.20
0.5 3.40 -0.55
0.8 3.80 -0.90
0.9 1.20 -0.40
1.0 1.15 0.10
1.3 1.10 0.65
1.5 1.50 1.20
1.6 1.90 6.30
1.9 2.30 6.30
A complete pandas version + numpy interpolate for better tuning at the edges :
#df1 = pd.read_clipboard(header=None,sep=';')
#df2 = pd.read_clipboard(header=None,sep=';')
import pylab as pl
df = pd.merge(df1,df2,'outer',0).sort_values(0)
df['y1']=scipy.interpolate.interp1d(*df1.values.T,fill_value='extrapolate')(df[0])
df['y2']=scipy.interpolate.interp1d(*df2.values.T,fill_value='extrapolate')(df[0])
ax=pl.gca()
df1.set_index(0).plot(lw=0,marker='o',ax=ax)
df2.set_index(0).plot(lw=0,marker='o',ax=ax)
df.set_index(0).loc[:,['y1','y2']].plot(ax=ax)
pl.show()
plots :
data :
In [344]: df1
Out[344]:
0 1
0 0.5 3.4
1 0.8 3.8
2 0.9 1.2
3 1.3 1.1
4 1.9 2.3
In [345]: df2
Out[345]:
0 1
0 0.3 -0.2
1 0.8 -0.9
2 1.0 0.1
3 1.5 1.2
4 1.6 6.3
In [346]: df
Out[346]:
0 1_x 1_y y1 y2
5 0.3 NaN -0.2 -20.713281 -0.200000
0 0.5 3.4 NaN 3.400000 -3.021563
1 0.8 3.8 -0.9 3.800000 -0.900000
2 0.9 1.2 NaN 1.200000 -0.092830
6 1.0 NaN 0.1 -0.265527 0.100000
3 1.3 1.1 NaN 1.100000 -1.960323
7 1.5 NaN 1.2 3.760937 1.200000
8 1.6 NaN 6.3 4.701230 6.300000
4 1.9 2.3 NaN 2.300000 44.318059
Upvotes: 1
Reputation: 918
I would use pandas
for this type of processing:
import pandas as pd
#I assumed you have no headers in the data files
df1 = pd.read_csv('./dataset1.txt',sep=';',header=None)
df2 = pd.read_csv('./dataset2.txt',sep=';',header=None)
#Join the datasets using full outer join on the first column in both datasets
df_merged = df1.merge(df2, on=0, how='outer')
#Fill the nulls with the desirable values in this case the average of the column
df_merged['1_x'].fillna(df_merged['1_x'].mean(),inplace=True)
df_merged['1_y'].fillna(df_merged['1_y'].mean(),inplace=True)
Output:
print(df_merged)
0 1_x 1_y
0 0.5 3.4 y2
1 0.8 3.8 -0.9
2 0.9 1.2 y2
3 1.3 1.1 y2
4 1.9 2.3 y2
5 0.3 y1 -0.2
6 1.0 y1 0.1
7 1.5 y1 1.2
8 1.6 y1 6.3
You can change the column names easily:
df_merged.columns = ['col1','col2','col3']
You can also sort values easily using sort_values
method:
df_merged.sort_values('col1')
Finally, you can convert this final DataFrame
into a numpy
array easily using:
import numpy as np
np.array(df_merged)
Upvotes: 1