Reputation: 45
First of all, I would like to say that I am new to python and this code has been created alonside advice and suggestions from users on stackoverflow. The code is shown below:
f = open('E:\Python27\WASP DATA\Sample Data.txt',"r")
num=0
line = f.readlines()
X = []
for n, lines in enumerate(line, 0): #6621
# make it 109 to remove the first line "['# Column 3: Magnitude error\n']"
if (n > 109):
linSplit = lines.split(' ')
joined = ' '.join(linSplit)
# apply the float function to every item in joined.split
# create a new list of floats in tmp variable
tmp = map((lambda x: float(x)), joined.split())
X.append(tmp)
#print X[0] # print first element in the list
Period_1 = float(line[28][23:31])
Epoch_1 = float(line[27][22:31])
Period_2 = float(line[44][23:31])
Epoch_2 = float(line[43][22:31])
#Period_3 = float(line[60][23:31])
#Epoch_3 = float(line[59][22:31])
#Period_4 = float(line[76][23:31])
#Epoch_4 = float(line[75][22:31])
#Period_5 = float(line[108][23:31])
#Epoch_5 = float(line[91][22:31])
print("The time periods are:")
print Period_1
print Period_2
#print Period_3
#print Period_4
#print Period_5
print("\nThe Epoch times are:")
print Epoch_1
print Epoch_2
#print Epoch_3
#print Epoch_4
#print Epoch_5
print('respectively.')
P = []
phase_var = float
for j in range(0,len(X),1):
phase_var = (X[j][0] + (10*Period_1) - Epoch_1)/Period_1
P.append(phase_var)
print P[0]
for m in range(0,len(P),1):
P[m]=float(P[m]-int(P[m]))
#print P[0]
Mag = []
for n in range(0,len(X),1):
temp = X[n][1]
Mag.append(temp)
#print Mag[0]
#print X[0]
from pylab import *
#Plotting the first scatter diagram to see if data is phased correctly.
#plot(P, Mag)
scatter(P, Mag)
xlabel('Phase (Periods)')
ylabel('Magnitude')
#title('Dunno yet')
grid(True)
savefig("test.png")
show()
#Bin the data to create graph where magnitudes are averaged, and B lets us mess around with the binning resolution, and reducing effect of extraneous data points.
B = 2050
minv = min(P)
maxv = max(P)
bincounts = []
for i in range(B+1):
bincounts.append(0)
for d in P:
b = int((d - minv) / (maxv - minv) * B)
bincounts[b] += 1
# plot new scatter
scatter(bincounts, Mag)
show()
The original graph is the scatter plot of P and Mag. However there are multiple Mag points for each period time. I am hoping to try to create a new scatter in which I can take all these Y values and average them for each individual X value, therefore creating a tighter graph which has two dips.
I have tried looking at various ways of binning the data, however no matter which method I use, the graph containing the binned data does not seem to display correctly. The X values should run from 0 to 1 like on the pre binned data graph.
This is the data with which I am working, just incase you need to see it.
Can anyone offer any suggestions or advice on how to created the binned data graph? My knowledge of data binning is quite minimal.
Thank you for your time!
Upvotes: 1
Views: 5649
Reputation: 19760
This actually solves a number of problems, not only the binning part. I've included code to parse the blocks at the beginning of the data file, so you can get all the peak data
import numpy
import re
import matplotlib.pyplot as plt
f = open('sample_data.txt')
f.next()
pair = re.compile(r'# (.*?)[ \t]*:[ \t]*([0-9e\.-]+).*')
blocks = []
block = {}
blocks.append(block)
for line in f:
if line[0] <> '#':
blocks.append(block)
break
line = line.strip()
m = pair.match(line)
if m:
print line
key, valstr = m.groups()
print key, valstr
try:
value = float(valstr)
except:
value = valstr
block[key] = value
if (line == "#") and len(block) > 0:
blocks.append(block)
block = {}
peaks = sorted([block for block in blocks if 'PEAK' in block],
key=lambda b: b['PEAK'])
print peaks
colnames = ['HJD', 'Tamuz-corrected magnitude', 'Magnitude error']
data = numpy.loadtxt(f, [(colname, 'float64') for colname in colnames])
Nbins = 50
for peak in peaks:
plt.figure()
phase, _ = numpy.modf((data['HJD'] + 10*peak['Period (days)'] - peak['Epoch'])/peak['Period (days)'])
mag = data['Tamuz-corrected magnitude']
# use numpy.histogram to calculate the sum and the number of points in the bins
sums, _ = numpy.histogram(phase, bins=Nbins, weights=mag)
N, bin_edges = numpy.histogram(phase, bins=Nbins)
# We'll plot the value at the center of each bin
centers = (bin_edges[:-1] + bin_edges[1:])/2
plt.scatter(phase, mag, alpha=0.2)
plt.plot(centers, sums/N, color='red', linewidth=2)
plt.show()
Upvotes: 2
Reputation: 7159
Full code: http://pastebin.com/4aBjZC7Q
Here's the snippet doing the binning:
x = [] # Column 1: HJD
y = [] # Column 2: Tamuz-corrected magnitude
# code to read "sample_data.txt" into lists x and y
# the full code in http://pastebin.com/4aBjZC7Q includes this part as well
import numpy as np
# these will characterize the bins
startBinValue = 5060
endBinValue = 5176
binSize = 0.1
# numpy.arange() will generate the bins of size binSize in between the limiting values
bins = np.arange(startBinValue, endBinValue, binSize)
# numpy.digitize() will "bin" the values; i.e. x_binned[i] will give the bin number of value in i-th index
x_binned = np.digitize(x, bins)
y_numpyArray = np.array(y)
# There is quite a bit of jugglery here.
# x_binned==i gives a boolean of size x_binned
# y_numpyArray[x_binned==i] returns the elements of y_numpyArray where the boolean is true
# The len() check is to make sure that mean() is not called for an empty array (which results in NAN
y_means = np.array([
y_numpyArray[x_binned == i].mean()
if len(y_numpyArray[x_binned == i]) > 0
else 0
for i in range(1, len(bins))])
# binnedData is a list of tuples; tuple elements are bin's-low-limit, bin's-high-limit, y-mean value
binnedData = [(bins[i], bins[i + 1], y_means[i]) for i in range(len(y_means))]
I have commented the code heavily. But to understand the full functionality of the numpy functions, please refer to numpy.digitize(), numpy.arange(). I used numpy.arange() to create the bins assuming you know the bin sizes beforehand, but in case you need a fixed number of bins (say 100 bins for the x-data), use numpy.linspace() instead.
Upvotes: 1