Reputation: 137
I'm making a program which fits a piecewise linear regression with up to 4-5 breakpoints in the data, and then deciding how many breakpoints is best to prevent over and underfitting. However, my code is extremely slow to run, due to how ungraceful it is.
A rough draft I have of my code is:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit, differential_evolution
import matplotlib.pyplot as plt
import warnings
def segmentedRegression_two(xData,yData):
def func(xVals,break1,break2,slope1,offset1,slope_mid,offset_mid,slope2,offset2):
returnArray=[]
for x in xVals:
if x < break1:
returnArray.append(slope1 * x + offset1)
elif (np.logical_and(x >= break1,x<break2)):
returnArray.append(slope_mid * x + offset_mid)
else:
returnArray.append(slope2 * x + offset2)
return returnArray
def sumSquaredError(parametersTuple): #Definition of an error function to minimize
model_y=func(xData,*parametersTuple)
warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm
return np.sum((yData-model_y)**2.0)
def generate_genetic_Parameters():
initial_parameters=[]
x_max=np.max(xData)
x_min=np.min(xData)
y_max=np.max(yData)
y_min=np.min(yData)
slope=10*(y_max-y_min)/(x_max-x_min)
initial_parameters.append([x_max,x_min]) #Bounds for model break point
initial_parameters.append([x_max,x_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
result=differential_evolution(sumSquaredError,initial_parameters,seed=3)
return result.x
geneticParameters = generate_genetic_Parameters() #Generates genetic parameters
fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data
print('Parameters:', fittedParameters)
model=func(xData,*fittedParameters)
absError = model - yData
SE = np.square(absError)
MSE = np.mean(SE)
RMSE = np.sqrt(MSE)
Rsquared = 1.0 - (np.var(absError) / np.var(yData))
return Rsquared
def segmentedRegression_three(xData,yData):
def func(xVals,break1,break2,break3,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4):
returnArray=[]
for x in xVals:
if x < break1:
returnArray.append(slope1 * x + offset1)
elif (np.logical_and(x >= break1,x<break2)):
returnArray.append(slope2 * x + offset2)
elif (np.logical_and(x >= break2,x<break3)):
returnArray.append(slope3 * x + offset3)
else:
returnArray.append(slope4 * x + offset4)
return returnArray
def sumSquaredError(parametersTuple): #Definition of an error function to minimize
model_y=func(xData,*parametersTuple)
warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm
return np.sum((yData-model_y)**2.0)
def generate_genetic_Parameters():
initial_parameters=[]
x_max=np.max(xData)
x_min=np.min(xData)
y_max=np.max(yData)
y_min=np.min(yData)
slope=10*(y_max-y_min)/(x_max-x_min)
initial_parameters.append([x_max,x_min]) #Bounds for model break point
initial_parameters.append([x_max,x_min])
initial_parameters.append([x_max,x_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
result=differential_evolution(sumSquaredError,initial_parameters,seed=3)
return result.x
geneticParameters = generate_genetic_Parameters() #Generates genetic parameters
fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data
print('Parameters:', fittedParameters)
model=func(xData,*fittedParameters)
absError = model - yData
SE = np.square(absError)
MSE = np.mean(SE)
RMSE = np.sqrt(MSE)
Rsquared = 1.0 - (np.var(absError) / np.var(yData))
return Rsquared
def segmentedRegression_four(xData,yData):
def func(xVals,break1,break2,break3,break4,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4,slope5,offset5):
returnArray=[]
for x in xVals:
if x < break1:
returnArray.append(slope1 * x + offset1)
elif (np.logical_and(x >= break1,x<break2)):
returnArray.append(slope2 * x + offset2)
elif (np.logical_and(x >= break2,x<break3)):
returnArray.append(slope3 * x + offset3)
elif (np.logical_and(x >= break3,x<break4)):
returnArray.append(slope4 * x + offset4)
else:
returnArray.append(slope5 * x + offset5)
return returnArray
def sumSquaredError(parametersTuple): #Definition of an error function to minimize
model_y=func(xData,*parametersTuple)
warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm
return np.sum((yData-model_y)**2.0)
def generate_genetic_Parameters():
initial_parameters=[]
x_max=np.max(xData)
x_min=np.min(xData)
y_max=np.max(yData)
y_min=np.min(yData)
slope=10*(y_max-y_min)/(x_max-x_min)
initial_parameters.append([x_max,x_min]) #Bounds for model break point
initial_parameters.append([x_max,x_min])
initial_parameters.append([x_max,x_min])
initial_parameters.append([x_max,x_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([-y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
initial_parameters.append([-slope,slope])
initial_parameters.append([y_max,y_min])
result=differential_evolution(sumSquaredError,initial_parameters,seed=3)
return result.x
geneticParameters = generate_genetic_Parameters() #Generates genetic parameters
fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data
print('Parameters:', fittedParameters)
model=func(xData,*fittedParameters)
absError = model - yData
SE = np.square(absError)
MSE = np.mean(SE)
RMSE = np.sqrt(MSE)
Rsquared = 1.0 - (np.var(absError) / np.var(yData))
return Rsquared
And from here, so far thinking something like this:
r2s=[segmentedRegression_two(xData,yData),segmentedRegression_three(xData,yData),segmentedRegression_four(xData,yData)]
best_fit=np.max(r2s)
Although I may need to use AIC or something.
Is there some way I can make this more efficient at running?
Upvotes: 0
Views: 103
Reputation: 231615
I grabbed one of your func
, and put it in a test script:
import numpy as np
def func(xVals,break1,break2,break3,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4):
returnArray=[]
for x in xVals:
if x < break1:
returnArray.append(slope1 * x + offset1)
elif (np.logical_and(x >= break1,x<break2)):
returnArray.append(slope2 * x + offset2)
elif (np.logical_and(x >= break2,x<break3)):
returnArray.append(slope3 * x + offset3)
else:
returnArray.append(slope4 * x + offset4)
return returnArray
arr = np.linspace(0,20,10000)
breaks = [4, 10, 15]
slopes = [.1, .2, .3, .4]
offsets = [1,2,3,4]
sl_off = np.array([slopes,offsets]).T.ravel().tolist()
print(sl_off)
ret = func(arr, *breaks, *sl_off)
if len(ret)<25:
print(ret)
I then took a first step of 'vectorizing', evaluating the function in blocks of values, rather than element by element.
def func1(xVals, breaks, slopes, offsets):
res = np.zeros(xVals.shape)
i = 0
mask = xVals<breaks[i]
res[mask] = slopes[i]*xVals[mask]+offsets[i]
for i in [1,2]:
mask = np.logical_and(xVals>=breaks[i-1], xVals<breaks[i])
res[mask] = slopes[i]*xVals[mask]+offsets[i]
i=3
mask = xVals>=breaks[i-1]
res[mask] = slopes[i]*xVals[mask]+offsets[i]
return res
ret1 = func1(arr, breaks, slopes, offsets)
print(np.allclose(ret, ret1))
The allclose
test prints True
. I also ran
it in ipython
, and timed the two versions.
In [41]: timeit func(arr, *breaks, *sl_off)
66.2 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [42]: timeit func1(arr, breaks, slopes, offsets)
165 µs ± 586 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
I also did a plt.plot(xVals, ret)
to see a simple plot of the function.
I wrote func1
with an eye toward making it work for all 3 of your cases. It's not there, but shouldn't be hard to change to depend on the length of the input lists (or arrays).
I'm sure more can be done, but this should be start in the right direction.
There is also a numpy
piecewise
evaluator:
np.piecewise(x, condlist, funclist, *args, **kw)
but I it appears that constructing the two input lists will be just as much work.
Upvotes: 1