genjong
genjong

Reputation: 137

Code too slow when selecting optimal piecewise regression fit

I'm making a program which fits a piecewise linear regression with up to 4-5 breakpoints in the data, and then deciding how many breakpoints is best to prevent over and underfitting. However, my code is extremely slow to run, due to how ungraceful it is.

A rough draft I have of my code is:

import numpy as np
import pandas as pd
from scipy.optimize import curve_fit, differential_evolution
import matplotlib.pyplot as plt
import warnings


def segmentedRegression_two(xData,yData):

    def func(xVals,break1,break2,slope1,offset1,slope_mid,offset_mid,slope2,offset2):
            returnArray=[]
            for x in xVals:
                if x < break1:
                    returnArray.append(slope1 * x + offset1)
                elif (np.logical_and(x >= break1,x<break2)):
                    returnArray.append(slope_mid * x + offset_mid)
                else:
                    returnArray.append(slope2 * x + offset2)

            return returnArray

    def sumSquaredError(parametersTuple): #Definition of an error function to minimize
        model_y=func(xData,*parametersTuple)
        warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm

        return np.sum((yData-model_y)**2.0)

    def generate_genetic_Parameters():
            initial_parameters=[]
            x_max=np.max(xData)
            x_min=np.min(xData)
            y_max=np.max(yData)
            y_min=np.min(yData)
            slope=10*(y_max-y_min)/(x_max-x_min)

            initial_parameters.append([x_max,x_min]) #Bounds for model break point
            initial_parameters.append([x_max,x_min])
            initial_parameters.append([-slope,slope]) 
            initial_parameters.append([-y_max,y_min]) 
            initial_parameters.append([-slope,slope]) 
            initial_parameters.append([-y_max,y_min]) 
            initial_parameters.append([-slope,slope])
            initial_parameters.append([y_max,y_min]) 



            result=differential_evolution(sumSquaredError,initial_parameters,seed=3)

            return result.x

    geneticParameters = generate_genetic_Parameters() #Generates genetic parameters



    fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data 
    print('Parameters:', fittedParameters)





    model=func(xData,*fittedParameters)

    absError = model - yData

    SE = np.square(absError) 
    MSE = np.mean(SE) 
    RMSE = np.sqrt(MSE) 
    Rsquared = 1.0 - (np.var(absError) / np.var(yData))




    return Rsquared

def segmentedRegression_three(xData,yData):

    def func(xVals,break1,break2,break3,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4):
            returnArray=[]
            for x in xVals:
                if x < break1:
                    returnArray.append(slope1 * x + offset1)
                elif (np.logical_and(x >= break1,x<break2)):
                    returnArray.append(slope2 * x + offset2)
                elif (np.logical_and(x >= break2,x<break3)):
                    returnArray.append(slope3 * x + offset3)
                else:
                    returnArray.append(slope4 * x + offset4)

            return returnArray

    def sumSquaredError(parametersTuple): #Definition of an error function to minimize
        model_y=func(xData,*parametersTuple)
        warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm

        return np.sum((yData-model_y)**2.0)

    def generate_genetic_Parameters():
            initial_parameters=[]
            x_max=np.max(xData)
            x_min=np.min(xData)
            y_max=np.max(yData)
            y_min=np.min(yData)
            slope=10*(y_max-y_min)/(x_max-x_min)

            initial_parameters.append([x_max,x_min]) #Bounds for model break point
            initial_parameters.append([x_max,x_min])
            initial_parameters.append([x_max,x_min])
            initial_parameters.append([-slope,slope]) 
            initial_parameters.append([-y_max,y_min]) 
            initial_parameters.append([-slope,slope]) 
            initial_parameters.append([-y_max,y_min]) 
            initial_parameters.append([-slope,slope])
            initial_parameters.append([y_max,y_min]) 
            initial_parameters.append([-slope,slope])
            initial_parameters.append([y_max,y_min]) 



            result=differential_evolution(sumSquaredError,initial_parameters,seed=3)

            return result.x

    geneticParameters = generate_genetic_Parameters() #Generates genetic parameters



    fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data 
    print('Parameters:', fittedParameters)





    model=func(xData,*fittedParameters)

    absError = model - yData

    SE = np.square(absError) 
    MSE = np.mean(SE) 
    RMSE = np.sqrt(MSE) 
    Rsquared = 1.0 - (np.var(absError) / np.var(yData))


    return Rsquared

def segmentedRegression_four(xData,yData):

def func(xVals,break1,break2,break3,break4,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4,slope5,offset5):
        returnArray=[]
        for x in xVals:
            if x < break1:
                returnArray.append(slope1 * x + offset1)
            elif (np.logical_and(x >= break1,x<break2)):
                returnArray.append(slope2 * x + offset2)
            elif (np.logical_and(x >= break2,x<break3)):
                returnArray.append(slope3 * x + offset3)
            elif (np.logical_and(x >= break3,x<break4)):
                returnArray.append(slope4 * x + offset4)
            else:
                returnArray.append(slope5 * x + offset5)

        return returnArray

def sumSquaredError(parametersTuple): #Definition of an error function to minimize
    model_y=func(xData,*parametersTuple)
    warnings.filterwarnings("ignore") # Ignore warnings by genetic algorithm

    return np.sum((yData-model_y)**2.0)

def generate_genetic_Parameters():
        initial_parameters=[]
        x_max=np.max(xData)
        x_min=np.min(xData)
        y_max=np.max(yData)
        y_min=np.min(yData)
        slope=10*(y_max-y_min)/(x_max-x_min)

        initial_parameters.append([x_max,x_min]) #Bounds for model break point
        initial_parameters.append([x_max,x_min])
        initial_parameters.append([x_max,x_min])
        initial_parameters.append([x_max,x_min])
        initial_parameters.append([-slope,slope]) 
        initial_parameters.append([-y_max,y_min]) 
        initial_parameters.append([-slope,slope]) 
        initial_parameters.append([-y_max,y_min]) 
        initial_parameters.append([-slope,slope])
        initial_parameters.append([y_max,y_min]) 
        initial_parameters.append([-slope,slope])
        initial_parameters.append([y_max,y_min]) 
        initial_parameters.append([-slope,slope])
        initial_parameters.append([y_max,y_min]) 



        result=differential_evolution(sumSquaredError,initial_parameters,seed=3)

        return result.x

geneticParameters = generate_genetic_Parameters() #Generates genetic parameters



fittedParameters, pcov= curve_fit(func, xData, yData, geneticParameters) #Fits the data 
print('Parameters:', fittedParameters)





model=func(xData,*fittedParameters)

absError = model - yData

SE = np.square(absError) 
MSE = np.mean(SE) 
RMSE = np.sqrt(MSE) 
Rsquared = 1.0 - (np.var(absError) / np.var(yData))


return Rsquared

And from here, so far thinking something like this:

r2s=[segmentedRegression_two(xData,yData),segmentedRegression_three(xData,yData),segmentedRegression_four(xData,yData)]

best_fit=np.max(r2s)

Although I may need to use AIC or something.

Is there some way I can make this more efficient at running?

Upvotes: 0

Views: 103

Answers (1)

hpaulj
hpaulj

Reputation: 231615

I grabbed one of your func, and put it in a test script:

import numpy as np

def func(xVals,break1,break2,break3,slope1,offset1,slope2,offset2,slope3,offset3,slope4,offset4):
    returnArray=[]
    for x in xVals:
        if x < break1:
            returnArray.append(slope1 * x + offset1)
        elif (np.logical_and(x >= break1,x<break2)):
            returnArray.append(slope2 * x + offset2)
        elif (np.logical_and(x >= break2,x<break3)):
            returnArray.append(slope3 * x + offset3)
        else:
            returnArray.append(slope4 * x + offset4)

    return returnArray

arr = np.linspace(0,20,10000)
breaks = [4, 10, 15]
slopes = [.1, .2, .3, .4]
offsets = [1,2,3,4]
sl_off = np.array([slopes,offsets]).T.ravel().tolist()
print(sl_off)
ret = func(arr, *breaks, *sl_off)
if len(ret)<25:
    print(ret)

I then took a first step of 'vectorizing', evaluating the function in blocks of values, rather than element by element.

def func1(xVals, breaks, slopes, offsets):
    res = np.zeros(xVals.shape)
    i = 0 
    mask = xVals<breaks[i]
    res[mask] = slopes[i]*xVals[mask]+offsets[i]
    for i in [1,2]:
        mask = np.logical_and(xVals>=breaks[i-1], xVals<breaks[i])
        res[mask] = slopes[i]*xVals[mask]+offsets[i]
    i=3
    mask = xVals>=breaks[i-1]
    res[mask] = slopes[i]*xVals[mask]+offsets[i]
    return res

ret1 = func1(arr, breaks, slopes, offsets)
print(np.allclose(ret, ret1))

The allclose test prints True. I also ran it in ipython, and timed the two versions.

In [41]: timeit func(arr, *breaks, *sl_off)                                                            
66.2 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [42]: timeit func1(arr, breaks, slopes, offsets)                                                    
165 µs ± 586 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

I also did a plt.plot(xVals, ret) to see a simple plot of the function.

I wrote func1 with an eye toward making it work for all 3 of your cases. It's not there, but shouldn't be hard to change to depend on the length of the input lists (or arrays).

I'm sure more can be done, but this should be start in the right direction.

There is also a numpy piecewise evaluator:

np.piecewise(x, condlist, funclist, *args, **kw)

but I it appears that constructing the two input lists will be just as much work.

Upvotes: 1

Related Questions