Reputation: 51
I am working on a customized estimator that should be passed in gridsearchcv function in sklearn. I have created the estimator now but run into a memory error. In the following code, you will see some constants like 'KxRange[0]' or arrays like retain_rate. They are just pre-defined with some random values in them. Here is my code:
# sklearn grid search
from sklearn.model_selection import GridSearchCV
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
# define my own estimator
class MyEstimator(BaseEstimator,RegressorMixin):
# define constructor
# possible tau: int/float
# other parameters: array of int/floats, length 9
def __init__(self, tau=0, \
K1=K1Range[0], K2=K2Range[0], K3=K3Range[0], K4=K4Range[0], K5=K5Range[0], K6=K6Range[0], K7=K7Range[0], K8=K8Range[0], K9=K9Range[0], \
S1=0, S2=0, S3=0, S4=0, S5=0, S6=0, S7=0, S8=0, S9=0, \
alpha1=1, alpha2=1, alpha3=1, alpha4=1, alpha5=1, alpha6=1, alpha7=1, alpha8=1, alpha9=1, \
beta1=1, beta2=1, beta3=1, beta4=1, beta5=1, beta6=1, beta7=1, beta8=1, beta9=1):
# initialize parameters
self.tau = tau
self.K1 = K1
self.K2 = K2
self.K3 = K3
self.K4 = K4
self.K5 = K5
self.K6 = K6
self.K7 = K7
self.K8 = K8
self.K9 = K9
self.S1 = S1
self.S2 = S2
self.S3 = S3
self.S4 = S4
self.S5 = S5
self.S6 = S6
self.S7 = S7
self.S8 = S8
self.S9 = S9
self.alpha1 = alpha1
self.alpha2 = alpha2
self.alpha3 = alpha3
self.alpha4 = alpha4
self.alpha5 = alpha5
self.alpha6 = alpha6
self.alpha7 = alpha7
self.alpha8 = alpha8
self.alpha9 = alpha9
self.beta1 = beta1
self.beta2 = beta2
self.beta3 = beta3
self.beta4 = beta4
self.beta5 = beta5
self.beta6 = beta6
self.beta7 = beta7
self.beta8 = beta8
self.beta9 = beta9
# to fit the model
def fit(self, X,y=None):
# define the mu vector
self.mu_ = np.ones((N))
# define lag weights
lag_weights = np.ones((max_lag))
# define retain_rate
retain_rate = np.array([alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7, alpha8, alpha9])
# define cum_effect, set to a random value
cum_effect = 1
# define cum_effects_hill
cum_effects_hill = np.ones((N, num_media))
# parameter transformation
for nn in range(N):
for m in range(num_media):
for l in range(max_lag):
lag_weights[l] = retain_rate[m]**l
cum_effect = Adstock(X[nn][m], lag_weights)
cum_effects_hill[nn][m] = Hill(cum_effect, ec[m], slope[m])
self.mu_[nn] = tau + np.dot(cum_effects_hill[nn], beta_medias)
return self
# the predict function
def predict(self, X, y=None):
# try to get the mu_ argument. If it does not exist, we throw an error
try:
getattr(self, "mu_")
except AttributeError:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
# the score function
def score(self, X, y):
# calculate the MSE
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
The following is like the "main" function
# initiliaze estimator
t = MyEstimator()
# parameter grid
# tau
param_grid = {'tau': [100,200], \
# K
'K1': [K1Range[0], K1Range[1]], 'K2' : [K2Range[0], K2Range[1]], 'K3': [K3Range[0], K3Range[1]], 'K4' : [K4Range[0], K4Range[1]], 'K5' : [K5Range[0], K5Range[1]], 'K6' : [K6Range[0], K6Range[1]], 'K7' : [K7Range[0], K7Range[1]], 'K8': [K8Range[0], K8Range[1]], 'K9': [K9Range[0], K9Range[1]], \
# S
'S1': [1, 100], 'S2': [1, 100], 'S3': [1, 100], 'S4': [1, 100], 'S5': [1, 100], 'S6': [1, 100], 'S7': [1, 100], 'S8': [1, 100], 'S9': [1, 100], \
# alpha
'alpha1': [0.1, 0.5], 'alpha2': [0.1, 0.5], 'alpha3': [0.1, 0.5], 'alpha4': [0.1, 0.5], 'alpha5': [0.1, 0.5], 'alpha6': [0.1, 0.5], 'alpha7': [0.1, 0.5], 'alpha8': [0.1, 0.5], 'alpha9': [0.1, 0.5], \
# beta
'beta1': [100,200], 'beta2': [100,200], 'beta3': [100,200], 'beta4': [100,200], 'beta5': [100,200], 'beta6': [100,200], 'beta7': [100,200], 'beta8': [100,200], 'beta9': [100,200]}
#
clf = GridSearchCV(t, param_grid)
clf.fit(X_media, actual_sales)
#clf.predict(X_media)
This is error message:
MemoryError Traceback (most recent call last)
<ipython-input-22-de0388db8453> in <module>
14 #
15 clf = GridSearchCV(t, param_grid)
---> 16 clf.fit(X_media, actual_sales)
17 #clf.predict(X_media)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
698
699 def evaluate_candidates(candidate_params):
--> 700 candidate_params = list(candidate_params)
701 n_candidates = len(candidate_params)
702
MemoryError:
Can anyone tell me how to fix this error? Or is there any problem with my code? Thanks!
Upvotes: 4
Views: 3175
Reputation: 12602
The traceback shows that the grid search runs out of memory already at trying to generate the list of candidates, i.e. the grid. You apparently have 37 parameters, each with two possible values, so the number of candidates is 2^(37)
, more than 137 billion. You really probably don't want to try that many candidates anyway, so maybe a RandomizedSearchCV
is more appropriate?
Upvotes: 2