Reputation: 832
I have a customized estimator object in Python (mkl_regressor
). One of the learning parameters of such an object is a numpy.array
of floats. Usually sklearn estimator objects are tuned by single parameters, like the C
of a SVM. Thus the randomizedSearchCV
search object takes a distribution or a list of values for picking up some value from the given distribution (in my example is scipy.stats.expon
) for the desired parameter. I have tried to pass a list of distributions, but I had not success, because randomizedSearchCV
does not execute the elements in the array of distributions. This is what I tried:
from modshogun import *
import Gnuplot, Gnuplot.funcutils
from numpy import *
from sklearn.metrics import r2_score
class mkl_regressor():
def __init__(self, widths = [0.01, 0.1, 1.0, 10.0, 50.0, 100.0], kernel_weights = [0.01, 0.1, 1.0,], svm_c = 0.01, mkl_c = 1.0, svm_norm = 1, mkl_norm = 1, degree = 2):
self.svm_c = svm_c
self.mkl_c = mkl_c
self.svm_norm = svm_norm
self.mkl_norm = mkl_norm = degree
self.widths = widths
self.kernel_weights = kernel_weights
def fit(self, X, y, **params):
for parameter, value in params.items():
setattr(self, parameter, value)
self.feats_train = RealFeatures(X.T)
labels_train = RegressionLabels(y.reshape((len(y), )))
self._kernels_ = CombinedKernel()
for width in self.widths:
kernel = GaussianKernel()
del kernel
kernel = PolyKernel(10,
del kernel
self._kernels_.init(self.feats_train, self.feats_train)
binary_svm_solver = SVRLight()
self.mkl = MKLRegression(binary_svm_solver)
self.mkl.set_C(self.svm_c, self.svm_c)
self.kernel_weights = self._kernels_.get_subkernel_weights()
def predict(self, X):
self.feats_test = RealFeatures(X.T)
self._kernels_.init(self.feats_train, self.feats_test)
return self.mkl.apply_regression().get_labels()
def set_params(self, **params):
for parameter, value in params.items():
setattr(self, parameter, value)
return self
def get_params(self, deep=False):
return {param: getattr(self, param) for param in dir(self) if not param.startswith('__') and not callable(getattr(self,param))}
def score(self, X_t, y_t):
predicted = self.predict(X_t)
return r2_score(predicted, y_t)
if __name__ == "__main__":
from sklearn.grid_search import RandomizedSearchCV as RS
from scipy.stats import randint as sp_randint
from scipy.stats import expon
labels = array([2.0,0.0,2.0,1.0,3.0,2.0])
labels = labels.reshape((len(labels), 1))
data = array([[1.0,2.0,3.0],[1.0,2.0,9.0],[1.0,2.0,3.0],[1.0,2.0,0.0],[0.0,2.0,3.0],[1.0,2.0,3.0]])
labels_t = array([1.,3.,4])
labels_t = labels_t.reshape((len(labels_t), 1))
data_t = array([[20.0,30.0,40.0],[10.0,20.0,30.0],[10.0,20.0,40.0]])
k = 3
param_grid = [ {'svm_c': expon(scale=100, loc=5),
'mkl_c': expon(scale=100, loc=5),
'degree': sp_randint(0, 32),
#'widths': [array([4.0,6.0,8.9,3.0]), array([4.0,6.0,8.9,3.0,2.0, 3.0, 4.0]), array( [100.0, 200.0, 300.0, 400.0])
'widths': [[expon, expon]]
mkl = mkl_regressor()
rs = RS(mkl, param_distributions = param_grid[0], n_iter = 10, n_jobs = 24, cv = k)#, scoring="r2", verbose=True), labels)
preds = rs.predict(data_t)
print "R^2: ", rs.score(data_t, labels_t)
print "Parameters: ", rs.best_params_
The above code works well by passing numpy arrays as elements of the list 'widths'
of the dictionary of parameters. However, when I try to pass a list of distributions, the randomizedSearchCV object does not respond as desired:
/home/ignacio/distributionalSemanticStabilityThesis/ in fit(self=<__main__.mkl_regressor instance>, X=array([[ 1., 2., 3.],
[ 1., 2., 0.],
[ 0., 2., 3.],
[ 1., 2., 3.]]), y=array([[ 2.],
[ 1.],
[ 3.],
[ 2.]]), **params={})
24 self.feats_train = RealFeatures(X.T)
25 labels_train = RegressionLabels(y.reshape((len(y), )))
26 self._kernels_ = CombinedKernel()
27 for width in self.widths:
28 kernel = GaussianKernel()
---> 29 kernel.set_width(width)
kernel.set_width = <built-in method set_width of GaussianKernel object>
width = <scipy.stats._continuous_distns.expon_gen object>
30 kernel.init(self.feats_train,self.feats_train)
31 self._kernels_.append_kernel(kernel)
32 del kernel
TypeError: in method 'GaussianKernel_set_width', argument 2 of type 'float64_t'
I wouldn't like to force the estimator for executing each distribution generator because in such a case, the randomizedSearchCV
wouldn't has control of the used values.
Some suggestions? Thank you.
Upvotes: 2
Views: 1970
Reputation: 832
The solution @bpachev suggested worked for me. The distribution class:
class expon_vector(stats.rv_continuous):
def __init__(self, loc = 1.0, scale = 50.0, min_size=2, max_size = 10):
self.loc = loc
self.scale = scale
self.min_size = min_size
self.max_size = max_size
self.size = max_size - min_size # Only for initialization
def rvs(self):
self.size = randint.rvs(low = self.min_size,
high = self.max_size, size = 1)
return expon.rvs(loc = self.loc, scale = self.scale, size = self.size)
Which is included in the dictionary of parameters for the customized estimator I'm using:
param_grid = [ {'svm_c': expon(scale=100, loc=5),
'mkl_c': expon(scale=100, loc=5),
'degree': sp_randint(0, 24),
'widths': expon_vector(loc = 0.1, scale = 100.0,
min_size = 2, max_size = 10) } ]
Upvotes: 2
Reputation: 2212
RandomizedSearchCV can take either a list of parameter values to try or a distribution object with an rvs method for sampling. If you pass it a list, it will assume you passed a discrete set of parameter values to sample from. It does not support a list of distributions for a single parameter. If existing distributions don't suit your needs, make a custom one.
If you need a distribution that returns an array, simply create a class that has an rvs() method to return a random sample and pass an instance of that instead of a list of single-variate distributions.
Upvotes: 2