GridSearchCV with score = neg_log_loss

Question

I'm trying to use gridsearchCV to search over specified parameters scoring with neg log loss:

grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = 'neg_log_loss', cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)

ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.

The same code but scoring with accuracy works fine. I found that for log loss we need to specify the labels, which works fine when using sklearn.metrics:

y_labels = np.unique(y_true)
y_labels
array([0., 1., 2.])

metrics.log_loss(y_true, y_pred, labels = y_labels )

So I tried:

grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)

ValueError: not enough values to unpack (expected 2, got 1)

I've tried quite a few variations of the above and also creating my own scorer with:

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

But everything I try comes down to one of the two above errors. Obviously I'm missing something, so any help much appreciated.

Update:

Made a small mistake in the above - this is a three class problem, not a binary problem as I first implied.

I've tried Ben's suggestion (thanks!):

LogLoss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2])

grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(order_inner_x, y_inner, groups=names_inner)

I'm getting a different error, so hopefully one step closer, here's the full traceback:

ValueError                                Traceback (most recent call last)
 in 
      2 
      3 grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
----> 4 grid.fit(order_inner_x, y_inner, groups=names_inner)

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
    720                 return results_container[0]
    721 
--> 722             self._run_search(evaluate_candidates)
    723 
    724         results = results_container[0]

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
   1189     def _run_search(self, evaluate_candidates):
   1190         """Search all candidates in param_grid"""
-> 1191         evaluate_candidates(ParameterGrid(self.param_grid))
   1192 
   1193 

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
    709                                for parameters, (train, test)
    710                                in product(candidate_params,
--> 711                                           cv.split(X, y, groups)))
    712 
    713                 all_candidate_params.extend(candidate_params)

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    918                 self._iterating = self._original_iterator is not None
    919 
--> 920             while self.dispatch_one_batch(iterator):
    921                 pass
    922 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in (.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    566         fit_time = time.time() - start_time
    567         # _score will return dict if is_multimetric is True
--> 568         test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
    569         score_time = time.time() - start_time - fit_time
    570         if return_train_score:

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
    603     """
    604     if is_multimetric:
--> 605         return _multimetric_score(estimator, X_test, y_test, scorer)
    606     else:
    607         if y_test is None:

~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
    633             score = scorer(estimator, X_test)
    634         else:
--> 635             score = scorer(estimator, X_test, y_test)
    636 
    637         if hasattr(score, 'item'):

~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
    133                                  ' but need classifier with two'
    134                                  ' classes for {} scoring'.format(
--> 135                                      y_pred.shape, self._score_func.__name__))
    136         if sample_weight is not None:
    137             return self._sign * self._score_func(y, y_pred,

ValueError: got predict_proba of shape (200, 3), but need classifier with two classes for log_loss scoring

Ben Reiniger · Accepted Answer

You're most of the way there: you need to provide the labels to your metric. In this attempt:

grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)

you pass the labels, but to the grid search's fit method rather than the scoring parameter itself.

make_scorer allows other keyword arguments to be passed to the metric function, so this should work:

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])

grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)

GridSearchCV with score = neg_log_loss

Answers (1)

Related Questions