Reputation: 2314
Using the caret package, I am having trouble getting the following user defined summary function to work. It is supposed to calculate the logloss, but I keep getting that logloss is not found. Below, a reproducible example:
data <- data.frame('target' = sample(c('Y','N'),100,replace = T), 'X1' = runif(100), 'X2' = runif(100))
log.loss2 <- function(data, lev = NULL, model = NULL) {
logloss = -sum(data$obs*log(data$Y) + (1-data$obs)*log(1-data$Y))/length(data$obs)
names(logloss) <- c('LL')
logloss
}
fitControl <- trainControl(method="cv",number=1, classProbs = T, summaryFunction = log.loss2)
my.grid <- expand.grid(.decay = c(0.05), .size = c(2))
fit.nnet2 <- train(target ~., data = data,
method = "nnet", maxit = 500, metric = 'LL',
tuneGrid = my.grid, verbose = T)
Upvotes: 1
Views: 2030
Reputation: 19716
The error was due to the fact you did not include trControl = fitControl
in the call to train. However that would bring you to another error that is due to the fact data$obs
and data$pred
are factors - one needs to convert to numeric which gives 1
or 2
, subtracting 1
gives desired 0
and 1
log.loss2 <- function(data, lev = NULL, model = NULL) {
data$pred <- as.numeric(data$pred)-1
data$obs <- as.numeric(data$obs)-1
logloss = -sum(data$obs*log(data$Y) + (1-data$obs)*log(1-data$Y))/length(data$obs)
names(logloss) <- c('LL')
logloss
}
fitControl <- trainControl(method="cv",number=1, classProbs = T, summaryFunction = log.loss2)
fit.nnet2 <- train(target ~., data = data,
method = "nnet", maxit = 500, metric = "LL" ,
tuneGrid = my.grid, verbose = T, trControl = fitControl,
maximize = FALSE)
#output
Neural Network
100 samples
2 predictor
2 classes: 'N', 'Y'
No pre-processing
Resampling: Cross-Validated (1 fold)
Summary of sample sizes: 0
Resampling results:
LL
0.6931472
Tuning parameter 'size' was held constant at a value of 2
Tuning parameter 'decay' was held constant at a value of 0.05
Several things to note:
this loss function will work only with data containing N
/Y
as classes because probability is defined as data$Y
, a better approach is to find the name of the class and use that. Additionally its good practice to truncate the probability values since log(0)
is not a good idea:
LogLoss <- function (data, lev = NULL, model = NULL)
{
obs <- data[, "obs"]
cls <- levels(obs) #find class names
probs <- data[, cls[2]] #use second class name
probs <- pmax(pmin(as.numeric(probs), 1 - 1e-15), 1e-15) #bound probability
logPreds <- log(probs)
log1Preds <- log(1 - probs)
real <- (as.numeric(data$obs) - 1)
out <- c(mean(real * logPreds + (1 - real) * log1Preds)) * -1
names(out) <- c("LogLoss")
out
}
Upvotes: 3
Reputation: 123
@missuse has answered the question, but I would like to add the option of weights in the logloss function:
# Cross-entropy error function
LogLoss <- function(pred, true, eps = 1e-15, weights = NULL) {
# Bound the results
pred = pmin(pmax(pred, eps), 1 - eps)
if (is.null(weights)) {
return(-(sum(
true * log(pred) + (1 - true) * log(1 - pred)
)) / length(true))
} else{
return(-weighted.mean(true * log(pred) + (1 - true) * log(1 - pred), weights))
}
}
# Caret train weighted logloss summary function
caret_logloss <- function(data, lev = NULL, model = NULL) {
cls <- levels(data$obs) #find class names
loss <- LogLoss(
pred = data[, cls[2]],
true = as.numeric(data$obs) - 1,
weights = data$weights
)
names(loss) <- c('MyLogLoss')
loss
}
Upvotes: 2