xgboost error over for loop but works normal over independently running xgboost

Question

I am running into errors with xgboost and a for loop, the error I am obtaining is the following;

Error in xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval) : 
  [23:48:27] amalgamation/../src/metric/rank_metric.cc:135: Check failed: !auc_error AUC: the dataset only contains pos or neg samples

Somebody else asked a similar quesstion, here

The creator of the package suggested the following;

This means some of your training data or evaluation data contains all 1 or all 0 as label

Which my problem is a binary classification problem, 0, 1,.

My code is as follows;

all <- NULL
for(i in 1:length(splitxgb)){
    xgbdata <- splitxgb[[i]]
    smp_size <- floor(0.75 * nrow(xgbdata))
    train_ind <- sample(seq_len(nrow(xgbdata)), size = smp_size)
    train <- xgbdata[train_ind, ]
    test <- xgbdata[-train_ind, ]
    ids <- sample(nrow(train))
    nfolds <- 5 #TAKE this out of the forloop
    score <- data.table()
    result <- data.table()

    x_train <- train %>%
      select(-BvD.ID.number, -Major.sectors, -Region.in.country, -Major.sectors.id, -Region.in.country.id, -status)
    x_test <- test %>%
      select(-BvD.ID.number, -Major.sectors, -Region.in.country, -Major.sectors.id, -Region.in.country.id, -status)
    y_train <- train$status
    y_test <- test$status

    nrounds <- 12 #take out of the for loop
    early_stopping_round <- NULL # take out of the for loop
    dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = y_train, missing=NaN)
    dtest <- xgb.DMatrix(data = as.matrix(x_test), missing=NaN)
    watchlist <- list(train = dtrain)

    params <- list("eta" = 0.01,
                   "max_deptch" = 10,      # take out of the for loop
                   "colsample_bytree" = 0.50,
                   "min_child_weight" = 0.75,
                   "subsample" = 0.5,
                   "objective" = "reg:logistic", #should this be reg_log, binary:log etc.
                   "eval_metric" = "auc")

    model_xgb <- xgb.train(params = params,
                           data = dtrain,
                           maximize = TRUE,
                           nrounds = nrounds,
                           watchlist = watchlist,
                           early_stopping_rounds = early_stopping_round,
                           print_every_n = 1)

    pred <- predict(model_xgb, dtest)
    result <- cbind(test %>%
                      select(BvD.ID.number), status = round(pred, 0), pred)

    compare <- merge(x = result, y = test[ , c("BvD.ID.number", "status", "Region.in.country", "Major.sectors")], by = "BvD.ID.number", all.x=TRUE)
    all[[i]] <- compare

}

And I run into the error above... However when I take it all out of the for loop and run it individually for example as th following;

i <-165

xgbdata <- splitxgb[[i]]
smp_size <- floor(0.75 * nrow(xgbdata))
train_ind <- sample(seq_len(nrow(xgbdata)), size = smp_size)
train <- xgbdata[train_ind, ]
test <- xgbdata[-train_ind, ]
ids <- sample(nrow(train))
nfolds <- 5 #TAKE this out of the forloop
score <- data.table()
result <- data.table()

x_train <- train %>%
  select(-BvD.ID.number, -Major.sectors, -Region.in.country, -Major.sectors.id, -Region.in.country.id, -status)
x_test <- test %>%
  select(-BvD.ID.number, -Major.sectors, -Region.in.country, -Major.sectors.id, -Region.in.country.id, -status)
y_train <- train$status
y_test <- test$status

nrounds <- 12 #take out of the for loop
early_stopping_round <- NULL # take out of the for loop
dtrain <- xgb.DMatrix(data = as.matrix(x_train), label = y_train, missing=NaN)
dtest <- xgb.DMatrix(data = as.matrix(x_test), missing=NaN)
watchlist <- list(train = dtrain)

params <- list("eta" = 0.01,
               "max_deptch" = 10,      # take out of the for loop
               "colsample_bytree" = 0.50,
               "min_child_weight" = 0.75,
               "subsample" = 0.5,
               "objective" = "reg:logistic", #should this be reg_log, binary:log etc.
               "eval_metric" = "auc")

model_xgb <- xgb.train(params = params,
                       data = dtrain,
                       maximize = TRUE,
                       nrounds = nrounds,
                       watchlist = watchlist,
                       early_stopping_rounds = early_stopping_round,
                       print_every_n = 1)

pred <- predict(model_xgb, dtest)
result <- cbind(test %>%
                  select(BvD.ID.number), status = round(pred, 0), pred)

compare <- merge(x = result, y = test[ , c("BvD.ID.number", "status", "Region.in.country", "Major.sectors")], by = "BvD.ID.number", all.x=TRUE)
all[[i]] <- compare

And I run this for each i separately... I obtain no errors,

There is some information online but nothing specific to the problem I run into, why am I obtaining erros in the loop but not individually?.

xgboost error over for loop but works normal over independently running xgboost

Answers (1)

Related Questions