Mariana da Costa
Mariana da Costa

Reputation: 173

Decision Tree in R is not splitting

I am trying to split my data into categories to understand which groups have more probability of being "Default". Therefore I want to use a decision tree.

My data has 809054 observations and 8 variables. And if I consider just a small sample of my data(for example 1%) I can run the code and I have my splits, the problem is when I have for example 70% of my total observation.:

Example of my data:

train <- structure(list(Gender = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("Man", "Woman"
), class = "factor"), Card = structure(c(1L, 1L, 2L, 1L, 1L, 
1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L), .Label = c("Credit", 
"Debit"), class = "factor"), Age = c(72, 62, 9999, 9999, 66, 
51, 44, 76, 47, 59, 40, 48, 40, 75, 57, 9999, 39, 49, 50, 65, 
67, 84, 58, 50, 50, 43, 45, 55, 64, 9999, 48, 73, 29, 9999, 29, 
63, 29, 9999, 49, 66, 48, 59, 57, 60, 50, 54, 9999, 57, 62, 59, 
62, 42, 50, 62, 9999, 48, 42, 52, 35, 80, 73, 46, 54, 76, 37, 
68, 46, 39, 37, 64, 43, 55, 9999, 33, 59, 66, 9999, 59, 45, 53
), Leasing = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), 
    District = structure(c(3L, 8L, 1L, 8L, 10L, 7L, 9L, 11L, 
    4L, 11L, 7L, 9L, 10L, 8L, 11L, 3L, 6L, 3L, 6L, 3L, 8L, 7L, 
    11L, 8L, 8L, 8L, 9L, 5L, 8L, 10L, 5L, 8L, 5L, 9L, 5L, 6L, 
    6L, 4L, 9L, 5L, 8L, 5L, 7L, 10L, 2L, 5L, 8L, 1L, 10L, 2L, 
    10L, 8L, 7L, 4L, 1L, 1L, 8L, 8L, 3L, 5L, 10L, 3L, 5L, 8L, 
    3L, 5L, 3L, 4L, 5L, 8L, 1L, 7L, 11L, 3L, 10L, 7L, 4L, 10L, 
    2L, 10L), .Label = c("Zona_01", "Zona_02", "Zona_03", "Zona_04", 
    "Zona_05", "Zona_06", "Zona_07", "Zona_08", "Zona_09", "Zona_10", 
    "Zona_11"), class = "factor"), product_type = structure(c(4L, 
    2L, 2L, 1L, 1L, 4L, 3L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 4L, 18L, 
    2L, 1L, 4L, 2L, 4L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 5L, 1L, 
    1L, 4L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 22L, 1L, 2L, 2L, 1L, 
    1L, 49L, 1L, 2L, 2L, 4L, 2L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 
    4L, 4L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    2L, 53L, 5L, 1L, 1L, 1L), .Label = c("010", "020", "040", 
    "050", "060", "061", "062", "070", "071", "072", "080", "081", 
    "082", "090", "091", "092", "093", "100", "101", "102", "110", 
    "130", "140", "150", "160", "170", "171", "172", "250", "260", 
    "265", "270", "271", "280", "285", "290", "291", "300", "301", 
    "302", "303", "304", "305", "306", "307", "308", "309", "310", 
    "330", "331", "351", "354", "520", "521"), class = "factor"), 
    Client_time = c(10, 17, 7, 8, 23, 21, 4, 24, 2, 20, 19, 21, 
    22, 15, 18, 18, 19, 22, 8, 14, 33, 24, 23, 18, 21, 8, 23, 
    21, 29, 5, 23, 10, 27, 3, 22, 16, 7, 3, 13, 10, 7, 12, 20, 
    17, 14, 17, 19, 26, 18, 11, 21, 6, 12, 6, 22, 17, 19, 10, 
    11, 19, 17, 18, 6, 19, 16, 24, 29, 15, 12, 19, 15, 18, 24, 
    17, 14, 37, 15, 2, 16, 22), Default = structure(c(1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("N", "Y"), class = "factor")), row.names = c(NA, 
-80L), class = c("tbl_df", "tbl", "data.frame"))

R code:

# Make dependent variable as a factor (categorical)
mydata$Default= as.factor(mydata$Default)
mydata$Gender= as.factor(mydata$Gender)
mydata$Card= as.factor(mydata$Card)
mydata$Leasing = as.factor(mydata$Leasing)
mydata$District= as.factor(mydata$District)
mydata$product_type= as.factor(mydata$product_type)

# Check attributes of data
str(mydata)

# Split data into training (70%) and validation (30%)
dt = sort(sample(nrow(mydata), nrow(mydata)*.7))
train<-mydata[dt,]
val<-mydata[-dt,] # Check number of rows in training data set
nrow(train)

#Compute decision tree
mtree<-rpart(Default~., data=train, parms=list(split=c("information","gini")),
      cp = 0, minsplit=1, minbucket=1)

Due to the parameters this results in an huge tree, with several splitts. So I include maxdepth in the code:

mtree<-rpart(Default~., data=train, parms=list(split=c("information","gini")), cp = 0, minsplit=1, minbucket=1, maxdepth=5)

The problem is when I do that there is no splitting:

> mtree
n= 566337 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 566337 38293 N (0.93238478 0.06761522) *

Would really enjoy some help please!

Thanks!

Upvotes: 0

Views: 453

Answers (0)

Related Questions