Eliminate highly correlated columns but conserving my non-numerical columns

Question

I have a data frame with categorical and numerical variables. I want to get rid of the highly correlated variables.

So, first I got rid of all my categorical columns, made the matrix and figured out which numerical columns wort eliminating out. Now I want to go back and also have my categorical-variable columns.

And I'm confused about the simplest way to do it... Maybe a join selecting which columns I want to add from the second table...

library(caret)
training_2 <- subset(training, select = -c(user_name ,timestamp, etc))
corr_matrix <- cor(training_2)
highCorr <- findCorrelation(corr_matrix, .75)
training_2<- training_2[,-highCorr]

EDIT: DATA

structure(list(X = 1:15, yaw_belt = c(-94.4, -94.4, -94.4, -94.4, 
-94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, -94.4, 
-94.4, -94.4), gyros_belt_x = c(0, 0.02, 0, 0.02, 0.02, 0.02, 
0.02, 0.02, 0.02, 0.03, 0.03, 0.02, 0.02, 0.02, 0), gyros_belt_y = c(0, 
0, 0, 0, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), gyros_belt_z = c(-0.02, 
-0.02, -0.02, -0.03, -0.02, -0.02, -0.02, -0.02, -0.02, 0, -0.02, 
-0.02, 0, -0.02, 0), magnet_belt_x = c(-3L, -7L, -2L, -6L, -6L, 
0L, -4L, -2L, 1L, -3L, -5L, -2L, -3L, -8L, -1L), magnet_belt_y = c(599L, 
608L, 600L, 604L, 600L, 603L, 599L, 603L, 602L, 609L, 596L, 602L, 
606L, 598L, 597L), roll_arm = c(-128, -128, -128, -128, -128, 
-128, -128, -128, -128, -128, -128, -128, -128, -128, -129), 
    pitch_arm = c(22.5, 22.5, 22.5, 22.1, 22.1, 22, 21.9, 21.8, 
    21.7, 21.6, 21.5, 21.5, 21.4, 21.4, 21.4), yaw_arm = c(-161, 
    -161, -161, -161, -161, -161, -161, -161, -161, -161, -161, 
    -161, -161, -161, -161), total_accel_arm = c(34L, 34L, 34L, 
    34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L, 34L
    ), gyros_arm_y = c(0, -0.02, -0.02, -0.03, -0.03, -0.03, 
    -0.03, -0.02, -0.03, -0.03, -0.03, -0.03, -0.02, 0, 0), gyros_arm_z = c(-0.02, 
    -0.02, -0.02, 0.02, 0, 0, 0, 0, -0.02, -0.02, 0, 0, -0.02, 
    -0.03, -0.03), magnet_arm_x = c(-368L, -369L, -368L, -372L, 
    -374L, -369L, -373L, -372L, -369L, -376L, -366L, -363L, -372L, 
    -371L, -374L), magnet_arm_z = c(516L, 513L, 513L, 512L, 506L, 
    513L, 509L, 510L, 518L, 516L, 509L, 520L, 509L, 523L, 510L
    ), roll_dumbbell = c(13.05217456, 13.13073959, 12.85074981, 
    13.43119971, 13.37871611, 13.38245941, 13.12694911, 12.75083041, 
    13.15463353, 13.33033728, 13.13073959, 13.10320545, 13.38245941, 
    13.41047767, 13.07948887), pitch_dumbbell = c(-70.49400371, 
    -70.63750507, -70.27811982, -70.39379464, -70.42855971, -70.81758832, 
    -70.24756905, -70.34768359, -70.42520377, -70.85058796, -70.63750507, 
    -70.45974712, -70.81758832, -70.99594236, -70.67116245), 
    yaw_dumbbell = c(-84.87393888, -84.71064711, -85.14078134, 
    -84.87362553, -84.85305745, -84.46500278, -85.09961258, -85.09708174, 
    -84.91563379, -84.44601983, -84.71064711, -84.89472246, -84.46500278, 
    -84.28004856, -84.69053461), total_accel_dumbbell = c(37L, 
    37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 
    37L, 37L), gyros_dumbbell_y = c(-0.02, -0.02, -0.02, -0.02, 
    -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0.02, 
    -0.02, -0.02), magnet_dumbbell_z = c(-65, -64, -63, -60, 
    -68, -66, -70, -74, -65, -69, -64, -65, -69, -68, -63), roll_forearm = c(28.4, 
    28.3, 28.3, 28.1, 28, 27.9, 27.9, 27.8, 27.7, 27.7, 27.6, 
    27.5, 27.2, 27.2, 27.2), pitch_forearm = c(-63.9, -63.9, 
    -63.9, -63.9, -63.9, -63.9, -63.9, -63.8, -63.8, -63.8, -63.8, 
    -63.8, -63.9, -63.9, -63.9), yaw_forearm = c(-153, -153, 
    -152, -152, -152, -152, -152, -152, -152, -152, -152, -152, 
    -151, -151, -151), total_accel_forearm = c(36L, 36L, 36L, 
    36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L
    ), gyros_forearm_x = c(0.03, 0.02, 0.03, 0.02, 0.02, 0.02, 
    0.02, 0.02, 0.03, 0.02, 0.02, 0.02, 0, 0, 0), gyros_forearm_z = c(-0.02, 
    -0.02, 0, 0, -0.02, -0.03, -0.02, 0, -0.02, -0.02, -0.02, 
    -0.03, -0.03, -0.03, -0.02), accel_forearm_x = c(192L, 192L, 
    196L, 189L, 189L, 193L, 195L, 193L, 193L, 190L, 193L, 191L, 
    193L, 193L, 192L), accel_forearm_z = c(-215L, -216L, -213L, 
    -214L, -214L, -215L, -215L, -213L, -214L, -215L, -214L, -215L, 
    -215L, -214L, -214L), magnet_forearm_x = c(-17L, -18L, -18L, 
    -16L, -17L, -9L, -18L, -9L, -16L, -22L, -17L, -11L, -15L, 
    -14L, -16L), magnet_forearm_y = c(654, 661, 658, 658, 655, 
    660, 659, 660, 653, 656, 657, 657, 655, 659, 656), magnet_forearm_z = c(476, 
    473, 469, 469, 473, 478, 470, 474, 476, 473, 465, 478, 472, 
    478, 472)), .Names = c("X", "yaw_belt", "gyros_belt_x", "gyros_belt_y", 
"gyros_belt_z", "magnet_belt_x", "magnet_belt_y", "roll_arm", 
"pitch_arm", "yaw_arm", "total_accel_arm", "gyros_arm_y", "gyros_arm_z", 
"magnet_arm_x", "magnet_arm_z", "roll_dumbbell", "pitch_dumbbell", 
"yaw_dumbbell", "total_accel_dumbbell", "gyros_dumbbell_y", "magnet_dumbbell_z", 
"roll_forearm", "pitch_forearm", "yaw_forearm", "total_accel_forearm", 
"gyros_forearm_x", "gyros_forearm_z", "accel_forearm_x", "accel_forearm_z", 
"magnet_forearm_x", "magnet_forearm_y", "magnet_forearm_z"), row.names = c(NA, 
15L), class = "data.frame")

Thanks

LyzandeR · Accepted Answer

I wasn't able to use your data unfortunately @mclzc because one of the columns only has one value and it returns an error when I try to calculate the correlations, but I ll provide a sample data set.

library(caret)
#sample data, 4 numeric fields and 2 categorical
df <- data.frame(a=runif(10), b=runif(10), c=1:10, d=1:10, e=letters[1:10], f=letters[1:10])

#categorical columns
cat_cols <- c('e', 'f')
#remove categorical
df2 <- df[!names(df) %in% cat_cols]

#run correlations
cor_mat <- cor(df2)

#the index of the columns to be removed because they have a high correlation
index <- findCorrelation(cor_mat, .75)

#the name of the columns chosen above
to_be_removed <- colnames(cor_mat)[index]

#now go back to df and use to_be_removed to subset the original df
df[!names(df) %in% to_be_removed]

Output (only column c is removed as instructed by findCorrelations, whereas categorical columns e and f are still preserved):

           a          b  d e f
1  0.8495639 0.52388246  1 a a
2  0.9478409 0.11466655  2 b b
3  0.9086955 0.12097760  3 c c
4  0.7724303 0.03160203  4 d d
5  0.1269406 0.10740182  5 e e
6  0.7454943 0.95892265  6 f f
7  0.2651961 0.15223472  7 g g
8  0.5714763 0.49808509  8 h h
9  0.1226760 0.06088693  9 i i
10 0.5526266 0.14715471 10 j j

Notice above that the findCorrelation function returns the index of the column to be removed but not the name. You need to have the name so that you can exclude that from the original data set (you cannot use the index as the number of columns changed after excluding the categorical columns). So, in my code above, I used the index to find the column name and then used this to remove the unneeded column (preserving the categorical columns)

Eliminate highly correlated columns but conserving my non-numerical columns

Answers (1)

Related Questions