Reputation: 1
I have to remove variables with zero variance from this dataset which has 530 variables. I used the nearzerovar function, but it eliminates almost all the variables and leaves me with only ten variables.
str(Dtrain)
'data.frame': 19937 obs. of 530 variables:
$ LONGITUDE : num -7541 -7537 -7519 -7525 -7632 ...
$ LATITUDE : num 4864921 4864934 4864950 4864934 4864982 ...
$ FLOOR : Factor w/ 5 levels "0","1","2","3",..: 3 3 3 3 1 3 3 3 3 3 ...
$ BUILDINGID : Factor w/ 3 levels "0","1","2": 2 2 2 2 1 2 2 2 2 2 ...
$ SPACEID : Factor w/ 123 levels "1","2","3","4",..: 31 31 28 27 47 30 28 26 ...
$ RELATIVEPOSITION: Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 1 ...
$ USERID : int 2 2 2 2 11 2 2 2 2 2 ...
$ PHONEID : int 23 23 23 23 13 23 23 23 23 23 ...
$ TIMESTAMP : int 1371713733 1371713691 1371714095 1371713807 1369909710 ...
$ IDidentifier : Factor w/ 905 levels "0_0_102_2","0_0_106_2",..: 400 400 394 392 16 ...
$ WAP001 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP002 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP003 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP004 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP005 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP006 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP007 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP008 : int 100 100 -97 100 100 100 100 100 100 100 ...
$ WAP009 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP010 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP011 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP012 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP013 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP014 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP015 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP016 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP017 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP018 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP019 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP020 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP021 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP022 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP023 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP024 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP025 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP026 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP027 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP028 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP029 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP030 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP031 : int 100 100 100 100 100 100 100 100 100 100 ...
$ WAP032 : int 100 100 100 100 100 100 100 100 100 100 ...
...
$ WAP520 : int 100 100 100 100 100 100 100 100 100 100 ...
I want to eliminate all WAP variables with zero variance.
Upvotes: 0
Views: 252
Reputation: 160607
Try this:
iszv <- grepl("WAP", names(Dtrain)) & sapply(Dtrain, is.numeric)
iszv[iszv] <- sapply(Dtrain[iszv], var, na.rm = TRUE) < 1e-9
Dtrain[,!iszv]
# LONGITUDE LATITUDE FLOOR BUILDINGID SPACEID RELATIVEPOSITION USERID PHONEID TIMESTAMP IDidentifier WAP008
# 1 -7541 4864921 2 1 L0031 L02 2 23 1371713733 L0400 100
# 2 -7537 4864934 2 1 L0031 L02 2 23 1371713691 L0400 100
# 3 -7519 4864950 2 1 L0028 L02 2 23 1371714095 L0394 -97
# 4 -7525 4864934 2 1 L0027 L02 2 23 1371713807 L0392 100
# 5 -7632 4864982 0 0 L0047 L02 11 13 1369909710 L0016 100
# 6 NA NA 2 1 L0030 L02 2 23 NA <NA> 100
# 7 NA NA 2 1 L0028 L02 2 23 NA <NA> 100
# 8 NA NA 2 1 L0026 L02 2 23 NA <NA> 100
# 9 NA NA 2 1 <NA> L02 2 23 NA <NA> 100
# 10 NA NA 2 1 <NA> 1 2 23 NA <NA> 100
Though admittedly this is effectively the same as
Dtrain[,-(caret::nearZeroVar(Dtrain))]
Sample data:
Dtrain <- structure(list(LONGITUDE = c(-7541, -7537, -7519, -7525, -7632, NA, NA, NA, NA, NA), LATITUDE = c(4864921, 4864934, 4864950, 4864934, 4864982, NA, NA, NA, NA, NA), FLOOR = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "2"), class = "factor"), BUILDINGID = structure(c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"), SPACEID = structure(c(5L, 5L, 3L, 2L, 6L, 4L, 3L, 1L, NA, NA), .Label = c("L0026", "L0027", "L0028", "L0030", "L0031", "L0047"), class = "factor"), RELATIVEPOSITION = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("1", "L02"), class = "factor"), USERID = c(2L, 2L, 2L, 2L, 11L, 2L, 2L, 2L, 2L, 2L), PHONEID = c(23L, 23L, 23L, 23L, 13L, 23L, 23L, 23L, 23L, 23L), TIMESTAMP = c(1371713733L, 1371713691L, 1371714095L, 1371713807L, 1369909710L, NA, NA, NA, NA, NA), IDidentifier = structure(c(4L, 4L, 3L, 2L, 1L, NA, NA, NA, NA, NA), .Label = c("L0016", "L0392", "L0394", "L0400"), class = "factor"), WAP001 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP002 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP003 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP004 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP005 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP006 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP007 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP008 = c(100L, 100L, -97L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP009 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP010 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP011 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP012 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP013 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP014 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP015 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP016 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP017 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP018 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP019 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP020 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP021 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP022 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP023 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP024 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP025 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP026 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP027 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP028 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP029 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP030 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP031 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), WAP032 = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L)), class = "data.frame", row.names = c(NA, -10L))
Upvotes: 2