Reputation: 2049
Supposing a data set with several rows and columns with some columns being 0 (I mean all values in the column are 0's). How one can filter out those columns? I have tried with the following code but no avail.
training_data <- Filer(function(x) { !(all(x[, 1:99]==0))}, training_data)
UPDATE:
Sorry. In the data set, NOT all columns are numeric, so I need to specify a range from 1:99 for columns.
UPDATE V2:
Added a part of my data set (using dput
)
structure(list(label = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), f1 = c(15, 24, 10, 9, 6, 9), f2 = c(6,
14, 5, 4, 2, 4), f3 = c(6, 7, 2, 2, 1, 2), f4 = c(0, 0, 0, 0,
0, 0), f5 = c(9, 15, 6, 5, 3, 5), f6 = c(3, 7, 2, 2, 1, 2), f7 = c(1,
0, 0, 0, 0, 0), f8 = c(4, 11, 5, 4, 2, 4), f9 = c(5, 3, 0, 0,
0, 0), f10 = c(1, 3, 0, 0, 0, 0), f11 = c(1, 4, 2, 2, 1, 2),
f12 = c(0, 0, 0, 0, 0, 0), f13 = c(13, 15, 7, 6, 3, 6), f14 = c(0,
7, 1, 1, 1, 1), f15 = c(0, 0, 0, 0, 0, 0), f16 = c(20, 30,
11, 10, 6, 10), f17 = c(5, 0, 0, 0, 0, 0), f18 = c(0, 0,
0, 0, 0, 0), ft19 = c(28, 344, 399, 28, 82, 42), f20 = c(2.15,
15.64, 49.88, 4, 20.5, 6), f21 = c(0, 0, 0, 0, 0, 0), f22 = c(0,
0, 0, 0, 0, 0), f23 = c(6, 7, 2, 2, 1, 2), f24 = c(0, 0,
0, 0, 0, 0), f25 = c(19, 334, 395, 23, 79, 37), f26 = c(0,
26, 37, 6, 16, 7), f27 = c(11, 64, 101, 5, 17, 12), f28 = c(0,
0, 0, 0, 0, 0), f29 = c(2, 37, 101, 7, 26, 8), f30 = c(0,
18, 32, 2, 16, 4), f31 = c(0, 0, 0, 0, 0, 0), f32 = c(0,
0, 0, 0, 0, 0), f33 = c(3, 0, 1, 0, 1, 0), f34 = c(5, 44,
32, 4, 15, 5), f35 = c(0, 0, 0, 0, 0, 0), f36 = c(0, 0, 0,
0, 0, 0), f37 = c(0, 0, 0, 0, 0, 0), f38 = c(0, 0, 0, 0,
0, 0), f39 = c(6, 8, 10, 3, 2, 3), f40 = c(4, 6, 16, 4, 4,
3), f41 = c(18, 36, 37, 7, 5, 7), f42 = c(0, 18, 27, 0, 14,
1), f43 = c(0, 0, 0, 0, 0, 0), f44 = c(54, 743, 910, 65,
184, 100), f45 = c(14, 133, 91, 25, 18, 40), f46 = c(0, 0,
0, 0, 0, 0), f47 = c(4, 25, 17, 6, 6, 8), f48 = c(0, 0, 0,
0, 0, 0), f49 = c(0.46, 1, 1.5, 1.14, 1.5, 1.14), f50 = c(2.67,
1.86, 1.83, 1.88, 1.67, 1.88), f51 = c(3, 9, 1, 2, 1, 2),
f52 = c(0, 1, 2, 1, 1, 1), f53 = c(10, 12, 5, 4, 2, 4), f54 = c(0,
0, 0, 0, 0, 0), ft55 = c(3, 10, 3, 3, 2, 3), f56 = c(0.54,
0.07, 0.03, 0.32, 0.07, 0.21), f57 = c(0.21, 0.04, 0.01,
0.14, 0.02, 0.1), f58 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
), f59 = c(0, 0, 0, 0, 0, 0), f60 = c(0.32, 0.04, 0.02, 0.18,
0.04, 0.12), f61 = c(0.11, 0.02, 0.01, 0.07, 0.01, 0.05),
f62 = c(0.04, 0, 0, 0, 0, 0), f63 = c(0.14, 0.03, 0.01, 0.14,
0.02, 0.1), f64 = c(0.18, 0.01, 0, 0, 0, 0), f65 = c(0.04,
0.01, 0, 0, 0, 0), f66 = c(0.04, 0.01, 0.01, 0.07, 0.01,
0.05), f67 = c(0, 0, 0, 0, 0, 0), f68 = c(0.46, 0.04, 0.02,
0.21, 0.04, 0.14), f69 = c(0, 0.02, 0, 0.04, 0.01, 0.02),
f70 = c(0, 0, 0, 0, 0, 0), f71 = c(0.71, 0.09, 0.03, 0.36,
0.07, 0.24), f72 = c(0.18, 0, 0, 0, 0, 0), f73 = c(0, 0,
0, 0, 0, 0), f74 = c(1, 1, 1, 1, 1, 1), f75 = c(0.08, 0.05,
0.12, 0.14, 0.25, 0.14), f76 = c(0, 0, 0, 0, 0, 0), f77 = c(0,
0, 0, 0, 0, 0), f78 = c(0.21, 0.02, 0.01, 0.07, 0.01, 0.05
), f79 = c(0, 0, 0, 0, 0, 0), f80 = c(0.68, 0.97, 0.99, 0.82,
0.96, 0.88), f81 = c(0, 0.08, 0.09, 0.21, 0.2, 0.17), f82 = c(0.39,
0.19, 0.25, 0.18, 0.21, 0.29), f83 = c(0, 0, 0, 0, 0, 0),
f84 = c(0.07, 0.11, 0.25, 0.25, 0.32, 0.19), f85 = c(0, 0.05,
0.08, 0.07, 0.2, 0.1), f86 = c(0, 0, 0, 0, 0, 0), f87 = c(0,
0, 0, 0, 0, 0), f88 = c(0.11, 0, 0, 0, 0.01, 0), f89 = c(0.18,
0.13, 0.08, 0.14, 0.18, 0.12), f90 = c(0, 0, 0, 0, 0, 0),
f91 = c(0, 0, 0, 0, 0, 0), f92 = c(0, 0, 0, 0, 0, 0), f93 = c(0,
0, 0, 0, 0, 0), f94 = c(0.21, 0.02, 0.03, 0.11, 0.02, 0.07
), f95 = c(0.14, 0.02, 0.04, 0.14, 0.05, 0.07), f96 = c(0.64,
0.1, 0.09, 0.25, 0.06, 0.17), f97 = c(0, 0.05, 0.07, 0, 0.17,
0.02), f98 = c(0, 0, 0, 0, 0, 0), f99 = c(1.93, 2.16, 2.28,
2.32, 2.24, 2.38), f100 = c(0.5, 0.39, 0.23, 0.89, 0.22,
0.95), f101 = c(0, 0, 0, 0, 0, 0), f102 = c(0.14, 0.07, 0.04,
0.21, 0.07, 0.19), f103 = c(0, 0, 0, 0, 0, 0), f104 = c(0.02,
0, 0, 0.04, 0.02, 0.03), f105 = c(0.1, 0.01, 0, 0.07, 0.02,
0.04), f106 = c(0.11, 0.03, 0, 0.07, 0.01, 0.05), f107 = c(0,
0, 0.01, 0.04, 0.01, 0.02), f108 = c(0.36, 0.03, 0.01, 0.14,
0.02, 0.1), f109 = c(0, 0, 0, 0, 0, 0), f110 = c(0.11, 0.03,
0.01, 0.11, 0.02, 0.07)), .Names = c("label", "f1", "f2",
"f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12",
"f13", "f14", "f15", "f16", "f17", "f18", "ft19", "f20", "f21",
"f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29", "f30",
"f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39",
"f40", "f41", "f42", "f43", "f44", "f45", "f46", "f47", "f48",
"f49", "f50", "f51", "f52", "f53", "f54", "ft55", "f56", "f57",
"f58", "f59", "f60", "f61", "f62", "f63", "f64", "f65", "f66",
"f67", "f68", "f69", "f70", "f71", "f72", "f73", "f74", "f75",
"f76", "f77", "f78", "f79", "f80", "f81", "f82", "f83", "f84",
"f85", "f86", "f87", "f88", "f89", "f90", "f91", "f92", "f93",
"f94", "f95", "f96", "f97", "f98", "f99", "f100", "f101", "f102",
"f103", "f104", "f105", "f106", "f107", "f108", "f109", "f110"
), class = "data.frame", row.names = c(NA, -6L))
Upvotes: 1
Views: 3769
Reputation: 7568
I think in the solutions using all(x == 0)
it is slightly more efficient to use any(x!=0)
, because any
stops after the first instance of an element being !=0
, which will be important with growing number of rows.
To provide a different solution using plyr
and colwise
(dat
being the dput
data):
library(plyr)
f0 <- function(x) any(x!=0) & is.numeric(x)
colwise(identity, f0)(dat)
The idea is to go through every column in dat and return it (identity
), but only if f0
returns TRUE
, i.e. the column has at least one entry !=0
and the column is.numeric
EDIT:
To do this for every data.frame in your list, eg. training_data <- list(dat, dat, dat, dat)
training_data_clean <- lapply(training_data, function(z) colwise(identity, f0)(z))
sapply(training_data, dim)
[,1] [,2] [,3] [,4]
[1,] 6 6 6 6
[2,] 111 111 111 111
sapply(training_data_clean, dim)
[,1] [,2] [,3] [,4]
[1,] 6 6 6 6
[2,] 74 74 74 74
EDIT2: To retain the label column:
lapply(training_data, function(z) cbind(label = z$label, colwise(identity, f0)(z)))
Upvotes: 2
Reputation: 118779
Just another way using lapply
as it is a data.frame
. apply
internally converts data.frame
to a matrix
I believe.
df[!unlist(lapply(df, function(x) all(x==0)))]
Or in your case:
df[, 1:99][!unlist(lapply(df[, 1:99], function(x) all(x==0)))]
Edit: Another way using colSums
. The trick is to use it after checking for 0
.
df[!colSums(df == 0) == nrow(df)]
If you know which columns are numeric (say, 1:99), then replace df
with:
df[,1:99][!colSums(df[,1:99] == 0) == nrow(df)]
Upvotes: 2
Reputation: 81683
training_data[, !colSums(training_data == 0)]
Based on question update: (filter applied to columns 1 - 99)
idx <- which(as.logical(colSums(training_data[, 1:99] == 0))) # find columns
training_data[, setdiff(seq_along(test_data), idx)] # exclude columns
Upvotes: 3
Reputation: 121568
You can use colSums
dat <- diag(10)
dat[1,1] <- 0
dat[5,5] <- 0
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0 0 0 0 0 0 0 0 0
[2,] 0 1 0 0 0 0 0 0 0 0
[3,] 0 0 1 0 0 0 0 0 0 0
[4,] 0 0 0 1 0 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0 0 0
[6,] 0 0 0 0 0 1 0 0 0 0
[7,] 0 0 0 0 0 0 1 0 0 0
[8,] 0 0 0 0 0 0 0 1 0 0
[9,] 0 0 0 0 0 0 0 0 1 0
[10,] 0 0 0 0 0 0 0 0 0 1
colSums(dat) == 0
TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
So to remove the columns with 0 , you just do this
dat[ ,colSums(dat)!=0]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0 0 0 0 0 0 0 0
[2,] 1 0 0 0 0 0 0 0
[3,] 0 1 0 0 0 0 0 0
[4,] 0 0 1 0 0 0 0 0
[5,] 0 0 0 0 0 0 0 0
[6,] 0 0 0 1 0 0 0 0
[7,] 0 0 0 0 1 0 0 0
[8,] 0 0 0 0 0 1 0 0
[9,] 0 0 0 0 0 0 1 0
[10,] 0 0 0 0 0 0 0 1
EDIT
This assume that all data have the same sign, to avoid this ,
dat[ ,colSums(abs(dat[,1:99]))!=0]
Upvotes: 2
Reputation: 2144
apply(df, 2, Filter, f = function(x){!all(x==0)})
I had the same question.
Upvotes: 1
Reputation: 338
training_data[,apply(training_data, MARGIN = 2, FUN = function(x) !all(x == 0))]
Upvotes: 2