Reputation: 61
I have the following data.frame
:
dage ded dht dwt marital inc smoke time number
31 5 65 110 1 1 0 0 0
38 5 70 148 1 4 0 0 0
32 1 99 999 1 2 1 1 1
28 4 99 999 1 98 3 4 2
35 4 99 999 1 7 0 0 0
33 4 98 998 1 99 0 0 0
I want to remove any row that has the number 99
or 999
(or both).
data.frame structure:
df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L,
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L,
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L,
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L,
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L,
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")
Upvotes: 1
Views: 88
Reputation: 1986
Create data.frame as shown in original question:
df <- structure(list(dage = c(31L, 38L, 32L, 28L, 35L, 33L), ded = c(5L,
5L, 1L, 4L, 4L, 4L), dht = c(65L, 70L, 99L, 99L, 99L, 98L), dwt = c(110L,
148L, 999L, 999L, 999L, 998L), marital = c(1L, 1L, 1L, 1L, 1L,
1L), inc = c(1L, 4L, 2L, 98L, 7L, 99L), smoke = c(0L, 0L, 1L,
3L, 0L, 0L), time = c(0L, 0L, 1L, 4L, 0L, 0L), number = c(0L,
0L, 1L, 2L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")
data.table
solution:
library(data.table)
dt <- as.data.table(df)
dt[rowSums(df == 99)==0 & rowSums(df == 999)==0]
base
R solution:
df[!apply(df, 1, function(x) any(x %in% c(99,999))),]
dplyr
solution:
require(dplyr)
filter_all(df, all_vars(.!=99 & .!=999))
Benchmarks:
microbenchmark::microbenchmark(dt = dt[rowSums(df == 99)==0 & rowSums(df == 999)==0],
base = df[!apply(df, 1, function(x) any(x %in% c(99,999))),],
dplyr = filter_all(df, all_vars(.!=99 & .!=999)), times = 10000)
# Unit: microseconds
#expr min lq mean median uq max neval
#dt 588.000 645.801 701.4309 675.6005 723.2515 5203.801 10000
#base 264.601 296.901 324.2588 314.4005 335.7020 3435.600 10000
#dplyr 3671.400 3854.301 4036.3976 3915.3010 3983.0010 139226.802 10000
Upvotes: 0
Reputation: 39154
You can replace 99
and 999
with NA
first.
dat[dat == 99 | dat == 999] <- NA
And then use na.omit
or complete.cases
.
na.omit(dat)
# dage ded dht dwt marital inc smoke time number
# 1 31 5 65 110 1 1 0 0 0
# 2 38 5 70 148 1 4 0 0 0
dat[complete.cases(dat), ]
# dage ded dht dwt marital inc smoke time number
# 1 31 5 65 110 1 1 0 0 0
# 2 38 5 70 148 1 4 0 0 0
DATA
dat <- read.table(text = "dage ded dht dwt marital inc smoke time number
31 5 65 110 1 1 0 0 0
38 5 70 148 1 4 0 0 0
32 1 99 999 1 2 1 1 1
28 4 99 999 1 98 3 4 2
35 4 99 999 1 7 0 0 0
33 4 98 998 1 99 0 0 0",
header = TRUE)
Upvotes: 1
Reputation: 323226
Using rowSums
df[rowSums(df[,c('dht','dwt')]==99|df[,c('dht','dwt')]==999)==0,]
ded dht dwt
1 5 65 110
2 5 70 148
6 4 98 998
Upvotes: 0
Reputation: 69171
Here's a solution using any()
and apply()
that doesn't require any supplemental packages:
#fake data
d <- data.frame(a = c(1,2,3,4,99), b = c(99, 1,2,999,4))
#subset rows that don't contain a 99 or 999
d[!apply(d, 1, function(x) any(x %in% c(99,999))),]
Yields:
a b
2 2 1
3 3 2
Upvotes: 0
Reputation: 7592
If your dataframe is called df1
:
require(dplyr)
filter_all(df1, all_vars(.!=99 & .!=999))
Result:
dage ded dht dwt marital inc smoke time number
1 31 5 65 110 1 1 0 0 0
2 38 5 70 148 1 4 0 0 0
Upvotes: 0