Reputation: 77
I have a following line of code which replaces a value in a variable (var2
) based on a value in other variable (var1
)
df$var2[df$var1 > 0] <- NA
However I would like to extend this and replace variables (e.g var5
) in the data frame based on values in multiple other variables (var1
, var2
, var3
, var4
) which are coded in specific columns 13:16.
I tried
df$var5[df[c(13:16)] > 0] <- NA
which would not work correctly and I would like to know why and how to optimally amend the code.
Upvotes: 0
Views: 141
Reputation: 101064
Here is a base R solution, slightly modification based on your code
df$var5[rowSums(df[13:16] > 0)>0] <- NA
Note that df[c(13:16)]>0
gives you a matrix, but you need just a vector to subset df$var5
, so rowSums()
can help you check if the whole row is fill with TRUEs.
Example
Given df
as
df <- structure(list(X1 = c(0L, 3L, 0L, 1L, 4L, 2L, NA, 1L, 2L, 2L,
0L, 4L, 4L, 1L, NA, NA, 1L, 0L, 4L, 4L), X2 = c(0L, 0L, NA, 4L,
4L, 1L, 1L, NA, 0L, 3L, 0L, 3L, 2L, NA, 1L, 1L, NA, 3L, 3L, 3L
), X3 = c(1L, 3L, 0L, NA, 0L, 3L, 0L, NA, 1L, 2L, 1L, NA, NA,
1L, 4L, 1L, NA, NA, NA, 0L), X4 = c(2L, 2L, NA, 3L, NA, 2L, 0L,
3L, 4L, 0L, 0L, NA, 3L, 4L, 4L, 3L, NA, 4L, 3L, 3L), X5 = c(0L,
4L, 4L, NA, 0L, 0L, 2L, NA, 1L, 1L, 2L, NA, 1L, 3L, 2L, 4L, 1L,
1L, 0L, 2L), X6 = c(2L, 1L, 1L, 4L, 1L, 4L, 3L, 4L, 3L, NA, 0L,
2L, 1L, 2L, 2L, 0L, 4L, NA, NA, NA), X7 = c(3L, 3L, 0L, 4L, 4L,
NA, 0L, 2L, NA, 2L, NA, 2L, 2L, 3L, 0L, 0L, 3L, 1L, NA, 0L),
X8 = c(1L, 2L, 3L, 0L, 2L, 4L, 2L, 3L, 1L, 0L, 3L, 0L, 3L,
1L, 4L, 1L, 1L, 1L, 2L, 0L), X9 = c(1L, 2L, 2L, 2L, NA, 2L,
4L, 2L, 0L, 1L, 3L, 1L, 1L, 3L, 4L, 0L, 4L, 4L, 4L, 3L),
X10 = c(NA, NA, 3L, NA, 3L, 1L, 0L, 2L, 0L, NA, 0L, 3L, 4L,
0L, 2L, 3L, 4L, 3L, 0L, 0L), X11 = c(4L, 4L, 0L, 4L, 3L,
1L, NA, 1L, 0L, 4L, 4L, NA, NA, 1L, NA, NA, 4L, 1L, NA, NA
), X12 = c(3L, 1L, 4L, 4L, 3L, 3L, 0L, 1L, 3L, 0L, 0L, 2L,
0L, 0L, NA, NA, NA, 3L, 2L, 4L), X13 = c(2L, 4L, 0L, 0L,
0L, NA, 4L, 3L, 3L, 3L, NA, 3L, 4L, 1L, 3L, 0L, 3L, NA, 3L,
4L), X14 = c(3L, 1L, 1L, 1L, 0L, 0L, 3L, 3L, 4L, 4L, NA,
0L, 4L, 3L, NA, 0L, 1L, 0L, 4L, 1L), X15 = c(2L, 2L, 1L,
0L, 3L, 1L, 4L, 4L, 2L, 1L, 3L, 2L, 2L, NA, NA, 0L, 3L, 4L,
3L, NA), X16 = c(4L, 2L, 2L, 0L, 0L, 1L, 4L, 0L, 2L, 1L,
3L, 0L, 2L, 0L, NA, 4L, 3L, 1L, 4L, 4L), resp = c(1.86666666666667,
2.26666666666667, 1.5, 2.07692307692308, 1.92857142857143,
1.78571428571429, 1.92857142857143, 2.23076923076923, 1.73333333333333,
1.71428571428571, 1.46153846153846, 1.83333333333333, 2.35714285714286,
1.64285714285714, 2.6, 1.30769230769231, 2.66666666666667,
2, 2.66666666666667, 2.15384615384615)), row.names = c(NA,
-20L), class = "data.frame")
> df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 resp
1 0 0 1 2 0 2 3 1 1 NA 4 3 2 3 2 4 1.866667
2 3 0 3 2 4 1 3 2 2 NA 4 1 4 1 2 2 2.266667
3 0 NA 0 NA 4 1 0 3 2 3 0 4 0 1 1 2 1.500000
4 1 4 NA 3 NA 4 4 0 2 NA 4 4 0 1 0 0 2.076923
5 4 4 0 NA 0 1 4 2 NA 3 3 3 0 0 3 0 1.928571
6 2 1 3 2 0 4 NA 4 2 1 1 3 NA 0 1 1 1.785714
7 NA 1 0 0 2 3 0 2 4 0 NA 0 4 3 4 4 1.928571
8 1 NA NA 3 NA 4 2 3 2 2 1 1 3 3 4 0 2.230769
9 2 0 1 4 1 3 NA 1 0 0 0 3 3 4 2 2 1.733333
10 2 3 2 0 1 NA 2 0 1 NA 4 0 3 4 1 1 1.714286
11 0 0 1 0 2 0 NA 3 3 0 4 0 NA NA 3 3 1.461538
12 4 3 NA NA NA 2 2 0 1 3 NA 2 3 0 2 0 1.833333
13 4 2 NA 3 1 1 2 3 1 4 NA 0 4 4 2 2 2.357143
14 1 NA 1 4 3 2 3 1 3 0 1 0 1 3 NA 0 1.642857
15 NA 1 4 4 2 2 0 4 4 2 NA NA 3 NA NA NA 2.600000
16 NA 1 1 3 4 0 0 1 0 3 NA NA 0 0 0 4 1.307692
17 1 NA NA NA 1 4 3 1 4 4 4 NA 3 1 3 3 2.666667
18 0 3 NA 4 1 NA 1 1 4 3 1 3 NA 0 4 1 2.000000
19 4 3 NA 3 0 NA NA 2 4 0 NA 2 3 4 3 4 2.666667
20 4 3 0 3 2 NA 0 0 3 0 NA 4 4 1 NA 4 2.153846
then
df$resp[rowSums(df[12:16]>0,na.rm = T)>0] <- NA
such that
> df
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 resp
1 0 0 1 2 0 2 3 1 1 NA 4 3 2 3 2 4 NA
2 3 0 3 2 4 1 3 2 2 NA 4 1 4 1 2 2 NA
3 0 NA 0 NA 4 1 0 3 2 3 0 4 0 1 1 2 NA
4 1 4 NA 3 NA 4 4 0 2 NA 4 4 0 1 0 0 NA
5 4 4 0 NA 0 1 4 2 NA 3 3 3 0 0 3 0 NA
6 2 1 3 2 0 4 NA 4 2 1 1 3 NA 0 1 1 NA
7 NA 1 0 0 2 3 0 2 4 0 NA 0 4 3 4 4 NA
8 1 NA NA 3 NA 4 2 3 2 2 1 1 3 3 4 0 NA
9 2 0 1 4 1 3 NA 1 0 0 0 3 3 4 2 2 NA
10 2 3 2 0 1 NA 2 0 1 NA 4 0 3 4 1 1 NA
11 0 0 1 0 2 0 NA 3 3 0 4 0 NA NA 3 3 NA
12 4 3 NA NA NA 2 2 0 1 3 NA 2 3 0 2 0 NA
13 4 2 NA 3 1 1 2 3 1 4 NA 0 4 4 2 2 NA
14 1 NA 1 4 3 2 3 1 3 0 1 0 1 3 NA 0 NA
15 NA 1 4 4 2 2 0 4 4 2 NA NA 3 NA NA NA NA
16 NA 1 1 3 4 0 0 1 0 3 NA NA 0 0 0 4 NA
17 1 NA NA NA 1 4 3 1 4 4 4 NA 3 1 3 3 NA
18 0 3 NA 4 1 NA 1 1 4 3 1 3 NA 0 4 1 NA
19 4 3 NA 3 0 NA NA 2 4 0 NA 2 3 4 3 4 NA
20 4 3 0 3 2 NA 0 0 3 0 NA 4 4 1 NA 4 NA
Upvotes: 2
Reputation: 2783
Here is a tidyverse
answer. First, we create a dummy dataset. For printing purposes I have only created one with 10 variables, rather than the 16 you requested:
library(tidyverse)
set.seed(1)
df <-
replicate(9, sample(0:4, size = 10, replace = TRUE)) %>%
as_tibble() %>%
set_names(paste0("var", 1:9))
df
#> # A tibble: 10 x 9
#> var1 var2 var3 var4 var5 var6 var7 var8 var9
#> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 4 4 3 2 0 3 2 1
#> 2 3 4 1 3 1 3 3 1 1
#> 3 0 1 1 3 1 4 0 3 4
#> 4 1 1 0 1 4 0 4 2 1
#> 5 4 0 3 3 1 0 4 4 4
#> 6 2 4 0 0 0 3 0 1 3
#> 7 1 4 3 0 2 4 0 1 4
#> 8 2 0 2 3 2 4 2 0 3
#> 9 2 0 1 0 3 3 1 2 0
#> 10 0 4 1 1 2 4 1 2 2
Next, we conditionally mutate
variable var5
such that it is equal to NA
only if all of the variables var6:var9
are greater than 0
, and otherwise keep their original values:
df <-
df %>%
mutate(
var5 = ifelse(var6 > 0 & var7 > 0 & var8 > 0 & var9 > 0, NA, var5)
)
df
#> # A tibble: 10 x 9
#> var1 var2 var3 var4 var5 var6 var7 var8 var9
#> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 4 4 3 2 0 3 2 1
#> 2 3 4 1 3 NA 3 3 1 1
#> 3 0 1 1 3 1 4 0 3 4
#> 4 1 1 0 1 4 0 4 2 1
#> 5 4 0 3 3 1 0 4 4 4
#> 6 2 4 0 0 0 3 0 1 3
#> 7 1 4 3 0 2 4 0 1 4
#> 8 2 0 2 3 2 4 2 0 3
#> 9 2 0 1 0 3 3 1 2 0
#> 10 0 4 1 1 NA 4 1 2 2
Created on 2020-01-22 by the reprex package (v0.3.0)
EDIT
Based on your comment below, we use the |
operator to say 'or' instead of &
to say 'and'. First, we create a new dummy dataset with many more 0
s for demonstration purposes:
library(tidyverse)
set.seed(1)
df <-
replicate(9, sample(c(rep(0, 10), 1:4), size = 10, replace = TRUE)) %>%
as_tibble() %>%
set_names(paste0("var", 1:9))
df
#> # A tibble: 10 x 9
#> var1 var2 var3 var4 var5 var6 var7 var8 var9
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 1 0 4 0 2 0 0 0
#> 2 0 0 0 0 2 0 4 0 3
#> 3 0 0 0 2 0 0 0 0 0
#> 4 0 0 0 0 0 0 3 0 0
#> 5 0 0 0 0 2 0 0 2 0
#> 6 3 0 4 0 0 0 0 0 0
#> 7 0 0 0 0 0 0 4 0 4
#> 8 1 4 0 0 0 0 0 0 0
#> 9 4 0 0 0 0 0 0 1 0
#> 10 0 0 0 0 0 0 0 0 3
And now we replace &
with |
:
df <-
df %>%
mutate(
var5 = ifelse(var6 > 0 | var7 > 0 | var8 > 0 | var9 > 0, NA, var5)
)
df
#> # A tibble: 10 x 9
#> var1 var2 var3 var4 var5 var6 var7 var8 var9
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0 1 0 4 NA 2 0 0 0
#> 2 0 0 0 0 NA 0 4 0 3
#> 3 0 0 0 2 0 0 0 0 0
#> 4 0 0 0 0 NA 0 3 0 0
#> 5 0 0 0 0 NA 0 0 2 0
#> 6 3 0 4 0 0 0 0 0 0
#> 7 0 0 0 0 NA 0 4 0 4
#> 8 1 4 0 0 0 0 0 0 0
#> 9 4 0 0 0 NA 0 0 1 0
#> 10 0 0 0 0 NA 0 0 0 3
Created on 2020-01-22 by the reprex package (v0.3.0)
Upvotes: 0
Reputation: 4169
First some.dummy data
library(data.table)
dt1 <- data.table(
"V1" = rnorm(10,0,1),
"V2" = rnorm(10,0,1),
"V3" = rnorm(10,0,1),
"V4" = rnorm(10,0,1),
"V5" = rnorm(10,0,1))
Then for one variable
dt1[V1 < 0, V6 := NA]
And for multiple
dt1[V1 < 0 & V2 < 0 & V3 <0, V5 := NA]
Upvotes: 1
Reputation: 11981
You can can get a better understanding by looking at smaller parts of your code.
First look at df[c(13:16)] > 0
: the result is a matrix with just TRUE
and FALSE
in it. In particular you cannot use df$var5[df[c(13:16)] > 0]
because the inner object is a matrix and you cannot subset a vector with matrix.
So what can you do? You can use apply
to transform this matrix into a vector:
idx <- apply(df[c(13:16)] > 0, 1, all)
This will result in a vector which contains TRUE
if all elements in a row are TRUE
and FALSE
otherwise.
And finally you can use df$var5[idx] <- NA
Upvotes: 0