Reputation: 15

How to use dplyr with variables for column names

I want to mutate one column of a data frame dynamically with dplyr by passing column names with a variable. For example, I have the following data frame:

DF <- data.frame(A = 1:10, 
                 B = 11:20, 
                 C = c(23:30, 21:22), 
                 D = c(39:40, 31:38), 
                 E = c(TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE))

DF
    A  B  C  D    E
1   1 11 23 39 TRUE
2   2 12 24 40 TRUE
3   3 13 25 31 TRUE
4   4 14 26 32 TRUE
5   5 15 27 33 TRUE
6   6 16 28 34 TRUE
7   7 17 29 35 TRUE
8   8 18 30 36 TRUE
9   9 19 21 37 TRUE
10 10 20 22 38 TRUE

Now I want to change the value of column E to FALSE for those rows where

the value in column B is smaller than the 0.1 quantile OR larger than the 0.9 quantile of all values in column B OR
if the same condition applies to column C OR
if the same condition applies to column D

So the resulting data frame should look like this:

    A  B  C  D    E
1   1 11 23 39 FALSE
2   2 12 24 40 FALSE
3   3 13 25 31 FALSE
4   4 14 26 32 TRUE
5   5 15 27 33 TRUE
6   6 16 28 34 TRUE
7   7 17 29 35 TRUE
8   8 18 30 36 FALSE
9   9 19 21 37 FALSE
10 10 20 22 38 FALSE

I wrote a script that mutates the data frame based on just one row and it works well:

DF <- DF %>%
    dplyr::mutate(E = if_else(B < quantile(B, 0.9), E, FALSE)) %>%
    dplyr::mutate(E = if_else(B > quantile(B, 0.1), E, FALSE))

DF
    A  B  C  D     E
1   1 11 23 39 FALSE
2   2 12 24 40  TRUE
3   3 13 25 31  TRUE
4   4 14 26 32  TRUE
5   5 15 27 33  TRUE
6   6 16 28 34  TRUE
7   7 17 29 35  TRUE
8   8 18 30 36  TRUE
9   9 19 21 37  TRUE
10 10 20 22 38 FALSE

However, when I try to make this dynamic, it doesn't work:

for (col in cols) {
  DF <- DF %>%
      dplyr::mutate_(E = if_else(col < quantile(col, 0.9), E, FALSE)) %>%
      dplyr::mutate_(E = if_else(col > quantile(col, 0.1), E, FALSE))
}

Error in (1 - h) * qs[i] : non-numeric argument to binary operator

How can I solve this?

Upvotes: 1

Answers (3)

Ivor

Reputation: 71

Using get() base R function -

for (col in cols) {
DF <- DF %>%
    dplyr::mutate(E = if_else(get(col) < quantile(get(col), 0.9), E, FALSE)) %>%
    dplyr::mutate(E = if_else(get(col) > quantile(get(col), 0.1), E, FALSE))
}

Upvotes: 0

akrun

Reputation: 887811

We can use the interp

library(dplyr)
library(lazyeval)
for (col in cols) {
  DF <- DF %>%
            mutate_(E = interp(~if_else(Col<quantile(Col, 0.9), E, FALSE),
                                        Col=as.name(col))) %>%
            mutate_(E = interp(~if_else(Col>quantile(Col, 0.1), E, FALSE),
                                        Col = as.name(col)))
        } 

DF
#    A  B  C  D     E
#1   1 11 23 39 FALSE
#2   2 12 24 40 FALSE
#3   3 13 25 31 FALSE
#4   4 14 26 32  TRUE
#5   5 15 27 33  TRUE
#6   6 16 28 34  TRUE
#7   7 17 29 35  TRUE
#8   8 18 30 36 FALSE
#9   9 19 21 37 FALSE
#10 10 20 22 38 FALSE

where

cols <- names(DF)[2:4]

Update

If we also need to pass the 'E' column

for (col in cols) {
    DF <- DF %>%
        mutate_(.dots = setNames(list(interp(~if_else(Col < quantile(Col, 0.9), Col2, FALSE), 
                    .values = list(Col= as.name(col), Col2 = as.name(names(DF)[5])))), names(DF)[5])) %>%
        mutate_(.dots = setNames(list(interp(~if_else(Col > quantile(Col, 0.1), Col2, FALSE), 
                    .values = list(Col= as.name(col), Col2 = as.name(names(DF)[5])))), names(DF)[5]))
}
 DF
#   A  B  C  D     E
#1   1 11 23 39 FALSE
#2   2 12 24 40 FALSE
#3   3 13 25 31 FALSE
#4   4 14 26 32  TRUE
#5   5 15 27 33  TRUE
#6   6 16 28 34  TRUE
#7   7 17 29 35  TRUE
#8   8 18 30 36 FALSE
#9   9 19 21 37 FALSE

Update2

With the devel version of dplyr (and soon to be released 0.6.0) we can also pass the variables as quosures and evaluate by unquoting with mutate

 varN <- quo(E)
 cols <- rlang::parse_quosures(paste(names(DF)[2:4], collapse=";"))
 varN1 <- quo_name(varN)

 for(i in seq_along(cols)) {
    DF <- DF %>%
         mutate(!!varN1 := if_else((!!cols[[i]]) < quantile((!!cols[[i]]), 0.9),
                      (!!varN), FALSE),
                !!varN1 := if_else((!!cols[[i]]) > quantile((!!cols[[i]]), 0.1),
                      (!!varN), FALSE))  


 }  
DF
#    A  B  C  D     E
#1   1 11 23 39 FALSE
#2   2 12 24 40 FALSE
#3   3 13 25 31 FALSE
#4   4 14 26 32  TRUE
#5   5 15 27 33  TRUE
#6   6 16 28 34  TRUE
#7   7 17 29 35  TRUE
#8   8 18 30 36 FALSE
#9   9 19 21 37 FALSE
#10 10 20 22 38 FALSE

Or another option is data.table

library(data.table) 
setDT(DF)[,  E := Reduce(`&`, lapply(.SD, function(x) x < quantile(x, 0.9) & 
             x > quantile(x, .1))), .SDcols = 2:4]

 DF
 #    A  B  C  D     E
 #1:  1 11 23 39 FALSE
 #2:  2 12 24 40 FALSE
 #3:  3 13 25 31 FALSE
 #4:  4 14 26 32  TRUE
 #5:  5 15 27 33  TRUE
 #6:  6 16 28 34  TRUE
 #7:  7 17 29 35  TRUE
 #8:  8 18 30 36 FALSE
 #9:  9 19 21 37 FALSE
 #10:10 20 22 38 FALSE

Or with only base R functions

DF$E <- Reduce(`&`, lapply(DF[2:4], function(x) x < quantile(x, 0.9) & x > quantile(x, .1)))

DF
#    A  B  C  D     E
#1   1 11 23 39 FALSE
#2   2 12 24 40 FALSE
#3   3 13 25 31 FALSE
#4   4 14 26 32  TRUE
#5   5 15 27 33  TRUE
#6   6 16 28 34  TRUE
#7   7 17 29 35  TRUE
#8   8 18 30 36 FALSE
#9   9 19 21 37 FALSE
#10 10 20 22 38 FALSE

Note: No external packages used

Note2: All the options return the same output

Upvotes: 1

alistaire

Reputation: 43354

You can iterate directly within mutate:

DF %>% mutate(E = apply(sapply(list(B, C, D), 
                               function(x){x < quantile(x, .9) & x > quantile(x, .1)}), 
                        1, all))
##     A  B  C  D     E
## 1   1 11 23 39 FALSE
## 2   2 12 24 40 FALSE
## 3   3 13 25 31 FALSE
## 4   4 14 26 32  TRUE
## 5   5 15 27 33  TRUE
## 6   6 16 28 34  TRUE
## 7   7 17 29 35  TRUE
## 8   8 18 30 36 FALSE
## 9   9 19 21 37 FALSE
## 10 10 20 22 38 FALSE

or with purrr,

library(tidyverse)

DF %>% mutate(E = list(B, C, D) %>%
                      map(~.x < quantile(.x, .9) & .x > quantile(.x, .1)) %>% 
                      pmap_lgl(all))

or go all in on matrices:

DF %>% mutate(E = cbind(B, C, D) %>% 
                      apply(2, function(x){x < quantile(x, .9) & x > quantile(x, .1)}) %>% 
                      apply(1, all))

All return the same thing.

If you like, substitute between for the inequalities, e.g. between(x, quantile(x, .1), quantile(x, .9)), though because it's defined as x >= left & x <= right it may differ when boundaries matter.

Upvotes: 0

How to use dplyr with variables for column names

Answers (3)

Update

Update2

Related Questions