stats_noob
stats_noob

Reputation: 5925

R: Selecting Rows Based on Conditions Stored in a Data Frame

I am working with the R programming language.

I have the following dataset:

num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)

factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")

factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <-  as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <-  as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <-  as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

id = 1:1000

my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)


> head(my_data)
  id num_var_1 num_var_2 num_var_3 num_var_4  num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1  1  9.439524  5.021006  4.883963  8.496925  11.965498            B           AA          AAA         CCCC         AAAA
2  2  9.769823  4.800225 12.369379  6.722429  16.501132            B           AA          AAA         AAAA         AAAA
3  3 11.558708  9.910099  4.584108 -4.481653  16.710042            C           AA          BBB         AAAA         CCCC
4  4 10.070508  9.339124 22.192276  3.027154  -2.841578            B           CC          DDD         BBBB         AAAA
5  5 10.129288 -2.746714 11.741359 35.984902 -10.261096            B           AA          AAA         DDDD         DDDD
6  6 11.715065 15.202867  3.847317  9.625850  32.053261            B           AA          CCC         BBBB         EEEE

Based on the answer provided from a previous question (R: Randomly Sampling Mixed Variables), I learned how to randomly take samples from this data:

library(dplyr)
 library(purrr)

# calc the ratio of choosing variable
var_num <- ncol(my_data) - 1
var_select_ratio <- sum(1:var_num) / (var_num^2)

num_func <- function(vec, iter_num) {
  random_val = runif(iter_num, min(vec), max(vec))
  is_select <- sample(c(NA, 1), iter_num, 
                      prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
  return(random_val * is_select)
}

fac_func <- function(vec, iter_num) {
  nlevels <- sample.int(length(levels(vec)), iter_num, replace = TRUE)
  is_select <- sample(c(0, 1), iter_num, 
                      prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
  out <- map2(nlevels, is_select,  # NOTE: this process isn't vectorized
              function(nl, ic){
                if(ic == 0) NULL else sample(vec, nl)
              })
  return(out)
}

integ_func <- function(vec, iter_num) {
  if(is.factor(vec)) fac_func(vec, iter_num) else num_func(vec, iter_num)
}

After these functions are defined, you can now take random samples:

res <- my_data %>% 
  select(-id) %>% 
  map(~ integ_func(.x, iter_num = 10)) %>%  # use the func with each cols
  as_tibble()   

# if you want to paste factor_var
res2 <- res %>% 
  mutate_if(is.list, function(col) map_chr(col, function(cell) paste(sort(cell), collapse = " "))) %>%   # paste
  mutate_if(is.character, function(col) na_if(col, ""))  # replace "" to NA

This produces the following results:

 > res2 = data.frame(res2)

> res2
   num_var_1 num_var_2  num_var_3 num_var_4  num_var_5 factor_var_1 factor_var_2    factor_var_3             factor_var_4             factor_var_5
1   8.251683 27.791314  30.525573  33.95768   2.388074            B         <NA>             AAA                     AAAA                     DDDD
2   9.012602        NA         NA        NA  20.236515            A        AA BB            <NA>                     <NA>                     BBBB
3         NA 16.778085  28.097324   5.69020         NA            B           BB     CCC DDD DDD                     <NA> AAAA BBBB CCCC CCCC CCCC
4  12.838667 -3.694075  13.411877  -2.20004         NA         <NA>     AA AA BB     AAA BBB CCC                     <NA> AAAA AAAA BBBB CCCC DDDD
5         NA        NA  11.922439  17.63757         NA          A B     AA AA BB            <NA>                AAAA AAAA                     BBBB
6  12.768595        NA  28.507646        NA         NA            C           AA     BBB DDD DDD      AAAA AAAA CCCC DDDD AAAA AAAA BBBB EEEE EEEE
7         NA        NA -20.424906        NA  20.147004         <NA>        AA AA            <NA> AAAA AAAA AAAA CCCC EEEE                     <NA>
8         NA  6.299722   8.569485  24.82825 -17.715862         <NA>           BB AAA AAA BBB CCC                     <NA>                BBBB EEEE
9  10.846757        NA         NA        NA         NA        A B C     AA BB CC            <NA>                     <NA>                BBBB BBBB
10        NA  4.663916  22.335404        NA         NA        B B C        AA BB AAA AAA AAA DDD AAAA AAAA CCCC EEEE EEEE                     <NA>

My Question: Is it possible to select all rows from my_data based on the values from the 10th row of res_2?

For example:

#pseudocode 

final_result <- my_data[which(my_data$num_var_1 < res2[10,1] & my_data$num_var_2 < res2[10,2] & my_data$num_var_3 < res2[10,3] & my_data$num_var_4 < res2[10,4] & my_data$num_var_5 < res2[10,5] & my_data$factor_var_1 == res2[10,6] & my_data$factor_var_2 == res2[10,7] & my_data$factor_var_3 == res2[10,8] & my_data$factor_var_4 == res2[10,9] & my_data$factor_var_5 == res2[10,10]), ]

This of course does not work and returns an empty result:

[1] id           num_var_1    num_var_2    num_var_3    num_var_4    num_var_5    factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<0 rows> (or 0-length row.names)

This would be the equivalent of writing (see 10th row of res_2):

final_result <- my_data[which(my_data$num_var_2 < 4.66 & my_data$num_var_3 < 22.33 &  my_data$factor_var_1 %in% c( "B", "C") & my_data$factor_var_2 %in% c( "AA", "BB") & my_data$factor_var_3 %in% c( "AAA", "DDD") & my_data$factor_var_4 %in% c( "AAAA", "CCCC", "EEEE" )), ]

Can someone please show me how to do this?

Thanks!

Note: In this case, if a variable is <NA>, it would mean select ALL factors in that column. E.g. If factor_var_4 was assigned as , it would mean select ALL rows where factor_var_4 = AAAA or BBBB or CCCC or DDDD or EEEE

Upvotes: 0

Views: 453

Answers (2)

Valkyr
Valkyr

Reputation: 431

Using purrr and stringi within the tidyverse

Variant with %in%

library(stringi)
library(tidyverse)

row10 <- res2[10,]

check_num_func <- function(num){ 
  return(all(is.na(row10[1:5]) | num<row10[1:5]))
}

check_factor_func <- function(fac){
  result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), `%in%`)
  return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}

my_data_filtered <- my_data %>% 
  mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
         check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>% 
  filter(check_num & check_fac) %>% 
  select(-check_num, -check_fac)

Variant with is_in

library(stringi)
library(tidyverse)

row10 <- res2[10,]

check_num_func <- function(num){ 
  return(all(is.na(row10[1:5]) | num<row10[1:5]))
}

check_factor_func <- function(fac){
  result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), is_in)
  return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}

my_data_filtered <- my_data %>% 
  mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
         check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>% 
  filter(check_num & check_fac) %>% 
  select(-check_num, -check_fac)

My output (displaying only the head)

> res2[10,]
# A tibble: 1 x 10
  num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5       
      <dbl>     <dbl>     <dbl>     <dbl>     <dbl> <chr>        <chr>        <chr>        <chr>        <chr>              
1        NA        NA      11.0        NA      14.7 NA           NA           AAA          NA           AAAA BBBB BBBB CCCC
> my_data_filtered %>% head
  id num_var_1 num_var_2  num_var_3  num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1  3 11.846715 11.509069  10.840912  10.114141 11.554633            B           CC          AAA         BBBB         AAAA
2  6 10.690189 -1.335147  -1.028286  13.884934 12.047721            B           AA          AAA         BBBB         AAAA
3 12 10.022039 10.851465   8.905692  18.650092  7.823362            B           BB          AAA         DDDD         AAAA
4 15  9.156137  1.179674 -10.866282  20.058736  4.543155            A           BB          AAA         AAAA         BBBB
5 43 10.160805  9.347031 -10.808265  -2.588934  4.912227            B           CC          AAA         AAAA         BBBB
6 68  9.130132 10.531478 -20.125775 -10.083095 12.016262            A           BB          AAA         DDDD         BBBB

Explanation

row10 <- res2[10,] # take 10th row of res2

This next function checks if all the conditions on the numeric columns are fulfilled, given a numeric vector of length 5. row10[1:5] is row10 limited to the numeric columns. I use all to check if all elements of are TRUE. I get a logical element.

check_num_func <- function(num){                   # num = numeric vector of length 5
  return(all(is.na(row10[1:5]) | num<row10[1:5]))  # check cell by cell if it's either NA in row10 or if the cell in num < corresponding cell of row10
}

This next function checks if all the conditions on the character columns are fulfilled, given a character vector of length 5. row10[6:10] is row10 limited to the character columns. I use stringi::stri_extract_all_words to separate the factors in the strings and put them in a list. Then I purrr:map2_lgl on fac and this list, using the is_in function from magrittr (equivalent to ~.x%in%.y), which gives a logical vector. map2_lgl applies is_in to each element of fac and the corresponding element of result to check if the factor is in the factors proposed by row10. For the NA cases I use ifelse, replacing the result of map2_lgl with TRUE when row10[6:10] is NA. I use all again to see if all elements are TRUE. I get a logical element.

check_factor_func <- function(fac){ # fac = char vector of length 5
  result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), is_in)
  return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}

Then I use these 2 functions to check the conditions for all the rows using apply on the rows, selecting the relevant columns for apply, which I can then use to filter.

my_data_filtered <- my_data %>% 
  mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
         check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>% # create new variables applying each function to corresponding columns, row by row
  filter(check_num & check_fac) %>% 
  select(-check_num, -check_fac) # filter with my variables and remove filtering columns

Upvotes: 2

pawelru
pawelru

Reputation: 111

There are two possible solutions:

  1. revert (or don't do this at all) a pasting step that converts a vector of strings into a single string. Vector of strings will allow you to use %in% operator that I believe you are looking for.
  2. use a string in another string comparison

AD 1:

Like I said earlier - it would be easier to don't do it at all (i.e. use res instead of res2) but your question explicitly mention usage of res2. Therefore some revert step is needed:

res3 <- res2 %>% 
  mutate(
    across(where(is.character), function(vec) unlist(lapply(vec, strsplit, split = " "), recursive = FALSE))
  )

Here's the result (note randomness in the data):

> res3
# A tibble: 10 × 10
   num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
       <dbl>     <dbl>     <dbl>     <dbl>     <dbl> <list>       <list>       <list>       <list>       <list>      
 1     NA        NA        13.1       NA       NA    <chr [1]>    <chr [3]>    <chr [1]>    <chr [1]>    <chr [1]>   
 2     13.2      NA        17.3       39.4     15.4  <chr [1]>    <chr [1]>    <chr [1]>    <chr [4]>    <chr [4]>   
 3      8.52     NA        NA         NA       NA    <chr [1]>    <chr [2]>    <chr [3]>    <chr [1]>    <chr [3]>   
 4     11.5      NA        31.8       NA       NA    <chr [3]>    <chr [1]>    <chr [1]>    <chr [2]>    <chr [2]>   
 5     NA        NA        33.4       NA       30.1  <chr [1]>    <chr [2]>    <chr [1]>    <chr [1]>    <chr [1]>   
 6      7.21     -2.56      2.42      34.6     NA    <chr [2]>    <chr [2]>    <chr [3]>    <chr [2]>    <chr [5]>   
 7     12.9      NA        -4.54      13.4     17.5  <chr [1]>    <chr [1]>    <chr [1]>    <chr [5]>    <chr [2]>   
 8     10.8      NA       -19.4       NA       13.8  <chr [3]>    <chr [1]>    <chr [1]>    <chr [5]>    <chr [1]>   
 9      7.37     16.6      NA         33.3     NA    <chr [1]>    <chr [2]>    <chr [1]>    <chr [4]>    <chr [5]>   
10     NA         6.75     NA        -11.9      7.04 <chr [1]>    <chr [1]>    <chr [4]>    <chr [1]>    <chr [1]>   

Now goes the final comparison code. I noticed that you skipped all the NA in the res2 which simply mean you use a TRUE value in the comparison.

f_comp_in <- function(x, y) if (is.na(y[[1]])) T else x %in% y[[1]]
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>% 
  filter(
    f_comp_in(factor_var_1, res3[10, "factor_var_1"][[1]][[1]]) &
      f_comp_in(factor_var_2, res3[10, "factor_var_2"][[1]][[1]]) &
      f_comp_in(factor_var_3, res3[10, "factor_var_3"][[1]][[1]]) &
      f_comp_in(factor_var_4, res3[10, "factor_var_4"][[1]][[1]]) &
      f_comp_in(factor_var_5, res3[10, "factor_var_5"][[1]][[1]]) &
      f_comp_l(num_var_1, res3[10, "num_var_1"][[1]]) &
      f_comp_l(num_var_2, res3[10, "num_var_2"][[1]]) &
      f_comp_l(num_var_3, res3[10, "num_var_3"][[1]]) &
      f_comp_l(num_var_4, res3[10, "num_var_4"][[1]]) &
      f_comp_l(num_var_5, res3[10, "num_var_5"][[1]])
  )

Mind the list indexing. res2 as well as res3 is a tibble, therefore you need to run tibble_object[<row>, <col>][[1]] to get the value (tibble_object[<row>, <col>] would return a tibble). Moreover, all the factor_var_X are lists inside a list - one additional [[1]] is required to get the value(s).

Note that you can also do strsplit() within a comparison function without the need of creating res3 object (memory efficient):

f_comp_in <- function(x, y) if (is.na(y)) T else x %in% strsplit(y, split = " ")[[1]]
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>% 
  filter(
    f_comp_in(factor_var_1, res2[10, "factor_var_1"][[1]]) &
      f_comp_in(factor_var_2, res2[10, "factor_var_2"][[1]]) &
      f_comp_in(factor_var_3, res2[10, "factor_var_3"][[1]]) &
      f_comp_in(factor_var_4, res2[10, "factor_var_4"][[1]]) &
      f_comp_in(factor_var_5, res2[10, "factor_var_5"][[1]]) &
      f_comp_l(num_var_1, res2[10, "num_var_1"][[1]]) &
      f_comp_l(num_var_2, res2[10, "num_var_2"][[1]]) &
      f_comp_l(num_var_3, res2[10, "num_var_3"][[1]]) &
      f_comp_l(num_var_4, res2[10, "num_var_4"][[1]]) &
      f_comp_l(num_var_5, res2[10, "num_var_5"][[1]])
  )

AD 2:

Unfortunately, base string comparison with grepl() is not vectorised thus you need to execute it inside a for loop.

f_comp_in <- function(x, y) if (is.na(y)) T else vapply(x, grepl, logical(1), x = y)
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>% 
  filter(
    f_comp_in(factor_var_1, res2[10, "factor_var_1"][[1]]) &
      f_comp_in(factor_var_2, res2[10, "factor_var_2"][[1]]) &
      f_comp_in(factor_var_3, res2[10, "factor_var_3"][[1]]) &
      f_comp_in(factor_var_4, res2[10, "factor_var_4"][[1]]) &
      f_comp_in(factor_var_5, res2[10, "factor_var_5"][[1]]) &
      f_comp_l(num_var_1, res2[10, "num_var_1"][[1]]) &
      f_comp_l(num_var_2, res2[10, "num_var_2"][[1]]) &
      f_comp_l(num_var_3, res2[10, "num_var_3"][[1]]) &
      f_comp_l(num_var_4, res2[10, "num_var_4"][[1]]) &
      f_comp_l(num_var_5, res2[10, "num_var_5"][[1]])
  )

Upvotes: 3

Related Questions