Reputation: 5925
I am working with the R programming language.
I have the following dataset:
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
id = 1:1000
my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
> head(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 1 9.439524 5.021006 4.883963 8.496925 11.965498 B AA AAA CCCC AAAA
2 2 9.769823 4.800225 12.369379 6.722429 16.501132 B AA AAA AAAA AAAA
3 3 11.558708 9.910099 4.584108 -4.481653 16.710042 C AA BBB AAAA CCCC
4 4 10.070508 9.339124 22.192276 3.027154 -2.841578 B CC DDD BBBB AAAA
5 5 10.129288 -2.746714 11.741359 35.984902 -10.261096 B AA AAA DDDD DDDD
6 6 11.715065 15.202867 3.847317 9.625850 32.053261 B AA CCC BBBB EEEE
Based on the answer provided from a previous question (R: Randomly Sampling Mixed Variables), I learned how to randomly take samples from this data:
library(dplyr)
library(purrr)
# calc the ratio of choosing variable
var_num <- ncol(my_data) - 1
var_select_ratio <- sum(1:var_num) / (var_num^2)
num_func <- function(vec, iter_num) {
random_val = runif(iter_num, min(vec), max(vec))
is_select <- sample(c(NA, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
return(random_val * is_select)
}
fac_func <- function(vec, iter_num) {
nlevels <- sample.int(length(levels(vec)), iter_num, replace = TRUE)
is_select <- sample(c(0, 1), iter_num,
prob = c(1 - var_select_ratio, var_select_ratio), replace = TRUE)
out <- map2(nlevels, is_select, # NOTE: this process isn't vectorized
function(nl, ic){
if(ic == 0) NULL else sample(vec, nl)
})
return(out)
}
integ_func <- function(vec, iter_num) {
if(is.factor(vec)) fac_func(vec, iter_num) else num_func(vec, iter_num)
}
After these functions are defined, you can now take random samples:
res <- my_data %>%
select(-id) %>%
map(~ integ_func(.x, iter_num = 10)) %>% # use the func with each cols
as_tibble()
# if you want to paste factor_var
res2 <- res %>%
mutate_if(is.list, function(col) map_chr(col, function(cell) paste(sort(cell), collapse = " "))) %>% # paste
mutate_if(is.character, function(col) na_if(col, "")) # replace "" to NA
This produces the following results:
> res2 = data.frame(res2)
> res2
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 8.251683 27.791314 30.525573 33.95768 2.388074 B <NA> AAA AAAA DDDD
2 9.012602 NA NA NA 20.236515 A AA BB <NA> <NA> BBBB
3 NA 16.778085 28.097324 5.69020 NA B BB CCC DDD DDD <NA> AAAA BBBB CCCC CCCC CCCC
4 12.838667 -3.694075 13.411877 -2.20004 NA <NA> AA AA BB AAA BBB CCC <NA> AAAA AAAA BBBB CCCC DDDD
5 NA NA 11.922439 17.63757 NA A B AA AA BB <NA> AAAA AAAA BBBB
6 12.768595 NA 28.507646 NA NA C AA BBB DDD DDD AAAA AAAA CCCC DDDD AAAA AAAA BBBB EEEE EEEE
7 NA NA -20.424906 NA 20.147004 <NA> AA AA <NA> AAAA AAAA AAAA CCCC EEEE <NA>
8 NA 6.299722 8.569485 24.82825 -17.715862 <NA> BB AAA AAA BBB CCC <NA> BBBB EEEE
9 10.846757 NA NA NA NA A B C AA BB CC <NA> <NA> BBBB BBBB
10 NA 4.663916 22.335404 NA NA B B C AA BB AAA AAA AAA DDD AAAA AAAA CCCC EEEE EEEE <NA>
My Question: Is it possible to select all rows from my_data
based on the values from the 10th row of res_2
?
For example:
#pseudocode
final_result <- my_data[which(my_data$num_var_1 < res2[10,1] & my_data$num_var_2 < res2[10,2] & my_data$num_var_3 < res2[10,3] & my_data$num_var_4 < res2[10,4] & my_data$num_var_5 < res2[10,5] & my_data$factor_var_1 == res2[10,6] & my_data$factor_var_2 == res2[10,7] & my_data$factor_var_3 == res2[10,8] & my_data$factor_var_4 == res2[10,9] & my_data$factor_var_5 == res2[10,10]), ]
This of course does not work and returns an empty result:
[1] id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<0 rows> (or 0-length row.names)
This would be the equivalent of writing (see 10th row of res_2
):
final_result <- my_data[which(my_data$num_var_2 < 4.66 & my_data$num_var_3 < 22.33 & my_data$factor_var_1 %in% c( "B", "C") & my_data$factor_var_2 %in% c( "AA", "BB") & my_data$factor_var_3 %in% c( "AAA", "DDD") & my_data$factor_var_4 %in% c( "AAAA", "CCCC", "EEEE" )), ]
Can someone please show me how to do this?
Thanks!
Note: In this case, if a variable is <NA>
, it would mean select ALL factors in that column. E.g. If factor_var_4 was assigned as , it would mean select ALL rows where factor_var_4 = AAAA or BBBB or CCCC or DDDD or EEEE
Upvotes: 0
Views: 453
Reputation: 431
Using purrr
and stringi
within the tidyverse
Variant with %in%
library(stringi)
library(tidyverse)
row10 <- res2[10,]
check_num_func <- function(num){
return(all(is.na(row10[1:5]) | num<row10[1:5]))
}
check_factor_func <- function(fac){
result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), `%in%`)
return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}
my_data_filtered <- my_data %>%
mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>%
filter(check_num & check_fac) %>%
select(-check_num, -check_fac)
Variant with is_in
library(stringi)
library(tidyverse)
row10 <- res2[10,]
check_num_func <- function(num){
return(all(is.na(row10[1:5]) | num<row10[1:5]))
}
check_factor_func <- function(fac){
result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), is_in)
return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}
my_data_filtered <- my_data %>%
mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>%
filter(check_num & check_fac) %>%
select(-check_num, -check_fac)
My output (displaying only the head
)
> res2[10,]
# A tibble: 1 x 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
1 NA NA 11.0 NA 14.7 NA NA AAA NA AAAA BBBB BBBB CCCC
> my_data_filtered %>% head
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 3 11.846715 11.509069 10.840912 10.114141 11.554633 B CC AAA BBBB AAAA
2 6 10.690189 -1.335147 -1.028286 13.884934 12.047721 B AA AAA BBBB AAAA
3 12 10.022039 10.851465 8.905692 18.650092 7.823362 B BB AAA DDDD AAAA
4 15 9.156137 1.179674 -10.866282 20.058736 4.543155 A BB AAA AAAA BBBB
5 43 10.160805 9.347031 -10.808265 -2.588934 4.912227 B CC AAA AAAA BBBB
6 68 9.130132 10.531478 -20.125775 -10.083095 12.016262 A BB AAA DDDD BBBB
Explanation
row10 <- res2[10,] # take 10th row of res2
This next function checks if all the conditions on the numeric columns are fulfilled, given a numeric vector of length 5.
row10[1:5]
is row10
limited to the numeric columns. I use all
to check if all elements of are TRUE
.
I get a logical element.
check_num_func <- function(num){ # num = numeric vector of length 5
return(all(is.na(row10[1:5]) | num<row10[1:5])) # check cell by cell if it's either NA in row10 or if the cell in num < corresponding cell of row10
}
This next function checks if all the conditions on the character columns are fulfilled, given a character vector of length 5.
row10[6:10]
is row10
limited to the character columns.
I use stringi::stri_extract_all_words
to separate the factors in the strings and put them in a list.
Then I purrr:map2_lgl
on fac
and this list, using the is_in
function from magrittr
(equivalent to ~.x%in%.y
), which gives a logical vector. map2_lgl
applies is_in
to each element of fac
and the corresponding element of result
to check if the factor is in the factors proposed by row10
.
For the NA
cases I use ifelse
, replacing the result of map2_lgl
with TRUE
when row10[6:10]
is NA
. I use all
again to see if all elements are TRUE
.
I get a logical element.
check_factor_func <- function(fac){ # fac = char vector of length 5
result <- map2_lgl(fac, stri_extract_all_words(row10[6:10]), is_in)
return(all(ifelse(is.na(row10[6:10]), TRUE, result)))
}
Then I use these 2 functions to check the conditions for all the rows using apply
on the rows, selecting the relevant columns for apply
, which I can then use to filter
.
my_data_filtered <- my_data %>%
mutate(check_num = apply(my_data %>% select(num_var_1:num_var_5), 1, check_num_func),
check_fac = apply(my_data %>% select(factor_var_1:factor_var_5), 1, check_factor_func)) %>% # create new variables applying each function to corresponding columns, row by row
filter(check_num & check_fac) %>%
select(-check_num, -check_fac) # filter with my variables and remove filtering columns
Upvotes: 2
Reputation: 111
There are two possible solutions:
%in%
operator that I believe you are looking for.AD 1:
Like I said earlier - it would be easier to don't do it at all (i.e. use res
instead of res2
) but your question explicitly mention usage of res2
. Therefore some revert step is needed:
res3 <- res2 %>%
mutate(
across(where(is.character), function(vec) unlist(lapply(vec, strsplit, split = " "), recursive = FALSE))
)
Here's the result (note randomness in the data):
> res3
# A tibble: 10 × 10
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
<dbl> <dbl> <dbl> <dbl> <dbl> <list> <list> <list> <list> <list>
1 NA NA 13.1 NA NA <chr [1]> <chr [3]> <chr [1]> <chr [1]> <chr [1]>
2 13.2 NA 17.3 39.4 15.4 <chr [1]> <chr [1]> <chr [1]> <chr [4]> <chr [4]>
3 8.52 NA NA NA NA <chr [1]> <chr [2]> <chr [3]> <chr [1]> <chr [3]>
4 11.5 NA 31.8 NA NA <chr [3]> <chr [1]> <chr [1]> <chr [2]> <chr [2]>
5 NA NA 33.4 NA 30.1 <chr [1]> <chr [2]> <chr [1]> <chr [1]> <chr [1]>
6 7.21 -2.56 2.42 34.6 NA <chr [2]> <chr [2]> <chr [3]> <chr [2]> <chr [5]>
7 12.9 NA -4.54 13.4 17.5 <chr [1]> <chr [1]> <chr [1]> <chr [5]> <chr [2]>
8 10.8 NA -19.4 NA 13.8 <chr [3]> <chr [1]> <chr [1]> <chr [5]> <chr [1]>
9 7.37 16.6 NA 33.3 NA <chr [1]> <chr [2]> <chr [1]> <chr [4]> <chr [5]>
10 NA 6.75 NA -11.9 7.04 <chr [1]> <chr [1]> <chr [4]> <chr [1]> <chr [1]>
Now goes the final comparison code. I noticed that you skipped all the NA
in the res2
which simply mean you use a TRUE
value in the comparison.
f_comp_in <- function(x, y) if (is.na(y[[1]])) T else x %in% y[[1]]
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>%
filter(
f_comp_in(factor_var_1, res3[10, "factor_var_1"][[1]][[1]]) &
f_comp_in(factor_var_2, res3[10, "factor_var_2"][[1]][[1]]) &
f_comp_in(factor_var_3, res3[10, "factor_var_3"][[1]][[1]]) &
f_comp_in(factor_var_4, res3[10, "factor_var_4"][[1]][[1]]) &
f_comp_in(factor_var_5, res3[10, "factor_var_5"][[1]][[1]]) &
f_comp_l(num_var_1, res3[10, "num_var_1"][[1]]) &
f_comp_l(num_var_2, res3[10, "num_var_2"][[1]]) &
f_comp_l(num_var_3, res3[10, "num_var_3"][[1]]) &
f_comp_l(num_var_4, res3[10, "num_var_4"][[1]]) &
f_comp_l(num_var_5, res3[10, "num_var_5"][[1]])
)
Mind the list indexing. res2
as well as res3
is a tibble
, therefore you need to run tibble_object[<row>, <col>][[1]]
to get the value (tibble_object[<row>, <col>]
would return a tibble
). Moreover, all the factor_var_X
are lists inside a list - one additional [[1]]
is required to get the value(s).
Note that you can also do strsplit()
within a comparison function without the need of creating res3
object (memory efficient):
f_comp_in <- function(x, y) if (is.na(y)) T else x %in% strsplit(y, split = " ")[[1]]
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>%
filter(
f_comp_in(factor_var_1, res2[10, "factor_var_1"][[1]]) &
f_comp_in(factor_var_2, res2[10, "factor_var_2"][[1]]) &
f_comp_in(factor_var_3, res2[10, "factor_var_3"][[1]]) &
f_comp_in(factor_var_4, res2[10, "factor_var_4"][[1]]) &
f_comp_in(factor_var_5, res2[10, "factor_var_5"][[1]]) &
f_comp_l(num_var_1, res2[10, "num_var_1"][[1]]) &
f_comp_l(num_var_2, res2[10, "num_var_2"][[1]]) &
f_comp_l(num_var_3, res2[10, "num_var_3"][[1]]) &
f_comp_l(num_var_4, res2[10, "num_var_4"][[1]]) &
f_comp_l(num_var_5, res2[10, "num_var_5"][[1]])
)
AD 2:
Unfortunately, base string comparison with grepl()
is not vectorised thus you need to execute it inside a for loop.
f_comp_in <- function(x, y) if (is.na(y)) T else vapply(x, grepl, logical(1), x = y)
f_comp_l <- function(x, y) if (is.na(y)) T else x < y
my_data %>%
filter(
f_comp_in(factor_var_1, res2[10, "factor_var_1"][[1]]) &
f_comp_in(factor_var_2, res2[10, "factor_var_2"][[1]]) &
f_comp_in(factor_var_3, res2[10, "factor_var_3"][[1]]) &
f_comp_in(factor_var_4, res2[10, "factor_var_4"][[1]]) &
f_comp_in(factor_var_5, res2[10, "factor_var_5"][[1]]) &
f_comp_l(num_var_1, res2[10, "num_var_1"][[1]]) &
f_comp_l(num_var_2, res2[10, "num_var_2"][[1]]) &
f_comp_l(num_var_3, res2[10, "num_var_3"][[1]]) &
f_comp_l(num_var_4, res2[10, "num_var_4"][[1]]) &
f_comp_l(num_var_5, res2[10, "num_var_5"][[1]])
)
Upvotes: 3