Reputation: 675
I am trying to sample the same number of classess from two data frames of differing sizes. I can do this manually, but the number of classes in some of my data frames are quite large.
I have been able to use the dplyr::count
function to get a list of the classes of interest from the smaller data frame, as well as their counts. I then extract these classes and their counts as vectors. I then have attempted to create a function using these vectors and call it using mapply
, so I can create filtered slices for each class then re-join the lists using do.call
, but I am getting the errors when I attempt to run mapply
.
Below are example datasets. df is the smaller data frame which has 6 rows containing ControlVarA == "Group_1"
and 10 rows containing ControlVarA == "Group_2"
, and I am wanting to extract the same number of rows/classes from the larger data frame df2 (which has 6 rows ControlVarA == "Group_1"
and 20 rows containing ControlVarA == "Group_2"
).
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df2
To extract the class names and class counts I use the code below.
slice_vars <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(1) %>%
colnames()
slice_nums <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(2) %>%
as.data.frame() %>%
rename_with(~ gsub(" ", "", .x)) %>%
colnames() %>%
as.numeric()
The function
I created and mapply
statement are below
func_group <- function(dataset, x, y) {
dataset %>%
group_by(ControlVarA) %>%
slice_sample(n = all_of(x)) %>%
ungroup() %>%
filter(ControlVarA == data[[y]])
}
combine_lists <- mapply(func_group, slice_nums, slice_vars, MoreArgs = list(dataset = df2))
do.call(rbind, combine_lists)
Upvotes: 0
Views: 218
Reputation: 389047
count
to get number of rows for each value in ControlVarA
, join with df2
and select n
random rows from each group using sample_n
. (Unfortunately, slice_sample(n = first(n))
returns an error)
library(dplyr)
df %>%
count(ControlVarA) %>%
left_join(df2, by = 'ControlVarA') %>%
group_by(ControlVarA) %>%
sample_n(first(n)) %>%
ungroup %>%
select(-n)
# ControlVarA ID VarA VarB VarC VarD
# <fct> <int> <dbl> <dbl> <dbl> <dbl>
# 1 Group_1 1 1 10 10 10
# 2 Group_1 4 1 0 16 16
# 3 Group_1 3 1 0 14 14
# 4 Group_1 2 1 0 12 12
# 5 Group_1 5 1 12 10 10
# 6 Group_1 6 1 12 12 12
# 7 Group_2 12 14 14 16 16
# 8 Group_2 25 16 16 16 16
# 9 Group_2 15 NA 16 14 14
#10 Group_2 22 16 16 16 16
#11 Group_2 9 1 0 10 10
#12 Group_2 8 1 12 16 16
#13 Group_2 24 16 16 16 16
#14 Group_2 21 16 16 16 16
#15 Group_2 7 1 12 14 14
#16 Group_2 14 NA 16 12 12
Upvotes: 2
Reputation: 1137
library(tidyverse)
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df <- as_tibble(df) %>%
mutate(table = "df")
df2 <- as_tibble(df2) %>%
mutate(table = "df2")
final_df <- df %>%
bind_rows(df2)
set.seed(2021)
final_df %>%
filter(!if_any(.cols = VarA:VarD, is.na)) %>%
group_by(table, ControlVarA) %>%
slice_sample(n = 5)
#> # A tibble: 20 x 7
#> # Groups: table, ControlVarA [4]
#> ID VarA VarB VarC VarD ControlVarA table
#> <int> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
#> 1 6 1 12 12 12 Group_1 df
#> 2 2 1 0 12 12 Group_1 df
#> 3 3 1 0 14 14 Group_1 df
#> 4 5 1 12 10 10 Group_1 df
#> 5 4 1 0 16 16 Group_1 df
#> 6 16 16 16 16 16 Group_2 df
#> 7 9 1 0 10 10 Group_2 df
#> 8 8 1 12 16 16 Group_2 df
#> 9 10 1 14 12 12 Group_2 df
#> 10 7 1 12 14 14 Group_2 df
#> 11 1 1 10 10 10 Group_1 df2
#> 12 4 1 0 16 16 Group_1 df2
#> 13 3 1 0 14 14 Group_1 df2
#> 14 2 1 0 12 12 Group_1 df2
#> 15 6 1 12 12 12 Group_1 df2
#> 16 22 16 16 16 16 Group_2 df2
#> 17 23 16 16 16 16 Group_2 df2
#> 18 9 1 0 10 10 Group_2 df2
#> 19 18 16 16 16 16 Group_2 df2
#> 20 20 16 16 16 16 Group_2 df2
Created on 2021-07-13 by the reprex package (v2.0.0)
Upvotes: 2