Reputation: 435
I have a data frame in the format:
head(subset)
# ants 0 1 1 0 1
# age 1 2 2 1 3
# lc 1 1 0 1 0
I need to create new data frame with random samples according to age and lc. For example I want 30 samples from age:1 and lc:1, 30 samples from age:1 and lc:0 etc.
I did look at random sampling method like;
newdata <- function(subset, age, 30)
But it is not the code that I want.
Upvotes: 40
Views: 100425
Reputation: 1
I have used the following approach. Using sampling you can get the index for which set the sample is in. If you want to stratify this sampling, it is not possible using sampling() in R. Therefore I created a function, using the splitTools package :
library("splitTools")
stratified_sampling <- function(y_values, k_folds, set_seed = 54321) {
index_array <- rep(0,length(y_values))
CVfolds <- create_folds(y_values, k = k_folds, type = "stratified", seed=set_seed)
for(i in 1:k_fold) {
index_array[!c(1:length(y_values)) %in% CVfolds[[i]]] <- i
}
return(index_array)
}
Upvotes: 0
Reputation: 1253
A one-liner using my function fslice_sample()
. .
The syntax is tidy-based but the grouping is done using a mix of collapse
and data.table
.
It's also a bit more flexible than dplyr::slice_sample()
.
# remotes::install_github("NicChr/timeplyr")
library(timeplyr)
fslice_sample(d, n = 30, .by = c(age, lc), seed = 1)
#> # A tibble: 300 x 3
#> age lc ants
#> * <int> <int> <int>
#> 1 1 0 1
#> 2 1 0 1
#> 3 1 0 0
#> 4 1 0 0
#> 5 1 0 0
#> 6 1 0 1
#> 7 1 0 1
#> 8 1 0 0
#> 9 1 0 0
#> 10 1 0 0
#> # ... with 290 more rows
Created on 2023-04-18 with reprex v2.0.2
Upvotes: 0
Reputation: 4910
This is ridiculously easy to do with base R.
Step 1: Create a stratum indicator using the interaction
function.
Step 2: Use tapply
on a sequence of row indicators to identify the indices of the random sample.
Step 3: Subset the data with those indices
Using the data example from @Thomas:
set.seed(1)
n <- 1e4
d <- data.frame(age = sample(1:5,n,TRUE),
lc = rbinom(n,1,.5),
ants = rbinom(n,1,.7))
## stratum indicator
d$group <- interaction(d[, c('age', 'lc')])
## sample selection
indices <- tapply(1:nrow(d), d$group, sample, 30)
## obtain subsample
subsampd <- d[unlist(indices, use.names = FALSE), ]
Verify appropriate stratification
> table(subsampd$group)
1.0 2.0 3.0 4.0 5.0 1.1 2.1 3.1 4.1 5.1
30 30 30 30 30 30 30 30 30 30
Upvotes: 3
Reputation: 1252
Here is an updated dplyr
version for stratified sampling when you need different numbers of samples from each group (i.e. 1:5 ratio or something in my case, but you can specify the n for each group combination).
set.seed(1)
n <- 1e4
d <- tibble::tibble(age = sample(1:5, n, T),
lc = rbinom(n, 1 , .5),
ants = rbinom(n, 1, .7))
> d
# A tibble: 10,000 x 3
age lc ants
<int> <int> <int>
1 2 0 1
2 2 1 1
3 3 1 1
4 5 0 1
5 2 0 1
6 5 0 1
7 5 1 1
8 4 1 1
9 4 1 1
10 1 0 1
# … with 9,990 more rows
there are 10 unique combos of age/lc:
> d %>% group_by(age, lc) %>% nest()
# A tibble: 10 x 3
# Groups: age, lc [10]
age lc data
<int> <int> <list>
1 2 0 <tibble [993 × 1]>
2 2 1 <tibble [1,026 × 1]>
3 3 1 <tibble [982 × 1]>
4 5 0 <tibble [1,012 × 1]>
5 5 1 <tibble [1,056 × 1]>
6 4 1 <tibble [940 × 1]>
7 1 0 <tibble [1,010 × 1]>
8 1 1 <tibble [1,002 × 1]>
9 4 0 <tibble [958 × 1]>
10 3 0 <tibble [1,021 × 1]>
We can sample a prespecified number of rows from each group of age/lc combinations:
> d %>%
group_by(age, lc) %>%
nest() %>%
ungroup() %>%
# you must supply `n` for each combination of groups in `group_by(age, lc)`
mutate(n = c(1, 1, 1, 2, 3, 1, 2, 3, 1, 1)) %>%
mutate(samp = purrr::map2(.x = data, .y= n,
.f = function(.x, .y) slice_sample(.data = .x, n = .y))) %>%
select(-data, -n) %>%
unnest(samp)
# A tibble: 16 x 3
age lc ants
<int> <int> <int>
1 2 0 0
2 2 1 1
3 3 1 1
4 5 0 0
5 5 0 1
6 5 1 1
7 5 1 1
8 5 1 1
9 4 1 1
10 1 0 1
11 1 0 1
12 1 1 1
13 1 1 1
14 1 1 0
15 4 0 1
16 3 0 1
Upvotes: 1
Reputation: 891
Here's a one-liner using data.table
:
set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T),
lc = rbinom(n, 1, .5),
ants = rbinom(n, 1, .7))
out <- d[, .SD[sample(1:.N, 30)], by=.(age, lc)]
# Check
out[, table(age, lc)]
## lc
## age 0 1
## 1 30 30
## 2 30 30
## 3 30 30
## 4 30 30
## 5 30 30
Upvotes: 8
Reputation: 5536
See the function strata
from the package sampling. The function selects stratified simple random sampling and gives a sample as a result. Extra two columns are added - inclusion probabilities (Prob
) and strata indicator (Stratum
). See the example.
require(data.table)
require(sampling)
set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T),
lc = rbinom(n, 1 , .5),
ants = rbinom(n, 1, .7))
# Sort
setkey(d, age, lc)
# Population size by strata
d[, .N, keyby = list(age, lc)]
# age lc N
# 1: 1 0 1010
# 2: 1 1 1002
# 3: 2 0 993
# 4: 2 1 1026
# 5: 3 0 1021
# 6: 3 1 982
# 7: 4 0 958
# 8: 4 1 940
# 9: 5 0 1012
# 10: 5 1 1056
# Select sample
set.seed(2)
s <- data.table(strata(d, c("age", "lc"), rep(30, 10), "srswor"))
# Sample size by strata
s[, .N, keyby = list(age, lc)]
# age lc N
# 1: 1 0 30
# 2: 1 1 30
# 3: 2 0 30
# 4: 2 1 30
# 5: 3 0 30
# 6: 3 1 30
# 7: 4 0 30
# 8: 4 1 30
# 9: 5 0 30
# 10: 5 1 30
Upvotes: 17
Reputation: 193517
I would suggest using either stratified
from my "splitstackshape" package, or sample_n
from the "dplyr" package:
## Sample data
set.seed(1)
n <- 1e4
d <- data.table(age = sample(1:5, n, T),
lc = rbinom(n, 1 , .5),
ants = rbinom(n, 1, .7))
# table(d$age, d$lc)
For stratified
, you basically specify the dataset, the stratifying columns, and an integer representing the size you want from each group OR a decimal representing the fraction you want returned (for example, .1 represents 10% from each group).
library(splitstackshape)
set.seed(1)
out <- stratified(d, c("age", "lc"), 30)
head(out)
# age lc ants
# 1: 1 0 1
# 2: 1 0 0
# 3: 1 0 1
# 4: 1 0 1
# 5: 1 0 0
# 6: 1 0 1
table(out$age, out$lc)
#
# 0 1
# 1 30 30
# 2 30 30
# 3 30 30
# 4 30 30
# 5 30 30
For sample_n
you first create a grouped table (using group_by
) and then specify the number of observations you want. If you wanted proportional sampling instead, you should use sample_frac
.
library(dplyr)
set.seed(1)
out2 <- d %>%
group_by(age, lc) %>%
sample_n(30)
# table(out2$age, out2$lc)
Upvotes: 56
Reputation: 44525
Here's some data:
set.seed(1)
n <- 1e4
d <- data.frame(age = sample(1:5,n,TRUE),
lc = rbinom(n,1,.5),
ants = rbinom(n,1,.7))
You want a split-apply-combine strategy, where you split
your data.frame (d
in this example), sample rows/observations from each subsample, and then combine then back together with rbind
. Here's how it works:
sp <- split(d, list(d$age, d$lc))
samples <- lapply(sp, function(x) x[sample(1:nrow(x), 30, FALSE),])
out <- do.call(rbind, samples)
The result:
> str(out)
'data.frame': 300 obs. of 3 variables:
$ age : int 1 1 1 1 1 1 1 1 1 1 ...
$ lc : int 0 0 0 0 0 0 0 0 0 0 ...
$ ants: int 1 1 0 1 1 1 1 1 1 1 ...
> head(out)
age lc ants
1.0.2242 1 0 1
1.0.4417 1 0 1
1.0.389 1 0 0
1.0.4578 1 0 1
1.0.8170 1 0 1
1.0.5606 1 0 1
Upvotes: 17