biscodyl
biscodyl

Reputation: 11

How to do stratified sampling by 2 variables?

My dataset has 5500 observations. I want to sample 1000 observations based on age category and diabetes. There are 5 categories in age:

  1. 31-40
  2. 41-50
  3. 51-60
  4. 61-70
  5. >70

and 2 categories in diabetes

  1. YES
  2. NO

I would like to select 1000 observations based on these proportions:

  1. from age categories: 0.8, 4.8, 12.8, 24.4, 57.2 and
  2. from diabetes frequency: 32.8, 67.2

I have look at previous discussion but unable to do it because of different scenario. Thank you.

Upvotes: 1

Views: 2801

Answers (1)

user974465
user974465

Reputation:

Here's an approach

Generate some fake data

nSamples <- 5500
set.seed(1)
data <- data.frame(age=sample(31:79,nSamples,replace=TRUE),
                   diabetic=sample(c('Yes','No'),nSamples,replace=TRUE))
data$age<-cut(data$age,c(30,40,50,60,70,80))

addmargins(prop.table(table(data)))

The fake data has the following distribution

         diabetic
age               No        Yes        Sum
  (30,40] 0.10145455 0.10563636 0.20709091
  (40,50] 0.10763636 0.10400000 0.21163636
  (50,60] 0.10363636 0.09509091 0.19872727
  (60,70] 0.09818182 0.09109091 0.18927273
  (70,80] 0.09581818 0.09745455 0.19327273
  Sum     0.50672727 0.49327273 1.00000000

Get a distribution representing only the target age probabilities

# Samples based on age probabilities
index_age_1 <- which(data$age==levels(data$age)[1])
index_age_2 <- which(data$age==levels(data$age)[2])
index_age_3 <- which(data$age==levels(data$age)[3])
index_age_4 <- which(data$age==levels(data$age)[4])
index_age_5 <- which(data$age==levels(data$age)[5])

prob_age=c(0.8, 4.8, 12.8, 24.4, 57.2)
prob_age=prob_age/sum(prob_age)
n_age=prob_age*1000
ageSampleIndex <- c(sample(index_age_1,n_age[1]),
               sample(index_age_2,n_age[2]),
               sample(index_age_3,n_age[3]),
               sample(index_age_4,n_age[4]),
               sample(index_age_5,n_age[5]))
addmargins(prop.table(table(data[ageSampleIndex,])))

Check the age distribution

         diabetic
age          No   Yes   Sum
  (30,40] 0.004 0.004 0.008
  (40,50] 0.024 0.024 0.048
  (50,60] 0.067 0.061 0.128
  (60,70] 0.130 0.114 0.244
  (70,80] 0.281 0.291 0.572
  Sum     0.506 0.494 1.000

Get the diabetic only distribution

# Samples based on diabetic probabilities
index_diabetic_1 <- which(data$diabetic==levels(data$diabetic)[1])
index_diabetic_2 <- which(data$diabetic==levels(data$diabetic)[2])

prob_diabetic=c(32.8, 67.2)
prob_diabetic=prob_diabetic/sum(prob_diabetic)
n_diabetic=prob_diabetic*1000
diabeticSampleIndex <- c(sample(index_diabetic_1,n_diabetic[1]),
                    sample(index_diabetic_2,n_diabetic[2]))
addmargins(prop.table(table(data[diabeticSampleIndex,])))

Check the diabetic only distribution

         diabetic
age               No        Yes        Sum
  (30,40] 0.06306306 0.15215215 0.21521522
  (40,50] 0.07407407 0.14914915 0.22322322
  (50,60] 0.07407407 0.13013013 0.20420420
  (60,70] 0.05905906 0.12112112 0.18018018
  (70,80] 0.05705706 0.12012012 0.17717718
  Sum     0.32732733 0.67267267 1.00000000

Get the age and diabetic distribution

# Samples based on age and diabetic probabilities

index_age_1_diabetic_1 <- which(data$age==levels(data$age)[1] &
                                  data$diabetic==levels(data$diabetic)[1])
index_age_1_diabetic_2 <- which(data$age==levels(data$age)[1] &
                                  data$diabetic==levels(data$diabetic)[2])
index_age_2_diabetic_1 <- which(data$age==levels(data$age)[2] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_2_diabetic_2 <- which(data$age==levels(data$age)[2] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_3_diabetic_1 <- which(data$age==levels(data$age)[3] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_3_diabetic_2 <- which(data$age==levels(data$age)[3] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_4_diabetic_1 <- which(data$age==levels(data$age)[4] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_4_diabetic_2 <- which(data$age==levels(data$age)[4] & 
                                  data$diabetic==levels(data$diabetic)[2])
index_age_5_diabetic_1 <- which(data$age==levels(data$age)[5] & 
                                  data$diabetic==levels(data$diabetic)[1])
index_age_5_diabetic_2 <- which(data$age==levels(data$age)[5] & 
                                  data$diabetic==levels(data$diabetic)[2])

prob_age_diabetic = prob_age %*% t(prob_diabetic)
n_prob_age_diabetic = prob_age_diabetic * 1000
ageDiabeticSampleIndex <- c(sample(index_age_1_diabetic_1,n_prob_age_diabetic[1,1]),
                            sample(index_age_1_diabetic_2,n_prob_age_diabetic[1,2]),
                            sample(index_age_2_diabetic_1,n_prob_age_diabetic[2,1]),
                            sample(index_age_2_diabetic_2,n_prob_age_diabetic[2,2]),
                            sample(index_age_3_diabetic_1,n_prob_age_diabetic[3,1]),
                            sample(index_age_3_diabetic_2,n_prob_age_diabetic[3,2]),
                            sample(index_age_4_diabetic_1,n_prob_age_diabetic[4,1]),
                            sample(index_age_4_diabetic_2,n_prob_age_diabetic[4,2]),
                            sample(index_age_5_diabetic_1,n_prob_age_diabetic[5,1]),
                            sample(index_age_5_diabetic_2,n_prob_age_diabetic[5,2]))

addmargins(prop.table(table(data[ageDiabeticSampleIndex,])))

Check the age and diabetic distribution

         diabetic
age                No         Yes         Sum
  (30,40] 0.002010050 0.005025126 0.007035176
  (40,50] 0.015075377 0.032160804 0.047236181
  (50,60] 0.041206030 0.086432161 0.127638191
  (60,70] 0.080402010 0.163819095 0.244221106
  (70,80] 0.187939698 0.385929648 0.573869347
  Sum     0.326633166 0.673366834 1.000000000

Upvotes: 3

Related Questions