Reputation: 11
My dataset has 5500 observations. I want to sample 1000 observations based on age category and diabetes. There are 5 categories in age:
and 2 categories in diabetes
I would like to select 1000 observations based on these proportions:
I have look at previous discussion but unable to do it because of different scenario. Thank you.
Upvotes: 1
Views: 2801
Reputation:
Here's an approach
Generate some fake data
nSamples <- 5500
set.seed(1)
data <- data.frame(age=sample(31:79,nSamples,replace=TRUE),
diabetic=sample(c('Yes','No'),nSamples,replace=TRUE))
data$age<-cut(data$age,c(30,40,50,60,70,80))
addmargins(prop.table(table(data)))
The fake data has the following distribution
diabetic
age No Yes Sum
(30,40] 0.10145455 0.10563636 0.20709091
(40,50] 0.10763636 0.10400000 0.21163636
(50,60] 0.10363636 0.09509091 0.19872727
(60,70] 0.09818182 0.09109091 0.18927273
(70,80] 0.09581818 0.09745455 0.19327273
Sum 0.50672727 0.49327273 1.00000000
Get a distribution representing only the target age probabilities
# Samples based on age probabilities
index_age_1 <- which(data$age==levels(data$age)[1])
index_age_2 <- which(data$age==levels(data$age)[2])
index_age_3 <- which(data$age==levels(data$age)[3])
index_age_4 <- which(data$age==levels(data$age)[4])
index_age_5 <- which(data$age==levels(data$age)[5])
prob_age=c(0.8, 4.8, 12.8, 24.4, 57.2)
prob_age=prob_age/sum(prob_age)
n_age=prob_age*1000
ageSampleIndex <- c(sample(index_age_1,n_age[1]),
sample(index_age_2,n_age[2]),
sample(index_age_3,n_age[3]),
sample(index_age_4,n_age[4]),
sample(index_age_5,n_age[5]))
addmargins(prop.table(table(data[ageSampleIndex,])))
Check the age distribution
diabetic
age No Yes Sum
(30,40] 0.004 0.004 0.008
(40,50] 0.024 0.024 0.048
(50,60] 0.067 0.061 0.128
(60,70] 0.130 0.114 0.244
(70,80] 0.281 0.291 0.572
Sum 0.506 0.494 1.000
Get the diabetic only distribution
# Samples based on diabetic probabilities
index_diabetic_1 <- which(data$diabetic==levels(data$diabetic)[1])
index_diabetic_2 <- which(data$diabetic==levels(data$diabetic)[2])
prob_diabetic=c(32.8, 67.2)
prob_diabetic=prob_diabetic/sum(prob_diabetic)
n_diabetic=prob_diabetic*1000
diabeticSampleIndex <- c(sample(index_diabetic_1,n_diabetic[1]),
sample(index_diabetic_2,n_diabetic[2]))
addmargins(prop.table(table(data[diabeticSampleIndex,])))
Check the diabetic only distribution
diabetic
age No Yes Sum
(30,40] 0.06306306 0.15215215 0.21521522
(40,50] 0.07407407 0.14914915 0.22322322
(50,60] 0.07407407 0.13013013 0.20420420
(60,70] 0.05905906 0.12112112 0.18018018
(70,80] 0.05705706 0.12012012 0.17717718
Sum 0.32732733 0.67267267 1.00000000
Get the age and diabetic distribution
# Samples based on age and diabetic probabilities
index_age_1_diabetic_1 <- which(data$age==levels(data$age)[1] &
data$diabetic==levels(data$diabetic)[1])
index_age_1_diabetic_2 <- which(data$age==levels(data$age)[1] &
data$diabetic==levels(data$diabetic)[2])
index_age_2_diabetic_1 <- which(data$age==levels(data$age)[2] &
data$diabetic==levels(data$diabetic)[1])
index_age_2_diabetic_2 <- which(data$age==levels(data$age)[2] &
data$diabetic==levels(data$diabetic)[2])
index_age_3_diabetic_1 <- which(data$age==levels(data$age)[3] &
data$diabetic==levels(data$diabetic)[1])
index_age_3_diabetic_2 <- which(data$age==levels(data$age)[3] &
data$diabetic==levels(data$diabetic)[2])
index_age_4_diabetic_1 <- which(data$age==levels(data$age)[4] &
data$diabetic==levels(data$diabetic)[1])
index_age_4_diabetic_2 <- which(data$age==levels(data$age)[4] &
data$diabetic==levels(data$diabetic)[2])
index_age_5_diabetic_1 <- which(data$age==levels(data$age)[5] &
data$diabetic==levels(data$diabetic)[1])
index_age_5_diabetic_2 <- which(data$age==levels(data$age)[5] &
data$diabetic==levels(data$diabetic)[2])
prob_age_diabetic = prob_age %*% t(prob_diabetic)
n_prob_age_diabetic = prob_age_diabetic * 1000
ageDiabeticSampleIndex <- c(sample(index_age_1_diabetic_1,n_prob_age_diabetic[1,1]),
sample(index_age_1_diabetic_2,n_prob_age_diabetic[1,2]),
sample(index_age_2_diabetic_1,n_prob_age_diabetic[2,1]),
sample(index_age_2_diabetic_2,n_prob_age_diabetic[2,2]),
sample(index_age_3_diabetic_1,n_prob_age_diabetic[3,1]),
sample(index_age_3_diabetic_2,n_prob_age_diabetic[3,2]),
sample(index_age_4_diabetic_1,n_prob_age_diabetic[4,1]),
sample(index_age_4_diabetic_2,n_prob_age_diabetic[4,2]),
sample(index_age_5_diabetic_1,n_prob_age_diabetic[5,1]),
sample(index_age_5_diabetic_2,n_prob_age_diabetic[5,2]))
addmargins(prop.table(table(data[ageDiabeticSampleIndex,])))
Check the age and diabetic distribution
diabetic
age No Yes Sum
(30,40] 0.002010050 0.005025126 0.007035176
(40,50] 0.015075377 0.032160804 0.047236181
(50,60] 0.041206030 0.086432161 0.127638191
(60,70] 0.080402010 0.163819095 0.244221106
(70,80] 0.187939698 0.385929648 0.573869347
Sum 0.326633166 0.673366834 1.000000000
Upvotes: 3