stats_noob
stats_noob

Reputation: 5897

R: Calculating Quantiles with (group_by .add = TRUE)

I am working with the R programming language.

I have the following dataset:

set.seed(123)
library(dplyr)

Patient_ID = 1:5000
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
Gender <- as.factor(gender)


status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
Status  <- as.factor(status )

Height = rnorm(5000, 150, 10)
Weight = rnorm(5000, 90, 10)
Hospital_Visits = sample.int(20,  5000, replace = TRUE)

################

disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
Disease <- as.factor(disease)

###################
my_data = data.frame(Patient_ID, Gender, Status, Height, Weight, Hospital_Visits, Disease)

  Patient_ID Gender    Status   Height    Weight Hospital_Visits Disease
1          1 Female   Citizen 145.0583 113.70725               1      No
2          2   Male Immigrant 161.2759  88.33188              18      No
3          3 Female Immigrant 138.5305  99.26961               6     Yes
4          4   Male   Citizen 164.8102  84.31848              12      No
5          5   Male   Citizen 159.1619  92.25090              12     Yes
6          6 Female   Citizen 153.3513 101.31986              11     Yes

In a previous question (R: Calculating Proportions Based on Nested Groups), I learned how to calculate "nested proportions" based on ntiles (e.g. calculate 3 ntiles for one variable, group by these 3 ntiles and then claculate 3 ntiles for the second variable based on these previous ntiles,etc.):

# e.g. using 3 ntiles

my_data %>% 
  group_by(Gender, Status) %>%
  mutate(Height_ntile = ntile(Height, 3),
         Height_range = paste(min(Height), max(Height), sep = "-")) %>%
  group_by(Height_ntile, Height_range, .add = TRUE) %>%
  mutate(Weight_ntile = ntile(Weight, 3),
         Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
  group_by(Weight_ntile, Weight_range, .add = TRUE) %>%
  mutate(Hospital_Visits_ntile = ntile(Hospital_Visits, 3),
         Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
  group_by(Hospital_Visits_ntile, Hospital_range, .add = TRUE) %>%
  summarize(percent_disease = mean(Disease == "Yes"), 
            count = n(),
            .groups = "drop")

Now, I am trying to repeat this exact same function but using "quantiles" instead:

I tried to modify the above code - here is my attempt:

my_data %>% 
  group_by(Gender, Status) %>%
  mutate(Height_group = cut(Height, breaks = c(-Inf, 
                                               quantile(Height, c(0.33, 0.67)), 
                                               Inf)),
         Height_range = paste(min(Height), max(Height), sep = "-")) %>%
  group_by(Height_group, Height_range, .add = TRUE) %>%
  mutate(Weight_group = cut(Weight, breaks = c(-Inf, 
                                               quantile(Weight, c(0.33, 0.67)), 
                                               Inf)),
         Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
  group_by(Weight_group, Weight_range, .add = TRUE) %>%
  mutate(Hospital_Visits_group = cut(Hospital_Visits, breaks = c(-Inf, 
                                                                quantile(Hospital_Visits, c(0.33, 0.67)), 
                                                                Inf)),
         Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
  group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
  summarize(percent_disease = mean(Disease == "Yes"), 
            count = n(),
            .groups = "drop")

This code runs, but I am not sure if I have done this correctly (e.g. the "infinite" values appearing):

 A tibble: 108 x 10
   Gender Status  Height_~1 Heigh~2 Weigh~3 Weigh~4 Hospi~5 Hospi~6 perce~7
   <fct>  <fct>   <fct>     <chr>   <fct>   <chr>   <fct>   <chr>     <dbl>
 1 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (-Inf,~ 1-20      0.314
 2 Female Citizen (-Inf,14~ 115.86~ (-Inf,~ 58.991~ (7,14]  1-20      0.458

Can someone please show me if I have done this correctly?

Thanks!

Upvotes: 0

Views: 87

Answers (1)

stats_noob
stats_noob

Reputation: 5897

Answer based on insights provided by @akrun:

my_data %>% 
  group_by(Gender, Status) %>%
  mutate(Height_group = as.integer(cut(Height, breaks = c(-Inf, 
                                               quantile(Height, c(0.33, 0.67)), 
                                               Inf))),
         Height_range = paste(min(Height), max(Height), sep = "-")) %>%
  group_by(Height_group, Height_range, .add = TRUE) %>%
  mutate(Weight_group = as.integer(cut(Weight, breaks = c(-Inf, 
                                               quantile(Weight, c(0.33, 0.67)), 
                                               Inf))),
         Weight_range = paste(min(Weight), max(Weight), sep = "-")) %>%
  group_by(Weight_group, Weight_range, .add = TRUE) %>%
  mutate(Hospital_Visits_group = as.integer(cut(Hospital_Visits, breaks = c(-Inf, 
                                                                quantile(Hospital_Visits, c(0.33, 0.67)), 
                                                                Inf))),
         Hospital_range = paste(min(Hospital_Visits), max(Hospital_Visits), sep = "-")) %>%
  group_by(Hospital_Visits_group, Hospital_range, .add = TRUE) %>%
  summarize(percent_disease = mean(Disease == "Yes"), 
            count = n(),
            .groups = "drop")

Have I understood this correctly?

Upvotes: 1

Related Questions