Wandering_geek
Wandering_geek

Reputation: 337

Facetted area plot of cumulative proportion with trend lines and Pearson's correlation coefficient?

I want to create a facetted area plot showing the cumulative proportion of each category, with a trend line and a Pearson's correlation coefficient.

I've so far used this code (exemplified with the iris data set):

    # Create data set ====
ciformat <- function(val) { paste0(sprintf("%.2f", val)) }
pformat_P <- function(val) { paste0("P ", ifelse(val < 0.001, "<.001", paste0("=", sub("^(-?)0.", "\\1.", sprintf("%.3f", val))))) }

figure_data <- iris[,c("Petal.Width", "Species")]
figure_data <- table(figure_data)
figure_data <- as.data.frame(figure_data)
figure_data <- figure_data %>% group_by(Species) %>% mutate(Proportion = Freq / sum(Freq)) %>% ungroup() %>% as.data.frame()
figure_data$Species <- factor(figure_data$Species, levels = c("setosa", "versicolor", "virginica"))
as.data.frame(figure_data)
figure_data$Species_f = factor(figure_data$Species, levels=c("setosa", "versicolor", "virginica"))

# Create correlation values ====
figure_setosa <- figure_data[(figure_data$Species == "setosa"),c("Species", "Petal.Width", "Proportion")]
figure_setosa$Petal.Width <- as.numeric(figure_setosa$Petal.Width)
figure_setosa$Proportion <- as.numeric(figure_setosa$Proportion)
figure_setosa_r <- paste0("r = ", ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$estimate))
figure_setosa_ci1 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[1])
figure_setosa_ci2 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[2])
figure_setosa_p <- pformat_P(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$p.value)
figure_setosa_ci <- paste0("(95% CI ", figure_setosa_ci1, " to ", figure_setosa_ci2, ")")

figure_versicolor <- figure_data[(figure_data$Species == "versicolor"),c("Species", "Petal.Width", "Proportion")]
figure_versicolor$Petal.Width <- as.numeric(figure_versicolor$Petal.Width)
figure_versicolor$Proportion <- as.numeric(figure_versicolor$Proportion)
figure_versicolor_r <- paste0("r = ", ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$estimate))
figure_versicolor_ci1 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[1])
figure_versicolor_ci2 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[2])
figure_versicolor_p <- pformat_P(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$p.value)
figure_versicolor_ci <- paste0("(95% CI ", figure_versicolor_ci1, " to ", figure_versicolor_ci2, ")")

figure_virginica <- figure_data[(figure_data$Species == "virginica"),c("Species", "Petal.Width", "Proportion")]
figure_virginica$Petal.Width <- as.numeric(figure_virginica$Petal.Width)
figure_virginica$Proportion <- as.numeric(figure_virginica$Proportion)
figure_virginica_r <- paste0("r = ", ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$estimate))
figure_virginica_ci1 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[1])
figure_virginica_ci2 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[2])
figure_virginica_p <- pformat_P(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$p.value)
figure_virginica_ci <- paste0("(95% CI ", figure_virginica_ci1, " to ", figure_virginica_ci2, ")")

# Create figure ====
figure_data %>% mutate(Petal.Width = as.numeric(Petal.Width), decimals=1) %>%
 mutate(Petal.Width = Petal.Width) %>% mutate(Proportion = as.numeric(Proportion)) %>%
  ggplot(aes(x = Petal.Width, y = Proportion, fill = Species)) +
  geom_area(position = "identity", color = "black", size = 0, alpha = 0.5) +
  geom_smooth(aes(x=Petal.Width, y=Proportion, fill = Species), method=lm, 
              se=T, color="black", size=0.7, alpha=0.7) +
  scale_y_continuous(labels = scales::percent) +
  labs(x = "Petal.Width", y="Proportion (%)") +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_r), paste0(figure_versicolor_r), paste0(figure_virginica_r)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.45, .45, .45))), mapping = aes(x = x, y = y, label = label)) +
  geom_text(data = (data.frame(label = c(paste0(figure_setosa_ci), paste0(figure_versicolor_ci), paste0(figure_virginica_ci)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.40, .40, .40))), mapping = aes(x = x, y = y, label = label)) +
    geom_text(data = (data.frame(label = c(paste0(figure_setosa_p), paste0(figure_versicolor_p), paste0(figure_virginica_p)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.35, .35, .35))), mapping = aes(x = x, y = y, label = label)) +
  coord_cartesian(ylim = c(0, 1), clip = "on", expand = F) +
  facet_grid(~fct_relevel(Species,"setosa", "versicolor", "virginica"))

However, I see that the Pearson's correlation coefficients are not correct here, as they use the proportion instead of the original iris$Petal.Width vs iris$Species.

How can I correct this so that the trend line and the Pearson's r is correct?

Addition 230410: Thank you @TarJae. So what I want to do is that the trend line and the correlation coefficient is correct. But when I do it from the original data, without the proportions data:

iris$Species = ifelse(iris$Species == "setosa", 1, 0)
cor.test(x=iris$Petal.Width, y=iris$Species, method = "pearson")

Then I get -0.89 (-0.91 to -0.85), P <.001, for setosa for example. While in my example with the proportions I get -0.52 (-0.77 to -0.13), P = .013, as in your example above.

The true correlation between pedal width and setosa must be the -0.89 rather than -0.52, right?

Kind regards

Upvotes: 0

Views: 37

Answers (1)

TarJae
TarJae

Reputation: 79311

I am not sure what you are after finally, but here is a try to make your code shorter:

library(tidyverse)

# function for correlation info
get_correlation_info <- function(species) {
  
  figure_data <- iris %>% 
    dplyr::select(Petal.Width, Species) %>% 
    table() %>% 
    as.data.frame() %>% 
    mutate(Proportion = Freq / sum(Freq), .by=Species) %>% 
    mutate(Species = factor(Species, levels = c("setosa", "versicolor", "virginica")),
           Species_f = factor(Species, levels = c("setosa", "versicolor", "virginica")))
  
  data <- figure_data[(figure_data$Species == species), c("Species", "Petal.Width", "Proportion")]
  data$Petal.Width <- as.numeric(data$Petal.Width)
  data$Proportion <- as.numeric(data$Proportion)
  
  r <- paste0("r = ", ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$estimate))
  ci1 <- ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$conf.int[1])
  ci2 <- ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$conf.int[2])
  p <- pformat_P(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$p.value)
  ci <- paste0("(95% CI ", ci1, " to ", ci2, ")")
  
  return(list(r=r, p=p, ci=ci))
}
  
# call each correlation info
  setosa_info <- get_correlation_info("setosa")
  versicolor_info <- get_correlation_info("versicolor")
  virginica_info <- get_correlation_info("virginica")

# plot the figure  
figure_data %>% mutate(Petal.Width = as.numeric(Petal.Width), decimals=1) %>%
    mutate(Petal.Width = Petal.Width) %>% mutate(Proportion = as.numeric(Proportion)) %>%
    ggplot(aes(x = Petal.Width, y = Proportion, fill = Species)) +
    geom_area(position = "identity", color = "black", size = 0, alpha = 0.5) +
    geom_smooth(aes(x=Petal.Width, y=Proportion, fill = Species), method=lm, 
                se=T, color="black", size=0.7, alpha=0.7) +
    scale_y_continuous(labels = scales::percent) +
    labs(x = "Petal.Width", y="Proportion (%)") +
    geom_text(data = (data.frame(label = c(paste0(setosa_info$r), paste0(versicolor_info$r), paste0(virginica_info$r)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.45, .45, .45))), mapping = aes(x = x, y = y, label = label)) +
    geom_text(data = (data.frame(label = c(paste0(setosa_info$ci), paste0(versicolor_info$ci), paste0(virginica_info$ci)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.40, .40, .40))), mapping = aes(x = x, y = y, label = label)) +
    geom_text(data = (data.frame(label = c(paste0(setosa_info$p), paste0(versicolor_info$p), paste0(virginica_info$p)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.35, .35, .35))), mapping = aes(x = x, y = y, label = label)) +
    coord_cartesian(ylim = c(0, 1), clip = "on", expand = F) +
    facet_grid(~fct_relevel(Species,"setosa", "versicolor", "virginica"))

enter image description here

Upvotes: 1

Related Questions