Reputation: 337
I want to create a facetted area plot showing the cumulative proportion of each category, with a trend line and a Pearson's correlation coefficient.
I've so far used this code (exemplified with the iris data set):
# Create data set ====
ciformat <- function(val) { paste0(sprintf("%.2f", val)) }
pformat_P <- function(val) { paste0("P ", ifelse(val < 0.001, "<.001", paste0("=", sub("^(-?)0.", "\\1.", sprintf("%.3f", val))))) }
figure_data <- iris[,c("Petal.Width", "Species")]
figure_data <- table(figure_data)
figure_data <- as.data.frame(figure_data)
figure_data <- figure_data %>% group_by(Species) %>% mutate(Proportion = Freq / sum(Freq)) %>% ungroup() %>% as.data.frame()
figure_data$Species <- factor(figure_data$Species, levels = c("setosa", "versicolor", "virginica"))
as.data.frame(figure_data)
figure_data$Species_f = factor(figure_data$Species, levels=c("setosa", "versicolor", "virginica"))
# Create correlation values ====
figure_setosa <- figure_data[(figure_data$Species == "setosa"),c("Species", "Petal.Width", "Proportion")]
figure_setosa$Petal.Width <- as.numeric(figure_setosa$Petal.Width)
figure_setosa$Proportion <- as.numeric(figure_setosa$Proportion)
figure_setosa_r <- paste0("r = ", ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$estimate))
figure_setosa_ci1 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[1])
figure_setosa_ci2 <- ciformat(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$conf.int[2])
figure_setosa_p <- pformat_P(cor.test(x=figure_setosa$Petal.Width, y=figure_setosa$Proportion, method = "pearson")$p.value)
figure_setosa_ci <- paste0("(95% CI ", figure_setosa_ci1, " to ", figure_setosa_ci2, ")")
figure_versicolor <- figure_data[(figure_data$Species == "versicolor"),c("Species", "Petal.Width", "Proportion")]
figure_versicolor$Petal.Width <- as.numeric(figure_versicolor$Petal.Width)
figure_versicolor$Proportion <- as.numeric(figure_versicolor$Proportion)
figure_versicolor_r <- paste0("r = ", ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$estimate))
figure_versicolor_ci1 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[1])
figure_versicolor_ci2 <- ciformat(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$conf.int[2])
figure_versicolor_p <- pformat_P(cor.test(x=figure_versicolor$Petal.Width, y=figure_versicolor$Proportion, method = "pearson")$p.value)
figure_versicolor_ci <- paste0("(95% CI ", figure_versicolor_ci1, " to ", figure_versicolor_ci2, ")")
figure_virginica <- figure_data[(figure_data$Species == "virginica"),c("Species", "Petal.Width", "Proportion")]
figure_virginica$Petal.Width <- as.numeric(figure_virginica$Petal.Width)
figure_virginica$Proportion <- as.numeric(figure_virginica$Proportion)
figure_virginica_r <- paste0("r = ", ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$estimate))
figure_virginica_ci1 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[1])
figure_virginica_ci2 <- ciformat(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$conf.int[2])
figure_virginica_p <- pformat_P(cor.test(x=figure_virginica$Petal.Width, y=figure_virginica$Proportion, method = "pearson")$p.value)
figure_virginica_ci <- paste0("(95% CI ", figure_virginica_ci1, " to ", figure_virginica_ci2, ")")
# Create figure ====
figure_data %>% mutate(Petal.Width = as.numeric(Petal.Width), decimals=1) %>%
mutate(Petal.Width = Petal.Width) %>% mutate(Proportion = as.numeric(Proportion)) %>%
ggplot(aes(x = Petal.Width, y = Proportion, fill = Species)) +
geom_area(position = "identity", color = "black", size = 0, alpha = 0.5) +
geom_smooth(aes(x=Petal.Width, y=Proportion, fill = Species), method=lm,
se=T, color="black", size=0.7, alpha=0.7) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Petal.Width", y="Proportion (%)") +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_r), paste0(figure_versicolor_r), paste0(figure_virginica_r)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.45, .45, .45))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_ci), paste0(figure_versicolor_ci), paste0(figure_virginica_ci)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.40, .40, .40))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(figure_setosa_p), paste0(figure_versicolor_p), paste0(figure_virginica_p)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.35, .35, .35))), mapping = aes(x = x, y = y, label = label)) +
coord_cartesian(ylim = c(0, 1), clip = "on", expand = F) +
facet_grid(~fct_relevel(Species,"setosa", "versicolor", "virginica"))
However, I see that the Pearson's correlation coefficients are not correct here, as they use the proportion instead of the original iris$Petal.Width vs iris$Species.
How can I correct this so that the trend line and the Pearson's r is correct?
Addition 230410: Thank you @TarJae. So what I want to do is that the trend line and the correlation coefficient is correct. But when I do it from the original data, without the proportions data:
iris$Species = ifelse(iris$Species == "setosa", 1, 0)
cor.test(x=iris$Petal.Width, y=iris$Species, method = "pearson")
Then I get -0.89 (-0.91 to -0.85), P <.001, for setosa for example. While in my example with the proportions I get -0.52 (-0.77 to -0.13), P = .013, as in your example above.
The true correlation between pedal width and setosa must be the -0.89 rather than -0.52, right?
Kind regards
Upvotes: 0
Views: 37
Reputation: 79311
I am not sure what you are after finally, but here is a try to make your code shorter:
library(tidyverse)
# function for correlation info
get_correlation_info <- function(species) {
figure_data <- iris %>%
dplyr::select(Petal.Width, Species) %>%
table() %>%
as.data.frame() %>%
mutate(Proportion = Freq / sum(Freq), .by=Species) %>%
mutate(Species = factor(Species, levels = c("setosa", "versicolor", "virginica")),
Species_f = factor(Species, levels = c("setosa", "versicolor", "virginica")))
data <- figure_data[(figure_data$Species == species), c("Species", "Petal.Width", "Proportion")]
data$Petal.Width <- as.numeric(data$Petal.Width)
data$Proportion <- as.numeric(data$Proportion)
r <- paste0("r = ", ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$estimate))
ci1 <- ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$conf.int[1])
ci2 <- ciformat(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$conf.int[2])
p <- pformat_P(cor.test(x=data$Petal.Width, y=data$Proportion, method="pearson")$p.value)
ci <- paste0("(95% CI ", ci1, " to ", ci2, ")")
return(list(r=r, p=p, ci=ci))
}
# call each correlation info
setosa_info <- get_correlation_info("setosa")
versicolor_info <- get_correlation_info("versicolor")
virginica_info <- get_correlation_info("virginica")
# plot the figure
figure_data %>% mutate(Petal.Width = as.numeric(Petal.Width), decimals=1) %>%
mutate(Petal.Width = Petal.Width) %>% mutate(Proportion = as.numeric(Proportion)) %>%
ggplot(aes(x = Petal.Width, y = Proportion, fill = Species)) +
geom_area(position = "identity", color = "black", size = 0, alpha = 0.5) +
geom_smooth(aes(x=Petal.Width, y=Proportion, fill = Species), method=lm,
se=T, color="black", size=0.7, alpha=0.7) +
scale_y_continuous(labels = scales::percent) +
labs(x = "Petal.Width", y="Proportion (%)") +
geom_text(data = (data.frame(label = c(paste0(setosa_info$r), paste0(versicolor_info$r), paste0(virginica_info$r)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.45, .45, .45))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(setosa_info$ci), paste0(versicolor_info$ci), paste0(virginica_info$ci)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.40, .40, .40))), mapping = aes(x = x, y = y, label = label)) +
geom_text(data = (data.frame(label = c(paste0(setosa_info$p), paste0(versicolor_info$p), paste0(virginica_info$p)), Species = c("setosa", "versicolor", "virginica"), x = c(10, 10, 10), y = c(.35, .35, .35))), mapping = aes(x = x, y = y, label = label)) +
coord_cartesian(ylim = c(0, 1), clip = "on", expand = F) +
facet_grid(~fct_relevel(Species,"setosa", "versicolor", "virginica"))
Upvotes: 1