Reputation: 21
I am trying to add kruskal Wallis and pairwise Wilcoxon test to the figure to show which groups are significant different, but I have multiple groups/subgroups within each group and facet which makes it complicated.
Here is the R code by using iris dataset as an example, the idea is to perform Kruskal.test across different treatments (A, B, C) for different variables (Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) per species, and also wilcox.test pairwise between them:
rm(list=ls(all=TRUE)); cat('\014') # clear workspace
library(tidyverse)
library(ggplot2)
library(viridis)
library(rstatix)
data(iris)
iris$treatment <- rep(c("A","B","C"), length(iris$Species)/3)
mydf <- gather(iris,Variable,value,Sepal.Length:Petal.Width)
# change number to create more difference
mydf[mydf$treatment=="B",]$value <- mydf[mydf$treatment=="B",]$value*1.2
#mydf[mydf$treatment=="C",]$value <- mydf[mydf$treatment=="C",]$value+0.3
# do pairwise Wilcoxon test for pairwise comparisons between groups
df_wilcox <- mydf %>%
group_by(Species,Variable) %>%
pairwise_wilcox_test(value ~ treatment) %>%
add_y_position(step.increase = 0.02)
# do Kruskal Wallis test to see whether or not there is statistically significant difference between three or more groups
df_kw <- compare_means(value ~ treatment, mydf, group.by = c("Species","Variable"), method="kruskal")
# plot boxplot with wilcoxon and kruskal test results
P <- ggplot(data=mydf,
aes(x=treatment, y=value, fill=Variable))+
stat_boxplot(geom = "errorbar")+geom_boxplot(outlier.shape = NA)+
facet_wrap(~Species,nrow=1)+
theme_bw()+
theme(axis.text=element_text(size=12),axis.title=element_text(size=16),plot.title=element_text(size=20)) +
theme(strip.text = element_text(size=14))+
scale_fill_viridis(discrete = TRUE) +
guides(fill=guide_legend(title="Variable"))+
stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02)
#stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02,hide.ns=T) #hide non-significant
# change legend title and wilcoxon test color
ggpar(P,legend.title = "Wilcoxon test",palette = c("#440154FF","#3B528BFF","#21908CFF","#FDE725FF"))
This produces the following plot:
To improve the figure, I want to:
I am stuck here and wondering anyone has a solution? Thank you very much!
Upvotes: 2
Views: 6368
Reputation: 55
I can answer the first part of your question regarding how to add the pvalues labels to the plot automatically. One way to do that is to combine mydf
anddf_kw
so that df_kw
includes all of the same columns as mydf
. here I do that using the data.table
package like this:
setDT(mydf); setDT(df_kw) # convert to data.tables by reference
df_kw <- mydf[df_kw, mult = "first", on = c("Variable", "Species"), nomatch=0L] #creates data table with the same columns as mydf
df_kw <- df_kw[df_kw$p < 0.05,] #removes non-significant values
Then you can add the labels automatically using geom_text
. I would generate a character vector of values to position the labels like this first:
y_lab_placement <- c(sort(rep(seq(max(mydf$value)*1.25, by = -0.35, length.out = length(unique(mydf$Variable))),
length(unique(mydf$Species))), decreasing = T)) # creates y values of where to place the labels
y_lab_placement <- y_lab_placement[1:nrow(df_kw)] # adjusts length of placements to the length of significant values
Then I would add this line to your ggplot to add the labels:
geom_text(data = df_kw, aes(x = 2 , y = y_lab_placement, label = c(paste(Variable, "KW p ~" , round(p, 5)))))+ #adds labels to the plot based on your data
Here is your entire code block including these editions.
rm(list=ls(all=TRUE)); cat('\014') # clear workspace
library(tidyverse)
library(ggplot2)
library(viridis)
library(rstatix)
library(data.table) # used in creating combined data table
data(iris)
iris$treatment <- rep(c("A","B","C"), length(iris$Species)/3)
mydf <- gather(iris,Variable,value,Sepal.Length:Petal.Width)
# change number to create more difference
mydf[mydf$treatment=="B",]$value <- mydf[mydf$treatment=="B",]$value*1.2
#mydf[mydf$treatment=="C",]$value <- mydf[mydf$treatment=="C",]$value+0.3
# do pairwise Wilcoxon test for pairwise comparisons between groups
df_wilcox <- mydf %>%
group_by(Species,Variable) %>%
pairwise_wilcox_test(value ~ treatment) %>%
add_y_position(step.increase = 0.02)
# do Kruskal Wallis test to see whether or not there is statistically significant difference between three or more groups
df_kw <- compare_means(value ~ treatment, mydf, group.by = c("Species","Variable"), method="kruskal")
setDT(mydf); setDT(df_kw) # convert to data.tables by reference
df_kw <- mydf[df_kw, mult = "first", on = c("Variable", "Species"), nomatch=0L] #creates data table with the same columns as mydf
df_kw <- df_kw[df_kw$p < 0.05,] #removes non-significant values
# plot boxplot with wilcoxon and kruskal test results
y_lab_placement <- c(sort(rep(seq(max(mydf$value)*1.25, by = -0.35, length.out = length(unique(mydf$Variable))),
length(unique(mydf$Species))), decreasing = T)) # creates y values of where to place the labels
y_lab_placement <- y_lab_placement[1:nrow(df_kw)] # adjusts length of placements to the length of significant values
P <- ggplot(data=mydf,
aes(x=treatment, y=value, fill=Variable))+
stat_boxplot(geom = "errorbar")+geom_boxplot(outlier.shape = NA)+
facet_wrap(~Species,nrow=1)+
theme_bw()+
theme(axis.text=element_text(size=12),axis.title=element_text(size=16),plot.title=element_text(size=20)) +
theme(strip.text = element_text(size=14))+
scale_fill_viridis(discrete = TRUE) +
guides(fill=guide_legend(title="Variable"))+
geom_text(data = df_kw, aes(x = 2 , y = y_lab_placement, label = c(paste(Variable, "KW p ~" , round(p, 5)))))+ #adds labels to the plot based on your data
stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02)
#stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02,hide.ns=T) #hide non-significant
# change legend title and wilcoxon test color
ggpar(P,legend.title = "Wilcoxon test",palette = c("#440154FF","#3B528BFF","#21908CFF","#FDE725FF"))
Upvotes: 0