Reputation: 223
Hi there I recently got the desired output for a geom_split_violin
; however, I do wish to have a much cleaner plot getting rid of all potential outliers which happens to connect the two aspects I'm comparing. See below for plot:
and code:
library(dplyr)
library(readxl)
library(tibble)
library(ggplot2)
library(hrbrthemes)
library(introdataviz)
### IMPORT THE DATASET
variants_dist <- read_excel("path/to/file.xlsm", 10)
### FEATURES WRANGLING TO GET THE RIGHT FORMAT
variants_dist <- variants_dist %>%
mutate(population_ID=factor(population_ID, levels=c("AFR", "EUR", "MENA", "SAS", "CEA", "SIB", "OCE", "AME")))
variants_dist %>% arrange(population_ID) -> pop_sort
pop_sort <- pop_sort %>%
mutate(variant_type=factor(variant_type, levels=c("SNPs", "INDELs")))
pop_sort %>% arrange(variant_type) -> variant_sort
df_var = variant_sort %>% group_by(population_ID) %>% summarise(num=n())
### PLOT THE DATA
violin_variants <- variant_sort %>%
left_join(df_var) %>%
mutate(pop_count = paste0(population_ID, "\n", "n=", num/2)) %>%
ggplot(aes(x=forcats::fct_inorder(pop_count), y=count, fill=population_ID, alpha=variant_type)) +
geom_split_violin(position="identity", trim=FALSE) +
scale_fill_manual(values=c(EUR="dodgerblue2", MENA="mediumvioletred", SIB="darkkhaki",
CEA="firebrick2", AFR="olivedrab2", OCE="powderblue",
SAS="darksalmon", AME="plum2")) +
scale_alpha_manual(values=c("SNPs"=1, "INDELs"=.25)) +
theme_bw() + theme(legend.title=element_text(face='italic'), legend.position='bottom') +
guides(
fill="none",
alpha=guide_legend(override.aes=list(fill=c("black","black")), title="variant type",
title.position="top", title.hjust=.5)
) +
xlab("") + ylab("")
violin_variants
If helpful I also share the dput()
for the first 36 samples, although I'm not sure whether any of them fall in the "outlier" category. Small toy dataset:
structure(list(samples = c("abh100 - number of:", "abh107 - number of:", "ALB212 - number of:", "Ale14 - number of:", "Ale20 - number of:", "Ale22 - number of:", "Ale32 - number of:", "altai363p - number of:", "armenia293 - number of:", "Armenian222 - number of:", "AV-21 - number of:", "Ayodo_430C - number of:", "Ayodo_502C - number of:", "Ayodo_81S - number of:", "B11 - number of:", "B17 - number of:", "Bishkek28439 - number of:", "Bishkek28440 - number of:", "Bu16 - number of:", "Bu5 - number of:", "BulgarianB4 - number of:", "BulgarianC1 - number of:", "ch113 - number of:", "CHI-007 - number of:", "CHI-034 - number of:", "DNK05 - number of:", "DNK07 - number of:", "DNK11 - number of:", "Dus16 - number of:", "Dus22 - number of:", "Esk29 - number of:", "Est375 - number of:", "Est400 - number of:", "HG00126 - number of:", "HG00128 - number of:"), population_ID = c("MENA", "MENA", "EUR", "SIB", "SIB", "SIB", "SIB", "SIB", "EUR", "EUR", "EUR", "AFR", "AFR", "AFR", "SAS", "SAS", "SIB", "SIB", "CEA", "CEA", "EUR", "EUR", "EUR", "CEA", "CEA", "AFR", "AFR", "AFR", "OCE", "OCE", "SIB", "EUR", "EUR", "EUR", "EUR"), snps = c(4847876, 4820146, 4875942, 4848405, 4846958, 4893150, 4886498, 4778500, 4868602, 4861225, 5513106, 5726596, 5766508, 5372587, 4974419, 4894272, 4870208, 4913870, 4923787, 4925207, 4840414, 4798908, 4891562, 4953420, 4881495, 5605004, 5703805, 5643221, 4831148, 4829405, 4688483, 4783761, 4778239, 4774887, 4811481)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -35L))
Upvotes: 0
Views: 112