Reputation: 35
I have a very simple ggplot2 lollipop plot from some dummy data:
mut.df <- data.frame("AA" = c(201, 203, 500, 601),
"Mut" = c("V201L", "R203H", "Q500*", "P601fs"),
"Type" = c("Missense", "Missense", "Nonsense", "Frameshift"),
"Freq" = c(2,3,4,1))
domain.df <- data.frame("Feature" = c("Start", "Dom1", "Dom2", "End"),
"Type" = c("str", "dom", "dom", "str"),
"Start" = c(1, 180, 480, 650),
"End" = c(1, 230, 630, 650))
str.fill <- "#E1E1E1"
str.col <- "#16161D"
dom.fill <- c("Dom2" = "#FF0000", "Dom1" = "#AA4BAB")
dom.col <- c("#16161D")
library(ggplot2)
library(ggrepel)
gp <- ggplot() +
geom_rect(data = subset(domain.df, Type == "str"),
mapping = aes(xmin = min(Start), xmax = max(End), ymin = 0.3, ymax = 0.7),
fill = str.fill,
colour = str.col)
gp <- gp + scale_y_continuous(limits = c(0, 10), breaks = 0:10)
gp <- gp + geom_segment(data = mut.df,
mapping = aes(x = AA, xend = AA, y = 0.7, yend = Freq)) +
geom_point(data = mut.df,
mapping = aes(x = AA, y = Freq, fill = Type),
shape = 21,
size = 2) +
geom_text_repel(data = mut.df,
mapping = aes(x = AA, y = Freq, label = Mut),
bg.colour = "white",
seed = 12345,
nudge_y = 0.25)
gp <- gp + geom_rect(data = subset(domain.df, Type == "dom"),
mapping = aes(xmin = Start, xmax = End, ymin = 0.2, ymax = 0.8, fill = Feature, group = Feature),
fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
colour = dom.col)
gp <- gp +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dotted")) +
labs(x = "AA", y = "Freq", fill = "Mutation")
I would like to add some smart-formatted branches that would separate the first two lollipops rather than stacking them next to each other. For example:
Would someone be able to suggest how to implement this in ggplot2?
Any and all suggestions welcome!
Upvotes: 1
Views: 273
Reputation: 35
I had a try to do this without any fancy clustering, just by doing a little bit of spacing. I think this will work for a sparse set of points, but might become an issue once mutations get quite dense.
# function to shift the x-axis coordinates when points are too close
shift.lollipop.x <- function(mut.pos = NULL, total.length = NULL, shift.factor = 0.05){
pos.dif <- 0
for (i in 1:length(mut.pos)){
pos.dif <- c(pos.dif, mut.pos[i+1] - mut.pos[i])
}
idx <- which(pos.dif < shift.factor*total.length)
## deal with odd and even sets of points
if (median(idx) %% 1==0){
mut.pos[idx[idx < median(idx)]] <- mut.pos[idx[idx < median(idx)]] - shift.factor*total.length
mut.pos[idx[idx > median(idx)]] <- mut.pos[idx[idx > median(idx)]] + shift.factor*total.length
} else {
mut.pos[idx[idx == median(idx)-0.5]] <- mut.pos[idx[idx == median(idx)-0.5]] - 0.5*shift.factor*total.length
mut.pos[idx[idx == median(idx)+0.5]] <- mut.pos[idx[idx == median(idx)+0.5]] + 0.5*shift.factor*total.length
mut.pos[idx[idx < median(idx)-0.5]] <- mut.pos[idx[idx < median(idx)-0.5]] - shift.factor*total.length
mut.pos[idx[idx > median(idx)+0.5]] <- mut.pos[idx[idx > median(idx)+0.5]] + shift.factor*total.length
}
mut.pos
}
# function to split the segment into 3 parts
shift.lollipop.y <- function(x, start.y = 0.7){
mod.start <- x - start.y
set1 <- start.y + mod.start/3
set2 <- set1 + mod.start/3
as.data.frame(cbind(set1,set2))
}
mut.df$Shift.AA <- shift.lollipop.x(mut.df$AA, 650)
mut.df <- cbind(mut.df, shift.lollipop.y(mut.df$Freq, 0.7))
str.fill <- "#E1E1E1"
str.col <- "#16161D"
dom.fill <- c("Dom2" = "#FF0000", "Dom1" = "#AA4BAB")
dom.col <- c("#16161D")
library(ggplot2)
library(ggrepel)
gp <- ggplot() +
geom_rect(data = subset(domain.df, Type == "str"),
mapping = aes(xmin = min(Start), xmax = max(End), ymin = 0.3, ymax = 0.7),
fill = str.fill,
colour = str.col)
gp <- gp + scale_y_continuous(limits = c(0, 10), breaks = 0:10)
gp <- gp +
geom_segment(data = mut.df,
mapping = aes(x = AA, xend = AA, y = 0.7, yend = set1)) +
geom_segment(data = mut.df,
mapping = aes(x = AA, xend = Shift.AA, y = set1, yend = set2)) +
geom_segment(data = mut.df,
mapping = aes(x = Shift.AA, xend = Shift.AA, y = set2, yend = Freq)) +
geom_point(data = mut.df,
mapping = aes(x = Shift.AA, y = Freq, fill = Type),
shape = 21,
size = 2) +
geom_text_repel(data = mut.df,
mapping = aes(x = Shift.AA, y = Freq, label = Mut),
bg.colour = "white",
seed = 12345,
nudge_y = 0.25,
angle = 90)
gp <- gp + geom_rect(data = subset(domain.df, Type == "dom"),
mapping = aes(xmin = Start, xmax = End, ymin = 0.2, ymax = 0.8, fill = Feature, group = Feature),
fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
colour = dom.col)
gp <- gp +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dotted")) +
labs(x = "AA", y = "Freq", fill = "Mutation")
gp
Upvotes: 1
Reputation: 173803
I don't know of any extension packages that allow this sort of conditional "branching" type of dodging. The closest thing that I know of would be to convert the lollipops to two-element graphs with a manual layout. The lower end of the lollipops would be fixed, but the upper ends get some random noise added to their position if they are close to another value of AA. The two ends of the nodes are joined by a geom_edge_elbow
:
library(tidyverse)
library(tidygraph)
library(ggraph)
mut.df %>%
select(Mut, AA, Type, Freq) %>%
mutate(Base = paste0(Mut, '_0'), .after = 'Mut') %>%
as_tbl_graph() %>%
mutate(AA = rep(mut.df$AA, 2),
Freq = c(mut.df$Freq, rep(0.5, nrow(mut.df))),
Type = c(mut.df$Type, rep(NA, nrow(mut.df)))) %>%
mutate(dist = sapply(AA, \(x) min(abs(x - mut.df$AA[!mut.df$AA %in% x])))) %>%
mutate(AA = ifelse(!is.na(Type) & dist < 20,
runif(n(), -50, 50), 0) + AA) %>%
ggraph(layout = 'manual', x = AA, y = Freq) +
geom_rect(data = subset(domain.df, Type == "str"),
mapping = aes(xmin = min(Start), xmax = max(End),
ymin = 0.3, ymax = 0.7),
fill = str.fill,
colour = str.col) +
geom_rect(data = subset(domain.df, Type == "dom"),
mapping = aes(xmin = Start, xmax = End, ymin = 0.2,
ymax = 0.8, fill = Feature, group = Feature),
fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
colour = dom.col) +
geom_edge_elbow(aes(direction = 1), strength = 0.5) +
geom_node_point(shape = 21, aes(fill = Type, size = Type)) +
geom_node_text(aes(label = ifelse(is.na(Type), '', name)),
angle = 90, hjust = -0.3) +
scale_size_manual(values = rep(3, 3), breaks = unique(mut.df$Type),
guide = 'none') +
scale_fill_discrete(breaks = unique(mut.df$Type)) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(linetype = "dotted")) +
labs(x = "AA", y = "Freq", fill = "Mutation") +
ylim(c(0, 5))
Note that you may need to run the code a few times to get a plot that you are happy with in terms of the dodging, since it occurs randomly. You could arrange it so that the dodging was calculated, but this would be a very complex calculation, requiring either selective randomization (which ggrepel uses), or some form of clustering / circle packing algorithm. Unless I was going to produce a lot of these plots, or there were a lot of clashing clusters, I would probably stick to the "randomize and choose the best" strategy shown above.
Upvotes: 1