user22423300
user22423300

Reputation: 35

Adding branches to ggplot mutation lollipop plot

I have a very simple ggplot2 lollipop plot from some dummy data:

mut.df <- data.frame("AA" = c(201, 203, 500, 601), 
                     "Mut" = c("V201L", "R203H", "Q500*", "P601fs"), 
                     "Type" = c("Missense", "Missense", "Nonsense", "Frameshift"), 
                     "Freq" = c(2,3,4,1))

domain.df <- data.frame("Feature" = c("Start", "Dom1", "Dom2", "End"), 
                        "Type" = c("str", "dom", "dom", "str"),
                        "Start" = c(1, 180, 480, 650), 
                        "End" = c(1, 230, 630, 650))

str.fill <- "#E1E1E1"
str.col <- "#16161D"

dom.fill <- c("Dom2" = "#FF0000", "Dom1" = "#AA4BAB")
dom.col <- c("#16161D")

library(ggplot2)
library(ggrepel)

gp <- ggplot() +
  geom_rect(data = subset(domain.df, Type == "str"),
            mapping = aes(xmin = min(Start), xmax = max(End), ymin = 0.3, ymax = 0.7),
            fill = str.fill,
            colour = str.col)

gp <- gp + scale_y_continuous(limits = c(0, 10), breaks = 0:10)

gp <- gp + geom_segment(data = mut.df, 
                  mapping = aes(x = AA, xend = AA, y = 0.7, yend = Freq)) +
  geom_point(data = mut.df,
             mapping = aes(x = AA, y = Freq, fill = Type),
             shape = 21,
             size = 2) +
  geom_text_repel(data = mut.df,
                  mapping = aes(x = AA, y = Freq, label = Mut),
                  bg.colour = "white",
                  seed = 12345,
                  nudge_y = 0.25)
  
gp <- gp + geom_rect(data = subset(domain.df, Type == "dom"),
               mapping = aes(xmin = Start, xmax = End, ymin = 0.2, ymax = 0.8, fill = Feature, group = Feature),
               fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
               colour = dom.col)

gp <- gp +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted")) +
  labs(x = "AA", y = "Freq", fill = "Mutation")

Example lollipop

I would like to add some smart-formatted branches that would separate the first two lollipops rather than stacking them next to each other. For example:

Example of branches

Would someone be able to suggest how to implement this in ggplot2?

Any and all suggestions welcome!

Upvotes: 1

Views: 273

Answers (2)

user22423300
user22423300

Reputation: 35

I had a try to do this without any fancy clustering, just by doing a little bit of spacing. I think this will work for a sparse set of points, but might become an issue once mutations get quite dense.

# function to shift the x-axis coordinates when points are too close
shift.lollipop.x <- function(mut.pos = NULL, total.length = NULL, shift.factor = 0.05){
  
  pos.dif <- 0
  for (i in 1:length(mut.pos)){
    pos.dif <- c(pos.dif, mut.pos[i+1] - mut.pos[i])
  }
  
  idx <- which(pos.dif < shift.factor*total.length)
  
  ## deal with odd and even sets of points
  if (median(idx) %% 1==0){
    mut.pos[idx[idx < median(idx)]] <- mut.pos[idx[idx < median(idx)]] - shift.factor*total.length
    mut.pos[idx[idx > median(idx)]] <- mut.pos[idx[idx > median(idx)]] + shift.factor*total.length
  } else {
    mut.pos[idx[idx == median(idx)-0.5]] <- mut.pos[idx[idx == median(idx)-0.5]] - 0.5*shift.factor*total.length
    mut.pos[idx[idx == median(idx)+0.5]] <- mut.pos[idx[idx == median(idx)+0.5]] + 0.5*shift.factor*total.length
    
    mut.pos[idx[idx < median(idx)-0.5]] <- mut.pos[idx[idx < median(idx)-0.5]] - shift.factor*total.length
    mut.pos[idx[idx > median(idx)+0.5]] <- mut.pos[idx[idx > median(idx)+0.5]] + shift.factor*total.length
  }
  
  mut.pos
  
}

# function to split the segment into 3 parts
shift.lollipop.y <- function(x, start.y = 0.7){
  mod.start <- x - start.y
  
  set1 <- start.y + mod.start/3
  set2 <- set1 + mod.start/3
  
  as.data.frame(cbind(set1,set2))
}

mut.df$Shift.AA <- shift.lollipop.x(mut.df$AA, 650)
mut.df <- cbind(mut.df, shift.lollipop.y(mut.df$Freq, 0.7))

str.fill <- "#E1E1E1"
str.col <- "#16161D"

dom.fill <- c("Dom2" = "#FF0000", "Dom1" = "#AA4BAB")
dom.col <- c("#16161D")

library(ggplot2)
library(ggrepel)

gp <- ggplot() +
  geom_rect(data = subset(domain.df, Type == "str"),
            mapping = aes(xmin = min(Start), xmax = max(End), ymin = 0.3, ymax = 0.7),
            fill = str.fill,
            colour = str.col)

gp <- gp + scale_y_continuous(limits = c(0, 10), breaks = 0:10)

gp <- gp + 
  geom_segment(data = mut.df, 
                        mapping = aes(x = AA, xend = AA, y = 0.7, yend = set1)) +
  geom_segment(data = mut.df, 
               mapping = aes(x = AA, xend = Shift.AA, y = set1, yend = set2)) +

  geom_segment(data = mut.df, 
               mapping = aes(x = Shift.AA, xend = Shift.AA, y = set2, yend = Freq)) +
  
  geom_point(data = mut.df,
             mapping = aes(x = Shift.AA, y = Freq, fill = Type),
             shape = 21,
             size = 2) +
  geom_text_repel(data = mut.df,
                  mapping = aes(x = Shift.AA, y = Freq, label = Mut),
                  bg.colour = "white",
                  seed = 12345,
                  nudge_y = 0.25,
                  angle = 90)

gp <- gp + geom_rect(data = subset(domain.df, Type == "dom"),
                     mapping = aes(xmin = Start, xmax = End, ymin = 0.2, ymax = 0.8, fill = Feature, group = Feature),
                     fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
                     colour = dom.col)

gp <- gp +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted")) +
  labs(x = "AA", y = "Freq", fill = "Mutation")

gp

plot results

Upvotes: 1

Allan Cameron
Allan Cameron

Reputation: 173803

I don't know of any extension packages that allow this sort of conditional "branching" type of dodging. The closest thing that I know of would be to convert the lollipops to two-element graphs with a manual layout. The lower end of the lollipops would be fixed, but the upper ends get some random noise added to their position if they are close to another value of AA. The two ends of the nodes are joined by a geom_edge_elbow:

library(tidyverse)
library(tidygraph)
library(ggraph)

mut.df %>%
  select(Mut, AA, Type, Freq) %>%
  mutate(Base = paste0(Mut, '_0'), .after = 'Mut') %>%
  as_tbl_graph() %>%
  mutate(AA = rep(mut.df$AA, 2),
         Freq = c(mut.df$Freq, rep(0.5, nrow(mut.df))),
         Type = c(mut.df$Type, rep(NA, nrow(mut.df)))) %>%
  mutate(dist = sapply(AA, \(x) min(abs(x - mut.df$AA[!mut.df$AA %in% x])))) %>%
  mutate(AA = ifelse(!is.na(Type) & dist < 20, 
                     runif(n(), -50, 50), 0) + AA) %>%
  ggraph(layout = 'manual', x = AA, y = Freq) +
  geom_rect(data = subset(domain.df, Type == "str"),
            mapping = aes(xmin = min(Start), xmax = max(End), 
                          ymin = 0.3, ymax = 0.7),
            fill = str.fill,
            colour = str.col) +
  geom_rect(data = subset(domain.df, Type == "dom"),
            mapping = aes(xmin = Start, xmax = End, ymin = 0.2, 
                          ymax = 0.8, fill = Feature, group = Feature),
            fill = dom.fill[subset(domain.df, Type == "dom")$Feature],
            colour = dom.col) +
  geom_edge_elbow(aes(direction = 1), strength = 0.5) +
  geom_node_point(shape = 21, aes(fill = Type, size = Type)) +
  geom_node_text(aes(label = ifelse(is.na(Type), '', name)), 
                 angle = 90, hjust = -0.3) +
  scale_size_manual(values = rep(3, 3), breaks = unique(mut.df$Type),
                    guide = 'none') +
  scale_fill_discrete(breaks = unique(mut.df$Type)) +
  theme_bw() +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(linetype = "dotted")) +
  labs(x = "AA", y = "Freq", fill = "Mutation") +
  ylim(c(0, 5))

enter image description here

Note that you may need to run the code a few times to get a plot that you are happy with in terms of the dodging, since it occurs randomly. You could arrange it so that the dodging was calculated, but this would be a very complex calculation, requiring either selective randomization (which ggrepel uses), or some form of clustering / circle packing algorithm. Unless I was going to produce a lot of these plots, or there were a lot of clashing clusters, I would probably stick to the "randomize and choose the best" strategy shown above.

Upvotes: 1

Related Questions