how can I plot a complex data structure

Question

I have a data way much bigger than this representative one

df<- structure(list(Pama1 = structure(c(2L, 5L, 3L, 5L, 3L, 3L, 3L, 
3L, 3L, 4L, 1L), .Label = c("", "DD1", "n/a", "PAMANA", "zf"), class = "factor"), 
    X = structure(c(11L, 3L, 10L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 
    1L), .Label = c("", "116", "12", "138", "197", "219", "224", 
    "230", "280", "85", "Start1"), class = "factor"), X.1 = structure(c(11L, 
    10L, 2L, 4L, 3L, 5L, 8L, 6L, 7L, 9L, 1L), .Label = c("", 
    "101", "145", "199", "222", "227", "233", "238", "331", "89", 
    "End1"), class = "factor"), Pama2 = structure(c(2L, 4L, 4L, 
    4L, 3L, 4L, 4L, 6L, 5L, 1L, 1L), .Label = c("", "DD2", "GGTR", 
    "n/a", "PAMANA", "T_reg"), class = "factor"), X.2 = structure(c(9L, 
    2L, 2L, 8L, 3L, 4L, 5L, 6L, 7L, 1L, 1L), .Label = c("", "1", 
    "115", "208", "214", "232", "376", "85", "Start2"), class = "factor"), 
    X.3 = structure(c(10L, 8L, 2L, 9L, 3L, 4L, 5L, 6L, 7L, 1L, 
    1L), .Label = c("", "15", "195", "229", "231", "362", "577", 
    "76", "86", "End2"), class = "factor"), Pama3 = structure(c(1L, 
    3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("DD3", 
    "GGTR", "n/a"), class = "factor"), X.4 = structure(c(10L, 
    1L, 7L, 8L, 9L, 9L, 2L, 3L, 4L, 5L, 6L), .Label = c("1", 
    "129", "136", "153", "166", "178", "48", "65", "66", "Start1"
    ), class = "factor"), X.5 = structure(c(10L, 6L, 7L, 8L, 
    3L, 9L, 1L, 2L, 4L, 4L, 5L), .Label = c("131", "138", "144", 
    "168", "180", "34", "51", "70", "79", "End2"), class = "factor")), .Names = c("Pama1", 
"X", "X.1", "Pama2", "X.2", "X.3", "Pama3", "X.4", "X.5"), class = "data.frame", row.names = c(NA, 
-11L))

At first I put all starts and ends together and find the range

In this example it is 1 to 577

Then I want to plot or highlight the regions that there is a string

for example, something like this figure that I made

or even like this

eipi10 · Accepted Answer

The code below tidies up the data into a form suitable for plotting and then uses geom_segment to lay out the sequences. To tidy the data, we want each column to be a variable and each row to be an observation.

library(tidyverse)

## Clean up data frame and convert to long form
df = map_df(seq(1,ncol(df),3),                                    # Turn each group of three columns into separate data frames that we'll stack into long format
            ~ setNames(df[-1,.x:(.x+2)], c("DD","Start","End")),  # Column names appear to be in the first data row, so we'll remove this row and provide new column names
            .id="Pama") %>%                                       # This line and next add a "Pama" column
  mutate(Pama = paste0("Pama", Pama)) %>%
  filter(!DD %in% c("n/a","")) %>%                                # Remove empty rows
  mutate_at(vars(matches("^[SE]")), funs(as.numeric(as.character(.))))  # All columns are in character format. Convert the numbers to numeric format.

The data frame now looks like this:

   Pama     DD Start   End
      
1 Pama1     zf    12    89
2 Pama1     zf   116   199
3 Pama1 PAMANA   280   331
4 Pama2   GGTR   115   195
5 Pama2  T_reg   232   362
6 Pama2 PAMANA   376   577
7 Pama3   GGTR    66   144

ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End), 
            aes(x=value, label=value, colour=DD), size=2.5, 
            fontface="bold", position=position_nudge(0,-0.3)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank())

UPDATE: To address your comment, here's another way of positioning the numbers to avoid overlap.

ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End), 
            aes(x=ifelse(key=="Start", value + 10, value - 10), label=value), 
                colour="white", size=2.8, fontface="bold", position=position_nudge(0,-0.2)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank())

UPDATE 2: To address your second comment, we'll add a grouping column that we'll use to alternate high and low number labels:

# Add grouping variable to alternate high and low labels
df = df %>% group_by(Pama) %>% arrange(Start) %>%
  mutate(hilow = rep(c("high","low"),nrow(df))[1:n()])

ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="high"), 
                      aes(x=value, label=value, colour=DD), hjust=0.5,
                      size=3, fontface="bold", position=position_nudge(0,0.3)) +
  geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="low"), 
            aes(x=value, label=value, colour=DD), hjust=0.5,
            size=3, fontface="bold", position=position_nudge(0,-0.3)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank())

how can I plot a complex data structure

Answers (2)

Update:

Related Questions