nik
nik

Reputation: 2584

how can I plot a complex data structure

I have a data way much bigger than this representative one

df<- structure(list(Pama1 = structure(c(2L, 5L, 3L, 5L, 3L, 3L, 3L, 
3L, 3L, 4L, 1L), .Label = c("", "DD1", "n/a", "PAMANA", "zf"), class = "factor"), 
    X = structure(c(11L, 3L, 10L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 
    1L), .Label = c("", "116", "12", "138", "197", "219", "224", 
    "230", "280", "85", "Start1"), class = "factor"), X.1 = structure(c(11L, 
    10L, 2L, 4L, 3L, 5L, 8L, 6L, 7L, 9L, 1L), .Label = c("", 
    "101", "145", "199", "222", "227", "233", "238", "331", "89", 
    "End1"), class = "factor"), Pama2 = structure(c(2L, 4L, 4L, 
    4L, 3L, 4L, 4L, 6L, 5L, 1L, 1L), .Label = c("", "DD2", "GGTR", 
    "n/a", "PAMANA", "T_reg"), class = "factor"), X.2 = structure(c(9L, 
    2L, 2L, 8L, 3L, 4L, 5L, 6L, 7L, 1L, 1L), .Label = c("", "1", 
    "115", "208", "214", "232", "376", "85", "Start2"), class = "factor"), 
    X.3 = structure(c(10L, 8L, 2L, 9L, 3L, 4L, 5L, 6L, 7L, 1L, 
    1L), .Label = c("", "15", "195", "229", "231", "362", "577", 
    "76", "86", "End2"), class = "factor"), Pama3 = structure(c(1L, 
    3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("DD3", 
    "GGTR", "n/a"), class = "factor"), X.4 = structure(c(10L, 
    1L, 7L, 8L, 9L, 9L, 2L, 3L, 4L, 5L, 6L), .Label = c("1", 
    "129", "136", "153", "166", "178", "48", "65", "66", "Start1"
    ), class = "factor"), X.5 = structure(c(10L, 6L, 7L, 8L, 
    3L, 9L, 1L, 2L, 4L, 4L, 5L), .Label = c("131", "138", "144", 
    "168", "180", "34", "51", "70", "79", "End2"), class = "factor")), .Names = c("Pama1", 
"X", "X.1", "Pama2", "X.2", "X.3", "Pama3", "X.4", "X.5"), class = "data.frame", row.names = c(NA, 
-11L))

At first I put all starts and ends together and find the range

In this example it is 1 to 577

Then I want to plot or highlight the regions that there is a string

for example, something like this figure that I made

enter image description here

or even like this

enter image description here

Upvotes: 2

Views: 167

Answers (2)

eipi10
eipi10

Reputation: 93851

The code below tidies up the data into a form suitable for plotting and then uses geom_segment to lay out the sequences. To tidy the data, we want each column to be a variable and each row to be an observation.

library(tidyverse)

## Clean up data frame and convert to long form
df = map_df(seq(1,ncol(df),3),                                    # Turn each group of three columns into separate data frames that we'll stack into long format
            ~ setNames(df[-1,.x:(.x+2)], c("DD","Start","End")),  # Column names appear to be in the first data row, so we'll remove this row and provide new column names
            .id="Pama") %>%                                       # This line and next add a "Pama" column
  mutate(Pama = paste0("Pama", Pama)) %>%
  filter(!DD %in% c("n/a","")) %>%                                # Remove empty rows
  mutate_at(vars(matches("^[SE]")), funs(as.numeric(as.character(.))))  # All columns are in character format. Convert the numbers to numeric format.

The data frame now looks like this:

   Pama     DD Start   End
  <chr>  <chr> <dbl> <dbl>
1 Pama1     zf    12    89
2 Pama1     zf   116   199
3 Pama1 PAMANA   280   331
4 Pama2   GGTR   115   195
5 Pama2  T_reg   232   362
6 Pama2 PAMANA   376   577
7 Pama3   GGTR    66   144
ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End), 
            aes(x=value, label=value, colour=DD), size=2.5, 
            fontface="bold", position=position_nudge(0,-0.3)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank())

enter image description here

UPDATE: To address your comment, here's another way of positioning the numbers to avoid overlap.

ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End), 
            aes(x=ifelse(key=="Start", value + 10, value - 10), label=value), 
                colour="white", size=2.8, fontface="bold", position=position_nudge(0,-0.2)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank())

enter image description here

UPDATE 2: To address your second comment, we'll add a grouping column that we'll use to alternate high and low number labels:

# Add grouping variable to alternate high and low labels
df = df %>% group_by(Pama) %>% arrange(Start) %>%
  mutate(hilow = rep(c("high","low"),nrow(df))[1:n()])

ggplot(df, aes(y=Pama, yend=Pama)) +
  geom_segment(data=data.frame(Pama=unique(df$Pama), x=min(df$Start), xend=max(df$End)),
               aes(x=x, xend=xend), colour="grey80", size=10) +
  geom_segment(aes(x=Start, xend=End, colour=DD), size=20) +
  geom_text(aes(x=(Start+End)/2, label=DD), colour="white", size=3, fontface="bold") +
  geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="high"), 
                      aes(x=value, label=value, colour=DD), hjust=0.5,
                      size=3, fontface="bold", position=position_nudge(0,0.3)) +
  geom_text(data=gather(df, key, value, Start:End) %>% filter(hilow=="low"), 
            aes(x=value, label=value, colour=DD), hjust=0.5,
            size=3, fontface="bold", position=position_nudge(0,-0.3)) +
  guides(colour=FALSE) +
  scale_x_continuous(breaks=seq(0,1000,100)) +
  labs(x="", y="") +
  theme_classic(base_size=15) +
  theme(axis.line.y=element_blank(),
        axis.ticks.y=element_blank()) 

enter image description here

Upvotes: 2

GGamba
GGamba

Reputation: 13680

That is one weird data structure you have. This will probably work out better down the line if you can modify the source data in some way to obtain a tidy dataframe from the start, where each column is a single variable and each row is an observation.

We can wrangle the data to obtain such dataframe (this is in base R, you can arguably achieve the same in other ways, also using dplyr or data.table):

df2 <- rbind(setNames(cbind(rep('DD1', nrow(df) - 1), df[2:nrow(df), 1:3]), c('DD', 'Pama', 'Start', 'End')),
             setNames(cbind(rep('DD2', nrow(df) - 1), df[2:nrow(df), 4:6]), c('DD', 'Pama', 'Start', 'End')),
             setNames(cbind(rep('DD3', nrow(df) - 1), df[2:nrow(df), 7:9]), c('DD', 'Pama', 'Start', 'End'))
)

df2$Start <- as.numeric(as.character(df2$Start))
df2$End <- as.numeric(as.character(df2$End))
df2 <- df2[!df2$Pama %in% c('','n/a'), ]


df2
#>     DD   Pama Start End
#> 2  DD1     zf    12  89
#> 4  DD1     zf   116 199
#> 10 DD1 PAMANA   280 331
#> 51 DD2   GGTR   115 195
#> 81 DD2  T_reg   232 362
#> 91 DD2 PAMANA   376 577
#> 52 DD3   GGTR    66 144

This gives us a nice dataset, where we can map any ggplot2's aestethic to a simple column:

library(ggplot2)
ggplot(df2, aes(y = DD, color = Pama)) +
  geom_segment(aes(x = Start, xend = End, yend = DD), size = 10) +
  geom_text(aes(label = Start, x = Start), size = 2.5, nudge_y = -.15) +
  geom_text(aes(label = End, x = End), size = 2.5, nudge_y = -.15) +
  scale_y_discrete(position = 'right') +
  theme(panel.background = element_rect(fill = 'white'),
        axis.text.x = element_blank(),
        axis.text.y.right = element_text(size = 14),
        axis.ticks.y = element_blank(),
        axis.title = element_blank())


Update:

The text positioning can indeed cause problems, it does so in this example, where we had to reduce the text size to get it somewhat right.

Here is a solution, based on the ggrepel package:

library(ggplot2)
library(ggrepel)
ggplot(df2, aes(y = DD, color = Pama)) +
  geom_segment(aes(x = Start, xend = End, yend = DD), size = 10) +
  geom_text_repel(data = function(d) tidyr::gather(d, k, p, -DD, -Pama), aes(label = p, x = p), size = 5, nudge_y = -.15, segment.size = 0) +
  # geom_label_repel(aes(label = End, x = End), size = 5, nudge_y = -.15) +
  geom_text(aes(x = (Start + End) / 2, label = Pama), colour = "white", size = 2.5) +
  scale_y_discrete(position = 'right') +
  guides(color = FALSE) +
  theme(panel.background = element_rect(fill = 'white'),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y.right = element_text(size = 14),
        axis.ticks.y = element_blank(),
        axis.title = element_blank())

(The text size is forcibly increased to show it does not overlap)

PS: Yea, this update makes it even more similar to @eipi10 answer.. it's a great answer, of course I'd steal from him :P

Upvotes: 1

Related Questions