user977828
user977828

Reputation: 7679

merging multiple lists into a data frame for ggpot2

I am trying to reproduce the below plot:

plot.

I could plot each density plot using the following code:

plot_densities2 <- function(density) {
  print(ggplot(data = densities, aes(x = x, y = y, fill = id)) + 
    theme_bw() + geom_area(alpha = 0.5))
}

filenames <- c("~/sample5-21.sam-uniq.sorted.bam", "~/sample5-24.sam-uniq.sorted.bam")

for ( i in filenames){ 
  print(i)
  density <- extracting_pos_neg_reads(i)
  densities <- cbind(rbind(data.frame(density[[1]][1:2]), data.frame(density[[2]][1:2])), 
                                        id = rep(c("neg", "pos"), each = length(density[[1]]$x)))
  plot_densities2(densities)
}

Unfortunately, I do not know how to add the additional lists to the densities data frame inside the above for loop.

The full code can be found below and the data can be downloaded from here

#apt update && apt install zlib1g-dev

#install if necessary
#source("http://bioconductor.org/biocLite.R")
#biocLite("Rsamtools")

#load library
library(Rsamtools)

#install.packages("ggplot2")
library("ggplot2")

extracting_pos_neg_reads <- function(bam_fn) {

  #read in entire BAM file
  bam <- scanBam(bam_fn)

  #names of the BAM fields
  names(bam[[1]])
  # [1] "qname"  "flag"   "rname"  "strand" "pos"    "qwidth" "mapq"   "cigar"
  # [9] "mrnm"   "mpos"   "isize"  "seq"    "qual"

  #distribution of BAM flags
  table(bam[[1]]$flag)

  #      0       4      16
  #1472261  775200 1652949

  #function for collapsing the list of lists into a single list
  #as per the Rsamtools vignette
  .unlist <- function (x) {
    ## do.call(c, ...) coerces factor to integer, which is undesired
    x1 <- x[[1L]]
    if (is.factor(x1)) {
      structure(unlist(x), class = "factor", levels = levels(x1))
    } else {
      do.call(c, x)
    }
  }

  #store names of BAM fields
  bam_field <- names(bam[[1]])

  #go through each BAM field and unlist
  list <- lapply(bam_field, function(y)
    .unlist(lapply(bam, "[[", y)))

  #store as data frame
  bam_df <- do.call("DataFrame", list)
  names(bam_df) <- bam_field

  dim(bam_df)
  #[1] 3900410      13

  #---------

  #use chr22 as an example
  #how many entries on the negative strand of chr22?
  ###table(bam_df$rname == 'chr22' & bam_df$flag == 16)
  # FALSE    TRUE
  #3875997   24413

  #function for checking negative strand
  check_neg <- function(x) {
    if (intToBits(x)[5] == 1) {
      return(T)
    } else {
      return(F)
    }
  }

  #test neg function with subset of chr22
  test <- subset(bam_df)#, rname == 'chr22')
  dim(test)
  #[1] 56426    13
  table(apply(as.data.frame(test$flag), 1, check_neg))
  #number same as above
  #FALSE  TRUE
  #32013 24413

  #function for checking positive strand
  check_pos <- function(x) {
    if (intToBits(x)[3] == 1) {
      return(F)
    } else if (intToBits(x)[5] != 1) {
      return(T)
    } else {
      return(F)
    }
  }

  #check pos function
  table(apply(as.data.frame(test$flag), 1, check_pos))
  #looks OK
  #FALSE  TRUE
  #24413 32013

  #store the mapped positions on the plus and minus strands
  neg <- bam_df[apply(as.data.frame(bam_df$flag), 1, check_neg),
                'pos']
  length(neg)
  #[1] 24413
  pos <- bam_df[apply(as.data.frame(bam_df$flag), 1, check_pos),
                'pos']
  length(pos)
  #[1] 32013

  #calculate the densities
  neg_density <- density(neg)
  pos_density <- density(pos)

  #display the negative strand with negative values
  neg_density$y <- neg_density$y * -1

  return (list(neg_density, pos_density))

}

#https://stackoverflow.com/a/53698575/977828
plot_densities2 <- function(density) {
  print(ggplot(data = densities, aes(x = x, y = y, fill = id)) + 
    theme_bw() + geom_area(alpha = 0.5))
}

filenames <- c("~/josh/sample5-21.sam-uniq.sorted.bam", "~/josh/sample5-24.sam-uniq.sorted.bam")

for ( i in filenames){ 
  print(i)
  density <- extracting_pos_neg_reads(i)
  densities <- cbind(rbind(data.frame(density[[1]][1:2]), data.frame(density[[2]][1:2])), 
                                        id = rep(c("neg", "pos"), each = length(density[[1]]$x)))
  plot_densities2(densities)
}

Upvotes: 1

Views: 69

Answers (1)

Taher A. Ghaleb
Taher A. Ghaleb

Reputation: 5240

I guess you can do that using rbind and plot all densities after the loop, like this:

all_densities <- data.frame()
groups <- c('21', '24')
k <- 1
for (i in filenames){ 
   print(i)
   density <- extracting_pos_neg_reads(i)
   densities <- cbind(rbind(data.frame(density[[1]][1:2]), data.frame(density[[2]][1:2])), 
                                       id = rep(c("neg", "pos"), each = length(density[[1]]$x)))
   densities$group <- groups[k]
   k <- k + 1
   all_densities <- rbind(all_densities, densities)
}
plot_densities2(all_densities)

You would need to modify your plotting function to fill based on group, like this:

plot_densities2 <- function(densities) {
  print(ggplot(data = densities, aes(x = x, y = y, fill = group)) + 
    theme_bw() + geom_area(alpha = 0.5))
}

Hope it helps.

Upvotes: 1

Related Questions