fridaymeetssunday
fridaymeetssunday

Reputation: 1148

How to create a knitr report looping over folders and returning tables and plots

I am trying to generate a quality control report that loops (sapply) over several folders (each one corresponds to an experiment), and for each load results, create tables and plots (inside a function). The resulting pdf should contain the name of the folder followed by tables and plots in order. I first created the R script (which runs nicely) and then created a rnw file. The plots are indeed generated but there are 2 problems (pdf outpout):

  1. in the chunk loop_n_plots no tables are generated;

  2. after all plots have been created there is an unintended jumbled line which looks like the output of a list.

Q: How do I get the tables in my pdf? The table which is generated in the chunk "table_files" works, but those inside the apply function don't. Why? More generally, is what I am trying to do (amd how am I doing it) ok for knitr reports? Would it be best to add tables and plots in a list and then loop over the list to print them?

I have played for while now with the chunk settings but nothing worked.

Sample code:

\documentclass{report}

\begin{document}
\title{Sequencing Quality Report}
\author{Deep Sequencing Group - SFB655}
\maketitle


<<knitr_option, cache=FALSE, echo=FALSE, results='hide'>>=
library(knitr)
## set global chunk options
opts_chunk$set(fig.align='center', fig.width=14, fig.heigth=8, out.width="1.2\\textwidth",  par=TRUE)
@


<<R_arguments, cache=FALSE, echo=FALSE, include=FALSE>>=

###### Libraries ######
library(reshape)
library(ggplot2)
theme_set(theme_bw(16)) # removes grey grid and increases letter size. Ideal for presentations
library(RColorBrewer)
library(plyr)
library(scales) # for natural numbers in axis
library(xtable)
library(rattle) # needed to generate a table in knitr?
#######################


###### Function definitions ######
## ggplot theme with extra space between legends and axis
gg.axis.space <- theme(axis.title.y=element_text(vjust=0.2), axis.title.x=element_text(vjust=0.2))


ReturnStatsPlotsAndTables <- function(fqc.folder){

   # for(fqc.folder in fq_fastqc.folders){
   ######################################
   ## for each folder in the vector will
   ## plot stats and 
   ## print tables of fastQC results

   ## which library is being analysed?
   fastq.lib <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fqc.folder, perl=T))
   xtable(fastq.lib)

   ## Basic statistics - table ##
   stats.path <- paste(fqc.folder, "/", "Basic_Statistics_fastqc_data.temp", sep="")
   basic.stats <- read.table(stats.path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
#    basic.stats[ ,1:2]
   xtable(basic.stats[ ,1:2]) 


   ## Summary of filters - table ##
   stats.path <- paste(fqc.folder, "/", "filters_summary_fastqc_data.temp", sep="")
   summary.filters <-  read.table(stats.path, 
      header = TRUE, sep = "\t", stringsAsFactors = FALSE)
#    summary.filters
   xtable(summary.filters)


   ## Per base sequence quality ##
   stats.path <- paste(fqc.folder, "/", "Per_base_sequence_quality_fastqc_data.temp", sep="")
   base.qual <- read.table(stats.path, 
      header = TRUE, sep = "\t", stringsAsFactors = FALSE)


   base.qual$Base <- factor(base.qual$Base, as.character(base.qual$Base)) # re-order the levels by order of appearance in DF

   plot.new()
   base.qual.p <- ggplot(base.qual, aes(x = Base, ymin = X10th.Percentile, lower = Lower.Quartile, middle = Median, upper = Upper.Quartile, ymax = X90th.Percentile, fill = Lower.Quartile)) + geom_boxplot(stat = "identity") + 
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + 
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=20, alpha=0.1, fill="red") +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=20, ymax=28, alpha=0.1, fill="yellow") +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf, alpha=0.1, fill="green") +
      ggtitle("Per base sequence quality") + ylab("Quality score (Phred score) ") + xlab("Position of base in read")

   print(base.qual.p)

}
@

\chapter{Preamble}

This an automated quality control report generated for the following fastq files:

<<table_files, echo=FALSE, results="asis">>=
##############################################
## loop over fastQC folder and parse txt files:

## list and read fastqc_data.temp old files
# testing #
# setwd("/projects/seq-work/analysis/martinad/p0196-totalRNA/")
folder <- "./"
filenames <- list.files(path=folder, pattern="fastqc_data.temp", recursive=TRUE) 
fq_fastqc.folders <- unique(dirname(filenames)) # the folders that contain fastQC
fastq.libs <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fq_fastqc.folders, perl=T))
xtable(fastq.libs)
@



\chapter{FastQC}

<<loop_n_plots, echo=FALSE, results='asis'>>=
## do the plotting
sapply(fq_fastqc.folders[1:3], ReturnStatsPlotsAndTables)
@

\end{document}

The function ReturnStatsPlotsAndTables is actually longer this is enough to give an idea of what is happening.

Upvotes: 2

Views: 677

Answers (1)

fridaymeetssunday
fridaymeetssunday

Reputation: 1148

Found the solution which has 2 steps:

  1. replace the sapply with a for loop which contains the instruction of the function ReturnStatsPlotsAndTables;

  2. inside the for loop the tables need to be explicitly printed using:

    print(xtable(fastq.lib))

Here is the final code:

\documentclass{report}

\begin{document}
\title{Sequencing Quality Report}
\author{Deep Sequencing Group - SFB655}
\maketitle


<<knitr_option, cache=FALSE, echo=FALSE, results='hide'>>=
library(knitr)
## set global chunk options
opts_chunk$set(fig.align='center', fig.width=14, fig.heigth=8, out.width="1.2\\textwidth",  par=TRUE)
@


<<R_arguments, cache=FALSE, echo=FALSE, include=FALSE>>=

###### Libraries ######
library(reshape)
library(ggplot2)
theme_set(theme_bw(16)) # removes grey grid and increases letter size. Ideal for presentations
library(RColorBrewer)
library(plyr)
library(scales) # for natural numbers in axis
library(xtable)
library(rattle) # needed to generate a table in knitr?
#######################


###### Function definitions ######
## ggplot theme with extra space between legends and axis
gg.axis.space <- theme(axis.title.y=element_text(vjust=0.2), axis.title.x=element_text(vjust=0.2))

@


\chapter{Preamble}

This an automated quality control report generated for the following fastq files:

<<table_files, echo=FALSE, results="asis">>=
##############################################
## loop over fastQC folder and parse txt files:

## list and read fastqc_data.temp old files
# testing #
# setwd("/projects/seq-work/analysis/martinad/p0196-totalRNA/")
folder <- "./"
filenames <- list.files(path=folder, pattern="fastqc_data.temp", recursive=TRUE) 
fq_fastqc.folders <- unique(dirname(filenames)) # the folders that contain fastQC
fastq.libs <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fq_fastqc.folders, perl=T))
xtable(fastq.libs)
@



\chapter{FastQC}

<<loop_n_plots, echo=FALSE, results="asis">>=
## do the plotting
# sapply(fq_fastqc.folders[1:3], ReturnStatsPlotsAndTables)
for (fqc.folder in fq_fastqc.folders[1:2]){
   # for(fqc.folder in fq_fastqc.folders){
   ######################################
   ## for each folder in the vector will
   ## plot stats and 
   ## print tables of fastQC results
#    print(fqc.folder)
   ## which library is being analysed?
   fastq.lib <- data.frame(Libraries = gsub(".*/(L.*)\\.fq_fastqc", "\\1", fqc.folder, perl=T))
   print(xtable(fastq.lib))

   ## Basic statistics - table ##
   stats.path <- paste(fqc.folder, "/", "Basic_Statistics_fastqc_data.temp", sep="")
   basic.stats <- read.table(stats.path, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
#    basic.stats[ ,1:2]
   print(xtable(basic.stats[ ,1:2])) 


   ## Summary of filters - table ##
   stats.path <- paste(fqc.folder, "/", "filters_summary_fastqc_data.temp", sep="")
   summary.filters <-  read.table(stats.path, 
      header = TRUE, sep = "\t", stringsAsFactors = FALSE)
#    summary.filters
   print(xtable(summary.filters))


   ## Per base sequence quality ##
   stats.path <- paste(fqc.folder, "/", "Per_base_sequence_quality_fastqc_data.temp", sep="")
   base.qual <- read.table(stats.path, 
      header = TRUE, sep = "\t", stringsAsFactors = FALSE)


   base.qual$Base <- factor(base.qual$Base, as.character(base.qual$Base)) # re-order the levels by order of appearance in DF

   plot.new()
   base.qual.p <- ggplot(base.qual, aes(x = Base, ymin = X10th.Percentile, lower = Lower.Quartile, middle = Median, upper = Upper.Quartile, ymax = X90th.Percentile, fill = Lower.Quartile)) + geom_boxplot(stat = "identity") + 
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + 
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=20, alpha=0.1, fill="red") +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=20, ymax=28, alpha=0.1, fill="yellow") +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=28, ymax=Inf, alpha=0.1, fill="green") +
      ggtitle("Per base sequence quality") + ylab("Quality score (Phred score) ") + xlab("Position of base in read")

   print(base.qual.p)

}
@

\end{document}

Upvotes: 2

Related Questions