Reputation: 157
I am using the R/Bioconductor package GOstats. I am having a problem that isn't related to the function of the program only my ability to redirect output in Rscripts. I am running a procedure called 'termGraph' and the output is a list of graphNEL graphs which I can plot individually using the code below.
y2 = termGraphs(hgOver2, use.terms=TRUE, pvalue=0.01)
y2.1 = termGraphs(hgCondOver2, use.terms=TRUE, pvalue=0.01)
b1 <- y2$`1`
b2 <- y2.1$`1`
png(paste(dt,"_GO_Tree_hgOver_BP_q_v_all_under_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(b1, hgOver2, node.colors=c(sig="darkorange", not="deepskyblue1"), node.shape="ellipse", add.counts=TRUE)
dev.off()
png(paste(dt,"_GO_Tree_hgCondOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(b2, hgCondOver2, node.colors=c(sig="darkorange", not="deepskyblue1"), node.shape="ellipse", add.counts=TRUE)
dev.off()
The accessor for each graph is 1
to an unknown amount. There could be as many as 200 graphNEL graphs. I would like to be able to pull out each graphNEL graph from y2 and y2.1 and plot them. Afterward, I would throw out the 1 node or 2 node graphs (unless they are relevant)
Here is the whole Rscript if that helps. I have two csv files of expression data generated by limma or xps.
#!/usr/bin/Rscript --vanilla --slave
today <- Sys.Date()
dt <- format(today, format="%d%b%y")
library(Biobase)
library(org.Rn.eg.db)
library(GO.db)
library(annotate)
library(GOstats)
library(genefilter)
library(xtable)
library(Rgraphviz)
library(ragene20sttranscriptcluster.db)
## Change to data directory
setwd("/data/13Aug15/gostats/")
## input reads for filtering
raw <- read.csv("27Mar15_VSN_expression_values.csv", sep=",", header=T)
colnames(raw) <- c("TranscriptClusterID", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1", "RA1_Mix2", "RA2_Mix2", "RA3_Mix2")
#import annotation
annot <- read.csv("RA-CTR_unfiltered_annotated.csv", sep=",", header=T)
# merge the data and clean it up
data <- merge(raw, annot, by="TranscriptClusterID", all.x=TRUE)
keeps <- c("TranscriptClusterID", "EntrezID", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1", "RA1_Mix2", "RA2_Mix2", "RA3_Mix2", "P.Value", "adj.P.Val", "B", "AveExpr")
data2 <- data[keeps]
x_data <- data2[data2$AveExpr > 6.25, ]
q_data <- x_data[x_data$adj.P.Val < 0.05, ]
up_q <- q_data[q_data$logFC > 0, ]
dn_q <- q_data[q_data$logFC < 0, ]
# filter by possestion of a valid EntrezID and clean
x_haveEntrez <- x_data[x_data$EntrezID > 0, ]
row.names(x_haveEntrez) <- x_haveEntrez$TranscriptClusterID
keeps <- c("RA1_Mix2", "RA2_Mix2", "RA3_Mix2", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1")
x_data2 <- x_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
q_data2 <- q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
up_q2 <- up_q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
dn_q2 <- dn_q_haveEntrez[keeps]
## Convert to ExpressionSet
x_data2 <- as.matrix(x_data2)
x_data3 <- new("ExpressionSet", exprs=x_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
x_data4 <- x_data3[haveGO, ]
## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]
## Convert to ExpressionSet
up_q2 <- as.matrix(up_q2)
up_q3 <- new("ExpressionSet", exprs=up_q2)
# filter by possestion of a valid EntrezID and clean
q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
q_data2 <- q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
up_q2 <- up_q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
dn_q2 <- dn_q_haveEntrez[keeps]
## Convert to ExpressionSet
x_data2 <- as.matrix(x_data2)
x_data3 <- new("ExpressionSet", exprs=x_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
x_data4 <- x_data3[haveGO, ]
## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]
## Convert to ExpressionSet
up_q2 <- as.matrix(up_q2)
up_q3 <- new("ExpressionSet", exprs=up_q2)
# filter by possestion of a valid EntrezID and clean
q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
q_data2 <- q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
up_q2 <- up_q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
dn_q2 <- dn_q_haveEntrez[keeps]
## Convert to ExpressionSet
x_data2 <- as.matrix(x_data2)
x_data3 <- new("ExpressionSet", exprs=x_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
x_data4 <- x_data3[haveGO, ]
## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]
## Convert to ExpressionSet
up_q2 <- as.matrix(up_q2)
up_q3 <- new("ExpressionSet", exprs=up_q2)
# filter by possestion of a valid EntrezID and clean
q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
q_data2 <- q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
up_q2 <- up_q_haveEntrez[keeps]
# filter by possestion of a valid EntrezID and clean
dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
dn_q2 <- dn_q_haveEntrez[keeps]
## Convert to ExpressionSet
x_data2 <- as.matrix(x_data2)
x_data3 <- new("ExpressionSet", exprs=x_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
x_data4 <- x_data3[haveGO, ]
## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]
## Convert to ExpressionSet
up_q2 <- as.matrix(up_q2)
up_q3 <- new("ExpressionSet", exprs=up_q2)
## Convert to ExpressionSet
dn_q2 <- as.matrix(dn_q2)
dn_q3 <- new("ExpressionSet", exprs=dn_q2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(dn_q3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
dn_q4 <- dn_q3[haveGO, ]
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(up_q3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
up_q4 <- up_q3[haveGO, ]
## Format the results for using with GOstats
chipAffyUniverse <- featureNames(x_data4)
chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))
## Format the results for use with GOstats
q_selectedEntrezIds <- unlist(mget(featureNames(q_data4), ragene20sttranscriptclusterENTREZID))
q_selectedEntrezIds <- unique(unlist(q_selectedEntrezIds))
## Format the results for use with GOstats
up_q_selectedEntrezIds <- unlist(mget(featureNames(up_q4), ragene20sttranscriptclusterENTREZID))
up_q_selectedEntrezIds <- unique(unlist(up_q_selectedEntrezIds))
## Format the results for use with GOstats
dn_q_selectedEntrezIds <- unlist(mget(featureNames(dn_q4), ragene20sttranscriptclusterENTREZID))
dn_q_selectedEntrezIds <- unique(unlist(dn_q_selectedEntrezIds))
##
## Run GOstats
##
hgCutoff <- 0.001
## q selected up/down versus all over 6.25 OVER
params1 <- new("GOHyperGParams", geneIds=q_selectedEntrezIds, universeGeneIds=chipEntrezUniverse, annotation="ragene20sttranscriptcluster.db", ontology="BP", pvalueCutoff=hgCutoff, conditional=FALSE, testDirection="over")
paramsCond1 <- params1
conditional(paramsCond1) <- TRUE
hgOver1 <- hyperGTest(params1)
hgCondOver1 <- hyperGTest(paramsCond1)
## Generate hmtl result files
htmlReport(hgOver1, file=paste(dt,"_hgOver_BP_q_v_all.html", sep=""))
htmlReport(hgCondOver1, file=paste(dt,"_hgCondOver_BP_q_v_all.html", sep=""))
# Get the tables out
dat <- as.data.frame(summary(hgOver1))
dat1 <- as.data.frame(summary(hgCondOver1))
write.table(dat, file=paste(dt, "_hgOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)
write.table(dat1, file=paste(dt, "_hgCondOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)
# Ok get the graph out
y1 = termGraphs(hgOver1, use.terms=TRUE, pvalue=0.01)
y1.1 = termGraphs(hgCondOver1, use.terms=TRUE, pvalue=0.01)
a1 <- y1$`1`
a2 <- y1.1$`1`
png(paste(dt,"_GO_Tree_hgOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(a1, hgOver1, node.colors=c(sig="darkorange", not="mediumspringgreen"), node.shape="ellipse", add.counts=TRUE)
dev.off()
png(paste(dt,"_GO_Tree_hgCondOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(a2, hgCondOver1, node.colors=c(sig="darkorange", not="mediumsringgreem"), node.shape="ellipse", add.counts=TRUE)
dev.off()
save.image(file=paste(dt,"_1.RData", sep=""))
# clean all
rm(list=ls(all=TRUE))
quit("yes")
If you have a better way to get the different plots out from termGraph. I would like to hear about it. Thanks!
Upvotes: 1
Views: 436
Reputation: 3397
I recommend to filter your data before applying other functions, this way you will have less data to manage. I would also recommend you to write your own functions to reduce same code for different data sets. You could do:
data_with_entrez <- data[data$EntrezID > 0, ]
or create your own function:
filter_clean <- function(data, keeps){
data_haveEntrez <- data[data$EntrezID > 0, ]
row.names(data_haveEntrez) <- data_haveEntrez$TranscriptClusterID
data_out <- data_haveEntrez[keeps]
return(data_out)
}
This way you just need to do:
q_data2 <- filter_clean(q_data, keeps)
up_q2 <- filter_clean(up_q, keeps)
The same with
## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]
Could become:
convert_filter_go <- function(data){
expr_set <- new("ExpressionSet", exprs=as.matrix(data2))
# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(expr_set), ragene20sttranscriptclusterGO),
function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
return(expr_set[haveGO, ])
}
You could apply this function to any of your data sets, improving readability and avoiding errors. You could do the same for:
## Format the results for using with GOstats
chipAffyUniverse <- featureNames(x_data4)
chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))
Which would become:
formating_gostats <- function(data){
chipAffyUniverse <- featureNames(data)
chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))
return(chipEntrezUniverse)
}
If you do:
write.table(dat, file=paste(dt, "_hgOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)
You are creating a tab separated file (values, and I would recommend to change the filename to "_hgOver_BP_q_v_all.tsv"
Now your question. You want to iterate over an unknown amount of data. I recommend to do the following:
plot_png <- function(df){
len <- length(names(df))
for (i in 1:len){
a1 <- df[len[i]]
# Plot the data for this case using the function you already use...}
}
Your next comment
Afterward, I would throw out the 1 node or 2 node graphs (unless they are relevant)
It is not clear, how you known if it is relevant this node or not, and should probably be another separate question
Upvotes: 1