Matt Thornton
Matt Thornton

Reputation: 157

GOstats and termGraph - Getting all of the plots out of the list

I am using the R/Bioconductor package GOstats. I am having a problem that isn't related to the function of the program only my ability to redirect output in Rscripts. I am running a procedure called 'termGraph' and the output is a list of graphNEL graphs which I can plot individually using the code below.

y2 = termGraphs(hgOver2, use.terms=TRUE, pvalue=0.01)
y2.1 = termGraphs(hgCondOver2, use.terms=TRUE, pvalue=0.01)

b1 <- y2$`1`
b2 <- y2.1$`1`

png(paste(dt,"_GO_Tree_hgOver_BP_q_v_all_under_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(b1, hgOver2, node.colors=c(sig="darkorange", not="deepskyblue1"), node.shape="ellipse", add.counts=TRUE)
dev.off()

png(paste(dt,"_GO_Tree_hgCondOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
plotGOTermGraph(b2, hgCondOver2, node.colors=c(sig="darkorange", not="deepskyblue1"), node.shape="ellipse", add.counts=TRUE)
dev.off()

The accessor for each graph is 1 to an unknown amount. There could be as many as 200 graphNEL graphs. I would like to be able to pull out each graphNEL graph from y2 and y2.1 and plot them. Afterward, I would throw out the 1 node or 2 node graphs (unless they are relevant)

Here is the whole Rscript if that helps. I have two csv files of expression data generated by limma or xps.

   #!/usr/bin/Rscript --vanilla --slave

    today <- Sys.Date()
    dt <- format(today, format="%d%b%y")

    library(Biobase)
    library(org.Rn.eg.db)
    library(GO.db)
    library(annotate)
    library(GOstats)
    library(genefilter)
    library(xtable)
    library(Rgraphviz)
    library(ragene20sttranscriptcluster.db)

    ## Change to data directory
    setwd("/data/13Aug15/gostats/")

    ## input reads for filtering
    raw <- read.csv("27Mar15_VSN_expression_values.csv", sep=",", header=T)
    colnames(raw) <- c("TranscriptClusterID", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1", "RA1_Mix2", "RA2_Mix2", "RA3_Mix2")

    #import annotation
    annot <- read.csv("RA-CTR_unfiltered_annotated.csv", sep=",", header=T)

    # merge the data and clean it up
    data <- merge(raw, annot, by="TranscriptClusterID", all.x=TRUE)
    keeps <- c("TranscriptClusterID", "EntrezID", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1", "RA1_Mix2", "RA2_Mix2", "RA3_Mix2", "P.Value", "adj.P.Val", "B", "AveExpr")
    data2 <- data[keeps]

    x_data <- data2[data2$AveExpr > 6.25, ]
    q_data <- x_data[x_data$adj.P.Val < 0.05, ]
    up_q <- q_data[q_data$logFC > 0, ]
    dn_q <- q_data[q_data$logFC < 0, ]

    # filter by possestion of a valid EntrezID and clean
    x_haveEntrez <- x_data[x_data$EntrezID > 0, ]
    row.names(x_haveEntrez) <- x_haveEntrez$TranscriptClusterID
    keeps <- c("RA1_Mix2", "RA2_Mix2", "RA3_Mix2", "CTR1_Mix1", "CTR2_Mix1", "CTR3_Mix1")
    x_data2 <- x_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
    row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
    q_data2 <- q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
    row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
    up_q2 <- up_q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
    row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
    dn_q2 <- dn_q_haveEntrez[keeps]

    ## Convert to ExpressionSet
    x_data2 <- as.matrix(x_data2)
    x_data3 <- new("ExpressionSet", exprs=x_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    x_data4 <- x_data3[haveGO, ]

    ## Convert to ExpressionSet
    q_data2 <- as.matrix(q_data2)
    q_data3 <- new("ExpressionSet", exprs=q_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    q_data4 <- q_data3[haveGO, ]

    ## Convert to ExpressionSet
    up_q2 <- as.matrix(up_q2)
    up_q3 <- new("ExpressionSet", exprs=up_q2)

    # filter by possestion of a valid EntrezID and clean
    q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
    row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
    q_data2 <- q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
    row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
    up_q2 <- up_q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
    row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
    dn_q2 <- dn_q_haveEntrez[keeps]

    ## Convert to ExpressionSet
    x_data2 <- as.matrix(x_data2)
    x_data3 <- new("ExpressionSet", exprs=x_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    x_data4 <- x_data3[haveGO, ]

    ## Convert to ExpressionSet
    q_data2 <- as.matrix(q_data2)
    q_data3 <- new("ExpressionSet", exprs=q_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    q_data4 <- q_data3[haveGO, ]

    ## Convert to ExpressionSet
    up_q2 <- as.matrix(up_q2)
    up_q3 <- new("ExpressionSet", exprs=up_q2)

    # filter by possestion of a valid EntrezID and clean
    q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
    row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
    q_data2 <- q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
    row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
    up_q2 <- up_q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
    row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
    dn_q2 <- dn_q_haveEntrez[keeps]

    ## Convert to ExpressionSet
    x_data2 <- as.matrix(x_data2)
    x_data3 <- new("ExpressionSet", exprs=x_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    x_data4 <- x_data3[haveGO, ]

    ## Convert to ExpressionSet
    q_data2 <- as.matrix(q_data2)
    q_data3 <- new("ExpressionSet", exprs=q_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    q_data4 <- q_data3[haveGO, ]

    ## Convert to ExpressionSet
    up_q2 <- as.matrix(up_q2)
    up_q3 <- new("ExpressionSet", exprs=up_q2)

    # filter by possestion of a valid EntrezID and clean
    q_haveEntrez <- q_data[q_data$EntrezID > 0, ]
    row.names(q_haveEntrez) <- q_haveEntrez$TranscriptClusterID
    q_data2 <- q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    up_q_haveEntrez <- up_q[up_q$EntrezID > 0, ]
    row.names(up_q_haveEntrez) <- up_q_haveEntrez$TranscriptClusterID
    up_q2 <- up_q_haveEntrez[keeps]

    # filter by possestion of a valid EntrezID and clean
    dn_q_haveEntrez <- dn_q[dn_q$EntrezID > 0, ]
    row.names(dn_q_haveEntrez) <- dn_q_haveEntrez$TranscriptClusterID
    dn_q2 <- dn_q_haveEntrez[keeps]

    ## Convert to ExpressionSet
    x_data2 <- as.matrix(x_data2)
    x_data3 <- new("ExpressionSet", exprs=x_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(x_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    x_data4 <- x_data3[haveGO, ]

    ## Convert to ExpressionSet
    q_data2 <- as.matrix(q_data2)
    q_data3 <- new("ExpressionSet", exprs=q_data2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    q_data4 <- q_data3[haveGO, ]

    ## Convert to ExpressionSet
    up_q2 <- as.matrix(up_q2)
    up_q3 <- new("ExpressionSet", exprs=up_q2)

    ## Convert to ExpressionSet
    dn_q2 <- as.matrix(dn_q2)
    dn_q3 <- new("ExpressionSet", exprs=dn_q2)

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(dn_q3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    dn_q4 <- dn_q3[haveGO, ]

    # filter by posession of a valid GO term
    haveGO <- sapply(mget(featureNames(up_q3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
    up_q4 <- up_q3[haveGO, ]

    ## Format the results for using with GOstats
    chipAffyUniverse <- featureNames(x_data4)
    chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
    chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))

    ## Format the results for use with GOstats
    q_selectedEntrezIds <- unlist(mget(featureNames(q_data4), ragene20sttranscriptclusterENTREZID))
    q_selectedEntrezIds <- unique(unlist(q_selectedEntrezIds))

    ## Format the results for use with GOstats
    up_q_selectedEntrezIds <- unlist(mget(featureNames(up_q4), ragene20sttranscriptclusterENTREZID))
    up_q_selectedEntrezIds <- unique(unlist(up_q_selectedEntrezIds))

    ## Format the results for use with GOstats
    dn_q_selectedEntrezIds <- unlist(mget(featureNames(dn_q4), ragene20sttranscriptclusterENTREZID))
    dn_q_selectedEntrezIds <- unique(unlist(dn_q_selectedEntrezIds))
##
## Run GOstats
##

hgCutoff <- 0.001

## q selected up/down versus all over 6.25 OVER

params1 <- new("GOHyperGParams", geneIds=q_selectedEntrezIds, universeGeneIds=chipEntrezUniverse, annotation="ragene20sttranscriptcluster.db", ontology="BP", pvalueCutoff=hgCutoff, conditional=FALSE, testDirection="over")
paramsCond1 <- params1
conditional(paramsCond1) <- TRUE
hgOver1 <- hyperGTest(params1)
hgCondOver1 <- hyperGTest(paramsCond1)

## Generate hmtl result files
htmlReport(hgOver1, file=paste(dt,"_hgOver_BP_q_v_all.html", sep=""))
htmlReport(hgCondOver1, file=paste(dt,"_hgCondOver_BP_q_v_all.html", sep=""))

# Get the tables out
dat <- as.data.frame(summary(hgOver1))
dat1 <- as.data.frame(summary(hgCondOver1))

write.table(dat, file=paste(dt, "_hgOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)
write.table(dat1, file=paste(dt, "_hgCondOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)

# Ok get the graph out
y1 = termGraphs(hgOver1, use.terms=TRUE, pvalue=0.01)
y1.1 = termGraphs(hgCondOver1, use.terms=TRUE, pvalue=0.01)

    a1 <- y1$`1`
    a2 <- y1.1$`1`
    png(paste(dt,"_GO_Tree_hgOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
    par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
    plotGOTermGraph(a1, hgOver1, node.colors=c(sig="darkorange", not="mediumspringgreen"), node.shape="ellipse", add.counts=TRUE)
    dev.off()

    png(paste(dt,"_GO_Tree_hgCondOver_BP_q_v_all_0.01.png", sep=""), width=6*1000, height=4*1000)
    par(mar=c(5,5,2,2), xaxs ="i", yaxs="i", cex.axis=1.3, cex.lab=1.4)
    plotGOTermGraph(a2, hgCondOver1, node.colors=c(sig="darkorange", not="mediumsringgreem"), node.shape="ellipse", add.counts=TRUE)
    dev.off()

save.image(file=paste(dt,"_1.RData", sep=""))

# clean all
rm(list=ls(all=TRUE))

quit("yes")

If you have a better way to get the different plots out from termGraph. I would like to hear about it. Thanks!

Upvotes: 1

Views: 436

Answers (1)

llrs
llrs

Reputation: 3397

I recommend to filter your data before applying other functions, this way you will have less data to manage. I would also recommend you to write your own functions to reduce same code for different data sets. You could do:

data_with_entrez <- data[data$EntrezID > 0, ]

or create your own function:

filter_clean <- function(data, keeps){
data_haveEntrez <- data[data$EntrezID > 0, ]
row.names(data_haveEntrez) <- data_haveEntrez$TranscriptClusterID
data_out <- data_haveEntrez[keeps]
return(data_out)
}

This way you just need to do:

q_data2 <- filter_clean(q_data, keeps)
up_q2 <- filter_clean(up_q, keeps)

The same with

## Convert to ExpressionSet
q_data2 <- as.matrix(q_data2)
q_data3 <- new("ExpressionSet", exprs=q_data2)

# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(q_data3), ragene20sttranscriptclusterGO), function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
q_data4 <- q_data3[haveGO, ]

Could become:

convert_filter_go <- function(data){
expr_set <- new("ExpressionSet", exprs=as.matrix(data2))

# filter by posession of a valid GO term
haveGO <- sapply(mget(featureNames(expr_set), ragene20sttranscriptclusterGO), 
           function(x) { if (length(x) == 1 && is.na(x)) FALSE else TRUE})
return(expr_set[haveGO, ])
}

You could apply this function to any of your data sets, improving readability and avoiding errors. You could do the same for:

## Format the results for using with GOstats
chipAffyUniverse <- featureNames(x_data4)
chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))

Which would become:

formating_gostats <- function(data){
chipAffyUniverse <- featureNames(data)
chipEntrezUniverse <- mget(chipAffyUniverse, ragene20sttranscriptclusterENTREZID)
chipEntrezUniverse <- unique(unlist(chipEntrezUniverse))
return(chipEntrezUniverse)
}

If you do:

write.table(dat, file=paste(dt, "_hgOver_BP_q_v_all.txt", sep=""), sep="\t", col.names=T, row.names=T)

You are creating a tab separated file (values, and I would recommend to change the filename to "_hgOver_BP_q_v_all.tsv"

Now your question. You want to iterate over an unknown amount of data. I recommend to do the following:

plot_png <- function(df){
    len <-  length(names(df))
    for (i in 1:len){
        a1 <- df[len[i]]
        # Plot the data for this case using the function you already use...}
}

Your next comment

Afterward, I would throw out the 1 node or 2 node graphs (unless they are relevant)

It is not clear, how you known if it is relevant this node or not, and should probably be another separate question

Upvotes: 1

Related Questions