Henri Wathieu
Henri Wathieu

Reputation: 121

R: Automated Survival Analysis

Below is example data where in genomicmatrix, each row corresponds to a gene ("sample"), and each cell corresponds to a value for that gene for a patient after which the column is named (in the format "TCGA-__-____-__"). (Question continues below)

genomicmatrix <- data.frame("sample" =  c("BIX","HEF","TUR","ZOP","VAG"), 
                  "TCGA-K4-6303-01" = runif(5, -1, 1),
                  "TCGA-DM-A28E-01" = runif(5, -1, 1),
                  "TCGA-AY-6197-01" = runif(5, -1, 1),
                  "TCGA-F4-6703-01" = runif(5, -1, 1),
                  "TCGA-HB-KH8H-01" = runif(5, -1, 1),
                  "TCGA-Y7-PIK2-01" = runif(5, -1, 1),
                  "TCGA-A6-5657-01" = runif(5, -1, 1))
colnames(genomicmatrix) <- gsub("[.]", "_",colnames(genomicmatrix))

sample = NULL
sample <- genomicmatrix$sample

genomicmatrix$sample = NULL
means = NULL

for(z in 1:nrow(genomicmatrix)) {
  means[z] <- rowMeans(genomicmatrix[z,])
}

genemeans <- data.frame(sample, means)

So, after finding the mean value for each row (gene) as above, I extract the patient names that have a value for that gene which is GREATER than the mean value for that gene. Those "greater than" patients for each gene go into a list for that gene in an element called high (e.g. for the fourth gene, the "greater than" patients appear in high[[4]]). The same goes for "lesser than" patients, which go to an element called low.

high = NULL
low = NULL
high <- list(list())
low <- list(list())
uplist = NULL
downlist = NULL
for (i in 1:nrow(genomicmatrix)) {
  uplist = NULL
  downlist = NULL
  for (y in seq_along(genomicmatrix)) {
    uplist[y] <- ifelse(genomicmatrix[i,y] > genemeans$means[i], names(genomicmatrix[y]), "")
    downlist[y] <- ifelse(genomicmatrix[i,y] < genemeans$means[i], names(genomicmatrix[y]), "")
    high[[i]] <- uplist
    low[[i]] <- downlist
  }
}

So, for each gene, I split the patients in "high expression" and "low expression" categories. For example, the patients that show up in low[[3]] are those that have an expression for the third gene ("TUR") that is lower than the average for that gene. Below, I have a conversion table for patientID - to survival time in days.

survival = NULL
survival$sampleID <- c("TCGA-K4-6303-01", "TCGA-DM-A28E-01", "TCGA-AY-6197-01", "TCGA-F4-6703-01", "TCGA-HB-KH8H-01", "TCGA-Y7-PIK2-01", "TCGA-A6-5657-01")
survival$X_OS <- c(256, 26, 88, 491, 553, 177, 732)
survival$sampleID <- chartr("-", "_", survival$sampleID)

I'd like to, from that setup, extract log rank test pvalues for each gene. That is, for gene 1 ("BIX") for example, given the Kaplan-Meier survival curves for high expression versus low expression (i.e. high[[1]] vs low[[1]]), I wish to extract the corresponding pvalue coming from a log rank test of those two vectors (answering the question: is there a significant difference in survival outcome for high expression and low expression patients FOR THAT GENE?). Once that pvalue is derived, it should of course move on to the next gene.

Upvotes: 0

Views: 764

Answers (1)

alexis_laz
alexis_laz

Reputation: 13122

(If you're only asking for a tool to perform an operation or for statistical advice, then StackOverflow might not be the right place for this question.)

Nonetheless, I'd suggest some improvements with the format of your data and your code that should be helpful in achieving your goals in R. If you have not significant memory constraints, you could transform your "genomicmatrix" into a long format "data.frame":

longDF = reshape(genomicmatrix, direction = "long", idvar = "sample", 
                 varying = list(2:8), times = colnames(genomicmatrix[-1]), 
                 timevar = "ID", v.names = "value")
row.names(longDF) = NULL
head(longDF)
#  sample              ID      value
#1    BIX TCGA_K4_6303_01 -0.4811441
#2    HEF TCGA_K4_6303_01 -0.2665017
#3    TUR TCGA_K4_6303_01  0.8367469
#4    ZOP TCGA_K4_6303_01 -0.5868480
#5    VAG TCGA_K4_6303_01 -0.0319600
#6    BIX TCGA_DM_A28E_01  0.3435170

Then you could find out which patients have higher and lower than mean expression and create a "data.frame":

exprs =  do.call(rbind, 
                 lapply(split(longDF, longDF$sample), 
                        function(x) { 
                           x$expr = ifelse(findInterval(x$value, mean(x$value)) == 1, 
                                           "high", 
                                           "low") 
                           x 
                        }))
row.names(exprs) = NULL
head(exprs)
#  sample              ID      value expr
#1    BIX TCGA_K4_6303_01 -0.4811441  low
#2    BIX TCGA_DM_A28E_01  0.3435170 high
#3    BIX TCGA_AY_6197_01  0.2269158 high
#4    BIX TCGA_F4_6703_01 -0.8283441  low
#5    BIX TCGA_HB_KH8H_01  0.4024671 high
#6    BIX TCGA_Y7_PIK2_01 -0.2979979  low

Then add "survival$X_OS":

exprs$X_OS = survival$X_OS[match(exprs$ID, survival$sampleID)]
head(exprs)
#  sample              ID      value expr X_OS
#1    BIX TCGA_K4_6303_01 -0.4811441  low  256
#2    BIX TCGA_DM_A28E_01  0.3435170 high   26
#3    BIX TCGA_AY_6197_01  0.2269158 high   88
#4    BIX TCGA_F4_6703_01 -0.8283441  low  491
#5    BIX TCGA_HB_KH8H_01  0.4024671 high  553
#6    BIX TCGA_Y7_PIK2_01 -0.2979979  low  177

Then, assuming you have a function log_rank_test that takes two vectors and outputs a "p.value" you could use something like:

#lapply(split(exprs[c("expr", "X_OS")], exprs$sample), 
#        function(x) log_rank_test(x$X_OS[x$expr == "high"], x$X_OS[x$expr == "low"]))

I'm attempting a "data.table" approach, although it might not be idiomatic or could be improved since I'm not familiar with it:

library(data.table)
library(reshape2)
DT = as.data.table(genomicmatrix)
longDT = melt(DT, "sample", variable.name = "ID")
longDT[, expr := ifelse(findInterval(value, mean(value)) == 1, "high", "low"), by = sample]
longDT[, X_OS := survival$X_OS[match(ID, survival$sampleID)]]
head(longDT)
#   sample              ID      value expr X_OS
#1:    BIX TCGA_K4_6303_01 -0.4811441  low  256
#2:    HEF TCGA_K4_6303_01 -0.2665017  low  256
#3:    TUR TCGA_K4_6303_01  0.8367469 high  256
#4:    ZOP TCGA_K4_6303_01 -0.5868480  low  256
#5:    VAG TCGA_K4_6303_01 -0.0319600  low  256
#6:    BIX TCGA_DM_A28E_01  0.3435170 high   26

And the, run your log_rank_test function like:

#longDT[, log_rank_test(X_OS[expr == "high"], X_OS[expr == "low"]), by = sample]

Upvotes: 4

Related Questions