Reputation: 121
Below is example data where in genomicmatrix
, each row corresponds to a gene ("sample"
), and each cell corresponds to a value for that gene for a patient after which the column is named (in the format "TCGA-__-____-__"
). (Question continues below)
genomicmatrix <- data.frame("sample" = c("BIX","HEF","TUR","ZOP","VAG"),
"TCGA-K4-6303-01" = runif(5, -1, 1),
"TCGA-DM-A28E-01" = runif(5, -1, 1),
"TCGA-AY-6197-01" = runif(5, -1, 1),
"TCGA-F4-6703-01" = runif(5, -1, 1),
"TCGA-HB-KH8H-01" = runif(5, -1, 1),
"TCGA-Y7-PIK2-01" = runif(5, -1, 1),
"TCGA-A6-5657-01" = runif(5, -1, 1))
colnames(genomicmatrix) <- gsub("[.]", "_",colnames(genomicmatrix))
sample = NULL
sample <- genomicmatrix$sample
genomicmatrix$sample = NULL
means = NULL
for(z in 1:nrow(genomicmatrix)) {
means[z] <- rowMeans(genomicmatrix[z,])
}
genemeans <- data.frame(sample, means)
So, after finding the mean value for each row (gene) as above, I extract the patient names that have a value for that gene which is GREATER than the mean value for that gene. Those "greater than" patients for each gene go into a list for that gene in an element called high
(e.g. for the fourth gene, the "greater than" patients appear in high[[4]]
). The same goes for "lesser than" patients, which go to an element called low
.
high = NULL
low = NULL
high <- list(list())
low <- list(list())
uplist = NULL
downlist = NULL
for (i in 1:nrow(genomicmatrix)) {
uplist = NULL
downlist = NULL
for (y in seq_along(genomicmatrix)) {
uplist[y] <- ifelse(genomicmatrix[i,y] > genemeans$means[i], names(genomicmatrix[y]), "")
downlist[y] <- ifelse(genomicmatrix[i,y] < genemeans$means[i], names(genomicmatrix[y]), "")
high[[i]] <- uplist
low[[i]] <- downlist
}
}
So, for each gene, I split the patients in "high expression" and "low expression" categories. For example, the patients that show up in low[[3]]
are those that have an expression for the third gene ("TUR"
) that is lower than the average for that gene. Below, I have a conversion table for patientID - to survival time in days.
survival = NULL
survival$sampleID <- c("TCGA-K4-6303-01", "TCGA-DM-A28E-01", "TCGA-AY-6197-01", "TCGA-F4-6703-01", "TCGA-HB-KH8H-01", "TCGA-Y7-PIK2-01", "TCGA-A6-5657-01")
survival$X_OS <- c(256, 26, 88, 491, 553, 177, 732)
survival$sampleID <- chartr("-", "_", survival$sampleID)
I'd like to, from that setup, extract log rank test pvalues for each gene. That is, for gene 1 ("BIX"
) for example, given the Kaplan-Meier survival curves for high expression versus low expression (i.e. high[[1]]
vs low[[1]]
), I wish to extract the corresponding pvalue coming from a log rank test of those two vectors (answering the question: is there a significant difference in survival outcome for high expression and low expression patients FOR THAT GENE?). Once that pvalue is derived, it should of course move on to the next gene.
Upvotes: 0
Views: 764
Reputation: 13122
(If you're only asking for a tool to perform an operation or for statistical advice, then StackOverflow might not be the right place for this question.)
Nonetheless, I'd suggest some improvements with the format of your data and your code that should be helpful in achieving your goals in R. If you have not significant memory constraints, you could transform your "genomicmatrix" into a long format "data.frame":
longDF = reshape(genomicmatrix, direction = "long", idvar = "sample",
varying = list(2:8), times = colnames(genomicmatrix[-1]),
timevar = "ID", v.names = "value")
row.names(longDF) = NULL
head(longDF)
# sample ID value
#1 BIX TCGA_K4_6303_01 -0.4811441
#2 HEF TCGA_K4_6303_01 -0.2665017
#3 TUR TCGA_K4_6303_01 0.8367469
#4 ZOP TCGA_K4_6303_01 -0.5868480
#5 VAG TCGA_K4_6303_01 -0.0319600
#6 BIX TCGA_DM_A28E_01 0.3435170
Then you could find out which patients have higher and lower than mean expression and create a "data.frame":
exprs = do.call(rbind,
lapply(split(longDF, longDF$sample),
function(x) {
x$expr = ifelse(findInterval(x$value, mean(x$value)) == 1,
"high",
"low")
x
}))
row.names(exprs) = NULL
head(exprs)
# sample ID value expr
#1 BIX TCGA_K4_6303_01 -0.4811441 low
#2 BIX TCGA_DM_A28E_01 0.3435170 high
#3 BIX TCGA_AY_6197_01 0.2269158 high
#4 BIX TCGA_F4_6703_01 -0.8283441 low
#5 BIX TCGA_HB_KH8H_01 0.4024671 high
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low
Then add "survival$X_OS":
exprs$X_OS = survival$X_OS[match(exprs$ID, survival$sampleID)]
head(exprs)
# sample ID value expr X_OS
#1 BIX TCGA_K4_6303_01 -0.4811441 low 256
#2 BIX TCGA_DM_A28E_01 0.3435170 high 26
#3 BIX TCGA_AY_6197_01 0.2269158 high 88
#4 BIX TCGA_F4_6703_01 -0.8283441 low 491
#5 BIX TCGA_HB_KH8H_01 0.4024671 high 553
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low 177
Then, assuming you have a function log_rank_test
that takes two vectors and outputs a "p.value" you could use something like:
#lapply(split(exprs[c("expr", "X_OS")], exprs$sample),
# function(x) log_rank_test(x$X_OS[x$expr == "high"], x$X_OS[x$expr == "low"]))
I'm attempting a "data.table" approach, although it might not be idiomatic or could be improved since I'm not familiar with it:
library(data.table)
library(reshape2)
DT = as.data.table(genomicmatrix)
longDT = melt(DT, "sample", variable.name = "ID")
longDT[, expr := ifelse(findInterval(value, mean(value)) == 1, "high", "low"), by = sample]
longDT[, X_OS := survival$X_OS[match(ID, survival$sampleID)]]
head(longDT)
# sample ID value expr X_OS
#1: BIX TCGA_K4_6303_01 -0.4811441 low 256
#2: HEF TCGA_K4_6303_01 -0.2665017 low 256
#3: TUR TCGA_K4_6303_01 0.8367469 high 256
#4: ZOP TCGA_K4_6303_01 -0.5868480 low 256
#5: VAG TCGA_K4_6303_01 -0.0319600 low 256
#6: BIX TCGA_DM_A28E_01 0.3435170 high 26
And the, run your log_rank_test
function like:
#longDT[, log_rank_test(X_OS[expr == "high"], X_OS[expr == "low"]), by = sample]
Upvotes: 4