Reputation: 6314
I have a positive big matrix:
set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
The rownames
of mat
are associated with a parent.id
:
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))
such that every few rows are associated with the same parent.id
.
I need to compute these operations for every row
in mat
:
mean
of log
of the row elements
mean
proportion of of that row out of all rows with the same parent.id
mean
probit of the proportion of of that row out of all rows with the same parent.id
sd
probit of the proportion of of that row out of all rows with the same parent.id
Naturally this is the first solution that comes to mind:
require(VGAM)
res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
}))
But I'm wondering if there's any way to achieve this faster.
P.S.
I don't think using data.table
instead of matrix is the way to go:
require(data.table)
require(microbenchmark)
require(VGAM)
set.seed(1)
mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
})))
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x, ])/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
65.08929 66.49415 69.78937 67.70534 70.38044 206.017 100
>
Compared to:
set.seed(1)
mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
rownames(mat) <- paste("r",1:nrow(mat),sep="")
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
require(VGAM)
microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
data.frame(mean.log=mean(log(mat[x,])),
mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
})))
Unit: milliseconds
expr
df <- do.call(rbind, lapply(1:nrow(mat), function(x) { idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])]) data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x, ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x, ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x, ]/apply(mat[idx, ], 2, sum)))) }))
min lq mean median uq max neval
10.15047 10.2894 10.69573 10.428 10.69741 14.56724 100
Unless applying as.numeric
each time I want to run an operation on data.table
row is a bad idea.
Upvotes: 0
Views: 97
Reputation: 132989
I don't think using data.table instead of matrix is the way to go
Obviously, you have to actually use data.table. It's not a magical wand that optimizes your code without you spending some effort. You need to use data.table syntax.
I need to compute these operations for every row in mat:
mean of log of the row elements mean proportion of of that row out of all rows with the same parent.id mean probit of the proportion of of that row out of all rows with the same parent.id sd probit of the proportion of of that row out of all rows with the same parent.id
I think this might do what you need:
library(data.table)
DT <- data.table(row.ids.df, mat)
DT <- melt(DT, id.vars = c("row.id", "parent.id"))
DT[, proportion := value / sum(value), by = .(variable, parent.id)]
res <- DT[, .(
mean.log = mean(log(value)),
mean.proportion = mean(proportion),
mean.probit = mean(probit(proportion)),
sd.probit = sd(probit(proportion))), by = row.id]
all.equal(res[["sd.probit"]],
res.df[["sd.probit"]])
#[1] TRUE
#(Tested with 100 rows and 30 columns.)
I expect it to be more efficient, but it's definitely more readable.
Upvotes: 1