Reputation: 259
Introduction
I've been writing a program that processes some Raw data and then passes it through several statistical processes. All in all it uses several instances of "lapply".
For example: in one part of the script I use a function known as Maxstat (note the function only takes data either as data.frame or data.table - will not accept a matrix).
**
**
Sample data:
The first two columns are neccesary. This is the large data sample seen below --> please read on after the sample
Genedata <- structure(list(time = c(120, 120, 28.96, 119.21, 59.53, 68.81,
82.29, 110.82, 65.88, 84.13, 16.47, 89.75, 76.07, 67.82, 52.24,
64.1, 55.13, 57.79, 50.1, 43.79, 30.27, 3.64, 36.59, 20.02, 118.58,
55.33, 120, 120, 120, 106.94, 11.7, 28.79, 26.82, 52.5, 24.03,
88.99, 102.44, 33.73, 85.28, 26.53, 37.31, 86.43, 55.92, 70.65,
76.04, 79.13, 83.99, 80.25, 40.6, 95.37, 31.06, 37.31, 22.02,
63.25, 34.09, 52.14, 66.04, 59.96, 47.86, 58.45, 45.99, 60.42,
37.67, 15.71, 39.25, 49.87, 25.24, 44.97, 35.9, 26.66, 36.78,
41.52, 22.22, 33.2, 39.68, 25.61, 83.99, 15.05, 8.38, 18.18,
27.15, 24.26, 105.17, 11.76, 70.45, 95.07, 112.33, 27.51, 45.92,
54.04, 103.98, 6.11, 99.51, 20.44, 76.6, 10.02, 41.45, 96.26,
85.61, 78.87, 22.25, 74.53, 59.07, 47.8, 5.68, 79.39, 74.07,
50.76, 67.82, 70.84, 50.59, 34.58, 38.72, 54.9, 67.92, 58.45,
59.34, 54.54, 19.03, 26.26, 52.86, 32.05, 55.95, 56.67, 50.43,
4.24, 49.11, 49.57, 50.49, 63.05, 49.24, 0.92, 31.36, 40.3, 116.64,
31.92, 19.98, 24.62, 18.54, 120, 17.62, 5.32, 2.36, 5.72, 15.28,
95.07, 4.96, 28.89, 3.25, 0.92, 18.77, 57.73, 14.39, 5.12, 4.99,
33.17, 50.53, 5.72, 8.02, 34.02, 1.44, 36.95, 60.75, 37.44, 23.07,
19.85, 31.85, 8.61, 42.27, 15.25, 14.56, 9.96, 8.94, 32.67, 2.07,
22.78, 22.35), cens = c(0L, 0L, NA, 0L, 0L, 1L, 0L, 0L, NA, 0L,
NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, NA, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, NA, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 1L, 0L, NA, 1L, 1L, 1L, 0L, 0L, 0L, NA, 0L, NA, NA, 1L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, NA, NA, 0L, 0L, 0L, 0L, 1L, NA, 0L, 1L,
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, NA, 0L, NA, 1L, 1L, 0L, 1L, 0L, 1L,
1L, NA, 1L, 0L, 1L, 1L, 0L)), .Names = c("time", "cens"), class = "data.frame", row.names = c(NA,
-177L))
To generate a larger set -> First define the sample above as Genedata
then the additional code (to change variable/column length, just change 1000 to 5000 or 50000 etc.
data <- replicate(1000, abs(rnorm(177)))
Genedata <- data.frame(Genedata, data)
library(data.table)
Genedata2 <- data.table(Genedata)
Then to calculate
#Calculating Maxstat
library(maxstat)
maxstat.genes <- as.list( names(Genedata) )[-c(1:3)] #generate a list of column numbers
system.time(maxstat.estimate <-lapply( maxstat.genes, function (x)
{maxstat.test(Surv(time, cens) ~ get (x), data=Genedata2, smethod="LogRank",
pmethod="Lau94")}))
I originally planned on using the for statement, as i'm familiar with it having a background in python. However, research has told me that using apply should be faster.
I also originally used data.frame, but switched to data.table as someone suggested this should be better for performance.
However, it seems there is still a significant problem with processing large datasets. Below is an explanation of why.
Running 25 variables
user system elapsed
0.04 0.00 0.05
Running 1000 variables
user system elapsed
6.54 0.00 6.57
Running 5000 variables
user system elapsed
132.89 0.00 133.02
As you can see, the processing speed is not linear but rather begins to fall off at a staggering rate. Given that the dataset is 43000 variables long, I can only guess that it would take on the order of hours. Interestingly, if i manually divided the tables into small sets, and ran them individually, it would be faster.
I think it has something to do with copying increasingly larger data, but I' not sure how this could be fixed. I already tried the data.table and lapply. Any suggestions?
Update 1: Added sample data. Note to reproduce a larger dataset. You can copy columns 4 onwards and paste. Rows can also be copied down. Should work as far as i know.
Update 2: Rprof on 1000 variables
> Rprof(NULL)
> summaryRprof()
$by.self
self.time self.pct total.time total.pct
terms.formula 0.50 15.82 0.50 15.82
eval 0.34 10.76 1.92 60.76
Surv 0.12 3.80 0.20 6.33
.deparseOpts 0.12 3.80 0.18 5.70
deparse 0.08 2.53 0.32 10.13
na.omit.data.frame 0.08 2.53 0.30 9.49
lapply 0.06 1.90 3.16 100.00
maxstat.test 0.06 1.90 3.16 100.00
maxstat.test.data.frame 0.06 1.90 3.10 98.10
model.frame.default 0.06 1.90 1.68 53.16
cmaxstat 0.06 1.90 0.38 12.03
match.arg 0.06 1.90 0.30 9.49
ifelse 0.06 1.90 0.06 1.90
is.matrix 0.06 1.90 0.06 1.90
length 0.06 1.90 0.06 1.90
match 0.06 1.90 0.06 1.90
pmatch 0.06 1.90 0.06 1.90
FUN 0.04 1.27 3.16 100.00
maxstat 0.04 1.27 0.90 28.48
terms 0.04 1.27 0.62 19.62
[.data.frame 0.04 1.27 0.20 6.33
order 0.04 1.27 0.08 2.53
[.formula 0.04 1.27 0.06 1.90
[.Surv 0.04 1.27 0.06 1.90
attr.all.equal 0.04 1.27 0.06 1.90
makepredictcall 0.04 1.27 0.06 1.90
unique 0.04 1.27 0.06 1.90
unlist 0.04 1.27 0.06 1.90
<Anonymous> 0.04 1.27 0.04 1.27
is.data.frame 0.04 1.27 0.04 1.27
names 0.04 1.27 0.04 1.27
NextMethod 0.04 1.27 0.04 1.27
sum 0.04 1.27 0.04 1.27
model.frame 0.02 0.63 1.70 53.80
na.omit 0.02 0.63 0.32 10.13
[ 0.02 0.63 0.30 9.49
cscores 0.02 0.63 0.28 8.86
cscores.Surv 0.02 0.63 0.26 8.23
all.equal 0.02 0.63 0.14 4.43
all.equal.numeric 0.02 0.63 0.12 3.80
paste 0.02 0.63 0.12 3.80
[[ 0.02 0.63 0.08 2.53
sort 0.02 0.63 0.08 2.53
[[.data.frame 0.02 0.63 0.06 1.90
sort.int 0.02 0.63 0.06 1.90
as.list 0.02 0.63 0.04 1.27
as.vector 0.02 0.63 0.04 1.27
get 0.02 0.63 0.04 1.27
- 0.02 0.63 0.02 0.63
$<- 0.02 0.63 0.02 0.63
/ 0.02 0.63 0.02 0.63
anyDuplicated 0.02 0.63 0.02 0.63
as.environment 0.02 0.63 0.02 0.63
as.list.data.frame 0.02 0.63 0.02 0.63
attr 0.02 0.63 0.02 0.63
attr<- 0.02 0.63 0.02 0.63
c 0.02 0.63 0.02 0.63
cbind 0.02 0.63 0.02 0.63
data.class 0.02 0.63 0.02 0.63
is.array 0.02 0.63 0.02 0.63
makepredictcall.default 0.02 0.63 0.02 0.63
mode 0.02 0.63 0.02 0.63
pLausen94 0.02 0.63 0.02 0.63
sys.function 0.02 0.63 0.02 0.63
$by.total
total.time total.pct self.time self.pct
lapply 3.16 100.00 0.06 1.90
maxstat.test 3.16 100.00 0.06 1.90
FUN 3.16 100.00 0.04 1.27
maxstat.test.data.frame 3.10 98.10 0.06 1.90
eval 1.92 60.76 0.34 10.76
model.frame 1.70 53.80 0.02 0.63
model.frame.default 1.68 53.16 0.06 1.90
maxstat 0.90 28.48 0.04 1.27
do.call 0.90 28.48 0.00 0.00
terms 0.62 19.62 0.04 1.27
terms.formula 0.50 15.82 0.50 15.82
cmaxstat 0.38 12.03 0.06 1.90
sapply 0.34 10.76 0.00 0.00
deparse 0.32 10.13 0.08 2.53
na.omit 0.32 10.13 0.02 0.63
na.omit.data.frame 0.30 9.49 0.08 2.53
match.arg 0.30 9.49 0.06 1.90
[ 0.30 9.49 0.02 0.63
cscores 0.28 8.86 0.02 0.63
cscores.Surv 0.26 8.23 0.02 0.63
Surv 0.20 6.33 0.12 3.80
[.data.frame 0.20 6.33 0.04 1.27
.deparseOpts 0.18 5.70 0.12 3.80
all.equal 0.14 4.43 0.02 0.63
all.equal.numeric 0.12 3.80 0.02 0.63
paste 0.12 3.80 0.02 0.63
order 0.08 2.53 0.04 1.27
[[ 0.08 2.53 0.02 0.63
sort 0.08 2.53 0.02 0.63
ifelse 0.06 1.90 0.06 1.90
is.matrix 0.06 1.90 0.06 1.90
length 0.06 1.90 0.06 1.90
match 0.06 1.90 0.06 1.90
pmatch 0.06 1.90 0.06 1.90
[.formula 0.06 1.90 0.04 1.27
[.Surv 0.06 1.90 0.04 1.27
attr.all.equal 0.06 1.90 0.04 1.27
makepredictcall 0.06 1.90 0.04 1.27
unique 0.06 1.90 0.04 1.27
unlist 0.06 1.90 0.04 1.27
[[.data.frame 0.06 1.90 0.02 0.63
sort.int 0.06 1.90 0.02 0.63
%in% 0.06 1.90 0.00 0.00
simplify2array 0.06 1.90 0.00 0.00
sort.default 0.06 1.90 0.00 0.00
<Anonymous> 0.04 1.27 0.04 1.27
is.data.frame 0.04 1.27 0.04 1.27
names 0.04 1.27 0.04 1.27
NextMethod 0.04 1.27 0.04 1.27
sum 0.04 1.27 0.04 1.27
as.list 0.04 1.27 0.02 0.63
as.vector 0.04 1.27 0.02 0.63
get 0.04 1.27 0.02 0.63
- 0.02 0.63 0.02 0.63
$<- 0.02 0.63 0.02 0.63
/ 0.02 0.63 0.02 0.63
anyDuplicated 0.02 0.63 0.02 0.63
as.environment 0.02 0.63 0.02 0.63
as.list.data.frame 0.02 0.63 0.02 0.63
attr 0.02 0.63 0.02 0.63
attr<- 0.02 0.63 0.02 0.63
c 0.02 0.63 0.02 0.63
cbind 0.02 0.63 0.02 0.63
data.class 0.02 0.63 0.02 0.63
is.array 0.02 0.63 0.02 0.63
makepredictcall.default 0.02 0.63 0.02 0.63
mode 0.02 0.63 0.02 0.63
pLausen94 0.02 0.63 0.02 0.63
sys.function 0.02 0.63 0.02 0.63
formals 0.02 0.63 0.00 0.00
is.na 0.02 0.63 0.00 0.00
is.na.Surv 0.02 0.63 0.00 0.00
rowSums 0.02 0.63 0.00 0.00
$sample.interval
[1] 0.02
$sampling.time
[1] 3.16
Update 3 - Rprof on 5000 variables - as you can see its not linear increase
> Rprof(NULL)
> summaryRprof()
$by.self
self.time self.pct total.time total.pct
eval 11.28 32.83 23.40 68.10
terms.formula 11.00 32.01 11.08 32.25
model.frame.default 0.68 1.98 22.38 65.13
deparse 0.68 1.98 1.70 4.95
cmaxstat 0.46 1.34 2.38 6.93
.deparseOpts 0.44 1.28 0.74 2.15
na.omit.data.frame 0.40 1.16 1.74 5.06
[.data.frame 0.36 1.05 1.06 3.08
maxstat.test.data.frame 0.32 0.93 34.16 99.42
pmatch 0.32 0.93 0.34 0.99
match 0.28 0.81 0.60 1.75
NextMethod 0.28 0.81 0.28 0.81
na.omit 0.26 0.76 2.00 5.82
Surv 0.26 0.76 0.54 1.57
get 0.26 0.76 0.28 0.81
unique 0.24 0.70 0.46 1.34
makepredictcall 0.24 0.70 0.36 1.05
order 0.24 0.70 0.34 0.99
match.arg 0.22 0.64 1.26 3.67
[.Surv 0.22 0.64 0.40 1.16
lapply 0.20 0.58 34.36 100.00
paste 0.20 0.58 0.90 2.62
maxstat 0.18 0.52 4.50 13.10
[ 0.18 0.52 1.54 4.48
cscores 0.18 0.52 1.26 3.67
$<- 0.18 0.52 0.18 0.52
terms 0.16 0.47 11.46 33.35
sort.int 0.16 0.47 0.46 1.34
cbind 0.16 0.47 0.16 0.47
is.matrix 0.16 0.47 0.16 0.47
length 0.16 0.47 0.16 0.47
[[.data.frame 0.14 0.41 0.34 0.99
FUN 0.12 0.35 34.26 99.71
all.equal.numeric 0.12 0.35 0.24 0.70
c 0.12 0.35 0.12 0.35
ifelse 0.12 0.35 0.12 0.35
is.data.frame 0.12 0.35 0.12 0.35
makepredictcall.default 0.12 0.35 0.12 0.35
names 0.12 0.35 0.12 0.35
cscores.Surv 0.10 0.29 1.08 3.14
%in% 0.10 0.29 0.70 2.04
all.equal 0.10 0.29 0.40 1.16
mode 0.10 0.29 0.34 0.99
unlist 0.10 0.29 0.20 0.58
unique.default 0.10 0.29 0.16 0.47
as.list 0.10 0.29 0.10 0.29
dim 0.10 0.29 0.10 0.29
is.factor 0.10 0.29 0.10 0.29
parent.frame 0.10 0.29 0.10 0.29
model.frame 0.08 0.23 22.46 65.37
is.na 0.08 0.23 0.18 0.52
duplicated 0.08 0.23 0.12 0.35
maxstat.test 0.06 0.17 34.22 99.59
sapply 0.06 0.17 1.72 5.01
sort 0.06 0.17 0.52 1.51
pLausen94 0.06 0.17 0.22 0.64
anyDuplicated 0.06 0.17 0.10 0.29
which 0.06 0.17 0.10 0.29
ncol 0.06 0.17 0.08 0.23
rowSums 0.06 0.17 0.08 0.23
* 0.06 0.17 0.06 0.17
/ 0.06 0.17 0.06 0.17
environment 0.06 0.17 0.06 0.17
is.ordered 0.06 0.17 0.06 0.17
list 0.06 0.17 0.06 0.17
do.call 0.04 0.12 4.56 13.27
[[ 0.04 0.12 0.38 1.11
attr.all.equal 0.04 0.12 0.10 0.29
formals 0.04 0.12 0.08 0.23
! 0.04 0.12 0.04 0.12
.Call 0.04 0.12 0.04 0.12
<Anonymous> 0.04 0.12 0.04 0.12
anyDuplicated.default 0.04 0.12 0.04 0.12
duplicated.default 0.04 0.12 0.04 0.12
floor 0.04 0.12 0.04 0.12
is.array 0.04 0.12 0.04 0.12
match.fun 0.04 0.12 0.04 0.12
options 0.04 0.12 0.04 0.12
pnorm 0.04 0.12 0.04 0.12
seq.default 0.04 0.12 0.04 0.12
sys.call 0.04 0.12 0.04 0.12
simplify2array 0.02 0.06 0.52 1.51
[.formula 0.02 0.06 0.14 0.41
is.na.Surv 0.02 0.06 0.10 0.29
sys.function 0.02 0.06 0.04 0.12
& 0.02 0.06 0.02 0.06
.row_names_info 0.02 0.06 0.02 0.06
^ 0.02 0.06 0.02 0.06
< 0.02 0.06 0.02 0.06
<= 0.02 0.06 0.02 0.06
== 0.02 0.06 0.02 0.06
as.environment 0.02 0.06 0.02 0.06
attr<- 0.02 0.06 0.02 0.06
baseenv 0.02 0.06 0.02 0.06
data.class 0.02 0.06 0.02 0.06
is.numeric 0.02 0.06 0.02 0.06
sqrt 0.02 0.06 0.02 0.06
sum 0.02 0.06 0.02 0.06
sys.parent 0.02 0.06 0.02 0.06
$by.total
total.time total.pct self.time self.pct
lapply 34.36 100.00 0.20 0.58
FUN 34.26 99.71 0.12 0.35
maxstat.test 34.22 99.59 0.06 0.17
maxstat.test.data.frame 34.16 99.42 0.32 0.93
eval 23.40 68.10 11.28 32.83
model.frame 22.46 65.37 0.08 0.23
model.frame.default 22.38 65.13 0.68 1.98
terms 11.46 33.35 0.16 0.47
terms.formula 11.08 32.25 11.00 32.01
do.call 4.56 13.27 0.04 0.12
maxstat 4.50 13.10 0.18 0.52
cmaxstat 2.38 6.93 0.46 1.34
na.omit 2.00 5.82 0.26 0.76
na.omit.data.frame 1.74 5.06 0.40 1.16
sapply 1.72 5.01 0.06 0.17
deparse 1.70 4.95 0.68 1.98
[ 1.54 4.48 0.18 0.52
match.arg 1.26 3.67 0.22 0.64
cscores 1.26 3.67 0.18 0.52
cscores.Surv 1.08 3.14 0.10 0.29
[.data.frame 1.06 3.08 0.36 1.05
paste 0.90 2.62 0.20 0.58
.deparseOpts 0.74 2.15 0.44 1.28
%in% 0.70 2.04 0.10 0.29
match 0.60 1.75 0.28 0.81
Surv 0.54 1.57 0.26 0.76
sort 0.52 1.51 0.06 0.17
simplify2array 0.52 1.51 0.02 0.06
unique 0.46 1.34 0.24 0.70
sort.int 0.46 1.34 0.16 0.47
sort.default 0.46 1.34 0.00 0.00
[.Surv 0.40 1.16 0.22 0.64
all.equal 0.40 1.16 0.10 0.29
[[ 0.38 1.11 0.04 0.12
makepredictcall 0.36 1.05 0.24 0.70
pmatch 0.34 0.99 0.32 0.93
order 0.34 0.99 0.24 0.70
[[.data.frame 0.34 0.99 0.14 0.41
mode 0.34 0.99 0.10 0.29
NextMethod 0.28 0.81 0.28 0.81
get 0.28 0.81 0.26 0.76
all.equal.numeric 0.24 0.70 0.12 0.35
pLausen94 0.22 0.64 0.06 0.17
unlist 0.20 0.58 0.10 0.29
$<- 0.18 0.52 0.18 0.52
is.na 0.18 0.52 0.08 0.23
cbind 0.16 0.47 0.16 0.47
is.matrix 0.16 0.47 0.16 0.47
length 0.16 0.47 0.16 0.47
unique.default 0.16 0.47 0.10 0.29
[.formula 0.14 0.41 0.02 0.06
c 0.12 0.35 0.12 0.35
ifelse 0.12 0.35 0.12 0.35
is.data.frame 0.12 0.35 0.12 0.35
makepredictcall.default 0.12 0.35 0.12 0.35
names 0.12 0.35 0.12 0.35
duplicated 0.12 0.35 0.08 0.23
as.list 0.10 0.29 0.10 0.29
dim 0.10 0.29 0.10 0.29
is.factor 0.10 0.29 0.10 0.29
parent.frame 0.10 0.29 0.10 0.29
anyDuplicated 0.10 0.29 0.06 0.17
which 0.10 0.29 0.06 0.17
attr.all.equal 0.10 0.29 0.04 0.12
is.na.Surv 0.10 0.29 0.02 0.06
ncol 0.08 0.23 0.06 0.17
rowSums 0.08 0.23 0.06 0.17
formals 0.08 0.23 0.04 0.12
as.vector 0.08 0.23 0.00 0.00
* 0.06 0.17 0.06 0.17
/ 0.06 0.17 0.06 0.17
environment 0.06 0.17 0.06 0.17
is.ordered 0.06 0.17 0.06 0.17
list 0.06 0.17 0.06 0.17
nrow 0.06 0.17 0.00 0.00
! 0.04 0.12 0.04 0.12
.Call 0.04 0.12 0.04 0.12
<Anonymous> 0.04 0.12 0.04 0.12
anyDuplicated.default 0.04 0.12 0.04 0.12
duplicated.default 0.04 0.12 0.04 0.12
floor 0.04 0.12 0.04 0.12
is.array 0.04 0.12 0.04 0.12
match.fun 0.04 0.12 0.04 0.12
options 0.04 0.12 0.04 0.12
pnorm 0.04 0.12 0.04 0.12
seq.default 0.04 0.12 0.04 0.12
sys.call 0.04 0.12 0.04 0.12
sys.function 0.04 0.12 0.02 0.06
getOption 0.04 0.12 0.00 0.00
irank 0.04 0.12 0.00 0.00
seq 0.04 0.12 0.00 0.00
& 0.02 0.06 0.02 0.06
.row_names_info 0.02 0.06 0.02 0.06
^ 0.02 0.06 0.02 0.06
< 0.02 0.06 0.02 0.06
<= 0.02 0.06 0.02 0.06
== 0.02 0.06 0.02 0.06
as.environment 0.02 0.06 0.02 0.06
attr<- 0.02 0.06 0.02 0.06
baseenv 0.02 0.06 0.02 0.06
data.class 0.02 0.06 0.02 0.06
is.numeric 0.02 0.06 0.02 0.06
sqrt 0.02 0.06 0.02 0.06
sum 0.02 0.06 0.02 0.06
sys.parent 0.02 0.06 0.02 0.06
match.call 0.02 0.06 0.00 0.00
$sample.interval
[1] 0.02
$sampling.time
[1] 34.36
Upvotes: 4
Views: 344
Reputation: 9805
You mentioned switching to data.table
, but it doesn't appear like you took full advantage of it. The key here is to reshape the table to "long" format, and then use data.table
to group by
your variable. Picking up from your example Genedata
...
library(data.table)
library(maxstat)
system.time({
## Convert to a "long" data.table format:
Genedata2 <- data.table(time=rep(Genedata[[1]],ncol(Genedata)-2),
cens=rep(Genedata[[2]],ncol(Genedata)-2),
variable=rep(1:(ncol(Genedata)-2), each=nrow(Genedata)),
value=as.vector(as.matrix(Genedata[3:ncol(Genedata)])))
## Calculate the maxstat.test by each variable:
maxstat.estimate2 <- Genedata2[,list(ans=list(maxstat.test(Surv(time, cens) ~ value,
data=.SD, smethod="LogRank", pmethod="Lau94"))), by="variable"]
})
## 1,000 variables:
## user system elapsed
## 2.01 0.00 2.01
## 5,000 variables:
## user system elapsed
## 12.17 0.00 12.17
## 10,000 variables:
## user system elapsed
## 21.19 0.07 21.25
## 43,000 variables:
## user system elapsed
## 96.91 0.21 97.22
If you are doing other calculations on the same data, you can just add them as new columns in the line of code that creates the maxstat.estimate2 table (list(ans=..., calc2=..., calc3=..., etc.)
).
Upvotes: 0
Reputation: 103898
I don't see any non-linear behaviour on my machine:
library(maxstat)
genes <- names(Genedata)[-(1:3)]
time_it <- function(f, vars = c(25, 100, 500, 999)) {
names(vars) <- vars
sapply(vars, function(x) system.time(lapply(genes[1:x], f))[3]) / vars
}
fit_model <- function(var) {
maxstat.test(Surv(time, cens) ~ get(var), data = Genedata,
smethod = "LogRank", pmethod = "Lau94")
}
time_it(fit_model)
# 25.elapsed 100.elapsed 500.elapsed 999.elapsed
# 1.800000 1.790000 1.812000 1.857858
Upvotes: 1
Reputation: 10437
I don't have a full answer to why the times for lapply are not linear, but notice that you are using lapply as a glorified loop by applying over a list of variable names (and then using these names to index the actual values) rather than applying over a list of values. I'm not sure how to solve this using lapply, but I can offer an alternative that is (roughly) a liner function of the number of variables.
Start by creating the example data:
Genedata <- structure(list(time = c(120, 120, 28.96, 119.21, 59.53, 68.81,
82.29, 110.82, 65.88, 84.13, 16.47, 89.75, 76.07, 67.82, 52.24,
64.1, 55.13, 57.79, 50.1, 43.79, 30.27, 3.64, 36.59, 20.02, 118.58,
55.33, 120, 120, 120, 106.94, 11.7, 28.79, 26.82, 52.5, 24.03,
88.99, 102.44, 33.73, 85.28, 26.53, 37.31, 86.43, 55.92, 70.65,
76.04, 79.13, 83.99, 80.25, 40.6, 95.37, 31.06, 37.31, 22.02,
63.25, 34.09, 52.14, 66.04, 59.96, 47.86, 58.45, 45.99, 60.42,
37.67, 15.71, 39.25, 49.87, 25.24, 44.97, 35.9, 26.66, 36.78,
41.52, 22.22, 33.2, 39.68, 25.61, 83.99, 15.05, 8.38, 18.18,
27.15, 24.26, 105.17, 11.76, 70.45, 95.07, 112.33, 27.51, 45.92,
54.04, 103.98, 6.11, 99.51, 20.44, 76.6, 10.02, 41.45, 96.26,
85.61, 78.87, 22.25, 74.53, 59.07, 47.8, 5.68, 79.39, 74.07,
50.76, 67.82, 70.84, 50.59, 34.58, 38.72, 54.9, 67.92, 58.45,
59.34, 54.54, 19.03, 26.26, 52.86, 32.05, 55.95, 56.67, 50.43,
4.24, 49.11, 49.57, 50.49, 63.05, 49.24, 0.92, 31.36, 40.3, 116.64,
31.92, 19.98, 24.62, 18.54, 120, 17.62, 5.32, 2.36, 5.72, 15.28,
95.07, 4.96, 28.89, 3.25, 0.92, 18.77, 57.73, 14.39, 5.12, 4.99,
33.17, 50.53, 5.72, 8.02, 34.02, 1.44, 36.95, 60.75, 37.44, 23.07,
19.85, 31.85, 8.61, 42.27, 15.25, 14.56, 9.96, 8.94, 32.67, 2.07,
22.78, 22.35), cens = c(0L, 0L, NA, 0L, 0L, 1L, 0L, 0L, NA, 0L,
NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, NA, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, NA, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 1L, 0L, NA, 1L, 1L, 1L, 0L, 0L, 0L, NA, 0L, NA, NA, 1L, 0L,
0L, 1L, 1L, 0L, 0L, 1L, NA, NA, 0L, 0L, 0L, 0L, 1L, NA, 0L, 1L,
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, NA, 0L, NA, 1L, 1L, 0L, 1L, 0L, 1L,
1L, NA, 1L, 0L, 1L, 1L, 0L)), .Names = c("time", "cens"), class = "data.frame", row.names = c(NA,
-177L))
data <- replicate(10000, abs(rnorm(177)))
Genedata <- data.frame(Genedata, data)
My approach is the reshape the data using the melt
function in the reshape2 package, then use dlply
from the plyr package to run the model on each subset.
ibrary(maxstat)
library(plyr)
library(reshape2)
system.time({
dat.m <- melt(Genedata, id.vars=1:3)
maxstat.estimate <- dlply(dat.m, .(variable), function(DF) {
maxstat.test(Surv(time, cens) ~ value, smethod="LogRank", pmethod="Lau94", data=DF)})})
Some timings from this approach are as follows:
## 1000 variables
## user system elapsed
## 11.613 0.090 11.780
## 2000 variables
## user system elapsed
## 23.847 0.060 24.043
## 3000 variables
## user system elapsed
## 35.316 0.093 35.905
## 10000 variables
## user system elapsed
## 120.654 0.594 122.140
showing that system time is roughly a linear function of the number of variables.
The full 43000 variables takes just over 10 minutes, just enough time for a cup of coffee:
## 43000
## user system elapsed
## 612.587 3.134 630.464
Upvotes: 6