PyPer User
PyPer User

Reputation: 259

Lapply slows with increasing loops

Introduction

I've been writing a program that processes some Raw data and then passes it through several statistical processes. All in all it uses several instances of "lapply".

For example: in one part of the script I use a function known as Maxstat (note the function only takes data either as data.frame or data.table - will not accept a matrix).

**

**

Sample data:

The first two columns are neccesary. This is the large data sample seen below --> please read on after the sample

Genedata <- structure(list(time = c(120, 120, 28.96, 119.21, 59.53, 68.81, 
82.29, 110.82, 65.88, 84.13, 16.47, 89.75, 76.07, 67.82, 52.24, 
64.1, 55.13, 57.79, 50.1, 43.79, 30.27, 3.64, 36.59, 20.02, 118.58, 
55.33, 120, 120, 120, 106.94, 11.7, 28.79, 26.82, 52.5, 24.03, 
88.99, 102.44, 33.73, 85.28, 26.53, 37.31, 86.43, 55.92, 70.65, 
76.04, 79.13, 83.99, 80.25, 40.6, 95.37, 31.06, 37.31, 22.02, 
63.25, 34.09, 52.14, 66.04, 59.96, 47.86, 58.45, 45.99, 60.42, 
37.67, 15.71, 39.25, 49.87, 25.24, 44.97, 35.9, 26.66, 36.78, 
41.52, 22.22, 33.2, 39.68, 25.61, 83.99, 15.05, 8.38, 18.18, 
27.15, 24.26, 105.17, 11.76, 70.45, 95.07, 112.33, 27.51, 45.92, 
54.04, 103.98, 6.11, 99.51, 20.44, 76.6, 10.02, 41.45, 96.26, 
85.61, 78.87, 22.25, 74.53, 59.07, 47.8, 5.68, 79.39, 74.07, 
50.76, 67.82, 70.84, 50.59, 34.58, 38.72, 54.9, 67.92, 58.45, 
59.34, 54.54, 19.03, 26.26, 52.86, 32.05, 55.95, 56.67, 50.43, 
4.24, 49.11, 49.57, 50.49, 63.05, 49.24, 0.92, 31.36, 40.3, 116.64, 
31.92, 19.98, 24.62, 18.54, 120, 17.62, 5.32, 2.36, 5.72, 15.28, 
95.07, 4.96, 28.89, 3.25, 0.92, 18.77, 57.73, 14.39, 5.12, 4.99, 
33.17, 50.53, 5.72, 8.02, 34.02, 1.44, 36.95, 60.75, 37.44, 23.07, 
19.85, 31.85, 8.61, 42.27, 15.25, 14.56, 9.96, 8.94, 32.67, 2.07, 
22.78, 22.35), cens = c(0L, 0L, NA, 0L, 0L, 1L, 0L, 0L, NA, 0L, 
NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, NA, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, NA, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 
0L, 1L, 0L, NA, 1L, 1L, 1L, 0L, 0L, 0L, NA, 0L, NA, NA, 1L, 0L, 
0L, 1L, 1L, 0L, 0L, 1L, NA, NA, 0L, 0L, 0L, 0L, 1L, NA, 0L, 1L, 
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, NA, 0L, NA, 1L, 1L, 0L, 1L, 0L, 1L, 
1L, NA, 1L, 0L, 1L, 1L, 0L)), .Names = c("time", "cens"), class = "data.frame", row.names = c(NA, 
-177L))

To generate a larger set -> First define the sample above as Genedata

then the additional code (to change variable/column length, just change 1000 to 5000 or 50000 etc.

    data <- replicate(1000, abs(rnorm(177)))
    Genedata <- data.frame(Genedata, data)
library(data.table)
    Genedata2 <- data.table(Genedata)

Then to calculate

    #Calculating Maxstat
    library(maxstat)
    maxstat.genes <- as.list( names(Genedata) )[-c(1:3)] #generate a list of column numbers
system.time(maxstat.estimate <-lapply( maxstat.genes, function (x)
{maxstat.test(Surv(time, cens) ~ get (x), data=Genedata2, smethod="LogRank",
pmethod="Lau94")}))

I originally planned on using the for statement, as i'm familiar with it having a background in python. However, research has told me that using apply should be faster.

I also originally used data.frame, but switched to data.table as someone suggested this should be better for performance.

However, it seems there is still a significant problem with processing large datasets. Below is an explanation of why.

Running 25 variables

   user  system elapsed 
   0.04    0.00    0.05  

Running 1000 variables

  user  system elapsed 
   6.54    0.00    6.57 

Running 5000 variables

  user  system elapsed 
 132.89    0.00  133.02 

As you can see, the processing speed is not linear but rather begins to fall off at a staggering rate. Given that the dataset is 43000 variables long, I can only guess that it would take on the order of hours. Interestingly, if i manually divided the tables into small sets, and ran them individually, it would be faster.

I think it has something to do with copying increasingly larger data, but I' not sure how this could be fixed. I already tried the data.table and lapply. Any suggestions?

Update 1: Added sample data. Note to reproduce a larger dataset. You can copy columns 4 onwards and paste. Rows can also be copied down. Should work as far as i know.

Update 2: Rprof on 1000 variables

> Rprof(NULL)
> summaryRprof()
$by.self
                        self.time self.pct total.time total.pct
terms.formula                0.50    15.82       0.50     15.82
eval                         0.34    10.76       1.92     60.76
Surv                         0.12     3.80       0.20      6.33
.deparseOpts                 0.12     3.80       0.18      5.70
deparse                      0.08     2.53       0.32     10.13
na.omit.data.frame           0.08     2.53       0.30      9.49
lapply                       0.06     1.90       3.16    100.00
maxstat.test                 0.06     1.90       3.16    100.00
maxstat.test.data.frame      0.06     1.90       3.10     98.10
model.frame.default          0.06     1.90       1.68     53.16
cmaxstat                     0.06     1.90       0.38     12.03
match.arg                    0.06     1.90       0.30      9.49
ifelse                       0.06     1.90       0.06      1.90
is.matrix                    0.06     1.90       0.06      1.90
length                       0.06     1.90       0.06      1.90
match                        0.06     1.90       0.06      1.90
pmatch                       0.06     1.90       0.06      1.90
FUN                          0.04     1.27       3.16    100.00
maxstat                      0.04     1.27       0.90     28.48
terms                        0.04     1.27       0.62     19.62
[.data.frame                 0.04     1.27       0.20      6.33
order                        0.04     1.27       0.08      2.53
[.formula                    0.04     1.27       0.06      1.90
[.Surv                       0.04     1.27       0.06      1.90
attr.all.equal               0.04     1.27       0.06      1.90
makepredictcall              0.04     1.27       0.06      1.90
unique                       0.04     1.27       0.06      1.90
unlist                       0.04     1.27       0.06      1.90
<Anonymous>                  0.04     1.27       0.04      1.27
is.data.frame                0.04     1.27       0.04      1.27
names                        0.04     1.27       0.04      1.27
NextMethod                   0.04     1.27       0.04      1.27
sum                          0.04     1.27       0.04      1.27
model.frame                  0.02     0.63       1.70     53.80
na.omit                      0.02     0.63       0.32     10.13
[                            0.02     0.63       0.30      9.49
cscores                      0.02     0.63       0.28      8.86
cscores.Surv                 0.02     0.63       0.26      8.23
all.equal                    0.02     0.63       0.14      4.43
all.equal.numeric            0.02     0.63       0.12      3.80
paste                        0.02     0.63       0.12      3.80
[[                           0.02     0.63       0.08      2.53
sort                         0.02     0.63       0.08      2.53
[[.data.frame                0.02     0.63       0.06      1.90
sort.int                     0.02     0.63       0.06      1.90
as.list                      0.02     0.63       0.04      1.27
as.vector                    0.02     0.63       0.04      1.27
get                          0.02     0.63       0.04      1.27
-                            0.02     0.63       0.02      0.63
$<-                          0.02     0.63       0.02      0.63
/                            0.02     0.63       0.02      0.63
anyDuplicated                0.02     0.63       0.02      0.63
as.environment               0.02     0.63       0.02      0.63
as.list.data.frame           0.02     0.63       0.02      0.63
attr                         0.02     0.63       0.02      0.63
attr<-                       0.02     0.63       0.02      0.63
c                            0.02     0.63       0.02      0.63
cbind                        0.02     0.63       0.02      0.63
data.class                   0.02     0.63       0.02      0.63
is.array                     0.02     0.63       0.02      0.63
makepredictcall.default      0.02     0.63       0.02      0.63
mode                         0.02     0.63       0.02      0.63
pLausen94                    0.02     0.63       0.02      0.63
sys.function                 0.02     0.63       0.02      0.63

$by.total
                        total.time total.pct self.time self.pct
lapply                        3.16    100.00      0.06     1.90
maxstat.test                  3.16    100.00      0.06     1.90
FUN                           3.16    100.00      0.04     1.27
maxstat.test.data.frame       3.10     98.10      0.06     1.90
eval                          1.92     60.76      0.34    10.76
model.frame                   1.70     53.80      0.02     0.63
model.frame.default           1.68     53.16      0.06     1.90
maxstat                       0.90     28.48      0.04     1.27
do.call                       0.90     28.48      0.00     0.00
terms                         0.62     19.62      0.04     1.27
terms.formula                 0.50     15.82      0.50    15.82
cmaxstat                      0.38     12.03      0.06     1.90
sapply                        0.34     10.76      0.00     0.00
deparse                       0.32     10.13      0.08     2.53
na.omit                       0.32     10.13      0.02     0.63
na.omit.data.frame            0.30      9.49      0.08     2.53
match.arg                     0.30      9.49      0.06     1.90
[                             0.30      9.49      0.02     0.63
cscores                       0.28      8.86      0.02     0.63
cscores.Surv                  0.26      8.23      0.02     0.63
Surv                          0.20      6.33      0.12     3.80
[.data.frame                  0.20      6.33      0.04     1.27
.deparseOpts                  0.18      5.70      0.12     3.80
all.equal                     0.14      4.43      0.02     0.63
all.equal.numeric             0.12      3.80      0.02     0.63
paste                         0.12      3.80      0.02     0.63
order                         0.08      2.53      0.04     1.27
[[                            0.08      2.53      0.02     0.63
sort                          0.08      2.53      0.02     0.63
ifelse                        0.06      1.90      0.06     1.90
is.matrix                     0.06      1.90      0.06     1.90
length                        0.06      1.90      0.06     1.90
match                         0.06      1.90      0.06     1.90
pmatch                        0.06      1.90      0.06     1.90
[.formula                     0.06      1.90      0.04     1.27
[.Surv                        0.06      1.90      0.04     1.27
attr.all.equal                0.06      1.90      0.04     1.27
makepredictcall               0.06      1.90      0.04     1.27
unique                        0.06      1.90      0.04     1.27
unlist                        0.06      1.90      0.04     1.27
[[.data.frame                 0.06      1.90      0.02     0.63
sort.int                      0.06      1.90      0.02     0.63
%in%                          0.06      1.90      0.00     0.00
simplify2array                0.06      1.90      0.00     0.00
sort.default                  0.06      1.90      0.00     0.00
<Anonymous>                   0.04      1.27      0.04     1.27
is.data.frame                 0.04      1.27      0.04     1.27
names                         0.04      1.27      0.04     1.27
NextMethod                    0.04      1.27      0.04     1.27
sum                           0.04      1.27      0.04     1.27
as.list                       0.04      1.27      0.02     0.63
as.vector                     0.04      1.27      0.02     0.63
get                           0.04      1.27      0.02     0.63
-                             0.02      0.63      0.02     0.63
$<-                           0.02      0.63      0.02     0.63
/                             0.02      0.63      0.02     0.63
anyDuplicated                 0.02      0.63      0.02     0.63
as.environment                0.02      0.63      0.02     0.63
as.list.data.frame            0.02      0.63      0.02     0.63
attr                          0.02      0.63      0.02     0.63
attr<-                        0.02      0.63      0.02     0.63
c                             0.02      0.63      0.02     0.63
cbind                         0.02      0.63      0.02     0.63
data.class                    0.02      0.63      0.02     0.63
is.array                      0.02      0.63      0.02     0.63
makepredictcall.default       0.02      0.63      0.02     0.63
mode                          0.02      0.63      0.02     0.63
pLausen94                     0.02      0.63      0.02     0.63
sys.function                  0.02      0.63      0.02     0.63
formals                       0.02      0.63      0.00     0.00
is.na                         0.02      0.63      0.00     0.00
is.na.Surv                    0.02      0.63      0.00     0.00
rowSums                       0.02      0.63      0.00     0.00

$sample.interval
[1] 0.02

$sampling.time
[1] 3.16

Update 3 - Rprof on 5000 variables - as you can see its not linear increase

> Rprof(NULL)
> summaryRprof()
$by.self
                        self.time self.pct total.time total.pct
eval                        11.28    32.83      23.40     68.10
terms.formula               11.00    32.01      11.08     32.25
model.frame.default          0.68     1.98      22.38     65.13
deparse                      0.68     1.98       1.70      4.95
cmaxstat                     0.46     1.34       2.38      6.93
.deparseOpts                 0.44     1.28       0.74      2.15
na.omit.data.frame           0.40     1.16       1.74      5.06
[.data.frame                 0.36     1.05       1.06      3.08
maxstat.test.data.frame      0.32     0.93      34.16     99.42
pmatch                       0.32     0.93       0.34      0.99
match                        0.28     0.81       0.60      1.75
NextMethod                   0.28     0.81       0.28      0.81
na.omit                      0.26     0.76       2.00      5.82
Surv                         0.26     0.76       0.54      1.57
get                          0.26     0.76       0.28      0.81
unique                       0.24     0.70       0.46      1.34
makepredictcall              0.24     0.70       0.36      1.05
order                        0.24     0.70       0.34      0.99
match.arg                    0.22     0.64       1.26      3.67
[.Surv                       0.22     0.64       0.40      1.16
lapply                       0.20     0.58      34.36    100.00
paste                        0.20     0.58       0.90      2.62
maxstat                      0.18     0.52       4.50     13.10
[                            0.18     0.52       1.54      4.48
cscores                      0.18     0.52       1.26      3.67
$<-                          0.18     0.52       0.18      0.52
terms                        0.16     0.47      11.46     33.35
sort.int                     0.16     0.47       0.46      1.34
cbind                        0.16     0.47       0.16      0.47
is.matrix                    0.16     0.47       0.16      0.47
length                       0.16     0.47       0.16      0.47
[[.data.frame                0.14     0.41       0.34      0.99
FUN                          0.12     0.35      34.26     99.71
all.equal.numeric            0.12     0.35       0.24      0.70
c                            0.12     0.35       0.12      0.35
ifelse                       0.12     0.35       0.12      0.35
is.data.frame                0.12     0.35       0.12      0.35
makepredictcall.default      0.12     0.35       0.12      0.35
names                        0.12     0.35       0.12      0.35
cscores.Surv                 0.10     0.29       1.08      3.14
%in%                         0.10     0.29       0.70      2.04
all.equal                    0.10     0.29       0.40      1.16
mode                         0.10     0.29       0.34      0.99
unlist                       0.10     0.29       0.20      0.58
unique.default               0.10     0.29       0.16      0.47
as.list                      0.10     0.29       0.10      0.29
dim                          0.10     0.29       0.10      0.29
is.factor                    0.10     0.29       0.10      0.29
parent.frame                 0.10     0.29       0.10      0.29
model.frame                  0.08     0.23      22.46     65.37
is.na                        0.08     0.23       0.18      0.52
duplicated                   0.08     0.23       0.12      0.35
maxstat.test                 0.06     0.17      34.22     99.59
sapply                       0.06     0.17       1.72      5.01
sort                         0.06     0.17       0.52      1.51
pLausen94                    0.06     0.17       0.22      0.64
anyDuplicated                0.06     0.17       0.10      0.29
which                        0.06     0.17       0.10      0.29
ncol                         0.06     0.17       0.08      0.23
rowSums                      0.06     0.17       0.08      0.23
*                            0.06     0.17       0.06      0.17
/                            0.06     0.17       0.06      0.17
environment                  0.06     0.17       0.06      0.17
is.ordered                   0.06     0.17       0.06      0.17
list                         0.06     0.17       0.06      0.17
do.call                      0.04     0.12       4.56     13.27
[[                           0.04     0.12       0.38      1.11
attr.all.equal               0.04     0.12       0.10      0.29
formals                      0.04     0.12       0.08      0.23
!                            0.04     0.12       0.04      0.12
.Call                        0.04     0.12       0.04      0.12
<Anonymous>                  0.04     0.12       0.04      0.12
anyDuplicated.default        0.04     0.12       0.04      0.12
duplicated.default           0.04     0.12       0.04      0.12
floor                        0.04     0.12       0.04      0.12
is.array                     0.04     0.12       0.04      0.12
match.fun                    0.04     0.12       0.04      0.12
options                      0.04     0.12       0.04      0.12
pnorm                        0.04     0.12       0.04      0.12
seq.default                  0.04     0.12       0.04      0.12
sys.call                     0.04     0.12       0.04      0.12
simplify2array               0.02     0.06       0.52      1.51
[.formula                    0.02     0.06       0.14      0.41
is.na.Surv                   0.02     0.06       0.10      0.29
sys.function                 0.02     0.06       0.04      0.12
&                            0.02     0.06       0.02      0.06
.row_names_info              0.02     0.06       0.02      0.06
^                            0.02     0.06       0.02      0.06
<                            0.02     0.06       0.02      0.06
<=                           0.02     0.06       0.02      0.06
==                           0.02     0.06       0.02      0.06
as.environment               0.02     0.06       0.02      0.06
attr<-                       0.02     0.06       0.02      0.06
baseenv                      0.02     0.06       0.02      0.06
data.class                   0.02     0.06       0.02      0.06
is.numeric                   0.02     0.06       0.02      0.06
sqrt                         0.02     0.06       0.02      0.06
sum                          0.02     0.06       0.02      0.06
sys.parent                   0.02     0.06       0.02      0.06

$by.total
                        total.time total.pct self.time self.pct
lapply                       34.36    100.00      0.20     0.58
FUN                          34.26     99.71      0.12     0.35
maxstat.test                 34.22     99.59      0.06     0.17
maxstat.test.data.frame      34.16     99.42      0.32     0.93
eval                         23.40     68.10     11.28    32.83
model.frame                  22.46     65.37      0.08     0.23
model.frame.default          22.38     65.13      0.68     1.98
terms                        11.46     33.35      0.16     0.47
terms.formula                11.08     32.25     11.00    32.01
do.call                       4.56     13.27      0.04     0.12
maxstat                       4.50     13.10      0.18     0.52
cmaxstat                      2.38      6.93      0.46     1.34
na.omit                       2.00      5.82      0.26     0.76
na.omit.data.frame            1.74      5.06      0.40     1.16
sapply                        1.72      5.01      0.06     0.17
deparse                       1.70      4.95      0.68     1.98
[                             1.54      4.48      0.18     0.52
match.arg                     1.26      3.67      0.22     0.64
cscores                       1.26      3.67      0.18     0.52
cscores.Surv                  1.08      3.14      0.10     0.29
[.data.frame                  1.06      3.08      0.36     1.05
paste                         0.90      2.62      0.20     0.58
.deparseOpts                  0.74      2.15      0.44     1.28
%in%                          0.70      2.04      0.10     0.29
match                         0.60      1.75      0.28     0.81
Surv                          0.54      1.57      0.26     0.76
sort                          0.52      1.51      0.06     0.17
simplify2array                0.52      1.51      0.02     0.06
unique                        0.46      1.34      0.24     0.70
sort.int                      0.46      1.34      0.16     0.47
sort.default                  0.46      1.34      0.00     0.00
[.Surv                        0.40      1.16      0.22     0.64
all.equal                     0.40      1.16      0.10     0.29
[[                            0.38      1.11      0.04     0.12
makepredictcall               0.36      1.05      0.24     0.70
pmatch                        0.34      0.99      0.32     0.93
order                         0.34      0.99      0.24     0.70
[[.data.frame                 0.34      0.99      0.14     0.41
mode                          0.34      0.99      0.10     0.29
NextMethod                    0.28      0.81      0.28     0.81
get                           0.28      0.81      0.26     0.76
all.equal.numeric             0.24      0.70      0.12     0.35
pLausen94                     0.22      0.64      0.06     0.17
unlist                        0.20      0.58      0.10     0.29
$<-                           0.18      0.52      0.18     0.52
is.na                         0.18      0.52      0.08     0.23
cbind                         0.16      0.47      0.16     0.47
is.matrix                     0.16      0.47      0.16     0.47
length                        0.16      0.47      0.16     0.47
unique.default                0.16      0.47      0.10     0.29
[.formula                     0.14      0.41      0.02     0.06
c                             0.12      0.35      0.12     0.35
ifelse                        0.12      0.35      0.12     0.35
is.data.frame                 0.12      0.35      0.12     0.35
makepredictcall.default       0.12      0.35      0.12     0.35
names                         0.12      0.35      0.12     0.35
duplicated                    0.12      0.35      0.08     0.23
as.list                       0.10      0.29      0.10     0.29
dim                           0.10      0.29      0.10     0.29
is.factor                     0.10      0.29      0.10     0.29
parent.frame                  0.10      0.29      0.10     0.29
anyDuplicated                 0.10      0.29      0.06     0.17
which                         0.10      0.29      0.06     0.17
attr.all.equal                0.10      0.29      0.04     0.12
is.na.Surv                    0.10      0.29      0.02     0.06
ncol                          0.08      0.23      0.06     0.17
rowSums                       0.08      0.23      0.06     0.17
formals                       0.08      0.23      0.04     0.12
as.vector                     0.08      0.23      0.00     0.00
*                             0.06      0.17      0.06     0.17
/                             0.06      0.17      0.06     0.17
environment                   0.06      0.17      0.06     0.17
is.ordered                    0.06      0.17      0.06     0.17
list                          0.06      0.17      0.06     0.17
nrow                          0.06      0.17      0.00     0.00
!                             0.04      0.12      0.04     0.12
.Call                         0.04      0.12      0.04     0.12
<Anonymous>                   0.04      0.12      0.04     0.12
anyDuplicated.default         0.04      0.12      0.04     0.12
duplicated.default            0.04      0.12      0.04     0.12
floor                         0.04      0.12      0.04     0.12
is.array                      0.04      0.12      0.04     0.12
match.fun                     0.04      0.12      0.04     0.12
options                       0.04      0.12      0.04     0.12
pnorm                         0.04      0.12      0.04     0.12
seq.default                   0.04      0.12      0.04     0.12
sys.call                      0.04      0.12      0.04     0.12
sys.function                  0.04      0.12      0.02     0.06
getOption                     0.04      0.12      0.00     0.00
irank                         0.04      0.12      0.00     0.00
seq                           0.04      0.12      0.00     0.00
&                             0.02      0.06      0.02     0.06
.row_names_info               0.02      0.06      0.02     0.06
^                             0.02      0.06      0.02     0.06
<                             0.02      0.06      0.02     0.06
<=                            0.02      0.06      0.02     0.06
==                            0.02      0.06      0.02     0.06
as.environment                0.02      0.06      0.02     0.06
attr<-                        0.02      0.06      0.02     0.06
baseenv                       0.02      0.06      0.02     0.06
data.class                    0.02      0.06      0.02     0.06
is.numeric                    0.02      0.06      0.02     0.06
sqrt                          0.02      0.06      0.02     0.06
sum                           0.02      0.06      0.02     0.06
sys.parent                    0.02      0.06      0.02     0.06
match.call                    0.02      0.06      0.00     0.00

$sample.interval
[1] 0.02

$sampling.time
[1] 34.36

Upvotes: 4

Views: 344

Answers (3)

dnlbrky
dnlbrky

Reputation: 9805

You mentioned switching to data.table, but it doesn't appear like you took full advantage of it. The key here is to reshape the table to "long" format, and then use data.table to group by your variable. Picking up from your example Genedata...

library(data.table)
library(maxstat)
system.time({

## Convert to a "long" data.table format: 
Genedata2 <- data.table(time=rep(Genedata[[1]],ncol(Genedata)-2),
                        cens=rep(Genedata[[2]],ncol(Genedata)-2),
                        variable=rep(1:(ncol(Genedata)-2), each=nrow(Genedata)),
                        value=as.vector(as.matrix(Genedata[3:ncol(Genedata)])))

## Calculate the maxstat.test by each variable:
maxstat.estimate2 <- Genedata2[,list(ans=list(maxstat.test(Surv(time, cens) ~ value,
                     data=.SD, smethod="LogRank", pmethod="Lau94"))), by="variable"]
})


## 1,000 variables:
##  user  system elapsed 
##  2.01    0.00    2.01

## 5,000 variables:
##  user  system elapsed 
## 12.17    0.00   12.17

## 10,000 variables:
##  user  system elapsed 
## 21.19    0.07   21.25

## 43,000 variables:
##  user  system elapsed 
## 96.91    0.21   97.22

If you are doing other calculations on the same data, you can just add them as new columns in the line of code that creates the maxstat.estimate2 table (list(ans=..., calc2=..., calc3=..., etc.)).

Upvotes: 0

hadley
hadley

Reputation: 103898

I don't see any non-linear behaviour on my machine:

library(maxstat)
genes <- names(Genedata)[-(1:3)]

time_it <- function(f, vars = c(25, 100, 500, 999)) {
  names(vars) <- vars
  sapply(vars, function(x) system.time(lapply(genes[1:x], f))[3]) / vars
}

fit_model <- function(var) {
  maxstat.test(Surv(time, cens) ~ get(var), data = Genedata, 
    smethod = "LogRank", pmethod = "Lau94")
}
time_it(fit_model)
#  25.elapsed 100.elapsed 500.elapsed 999.elapsed 
#    1.800000    1.790000    1.812000    1.857858 

Upvotes: 1

Ista
Ista

Reputation: 10437

I don't have a full answer to why the times for lapply are not linear, but notice that you are using lapply as a glorified loop by applying over a list of variable names (and then using these names to index the actual values) rather than applying over a list of values. I'm not sure how to solve this using lapply, but I can offer an alternative that is (roughly) a liner function of the number of variables.

Start by creating the example data:

Genedata <- structure(list(time = c(120, 120, 28.96, 119.21, 59.53, 68.81, 
82.29, 110.82, 65.88, 84.13, 16.47, 89.75, 76.07, 67.82, 52.24, 
64.1, 55.13, 57.79, 50.1, 43.79, 30.27, 3.64, 36.59, 20.02, 118.58, 
55.33, 120, 120, 120, 106.94, 11.7, 28.79, 26.82, 52.5, 24.03, 
88.99, 102.44, 33.73, 85.28, 26.53, 37.31, 86.43, 55.92, 70.65, 
76.04, 79.13, 83.99, 80.25, 40.6, 95.37, 31.06, 37.31, 22.02, 
63.25, 34.09, 52.14, 66.04, 59.96, 47.86, 58.45, 45.99, 60.42, 
37.67, 15.71, 39.25, 49.87, 25.24, 44.97, 35.9, 26.66, 36.78, 
41.52, 22.22, 33.2, 39.68, 25.61, 83.99, 15.05, 8.38, 18.18, 
27.15, 24.26, 105.17, 11.76, 70.45, 95.07, 112.33, 27.51, 45.92, 
54.04, 103.98, 6.11, 99.51, 20.44, 76.6, 10.02, 41.45, 96.26, 
85.61, 78.87, 22.25, 74.53, 59.07, 47.8, 5.68, 79.39, 74.07, 
50.76, 67.82, 70.84, 50.59, 34.58, 38.72, 54.9, 67.92, 58.45, 
59.34, 54.54, 19.03, 26.26, 52.86, 32.05, 55.95, 56.67, 50.43, 
4.24, 49.11, 49.57, 50.49, 63.05, 49.24, 0.92, 31.36, 40.3, 116.64, 
31.92, 19.98, 24.62, 18.54, 120, 17.62, 5.32, 2.36, 5.72, 15.28, 
95.07, 4.96, 28.89, 3.25, 0.92, 18.77, 57.73, 14.39, 5.12, 4.99, 
33.17, 50.53, 5.72, 8.02, 34.02, 1.44, 36.95, 60.75, 37.44, 23.07, 
19.85, 31.85, 8.61, 42.27, 15.25, 14.56, 9.96, 8.94, 32.67, 2.07, 
22.78, 22.35), cens = c(0L, 0L, NA, 0L, 0L, 1L, 0L, 0L, NA, 0L, 
NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L, NA, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, NA, 1L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, NA, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 
0L, 1L, 0L, NA, 1L, 1L, 1L, 0L, 0L, 0L, NA, 0L, NA, NA, 1L, 0L, 
0L, 1L, 1L, 0L, 0L, 1L, NA, NA, 0L, 0L, 0L, 0L, 1L, NA, 0L, 1L, 
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, NA, 0L, NA, 1L, 1L, 0L, 1L, 0L, 1L, 
1L, NA, 1L, 0L, 1L, 1L, 0L)), .Names = c("time", "cens"), class = "data.frame", row.names = c(NA, 
-177L))

data <- replicate(10000, abs(rnorm(177)))
Genedata <- data.frame(Genedata, data)

My approach is the reshape the data using the melt function in the reshape2 package, then use dlply from the plyr package to run the model on each subset.

ibrary(maxstat)
library(plyr)
library(reshape2)

system.time({
  dat.m <- melt(Genedata, id.vars=1:3)
  maxstat.estimate <- dlply(dat.m, .(variable), function(DF) {
  maxstat.test(Surv(time, cens) ~ value, smethod="LogRank", pmethod="Lau94", data=DF)})})

Some timings from this approach are as follows:

## 1000 variables
##    user  system elapsed 
##  11.613   0.090  11.780 

## 2000 variables
##   user  system elapsed
## 23.847   0.060  24.043

## 3000 variables
##    user  system elapsed 
##  35.316   0.093  35.905

## 10000 variables
##    user  system elapsed 
## 120.654   0.594 122.140 

showing that system time is roughly a linear function of the number of variables.

The full 43000 variables takes just over 10 minutes, just enough time for a cup of coffee:

## 43000
##    user  system elapsed 
## 612.587   3.134 630.464 

Upvotes: 6

Related Questions