Chris
Chris

Reputation: 2256

Speed up vector-creation

I need to create a vector with all numbers within ranges defined in a table. For example the rows 23:25 and 34:39 shall become the single vector c(23, 24, 25, 34, 35, 36, 37, 38, 39)

The MWE below does this, but is way too slow. I need to do it for n.rows of 15,000,000 or higher.

row.references is the input. row.references.long is the wanted output.

What is a better code to do this?

library(data.table)
# Create example data
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
row.references.long <- NULL
# The too-slow code
for (i in 1:nrow(row.references)) {
  row.references.long <- rbind(row.references.long, data.table(row.references[i, start.number]:row.references[i, end.number]))
}

I suppose data.table is the way to go.

Upvotes: 2

Views: 153

Answers (4)

Martin Morgan
Martin Morgan

Reputation: 46856

For one way to avoid explicit iteration, create a sequence spanning the entire range (use a numeric vector to avoid integer overflow), then correct elements by the offset required to make the elements corresponding to the start of each sequence, equal to the start of the sequence.

f <- function(start, step) {
    res <- seq(1, sum(step + 1), by=1)
    offset <- start -  c(0, cumsum(step + 1)[-length(step)]) - 1L
    res + rep(offset, step + 1)
}

Upvotes: 1

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193517

For some reason, this still strikes me as overthinking things. I'm not sure if there's a big disadvantage to using by = 1:nrow(indt), but that gives me good performance.

My suggestion for "data.table" would simply be:

row.references[, list(V1 = start.number:end.number), 
               by = 1:nrow(row.references)]$V1

And for base R, would be:

unlist(mapply(":", row.references$start.number, row.references$end.number), 
         use.names = FALSE)

This latter one is similar to Roland's approach, but just uses : and unlist instead of do.call(c, ...)


Benchmarks

Here's your sample data:

library(data.table)
set.seed(1)
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), 
                             steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]

Here are a few functions to try out:

AM1 <- function() {
  unlist(mapply(":", row.references$start.number, row.references$end.number), 
         use.names = FALSE)
}

AM2 <- function() {
  row.references[, list(V1 = start.number:end.number), 
                 by = 1:nrow(row.references)]$V1
}

roland1 <- function() {
  do.call(c, mapply(seq, 
                    row.references[["start.number"]], 
                    row.references[["end.number"]], 
                    MoreArgs = list(by = 1)))
}

roland2 <- function(indt = copy(row.references)) {
  indt[, lengths := end.number - start.number + 1]
  res <- indt[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
  res[, V1 := V1 + seq_along(V1), 
      by = rep(seq_len(nrow(indt)), indt[["lengths"]])]$V1
}

jaap <- function(indt = copy(row.references)) {
  indt[, `:=` (idx=.I)][, .(var = seq(start.number,end.number)), by = idx]$var
}

Check that they are all equal:

sapply(c(quote(AM2()), quote(roland1()), quote(roland2()), quote(jaap())), 
       function(x) all.equal(AM1(), eval(x)))
# [1] TRUE TRUE TRUE TRUE

Now, make some bigger data:

# Make the data bigger -- 2.5 million rows
row.references <- rbindlist(replicate(2500, row.references, FALSE))
dim(row.references)

Test out the timings:

system.time(AM1())
#    user  system elapsed 
#   6.936   0.000   6.845 

system.time(AM2())
#    user  system elapsed 
#   2.480   0.212   2.800 

system.time(roland1())
#    user  system elapsed 
#  64.932   0.000  63.525 

system.time(roland2())
#    user  system elapsed 
#   3.488   0.000   2.434 

system.time(jaap())
#    user  system elapsed 
#  14.068   0.000  13.643 

It seems like roland2 and AM2 are viable alternatives. Even if this "microbenchmark" is a bit off, I feel AM2 trumps in readability:

library(microbenchmark)
microbenchmark(AM2(), roland2(), times = 20)
# Unit: seconds
#        expr      min       lq     mean   median       uq      max neval
#       AM2() 2.202286 2.236027 2.323602 2.320230 2.394856 2.477074    20
#   roland2() 2.314997 2.428790 2.502338 2.477764 2.589151 2.700195    20

Upvotes: 9

Jaap
Jaap

Reputation: 83215

As @Roland mentioned, there is no need for a for-loop. You can do this entirely in data.table with the help of an index (idx) column:

set.seed(12)
row.ref <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
                      steps=floor(runif(n=n.rows, min=1, max=50)))
row.ref[, `:=` (end.number=start.number+steps, idx=.I)]

row.ref.l <- row.ref[, .(var = seq(start.number,end.number)), by = idx][, idx:=NULL]

which results in:

> head(row.ref,3)
   start.number steps end.number idx
1:            7     3         10   1
2:           81     8         89   2
3:           94    40        134   3

> head(row.ref.l,15)
    var
 1:   7
 2:   8
 3:   9
 4:  10
 5:  81
 6:  82
 7:  83
 8:  84
 9:  85
10:  86
11:  87
12:  88
13:  89
14:  94
15:  95

A benchmark of the several proposed solutions:

microbenchmark(jaap = row.references[, .(var = seq(start.number,end.number)), by = idx],
               roland1 = do.call(c, mapply(seq,row.references[["start.number"]],row.references[["end.number"]],MoreArgs = list(by = 1))),
               roland2 = row.references[, lengths := end.number - start.number + 1][, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))][, V1 := V1 + seq_along(V1), by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])],
               ananda = data.table(unlist(mapply(":", row.references$start.number, row.references$end.number), use.names = FALSE)),
               times = 100, unit = "relative")

which gives:

Unit: relative
    expr       min        lq      mean    median        uq      max neval  cld
    jaap  5.517431  5.358356  5.168787  5.183828  5.164292 2.157907   100   c 
 roland1 19.023350 18.029892 16.897831 17.475423 17.111857 5.682705   100    d
 roland2  2.051662  2.015041  1.912261  1.964078  1.957416 1.277075   100  b  
  ananda  1.000000  1.000000  1.000000  1.000000  1.000000 1.000000   100 a   

Upvotes: 4

Roland
Roland

Reputation: 132706

Do not grow an object in a loop. Pre-allocate. Here is a more efficient version of your loop:

res <- do.call(c, mapply(seq, 
                        row.references[["start.number"]], 
                        row.references[["end.number"]], 
                        MoreArgs = list(by = 1)))
all.equal(res, row.references.long[[1]])
#[1] TRUE

Here is another option. Benchmark to see if it is faster.

row.references[, lengths := end.number - start.number + 1]
res <- row.references[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1), 
    by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])]
all.equal(res, row.references.long)
#[1] TRUE

However, I would do this in compiled code, i.e., with Rcpp.

Upvotes: 4

Related Questions