Reputation: 2256
I need to create a vector with all numbers within ranges defined in a table. For example the rows 23:25 and 34:39 shall become the single vector c(23, 24, 25, 34, 35, 36, 37, 38, 39)
The MWE below does this, but is way too slow. I need to do it for n.rows of 15,000,000 or higher.
row.references
is the input. row.references.long
is the wanted output.
What is a better code to do this?
library(data.table)
# Create example data
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)), steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
row.references.long <- NULL
# The too-slow code
for (i in 1:nrow(row.references)) {
row.references.long <- rbind(row.references.long, data.table(row.references[i, start.number]:row.references[i, end.number]))
}
I suppose data.table
is the way to go.
Upvotes: 2
Views: 153
Reputation: 46856
For one way to avoid explicit iteration, create a sequence spanning the entire range (use a numeric vector to avoid integer overflow), then correct elements by the offset required to make the elements corresponding to the start of each sequence, equal to the start of the sequence.
f <- function(start, step) {
res <- seq(1, sum(step + 1), by=1)
offset <- start - c(0, cumsum(step + 1)[-length(step)]) - 1L
res + rep(offset, step + 1)
}
Upvotes: 1
Reputation: 193517
For some reason, this still strikes me as overthinking things. I'm not sure if there's a big disadvantage to using by = 1:nrow(indt)
, but that gives me good performance.
My suggestion for "data.table" would simply be:
row.references[, list(V1 = start.number:end.number),
by = 1:nrow(row.references)]$V1
And for base R, would be:
unlist(mapply(":", row.references$start.number, row.references$end.number),
use.names = FALSE)
This latter one is similar to Roland's approach, but just uses :
and unlist
instead of do.call(c, ...)
Here's your sample data:
library(data.table)
set.seed(1)
n.rows <- 1000
row.references <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
steps=floor(runif(n=n.rows, min=1, max=50)))
row.references[, end.number:=start.number+steps]
row.references[, steps:=NULL]
Here are a few functions to try out:
AM1 <- function() {
unlist(mapply(":", row.references$start.number, row.references$end.number),
use.names = FALSE)
}
AM2 <- function() {
row.references[, list(V1 = start.number:end.number),
by = 1:nrow(row.references)]$V1
}
roland1 <- function() {
do.call(c, mapply(seq,
row.references[["start.number"]],
row.references[["end.number"]],
MoreArgs = list(by = 1)))
}
roland2 <- function(indt = copy(row.references)) {
indt[, lengths := end.number - start.number + 1]
res <- indt[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1),
by = rep(seq_len(nrow(indt)), indt[["lengths"]])]$V1
}
jaap <- function(indt = copy(row.references)) {
indt[, `:=` (idx=.I)][, .(var = seq(start.number,end.number)), by = idx]$var
}
Check that they are all equal:
sapply(c(quote(AM2()), quote(roland1()), quote(roland2()), quote(jaap())),
function(x) all.equal(AM1(), eval(x)))
# [1] TRUE TRUE TRUE TRUE
Now, make some bigger data:
# Make the data bigger -- 2.5 million rows
row.references <- rbindlist(replicate(2500, row.references, FALSE))
dim(row.references)
Test out the timings:
system.time(AM1())
# user system elapsed
# 6.936 0.000 6.845
system.time(AM2())
# user system elapsed
# 2.480 0.212 2.800
system.time(roland1())
# user system elapsed
# 64.932 0.000 63.525
system.time(roland2())
# user system elapsed
# 3.488 0.000 2.434
system.time(jaap())
# user system elapsed
# 14.068 0.000 13.643
It seems like roland2
and AM2
are viable alternatives. Even if this "microbenchmark" is a bit off, I feel AM2
trumps in readability:
library(microbenchmark)
microbenchmark(AM2(), roland2(), times = 20)
# Unit: seconds
# expr min lq mean median uq max neval
# AM2() 2.202286 2.236027 2.323602 2.320230 2.394856 2.477074 20
# roland2() 2.314997 2.428790 2.502338 2.477764 2.589151 2.700195 20
Upvotes: 9
Reputation: 83215
As @Roland mentioned, there is no need for a for-loop. You can do this entirely in data.table with the help of an index (idx
) column:
set.seed(12)
row.ref <- data.table(start.number=floor(runif(n=n.rows, min=1, max=100)),
steps=floor(runif(n=n.rows, min=1, max=50)))
row.ref[, `:=` (end.number=start.number+steps, idx=.I)]
row.ref.l <- row.ref[, .(var = seq(start.number,end.number)), by = idx][, idx:=NULL]
which results in:
> head(row.ref,3)
start.number steps end.number idx
1: 7 3 10 1
2: 81 8 89 2
3: 94 40 134 3
> head(row.ref.l,15)
var
1: 7
2: 8
3: 9
4: 10
5: 81
6: 82
7: 83
8: 84
9: 85
10: 86
11: 87
12: 88
13: 89
14: 94
15: 95
A benchmark of the several proposed solutions:
microbenchmark(jaap = row.references[, .(var = seq(start.number,end.number)), by = idx],
roland1 = do.call(c, mapply(seq,row.references[["start.number"]],row.references[["end.number"]],MoreArgs = list(by = 1))),
roland2 = row.references[, lengths := end.number - start.number + 1][, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))][, V1 := V1 + seq_along(V1), by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])],
ananda = data.table(unlist(mapply(":", row.references$start.number, row.references$end.number), use.names = FALSE)),
times = 100, unit = "relative")
which gives:
Unit: relative
expr min lq mean median uq max neval cld
jaap 5.517431 5.358356 5.168787 5.183828 5.164292 2.157907 100 c
roland1 19.023350 18.029892 16.897831 17.475423 17.111857 5.682705 100 d
roland2 2.051662 2.015041 1.912261 1.964078 1.957416 1.277075 100 b
ananda 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100 a
Upvotes: 4
Reputation: 132706
Do not grow an object in a loop. Pre-allocate. Here is a more efficient version of your loop:
res <- do.call(c, mapply(seq,
row.references[["start.number"]],
row.references[["end.number"]],
MoreArgs = list(by = 1)))
all.equal(res, row.references.long[[1]])
#[1] TRUE
Here is another option. Benchmark to see if it is faster.
row.references[, lengths := end.number - start.number + 1]
res <- row.references[, .(V1 = rep(as.integer(start.number) - 1L, times = lengths))]
res[, V1 := V1 + seq_along(V1),
by = rep(seq_len(nrow(row.references)), row.references[["lengths"]])]
all.equal(res, row.references.long)
#[1] TRUE
However, I would do this in compiled code, i.e., with Rcpp.
Upvotes: 4