matehorvath
matehorvath

Reputation: 69

How to create large vectors with repeated elements fast?

There is a vector from which I want to make a new vector by taking its elements based on a sequence:

set.seed(0)

n <- 1000
ncval1 <- as.integer(n)
ncval2 <- ncval1:1L
ncval3 <- sequence(ncval2, from = 1L, by = 1L)
x <- as.double(runif(n))

y <- x[ncval3]

This takes roughly 2.2 miliseconds. Maybe it could be sped up by taking the property of repeated elements.

Upvotes: 2

Views: 85

Answers (1)

jay.sf
jay.sf

Reputation: 73592

You could use Rcpp.

Rcpp::sourceCpp(code='
  #include <Rcpp.h>
  // [[Rcpp::export]]
  Rcpp::NumericVector foo(int n) {
    // draw from standard normal
    Rcpp::NumericVector r(n);
    r = Rcpp::runif(n);
    // length of result
    int l = 0;
    for (int i = 0; i <= n; i++) {
      l = l + i;
    }
    // subset and concatenate
    Rcpp::NumericVector a(l);
    int p = 0;
    for (int i = 0; i < n; i++) {
      for (int j = 0; j < n - i; j++) {
        a[p] = r[j];
        p = p + 1;
      }
    }
    return a;
  }
')

Usage for n = 10

> set.seed(0)
> foo(10)
 [1] 0.8966972 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897
 [8] 0.9446753 0.6607978 0.6291140 0.8966972 0.2655087 0.3721239 0.5728534
[15] 0.9082078 0.2016819 0.8983897 0.9446753 0.6607978 0.8966972 0.2655087
[22] 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897 0.9446753 0.8966972
[29] 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897 0.8966972
[36] 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8966972 0.2655087
[43] 0.3721239 0.5728534 0.9082078 0.8966972 0.2655087 0.3721239 0.5728534
[50] 0.8966972 0.2655087 0.3721239 0.8966972 0.2655087 0.8966972

Benchmark

n <- 1e3
microbenchmark::microbenchmark(
  OP={
    set.seed(0)
    ncval1 <- as.integer(n)
    ncval2 <- ncval1:1L
    ncval3 <- sequence(ncval2, from = 1L, by = 1L)
    x <- as.double(runif(n))
    x[ncval3]
  },
  foo={set.seed(0); foo(n)}, 
  check='identical'
)

$ Rscript --vanilla foo.R
Unit: milliseconds
 expr      min       lq     mean   median       uq      max neval cld
   OP 2.109090 2.199845 3.119882 2.294714 4.213308 7.297789   100  a 
  foo 1.055756 1.190470 1.983916 1.318557 2.741124 6.850159   100   b

According to median, foo() takes only 57% of the time.

Update

For a given vector x this simplifies to:

Rcpp::sourceCpp(code='
  #include <Rcpp.h>
  // [[Rcpp::export]]
  Rcpp::NumericVector foo2(Rcpp::NumericVector x) {
    // length of vector
    int n = x.size();
    // length of result
    int l = 0;
    for (int i = 0; i <= n; i++) {
      l = l + i;
    }
    // subset and concatenate
    Rcpp::NumericVector a(l);
    int p = 0;
    for (int i = 0; i < n; i++) {
      for (int j = 0; j < n - i; j++) {
        a[p] = x[j];
        p = p + 1;
      }
    }
    return a;
  }
')

Usage

> set.seed(0)
> x <- runif(10)
> foo2(x)
 [1] 0.8966972 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897
 [8] 0.9446753 0.6607978 0.6291140 0.8966972 0.2655087 0.3721239 0.5728534
[15] 0.9082078 0.2016819 0.8983897 0.9446753 0.6607978 0.8966972 0.2655087
[22] 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897 0.9446753 0.8966972
[29] 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8983897 0.8966972
[36] 0.2655087 0.3721239 0.5728534 0.9082078 0.2016819 0.8966972 0.2655087
[43] 0.3721239 0.5728534 0.9082078 0.8966972 0.2655087 0.3721239 0.5728534
[50] 0.8966972 0.2655087 0.3721239 0.8966972 0.2655087 0.8966972

Upvotes: 4

Related Questions