Reputation: 498
I have written a recursive function in R and used memoise to speed it up. I have tried to speed it up further by writing it in Rcpp and then memoising the Rcpp function, but the R function is faster. Why is this, and is there any way to speed this up in my usage?
require(microbenchmark)
require(Rcpp)
require(memoise)
Rcpp function:
cppFunction('
double FunCpp (unsigned int i, double d1, double d2,
double p, double s, unsigned int imax,
double n, double k, double r,
double m, double t) {
if (i == 0) return 0;
if (i == 1) return log2(-1*d1);
if (i == 2) return log2(d2*d1 - p*s);
double x = log2(fabs(-(((imax - (n - i))/imax)*k*r + m + (n - i)*t)));
x = x + FunCpp(i-1, d1, d2, p, s, imax, n, k, r, m, t);
double y = log2((n - i + 1)*t*k*r*((imax - ((n - i + 1) - 1))/imax));
y = y + FunCpp(i-2, d1, d2, p, s, imax, n, k, r, m, t);
return x + log2(1 - pow(2,y-x));
}
')
FunCpp = memoise(FunCpp)
R function:
FunR = memoise(function(i, d1, d2, p, s, imax, n, k, r, m, t) {
if(i == 0) 0
else if(i == 1) log2(-1*d1)
else if(i == 2) log2(d2*d1 - p*s)
else {
x = log2(abs(-(((imax - (n - i))/imax)*k*r + m + (n - i)*t)))
x = x + FunR(i-1, d1, d2, p, s, imax, n, k, r, m, t)
y = log2((n - i + 1)*t*k*r*((imax - ((n - i + 1) - 1))/imax))
y = y + FunR(i-2, d1, d2, p, s, imax, n, k, r, m, t)
x + log2(1 - 2^(y-x))
}
})
This speed comparison is realistic for my purposes. The recursive function is used on a range of integers, but afterward, it won't be called with the same inputs again. So for speed comparison, here I'm calling the functions from within other functions, and after I'm done calling the recursive functions I use forget() to reset the cache.
TestFunR = function() {
x = sapply(1:31, function(i) {
FunR(i = 31-i, d1 = -152, d2 = -147.33, p = 150, s = 0.03,
imax = 30, n = 31, k = 1, r = 1, m = 2, t = 5)
})
forget(FunR)
}
TestFunCpp = function() {
x = sapply(1:31, function(i) {
FunCpp(i = 31-i, d1 = -152, d2 = -147.33, p = 150, s = 0.03,
imax = 30, n = 31, k = 1, r = 1, m = 2, t = 5)
})
forget(FunCpp)
}
microbenchmark(TestFunR(), TestFunCpp())
Unit: milliseconds
expr min lq mean median uq max neval cld
TestFunR() 9.979738 10.4910 12.83228 10.91887 11.89264 61.61513 100 a
TestFunCpp() 520.955483 528.6965 547.31103 536.73058 547.66377 729.57631 100 b
Edit: I had gotten a method working from Dirk's book prior to posting this.
includeText = '
#include <algorithm>
#include <vector>
#include <stdexcept>
#include <cmath>
#include <iostream>
class F {
public:
F(unsigned int n = 200, double d1 = 0, double d2 = 0, double p = 0, double s = 0) {
memo.resize(n);
std::fill( memo.begin(), memo.end(), NAN );
memo[0] = 0;
memo[1] = log2(-1*d1);
memo[2] = log2(d2*d1 - p*s);
}
double FunIL(int i, double d1, double d2, double p, double s, double imax,
double n, double k, double r, double m, double t) {
if (i < 0) return((double) NAN);
if (i >= (int) memo.size()) throw std::range_error(\"i too large\");
if (!std::isnan(memo[i])) return(memo[i]);
double x = log2(fabs(-(((imax - (n - i))/imax)*k*r + m + (n - i)*t)));
x = x + FunIL(i-1, d1, d2, p, s, imax, n, k, r, m, t);
double y = log2((n - i + 1)*t*k*r*((imax - ((n - i + 1) - 1))/imax));
y = y + FunIL(i-2, d1, d2, p, s, imax, n, k, r, m, t);
memo[i] = x + log2(1 - pow(2,y-x));
return(memo[i]);
}
private:
std::vector< double > memo;
};
'
bodyText = '
int is = Rcpp::as<int>(i);
double d1s = Rcpp::as<double>(d1);
double d2s = Rcpp::as<double>(d2);
double ps = Rcpp::as<double>(p);
double ss = Rcpp::as<double>(s);
double imaxs = Rcpp::as<double>(imax);
double ns = Rcpp::as<double>(n);
double ks = Rcpp::as<double>(k);
double rs = Rcpp::as<double>(r);
double ms = Rcpp::as<double>(m);
double ts = Rcpp::as<double>(t);
F f(ns, d1s, d2s, ps, ss);
return Rcpp::wrap( f.FunIL(is, d1s, d2s, ps, ss, imaxs, ns, ks, rs, ms, ts) );
'
FunInline = cxxfunction(signature(i = "integer", d1 = "numeric", d2 = "numeric", p = "numeric",
s = "numeric", imax = "numeric", n = "numeric", k = "numeric",
r = "numeric", m = "numeric", t = "numeric"),
plugin = "Rcpp",
verbose = T,
incl = includeText,
body = bodyText)
It works equally well (see TestFunInline):
microbenchmark(TestFunR(), TestFunCpp(), TestFunCpp_Mem(), TestFunInline())
Unit: microseconds
expr min lq mean median uq max neval cld
TestFunR() 8871.251 9067.758 10301.8003 9287.5725 9593.1310 19270.081 100 b
TestFunCpp() 514415.356 517160.251 522431.2980 519321.6130 523811.7640 584812.731 100 c
TestFunCpp_Mem() 245.474 264.291 284.8908 281.6105 292.0885 526.870 100 a
TestFunInline() 279.686 295.723 378.2134 306.8425 316.0370 6621.364 100 a
However, I couldn't get it to work with doParallel. I'm optimizing an objective function on each process using the optim and optimx packages, and when I use %do% it works, but when I use %dopar% all I see is that the objective function cannot be evaluated at the initial parameters. I took Dirk's advice from his many other posts and put Coatless's method into a package, but I'm not sure about how to put the method from Dirk's book into a package. It's just my inexperience in C++.
Edit 2: It finally clicked how to put Dirk's method in a source file in my package. I know there are other discussions about using Rcpp with doParallel, but I'm putting this code here because it is another good way to solve my problem, and by adding this code to a source file in my package, it happened to be much easier for me to get this to work in my parallel approach than inline was.
class F {
public:
F(unsigned int n = 200, double d1 = 0, double d2 = 0, double p = 0, double s = 0) {
memo.resize(n);
std::fill( memo.begin(), memo.end(), NAN );
memo[0] = 0;
memo[1] = log2(-1*d1);
memo[2] = log2(d2*d1 - p*s);
}
double FunIL(int i, double d1, double d2, double p, double s, double imax,
double n, double k, double r, double m, double t) {
if (i < 0) return((double) NAN);
if (i >= (int) memo.size()) throw std::range_error("\"i too large\"");
if (!std::isnan(memo[i])) return(memo[i]);
double x = log2(fabs(-(((imax - (n - i))/imax)*k*r + m + (n - i)*t)));
x = x + FunIL(i-1, d1, d2, p, s, imax, n, k, r, m, t);
double y = log2((n - i + 1)*t*k*r*((imax - ((n - i + 1) - 1))/imax));
y = y + FunIL(i-2, d1, d2, p, s, imax, n, k, r, m, t);
memo[i] = x + log2(1 - pow(2,y-x));
return(memo[i]);
}
private:
std::vector< double > memo;
};
// [[Rcpp::export]]
double FunDirk(int i, double d1, double d2, double p, double s,
double imax, double n, double k, double r,
double m, double t) {
F f(n, d1, d2, p, s);
return f.FunIL(i, d1, d2, p, s, imax, n, k, r, m, t);
}
Upvotes: 5
Views: 485
Reputation: 20746
Well, first let's think about the purpose of memoise
. The goal of memoise
is to cache function results and reuse them. Thus, after one calculation it no longer needs to recalculate the value again for any other sequence in the computation it can just retrieve the value from the cache. This is particularly relevant for the recursive structure set up.
memoise
cache access with respect to R and C++The setup for memoisize is to cache R value function values. In this case, it is caching those values. However, the C++ code cannot access the cached values. Thus, the C++ version recalculates each of those values. In essence, you are essentially using:
x = sapply(1:31, function(i) {
FunCpp(i = 31-i, d1 = -152, d2 = -147.33, p = 150, s = 0.03,
imax = 30, n = 31, k = 1, r = 1, m = 2, t = 5)
})
Disclaimer: There should be a more formal argument next but its been awhile.
To understand algorithms, sometimes we need to use what's called Big O notation that allows us to observe how the code performs asymptotically. Now, the Big O in this case is O(2^N) because of two calls to compute: Fun(i-1)
and FunR(i-2)
. However, memoise
uses a hash map / look up table with probably a Big O of O(n)
at its worst and O(1)
at its best. In essence, we have constant vs. exponential asymptotic results.
However, that does not necessarily mean that the C++ function is garbage. One of the downsides of R to Rcpp and back bridge is the lag time between transferring values between the two domains. Thus, one way we can slightly lower the computation time is by placing the loop completely in C++.
e.g.
// [[Rcpp::export]]
Rcpp::NumericVector FunCpp_loop(unsigned int e,
double d1, double d2,
double p, double s, unsigned int imax,
double n, double k, double r,
double m, double t){
Rcpp::NumericVector o(e);
for(unsigned int i = 0; i < e; i++){
o(i) = FunCpp(31-(i+1), -152, -147.33, 150, 0.03, 30, 31, 1, 1, 2, 5);
}
return o;
}
However, the benchmarks here do not really improve the situation (even by pre-creating the vector 1:31
)
Unit: milliseconds
expr min lq mean median uq max neval
TestFunR(tv) 8.467568 9.077262 9.986837 9.449952 10.60555 14.91243 100
TestFunCpp(tv) 476.678391 482.489094 487.687811 486.351087 490.25346 579.38161 100
TestFunCpp_loop() 478.348070 482.588307 488.234200 486.211347 492.33965 521.10918 100
We can apply the same memoziation techniques given in memoise
in C++. The implementation is not as pretty and nice but it serves to show the same principles are applicable.
To start, we'll make a hashmap.
// Memoization structure to hold the hash map
struct mem_map{
// Initializer to create the static (presistent) map
static std::map<int, double> create_map()
{
std::map<int, double> m;
m.clear();
return m;
}
// Name of the static map for the class
static std::map<int, double> memo;
};
// Actuall instantiate the class in the global scope (I know, bad me...)
std::map<int, double> mem_map::memo = mem_map::create_map();
Now, we probably should make some accessors to work with the map object.
// Reset the map
// [[Rcpp::export]]
void clear_mem(){
mem_map::memo.clear();
}
// Get the values of the map.
// [[Rcpp::export]]
std::map<int, double> get_mem(){
return mem_map::memo;
}
Lastly, let's change around some internal things in your function.
// Users function
// [[Rcpp::export]]
double FunCpp_Mem (int i, double d1, double d2,
double p, double s, unsigned int imax,
double n, double k, double r,
double m, double t) {
// We have already computed the value...
if(mem_map::memo.count(i) > 0)
return mem_map::memo[i];
// Otherwise, let us get ready to compute it!
double res = 0;
if (i <= 2){
if (i <= 0) { // i == 1
res = 0.0;
}else if (i == 1) {
res = log2(-1.0*d1);
}else { // i == 2
res = log2(d2*d1 - p*s);
}
// Store result in hashmap
mem_map::memo[i] = res;
return res;
}
// Calculate if not in special case.
double x = log2(fabs(-(((imax - (n - i))/imax)*k*r + m + (n - i)*t)));
x = x + FunCpp_Mem(i-1, d1, d2, p, s, imax, n, k, r, m, t);
double y = log2((n - i + 1)*t*k*r*((imax - ((n - i + 1) - 1))/imax));
y = y + FunCpp_Mem(i-2, d1, d2, p, s, imax, n, k, r, m, t);
res = x + log2(1 - pow(2,y-x));
// Update the hashmap for uncalculated value
mem_map::memo[i] = res;
return res;
}
Whew Lots of work. Let's test it out to see if it was worth it.
# Benchmark for Rcpp Memoization
TestFunCpp_mem = function(tv) {
x = sapply(tv, function(i) {
FunCpp_Mem(i = 31-i, d1 = -152, d2 = -147.33, p = 150, s = 0.03,
imax = 30, n = 31, k = 1, r = 1, m = 2, t = 5)
})
clear_mem()
}
TestFunR = function(tv) {
x = sapply(tv, function(i) {
FunR(i = 31-i, d1 = -152, d2 = -147.33, p = 150, s = 0.03,
imax = 30, n = 31, k = 1, r = 1, m = 2, t = 5)
})
forget(FunR)
}
# Pre-generate vector
tv = 1:31
microbenchmark(TestFunR(tv),TestFunCpp_mem(tv))
And the results....
microbenchmark(TestFunR(tv),TestFunCpp_mem(tv))
Unit: microseconds
expr min lq mean median uq max neval
TestFunR(tv) 8246.324 8662.694 9345.6947 9009.868 9797.126 13001.995 100
TestFunCpp_mem(tv) 203.832 214.939 253.7931 228.898 240.906 1277.325 100
The Cpp Function with memoization is about 40.5x faster than the R version! Memoization was definitely worth it!
Upvotes: 7