Reputation: 13
I was trying to use rcpp/armadillo with openmp to speed up a loop in R. The loop takes a matrix with each row containing indices of a location vector(or matrix if it's 2D locations) as input(and other matrix/vec to be used). Inside the loop, I extracted each row of input indices matrix and find the corresponding locations, calculate distance matrix, and covariance matrix, do cholesky and backsolve, save the backsolve results to a new matrix. Here is the rcpp code:
`#include <iostream>
#include <RcppArmadillo.h>
#include <omp.h>
#include <Rcpp.h>
// [[Rcpp::plugins(openmp)]]
using namespace Rcpp;
using namespace arma;
using namespace std;
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::export]]
mat NZentries_new2 (int m, int nnp, const mat& locs, const umat& revNNarray, const mat& revCondOnLatent, const vec& nuggets, const vec covparms){
// initialized the output matrix
mat Lentries=zeros(nnp,m+1);
// initialized objects in parallel part
int n0; //number of !is_na elements
uvec inds;//
vec revCon_row;//
uvec inds00;//
vec nug;//
mat covmat;//
vec onevec;//
vec M;//
mat dist;//
int k;//
omp_set_num_threads(2);// selects the number of cores to use.
#pragma omp parallel for shared(locs,revNNarray,revCondOnLatent,nuggets,nnp,m,Lentries) private(k,M,dist,onevec,covmat,nug,n0,inds,revCon_row,inds00) default(none) schedule(static)
for (k = 0; k < nnp; k++) {
// extract a row to work with
inds=revNNarray.row(k).t();
revCon_row=revCondOnLatent.row(k).t();
if (k < m){
n0=k+1;
} else {
n0=m+1;
}
// extract locations
inds00=inds(span(m+1-n0,m))-ones<uvec>(n0);
nug=nuggets.elem(inds00) % (ones(n0)-revCon_row(span(m+1-n0,m))); // vec is vec, cannot convert to mat
dist=calcPWD2(locs.rows(inds00));
#pragma omp critical
{
//calculate covariance matrix
covmat= MaternFun(dist,covparms) + diagmat(nug) ; // summation from arma
}
// get last row of inverse Cholesky
onevec = zeros(n0);
onevec[n0-1] = 1;
M=solve(chol(covmat,"upper"),onevec);
// save the entries to matrix
Lentries(k,span(0,n0-1)) = M.t();
}
return Lentries;
}`
The current version works fine but speed is slow(almost the same as no parallel version), if I take the line in omp critical bracket out, it cause segment fault and R will be crashed. This MaterFun is a function I defined as below with several other small functions. So my question is that why MaternFun has to stay in the critical part.
// [[Rcpp::export]]
mat MaternFun( mat distmat, vec covparms ){
int d1 = distmat.n_rows;
int d2 = distmat.n_cols;
int j1;
int j2;
mat covmat(d1,d2);
double scaledist;
double normcon = covparms(0)/(pow(2.0,covparms(2)-1)*Rf_gammafn(covparms(2)));
for (j1 = 0; j1 < d1; j1++){
for (j2 = 0; j2 < d2; j2++){
if ( distmat(j1,j2) == 0 ){
covmat(j1,j2) = covparms(0);
} else {
scaledist = distmat(j1,j2)/covparms(1);
covmat(j1,j2) = normcon*pow( scaledist, covparms(2) )*
Rf_bessel_k(scaledist,covparms(2),1.0);
}
}
}
return covmat;
}
// [[Rcpp::export]]
double dist2(double lat1,double long1,double lat2,double long2) {
double dist = sqrt(pow(lat1 - lat2, 2) + pow(long1 - long2, 2)) ;
return (dist) ;
}
// [[Rcpp::export]]
mat calcPWD2( mat x) {//Rcpp::NumericMatrix
int outrows = x.n_rows ;
int outcols = x.n_rows ;
mat out(outrows, outcols) ;
for (int arow = 0 ; arow < outrows ; arow++) {
for (int acol = 0 ; acol < outcols ; acol++) {
out(arow, acol) = dist2(x(arow, 0),x(arow, 1),
x(acol, 0),x(acol, 1)) ; //extract element from mat
}
}
return (out) ;
}
Here is some sample inputs for testing the MaterFun
in R:
library(fields)
distmat=rdist(1:5) # distance matrix
covparms=c(1,0.2,1.5)
Upvotes: 1
Views: 430
Reputation: 20746
The issue is there are two calls to R math functions (Rf_bessel_k
and Rf_gammafn
) that require the access to be single threaded instead of parallel.
To get around this, let's add a dependency on boost
via BH
to obtain the cyl_bessel_k
and tgamma
functions. Alternatively, there is always the option of reimplementing R's besselK
and gamma
in C++ so it doesn't use the single-threaded R variant.
This gives:
#include <Rcpp.h>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/gamma.hpp>
// [[Rcpp::depends(BH)]]
// [[Rcpp::export]]
double besselK_boost(double x, double v) {
return boost::math::cyl_bessel_k(v, x);
}
// [[Rcpp::export]]
double gamma_fn_boost(double x) {
return boost::math::tgamma(x);
}
Test Code
x0 = 9.536743e-07
nu = -10
all.equal(besselK(x0, nu), besselK_boost(x0, nu))
# [1] TRUE
x = 2
all.equal(gamma(x), gamma_fn_boost(x))
# [1] TRUE
Note: The order of parameters for boost's variant differs from R's:
cyl_bessel_k(v, x)
Rf_bessel_k(x, v, expon.scaled = FALSE)
From here, we can modify the MaternFun
. Unfortunately, because calcPWD2
is missing, the furthest we can go is switching to use boost and incorporating in OpenMP protections.
#include <RcppArmadillo.h>
#include <boost/math/special_functions/bessel.hpp>
#include <boost/math/special_functions/gamma.hpp>
#ifdef _OPENMP
#include <omp.h>
#endif
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::depends(BH)]]
// [[Rcpp::plugins(openmp)]]
// [[Rcpp::export]]
arma::mat MaternFun(arma::mat distmat, arma::vec covparms) {
int d1 = distmat.n_rows;
int d2 = distmat.n_cols;
int j1;
int j2;
arma::mat covmat(d1,d2);
double scaledist;
double normcon = covparms(0) /
(pow(2.0, covparms(2) - 1) * boost::math::tgamma(covparms(2)));
for (j1 = 0; j1 < d1; ++j1){
for (j2 = 0; j2 < d2; ++j2){
if ( distmat(j1, j2) == 0 ){
covmat(j1, j2) = covparms(0);
} else {
scaledist = distmat(j1, j2)/covparms(1);
covmat(j1, j2) = normcon * pow( scaledist, covparms(2) ) *
boost::math::cyl_bessel_k(covparms(2), scaledist);
}
}
}
return covmat;
}
Upvotes: 2