Reputation: 7063
I want to have a very quick search and it seems, that using hashes (via environments) is the best way. Now, I got an example to run with environments, but it does not return what I need.
Here is an example:
a <- data.table::data.table(a=c(1, 3, 5), b=c(2, 4, 6), time=c(10, 20, 30))
my_env <- list2env(a)
x <- a[2, .(a, b)] # x=c(3,4)
found <- get("x", envir = my_env)
I would expect found = c(3, 4, 20)
but receive found = c(3, 4)
(I want the whole row to be returned instead of the unknown row subset)
Backround: I have a huge list containing source and destination of routes calculated with osrm, e.g.
lattitude1, longitude1, lattitude2, longitude2, travel-time
46.12, 8.32, 47.87, 9.92, 1036
...
The list contains in a first example about 100000 rows. Using binary search in a data.table speeded up my code by a factor 100, but one search still takes 1 ms. As I have to search for many routes during a simulation (About 2e5 searches) I would like to get even faster.
@Gregor: I am a beginner in R, but I don't think my question is a duplicate:
Summary (Thanks to DigEmAll for his running example below):
Upvotes: 4
Views: 419
Reputation: 57220
Here's an example using enviroment and data.table, the code is pretty self-explanatory :
library(data.table)
# create a big random example (160k rows)
set.seed(123)
fromTo <- expand.grid(1:400,1:400)
colnames(fromTo) <- c('a','b')
DF <- as.data.frame(cbind(fromTo,time=as.integer(runif(nrow(fromTo), min = 1, max=500))))
# setup the environment to use it as hashtable:
# we simply put the times inside an enviroment using
# a|b (concatenation of a with b) as key
timesList <- as.list(DF$time)
names(timesList) <- paste(DF$a,DF$b,sep='|')
timesEnv <- list2env(timesList)
# setup the data.table to use it as hashtable
DT <- setDT(DF,key=c('a','b'))
# create search functions
searchUsingEnv <- function(a,b){
time <- get(paste(a,b,sep='|'),envir=timesEnv,inherits=FALSE)
return(time)
}
searchUsingDataTable <- function(from,to){
time <- DT[.(from,to),time]
return(time)
}
Benchmark :
# benchmark functions
# i.e. we try to search ~16K rows in ourtwo kind of hashtables
benchEnv <- function(){
n <- nrow(fromTo)
s <- as.integer(n * 0.9)
for(i in s:n){
searchUsingEnv(fromTo[i,'a'],fromTo[i,'b'])
}
}
benchDT <- function(){
n <- nrow(fromTo)
s <- as.integer(n * 0.9)
for(i in s:n){
searchUsingDataTable(fromTo[i,'a'],fromTo[i,'b'])
}
}
# let's measure the performances
> system.time(benchEnv(), gcFirst = TRUE)
user system elapsed
2.26 0.00 2.30
> system.time(benchDT(), gcFirst = TRUE)
user system elapsed
42.34 0.00 42.56
Conclusions:
environment seems much faster then data.table for repeated single key access, so you can try to use it.
Enviroments have fast access but they can only have string keys which occupy more memory than doubles. So, I've added an example using Rcpp
and std::map<>
with a multiple values map :
(note: if you are on Windows you need to install RTools in order to make Rcpp work)
library(data.table)
library(Rcpp)
library(inline)
nRows <- 1e7
############# create data.table "DT" containing coordinates and times
generate_routes_dt <- function(nmax) {
set.seed(123)
routes <- data.table(lat1 = numeric(nmax),
lng1 = numeric(nmax),
lat2 = numeric(nmax),
lng2 = numeric(nmax),
time = numeric(nmax))
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat1 <- tmp
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng1 <- tmp
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat2 <- tmp
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng2 <- tmp
tmp <- sample(seq(0, 1e7, length.out = nmax), nmax)
routes$time <- as.integer(tmp)
data.table::setkey(routes, lat1, lng1, lat2, lng2)
return(routes)
}
DT <- generate_routes_dt(nRows)
############# create data.table search function
searchUsingDataTable <- function(lat_1,lng_1,lat_2,lng_2){
time <- DT[.(lat_1,lng_1,lat_2,lng_2),time]
return(time)
}
#############
############# create Rcpp search function
# the following code create 2 functions: createMap and getTime
# usage:
# map <- createMap(lat1Vec,lng1Vec,lat2Vec,lng2Vec,timesVec)
# t <- getTime(map,lat1,lng1,lat2,lng2)
sourceCpp(code=
'
#include <Rcpp.h>
class MultiKey {
public:
double lat1;
double lng1;
double lat2;
double lng2;
MultiKey(double la1, double ln1, double la2, double ln2)
: lat1(la1), lng1(ln1), lat2(la2), lng2(ln2) {}
bool operator<(const MultiKey &right) const
{
if ( lat1 == right.lat1 ) {
if ( lng1 == right.lng1 ) {
if ( lat2 == right.lat2 ) {
return lng2 < right.lng2;
}
else {
return lat2 < right.lat2;
}
}
else {
return lng1 < right.lng1;
}
}
else {
return lat1 < right.lat1;
}
}
};
// [[Rcpp::export]]
SEXP createMap(Rcpp::NumericVector lat1,
Rcpp::NumericVector lng1,
Rcpp::NumericVector lat2,
Rcpp::NumericVector lng2,
Rcpp::NumericVector times){
std::map<MultiKey, double>* map = new std::map<MultiKey, double>;
int n1 = lat1.size();
int n2 = lng1.size();
int n3 = lat2.size();
int n4 = lng2.size();
int n5 = times.size();
if(!(n1 == n2 && n2 == n3 && n3 == n4 && n4 == n5)){
throw std::range_error("input vectors lengths are different");
}
for(int i = 0; i < n1; i++){
MultiKey key(lat1[i],lng1[i],lat2[i],lng2[i]);
map->insert(std::pair<MultiKey, double>(key, times[i]));
}
Rcpp::XPtr< std::map<MultiKey, double> > p(map, true);
return( p );
}
// [[Rcpp::export]]
Rcpp::NumericVector getTime(SEXP mapPtr,
double lat1,
double lng1,
double lat2,
double lng2){
Rcpp::XPtr< std::map<MultiKey, double> > ptr(mapPtr);
MultiKey key(lat1,lng1,lat2,lng2);
std::map<MultiKey,double>::iterator it = ptr->find(key);
if(it == ptr->end())
return R_NilValue;
return Rcpp::wrap(it->second);
}
')
map <- createMap(DT$lat1,DT$lng1,DT$lat2,DT$lng2,DT$time)
searchUsingRcpp <- function(lat_1,lng_1,lat_2,lng_2){
time <- getTime(map,lat_1,lng_1,lat_2,lng_2)
return(time)
}
#############
############# benchmark
set.seed(1234)
rowsToSearchOneByOne <- DT[sample.int(nrow(DT),size=nrow(DT),replace=FALSE),]
bench <- function(searchFun2Use){
for(i in nrow(rowsToSearchOneByOne)){
key <- rowsToSearchOneByOne[i,]
searchFun2Use(key$lat1,key$lng1,key$lat2,key$lng2)
}
}
microbenchmark::microbenchmark(
bench(searchUsingRcpp),
bench(searchUsingDataTable),
times=100)
#############
Benchmark result :
Unit: microseconds
expr min lq mean median uq max neval
bench(searchUsingRcpp) 360.959 381.7585 400.4466 391.999 403.9985 665.597 100
bench(searchUsingDataTable) 1103.034 1138.0740 1214.3008 1163.514 1224.9530 2035.828 100
Note:
I really don't think that using double as keys is a good idea... floating point values should be used to search using a certain tolerance or inside a range, not to look up for perfect match inside a map.
Upvotes: 4