Leosar
Leosar

Reputation: 2072

A faster function to lower the resolution of a raster R

I am using the raster package to lower the resolution of big rasters, using the function aggregate like this

require(raster)    
x <- matrix(rpois(1000000, 2),1000)

a <-raster(x)
plot(a)

agg.fun <- function(x,...) 
    if(sum(x)==0){
        return(NA)
    } else {
        which.max(table(x))
    }

a1<-aggregate(a,fact=10,fun=agg.fun)
plot(a1)

the raster images I have to aggregate are much bigger 34000x34000 so I would like to know if there is a faster way to implement the agg.fun function.

Upvotes: 9

Views: 4319

Answers (4)

Robert Hijmans
Robert Hijmans

Reputation: 47591

If your goal is aggregation, wouldn't you want the max function?

library(raster)    
x <- matrix(rpois(1000000, 2),1000)
a <- aggregate(a,fact=10,fun=max)

Upvotes: 1

digEmAll
digEmAll

Reputation: 57220

Just for fun I created also an Rcpp function (not much faster than @JosephWood) :

########### original function 
#(modified to return most frequent value instead of index)
agg.fun <- function(x,...){
    if(sum(x)==0){
        return(NA)
    } else {
        as.integer(names(which.max(table(x))))
    }
}
########### @JosephWood function
fasterAgg.Fun <- function(x,...) {
    myRle.Alt <- function (x1) {
        n1 <- length(x1)
        y1 <- x1[-1L] != x1[-n1]
        i <- c(which(y1), n1)
        x1[i][which.max(diff(c(0L, i)))]
    }

    if (sum(x)==0) {
        return(NA)
    } else {
        myRle.Alt(sort(x, method="quick"))
    }
}
########### Rcpp function
library(Rcpp)
library(inline)

aggrRcpp <- cxxfunction(signature(values='integer'), '
                Rcpp::IntegerVector v(clone(values));
                std::sort(v.begin(),v.end());
                int n = v.size();
                double sum = 0;
                int currentValue = 0, currentCount = 0, maxValue = 0, maxCount = 0;
                for(int i=0; i < n; i++) {
                    int value = v[i];
                    sum += value;
                    if(i==0 || currentValue != value){
                      if(currentCount > maxCount){
                        maxCount = currentCount;
                        maxValue = currentValue;
                      }
                      currentValue = value;
                      currentCount = 0;
                    }else{
                      currentCount++;
                    }
                }
                if(sum == 0){
                  return Rcpp::IntegerVector::create(NA_INTEGER);
                }
                if(currentCount > maxCount){
                  maxCount = currentCount;
                  maxValue = currentValue;
                }
                return wrap( maxValue ) ;
        ', plugin="Rcpp", verbose=FALSE, 
        includes='')
# wrap it to support "..." argument
aggrRcppW <- function(x,...)aggrRcpp(x);

Benchmark :

require(raster)   
set.seed(123)
x <- matrix(rpois(10^8, 2), 10000)
a <- raster(x)

system.time(a1<-aggregate(a,fact=100,fun=agg.fun))
#   user  system elapsed 
#  35.13    0.44   35.87 
system.time(a2<-aggregate(a,fact=100,fun=fasterAgg.Fun))
#   user  system elapsed 
#   8.20    0.34    8.59
system.time(a3<-aggregate(a,fact=100,fun=aggrRcppW))
#   user  system elapsed 
#   5.77    0.39    6.22 

###########  all equal ?
all(TRUE,all.equal(a1,a2),all.equal(a2,a3))
# > [1] TRUE

Upvotes: 4

jbaums
jbaums

Reputation: 27408

You can use gdalUtils::gdalwarp for this. For me, it's less efficient than @JosephWood's fasterAgg.Fun for rasters with 1,000,000 cells, but for Joseph's larger example it's much faster. It requires that the raster exists on disk, so factor writing time into the below if your raster is in memory.

Below, I've used the modification of fasterAgg.Fun that returns the most frequent value, rather than its index in the block.

library(raster)
x <- matrix(rpois(10^8, 2), 10000)
a <- raster(x)

fasterAgg.Fun <- function(x,...) {
    myRle.Alt <- function (x1) {
        n1 <- length(x1)
        y1 <- x1[-1L] != x1[-n1]
        i <- c(which(y1), n1)
        x1[i][which.max(diff(c(0L, i)))]
    }

    if (sum(x)==0) {
        return(NA)
    } else {
        myRle.Alt(sort(x, method="quick"))
    }
}
system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun))

##   user  system elapsed 
##  67.42    8.82   76.38

library(gdalUtils)
writeRaster(a, f <- tempfile(fileext='.tif'), datatype='INT1U')
system.time(a3 <- gdalwarp(f, f2 <- tempfile(fileext='.tif'), r='mode', 
                           multi=TRUE, tr=res(a)*10, output_Raster=TRUE))

##   user  system elapsed 
##   0.00    0.00    2.93

Note that there is a slight difference in the definition of the mode when there are ties: gdalwarp selects the highest value, while the functions passed to aggregate above (via which.max's behaviour) select the lowest (e.g., see which.max(table(c(1, 1, 2, 2, 3, 4)))).

Also, storing the raster data as integer is important (when applicable). If the data are stored as float (the writeRaster default), for example, the gdalwarp operation above takes ~14 sec on my system. See ?dataType for available types.

Upvotes: 6

Joseph Wood
Joseph Wood

Reputation: 7608

Try this:

fasterAgg.Fun <- function(x,...) {
    myRle.Alt <- function (x1) {
        n1 <- length(x1)
        y1 <- x1[-1L] != x1[-n1]
        i <- c(which(y1), n1)
        which.max(diff(c(0L, i)))
    }

    if (sum(x)==0) {
        return(NA)
    } else {
        myRle.Alt(sort(x, method="quick"))
    }
}

library(rbenchmark)
benchmark(FasterAgg=aggregate(a, fact=10, fun=fasterAgg.Fun),
            AggFun=aggregate(a, fact=10, fun=agg.fun),
            replications=10,
            columns = c("test", "replications", "elapsed", "relative"),
            order = "relative")
       test replications elapsed relative
1 FasterAgg           10  12.896    1.000
2    AggFun           10  30.454    2.362

For a larger test object, we have:

x <- matrix(rpois(10^8,2),10000)
a <- raster(x)
system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun))
   user  system elapsed 
111.271  22.225 133.943  

system.time(a1 <- aggregate(a, fact=10, fun=agg.fun))
   user  system elapsed 
282.170  24.327 308.112

If you want the actual values as @digEmAll says in the comments above, simply change the return value in myRle.Alt from which.max(diff(c(0L, i))) to x1[i][which.max(diff(c(0L, i)))].

Upvotes: 4

Related Questions