Reputation: 2940

calculate max distance by group across millions of coordinates

What is the most efficient way to calculate max distance between a set of coordinates by group in R?

Sample data: I have data like this, but rather than x10000 (which is for the example) the data I have has more like 25 million entries.

library(data.table)
data <- data.table(latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
               longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE))
groupn <- nrow(data)/1000
data$group <- sample(seq(1,groupn,by=1),10000,replace=T)

My current method is pretty slow:

data <- data[order(data$group),]
library(dplyr)
library(sf)
library(foreach)
distlist <- foreach(i=1:10)%do%{
  tempsf <- st_as_sf(filter(data,group==i), coords= c("longitude", "latitude"), crs=4326)
  max(st_distance(tempsf, tempsf))
  }

Can some genius out there help me speed this up?

Upvotes: 3

Answers (3)

Jake

Reputation: 520

Here is another way using data.table and .SD

> library(data.table)
> data <- data.table(
+   latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
+   longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE)
+ )
> groupn <- nrow(data)/1000
> data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
> 
> way1 <- function() {
+   data[,
+     .(maxdist = max(
+       dist(
+         .SD[1:.N, .(latitude, longitude)]
+       )
+     )),
+     keyby = group
+   ]
+ }
> 
> way2 <- function() {
+   tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2])))
+ }
> 
> system.time(out1 <- way1())
   user  system elapsed 
   0.16    0.03    0.18 
> out1
    group  maxdist
 1:     1 196.7296
 2:     2 195.9555
 3:     3 196.0794
 4:     4 196.3476
 5:     5 195.2577
 6:     6 196.0791
 7:     7 198.5209
 8:     8 196.6944
 9:     9 195.2630
10:    10 194.4611
> 
> system.time(out1 <- way2())
   user  system elapsed 
   0.22    0.10    0.60 
> out1
       1        2        3        4        5        6        7        8        9       10 
196.7296 195.9555 196.0794 196.3476 195.2577 196.0791 198.5209 196.6944 195.2630 194.4611 
> 
> library(microbenchmark)
> microbenchmark(way1(), way2())
Unit: milliseconds
   expr      min       lq     mean   median       uq       max neval cld
 way1() 172.3232 231.3411 327.1674 266.9135 370.9586 1569.7742   100   a
 way2() 181.7716 228.1266 346.2764 285.8394 444.8963  800.4725   100   a

Upvotes: 0

Neal Barsch

Reputation: 2940

Thanks to Juan Antonio for the idea to use tapply. . . I ended up using the function into sp you built, it is the fastest.

auxF <- function(x) {
require(sp)
tempsf <- data[x, 1:2]
coordinates(tempsf) <- c("longitude", "latitude")
proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
return(max(spDists(tempsf)))
}
out1 <- tapply(1:nrow(data), data$group, auxF)

This also works: dt.haversine that @SymbolixAU (awesome as usual) built:

dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){
  radians <- pi/180
  lat_to <- lat_to * radians
  lat_from <- lat_from * radians
  lon_to <- lon_to * radians
  lon_from <- lon_from * radians
  dLat <- (lat_to - lat_from)
  dLon <- (lon_to - lon_from)
  a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2)
  return(2 * atan2(sqrt(a), sqrt(1 - a)) * r)
}
library(geosphere)
out1 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,c("longitude","latitude")], fun=dt.haversine))))

Upvotes: 2

Juan Antonio Roldán Díaz

Reputation: 2047

Try this:

Euclidean dist:

> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
   user  system elapsed 
   0.14    0.00    0.14 
> out1
   1        2        3        4        5        6        7        8        9       10 
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123

WGS84:

> auxF <- function(x) {
+   require(sp)
+   
+   tempsf <- data[x, 1:2]
+   coordinates(tempsf) <- c("longitude", "latitude")
+   proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
+   return(max(spDists(tempsf)))
+ }
> 
> system.time(out2 <- tapply(1:nrow(data), data$group, auxF))
   user  system elapsed 
   4.71    0.00    4.76 
> out2
   1        2        3        4        5        6        7        8        9       10 
19646.04 19217.48 19223.27 19543.99 19318.55 18856.65 19334.11 19679.45 18840.90 19460.14

Haversine method:

> system.time(out3 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,.(longitude,latitude)], fun=distHaversine)))))
   user  system elapsed 
  13.24    0.01   13.30 
> out3
   1        2        3        4        5        6        7        8        9       10 
19644749 19216989 19223012 19542956 19317958 18856273 19333424 19677917 18840641 19459353

For 7 million records you can assume a Euclidean distance or project your points to a plane so you can work with the Euclidean distance, since we know that the maximum distance is between the points of the convex hull of each group and this greatly reduces the operations and it does not require a lot of RAM:

> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
   user  system elapsed 
   0.03    0.00    0.03 
> out4
       1        2        3        4        5        6        7        8        9       10 
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123

With big data:

> data <- data.table(latitude=sample(seq(0,90,by=0.001), 7000000, replace = TRUE),
+                    longitude=sample(seq(0,180,by=0.001), 7000000, replace = TRUE))
> groupn <- nrow(data)/700000
> data$group <- sample(seq(1,groupn,by=1),7000000,replace=T)
> 
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
Error: cannot allocate vector of size 1824.9 Gb
Called from: dist(data[x, 1:2])
Browse[1]> 
Timing stopped at: 7.81 0.06 7.91
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
   user  system elapsed 
   8.41    0.22    8.64

Upvotes: 2

calculate max distance by group across millions of coordinates

Answers (3)

Related Questions