Reputation: 2940
What is the most efficient way to calculate max distance between a set of coordinates by group in R?
Sample data: I have data like this, but rather than x10000 (which is for the example) the data I have has more like 25 million entries.
library(data.table)
data <- data.table(latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE))
groupn <- nrow(data)/1000
data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
My current method is pretty slow:
data <- data[order(data$group),]
library(dplyr)
library(sf)
library(foreach)
distlist <- foreach(i=1:10)%do%{
tempsf <- st_as_sf(filter(data,group==i), coords= c("longitude", "latitude"), crs=4326)
max(st_distance(tempsf, tempsf))
}
Can some genius out there help me speed this up?
Upvotes: 3
Views: 1711
Reputation: 520
Here is another way using data.table
and .SD
> library(data.table)
> data <- data.table(
+ latitude=sample(seq(0,90,by=0.001), 10000, replace = TRUE),
+ longitude=sample(seq(0,180,by=0.001), 10000, replace = TRUE)
+ )
> groupn <- nrow(data)/1000
> data$group <- sample(seq(1,groupn,by=1),10000,replace=T)
>
> way1 <- function() {
+ data[,
+ .(maxdist = max(
+ dist(
+ .SD[1:.N, .(latitude, longitude)]
+ )
+ )),
+ keyby = group
+ ]
+ }
>
> way2 <- function() {
+ tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2])))
+ }
>
> system.time(out1 <- way1())
user system elapsed
0.16 0.03 0.18
> out1
group maxdist
1: 1 196.7296
2: 2 195.9555
3: 3 196.0794
4: 4 196.3476
5: 5 195.2577
6: 6 196.0791
7: 7 198.5209
8: 8 196.6944
9: 9 195.2630
10: 10 194.4611
>
> system.time(out1 <- way2())
user system elapsed
0.22 0.10 0.60
> out1
1 2 3 4 5 6 7 8 9 10
196.7296 195.9555 196.0794 196.3476 195.2577 196.0791 198.5209 196.6944 195.2630 194.4611
>
> library(microbenchmark)
> microbenchmark(way1(), way2())
Unit: milliseconds
expr min lq mean median uq max neval cld
way1() 172.3232 231.3411 327.1674 266.9135 370.9586 1569.7742 100 a
way2() 181.7716 228.1266 346.2764 285.8394 444.8963 800.4725 100 a
Upvotes: 0
Reputation: 2940
Thanks to Juan Antonio for the idea to use tapply. . . I ended up using the function into sp you built, it is the fastest.
auxF <- function(x) {
require(sp)
tempsf <- data[x, 1:2]
coordinates(tempsf) <- c("longitude", "latitude")
proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
return(max(spDists(tempsf)))
}
out1 <- tapply(1:nrow(data), data$group, auxF)
This also works: dt.haversine that @SymbolixAU (awesome as usual) built:
dt.haversine <- function(lat_from, lon_from, lat_to, lon_to, r = 6378137){
radians <- pi/180
lat_to <- lat_to * radians
lat_from <- lat_from * radians
lon_to <- lon_to * radians
lon_from <- lon_from * radians
dLat <- (lat_to - lat_from)
dLon <- (lon_to - lon_from)
a <- (sin(dLat/2)^2) + (cos(lat_from) * cos(lat_to)) * (sin(dLon/2)^2)
return(2 * atan2(sqrt(a), sqrt(1 - a)) * r)
}
library(geosphere)
out1 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,c("longitude","latitude")], fun=dt.haversine))))
Upvotes: 2
Reputation: 2047
Try this:
Euclidean dist:
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
user system elapsed
0.14 0.00 0.14
> out1
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
WGS84:
> auxF <- function(x) {
+ require(sp)
+
+ tempsf <- data[x, 1:2]
+ coordinates(tempsf) <- c("longitude", "latitude")
+ proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
+ return(max(spDists(tempsf)))
+ }
>
> system.time(out2 <- tapply(1:nrow(data), data$group, auxF))
user system elapsed
4.71 0.00 4.76
> out2
1 2 3 4 5 6 7 8 9 10
19646.04 19217.48 19223.27 19543.99 19318.55 18856.65 19334.11 19679.45 18840.90 19460.14
Haversine method:
> system.time(out3 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,.(longitude,latitude)], fun=distHaversine)))))
user system elapsed
13.24 0.01 13.30
> out3
1 2 3 4 5 6 7 8 9 10
19644749 19216989 19223012 19542956 19317958 18856273 19333424 19677917 18840641 19459353
For 7 million records you can assume a Euclidean distance or project your points to a plane so you can work with the Euclidean distance, since we know that the maximum distance is between the points of the convex hull of each group and this greatly reduces the operations and it does not require a lot of RAM:
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
user system elapsed
0.03 0.00 0.03
> out4
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
With big data:
> data <- data.table(latitude=sample(seq(0,90,by=0.001), 7000000, replace = TRUE),
+ longitude=sample(seq(0,180,by=0.001), 7000000, replace = TRUE))
> groupn <- nrow(data)/700000
> data$group <- sample(seq(1,groupn,by=1),7000000,replace=T)
>
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
Error: cannot allocate vector of size 1824.9 Gb
Called from: dist(data[x, 1:2])
Browse[1]>
Timing stopped at: 7.81 0.06 7.91
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
user system elapsed
8.41 0.22 8.64
Upvotes: 2