Felipe Alvarenga
Felipe Alvarenga

Reputation: 2652

Faster way of evaluating points inside a list of polygons in R

I have two data sets, one with over 13 million rectangle polygons (set of 4 lat lng points) and another with 10 thousand points referring to prices in that location.

> polygons
     id                                 pol_lat                                 pol_lng
 1: 148 -4.250236,-4.250236,-4.254640,-4.254640 -49.94628,-49.94494,-49.94494,-49.94628
 2: 149 -4.254640,-4.254640,-5.361601,-5.361601 -49.94494,-49.07906,-49.07906,-49.94494
 3: 150 -5.361601,-5.361601,-5.212208,-5.212208 -49.07906,-49.04469,-49.04469,-49.07906
 4: 151 -5.212208,-5.212208,-5.002878,-5.002878 -49.04469,-48.48664,-48.48664,-49.04469
 5: 152 -5.002878,-5.002878,-5.080018,-5.080018 -48.48664,-48.43699,-48.43699,-48.48664
 6: 153 -5.080018,-5.080018,-5.079819,-5.079819 -48.43699,-48.42480,-48.42480,-48.43699
 7: 154 -5.079819,-5.079819,-5.155606,-5.155606 -48.42480,-47.53891,-47.53891,-48.42480
 8: 155 -5.155606,-5.155606,-4.954156,-4.954156 -47.53891,-47.50354,-47.50354,-47.53891
 9: 156 -4.954156,-4.954156,-3.675864,-3.675864 -47.50354,-45.39022,-45.39022,-47.50354
10: 157 -3.675864,-3.675864,-3.706356,-3.706356 -45.39022,-45.30724,-45.30724,-45.39022
11: 158 -3.706356,-3.706356,-3.705801,-3.705801 -45.30724,-45.30722,-45.30722,-45.30724
> points
    longitude  latitude  price
 1: -47.50308 -4.953936 3.0616
 2: -47.50308 -4.953936 3.2070
 3: -47.50308 -4.953936 3.0630
 4: -47.50308 -4.953936 3.0603
 5: -47.50308 -4.953936 3.0460
 6: -47.50308 -4.953936 2.9900
 7: -49.07035 -5.283658 3.3130
 8: -49.08054 -5.347284 3.3900
 9: -49.08054 -5.347284 3.3620
10: -49.21726 -5.338270 3.3900
11: -49.08050 -5.347255 3.4000
12: -49.08042 -5.347248 3.3220
13: -49.08190 -5.359508 3.3130
14: -49.08046 -5.347277 3.3560

I want to generate a mean price among all points that fit inside each polygon, for each polygon.

Right now I am using sp::point.in.polygon to get indices to all points that fit inside a given polygon, and then get its mean price

w <- lapply(1:nrow(polygons),
            function(tt) {
              ind <- point.in.polygon(points$latitude, points$longitude,
                                      polygons$pol_lat[[tt]], polygons$pol_lng[[tt]]) > 0
              med <- mean(points$price[ind])
              return(med)
            }
)
> unlist(w)
 [1]      NaN 3.361857 3.313000      NaN      NaN      NaN      NaN      NaN 3.071317      NaN      NaN

However, this is, obviously, slow. Any ideas on how to do it faster, maybe using data.table or dplyr (or any other means)?

Data follows

> dput(polygons)
structure(list(id = 148:158, pol_lat = list(c(-4.2502356, -4.2502356, 
-4.2546403, -4.2546403), c(-4.2546403, -4.2546403, -5.3616014, 
-5.3616014), c(-5.3616014, -5.3616014, -5.2122078, -5.2122078
), c(-5.2122078, -5.2122078, -5.0028781, -5.0028781), c(-5.0028781, 
-5.0028781, -5.0800181, -5.0800181), c(-5.0800181, -5.0800181, 
-5.0798186, -5.0798186), c(-5.0798186, -5.0798186, -5.1556063, 
-5.1556063), c(-5.1556063, -5.1556063, -4.9541564, -4.9541564
), c(-4.9541564, -4.9541564, -3.6758637, -3.6758637), c(-3.6758637, 
-3.6758637, -3.706356, -3.706356), c(-3.706356, -3.706356, -3.7058011, 
-3.7058011)), pol_lng = list(c(-49.9462826, -49.9449427, -49.9449427, 
-49.9462826), c(-49.9449427, -49.0790599, -49.0790599, -49.9449427
), c(-49.0790599, -49.0446868, -49.0446868, -49.0790599), c(-49.0446868, 
-48.4866355, -48.4866355, -49.0446868), c(-48.4866355, -48.436988, 
-48.436988, -48.4866355), c(-48.436988, -48.4247989, -48.4247989, 
-48.436988), c(-48.4247989, -47.5389072, -47.5389072, -48.4247989
), c(-47.5389072, -47.5035404, -47.5035404, -47.5389072), c(-47.5035404, 
-45.3902168, -45.3902168, -47.5035404), c(-45.3902168, -45.3072392, 
-45.3072392, -45.3902168), c(-45.3072392, -45.3072216, -45.3072216, 
-45.3072392))), row.names = c(NA, -11L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x00000000025e1ef0>)
> dput(points)
structure(list(longitude = c(-47.5030772, -47.5030772, -47.5030772, 
-47.5030772, -47.5030772, -47.5030772, -49.0703469, -49.0805422, 
-49.0805422, -49.217259, -49.0804978, -49.0804181, -49.0818997, 
-49.0804625), latitude = c(-4.9539357, -4.9539357, -4.9539357, 
-4.9539357, -4.9539357, -4.9539357, -5.283658, -5.3472839, -5.3472839, 
-5.3382696, -5.3472551, -5.347248, -5.3595084, -5.3472768), price = c(3.0616, 
3.207, 3.063, 3.0603, 3.046, 2.99, 3.313, 3.39, 3.362, 3.39, 
3.4, 3.322, 3.313, 3.356)), row.names = c(NA, -14L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x00000000025e1ef0>)

Upvotes: 1

Views: 935

Answers (1)

lbusett
lbusett

Reputation: 5932

If your "polygons" are always rectangles as in the example, a possibility is to use a QuadTree spatial index, as implemented in package SearchTrees, to improve the speed of identification of which points fall in each polygon.

It could give you a rather big speed boost thanks to the lower number of "comparisons" to be done granted by the spatial indexing, the larger the greater the number of points in your dataset.

For example:

library(SearchTrees)
library(magrittr)

# Create a "beefier" test dataset based on your data: 14000 pts 
# over 45000 polygons

for (i in 1:10) points   <- rbind(points, points + runif(length(points)))
for (i in 1:12) polygons <- rbind(polygons, polygons)


# Compute limits of the polygons
min_xs <- lapply(polygons$pol_lng , min) %>% unlist()
max_xs <- lapply(polygons$pol_lng , max) %>% unlist()
min_ys <- lapply(polygons$pol_lat , min) %>% unlist()
max_ys <- lapply(polygons$pol_lat, max) %>% unlist()
xlims <- cbind(min_xs, max_xs)
ylims <- cbind(min_ys, max_ys)

# Create the quadtree
tree = SearchTrees::createTree(cbind(points[1],points[2]))

#☻ extract averages, looping over polygons ----
t1 <- Sys.time()
w <- lapply(1:nrow(polygons), 
            function(tt) {
              ind <- SearchTrees::rectLookup(
                tree, 
                xlims = xlims[tt,],
                ylims = ylims[tt,]))
              mean(points$price[ind])

              })
Sys.time() - t1

Time difference of 2.945789 secs

w1 <- unlist(w)

On my old laptop, this "naive" implementation is more than 10 times faster than your original approach on the test data:

t1 <- Sys.time()
w <- lapply(1:nrow(polygons),
            function(tt) {
              ind <- sp::point.in.polygon(points$latitude, points$longitude,
                                      polygons$pol_lat[[tt]], polygons$pol_lng[[tt]]) > 0
              med <- mean(points$price[ind])
              return(med)
            }
)
Sys.time() - t1
w2 <- unlist(w)

Time difference of 40.36493 secs

, with the same results:

> all.equal(w1, w2)
[1] TRUE

The overall speed improvement will depend on how your points are "clustered" over the spatial extent, and with respect to the polygons.

Consider that you could exploit this approach also if your polygons are not rectangular, by first extracting the points included in the bbox of each polygon, and later finding the ones "inside" the polygon with a more standard approach.

Also consider that the task is embarassingly parallel, so that you could easily improve performance by using a foreach or parlapply approach over the polygons.

HTH!

Upvotes: 1

Related Questions