Reputation: 679
I'm mapping a large number of lat/long coordinate pairs to associated zip codes. The number of records is too large to use an API like Google Maps or geonames because of call limits.
I have a lookup table which contains zip codes and the lat/long centroid of each zip code. You can get the lookup table here:
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
For each lat/long pair in my data, I'd like to match it to the nearest zip code centroid by finding the minimum absolute difference between that lat/long pair and each centroid in my lookup table.
What is the most efficient way to apply such a a "lookup" function row-wise to a large number of records?
Sample Data: A list of lat/long coordinates:
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
print(latlongdata)
dropoff_longitude dropoff_latitude
8807218 -73.98171 40.76056
9760613 -73.99355 40.75635
3175671 -73.97330 40.76265
10878727 -73.98882 40.77750
2025038 -73.93848 40.68469
5345659 -74.01550 40.70988
14474481 -73.95472 40.78337
1650223 -73.95710 40.77666
684883 -73.97130 40.75215
9129975 -73.99794 40.72053
**ZipLooker function: Finds the minimum absolute distance from an input coordinate pair to the nearest zip code centroid and returns that zip
library(dplyr)
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp1
x$latdiff=abs(dropoff_latitude-x$Latitude)
x$londiff=abs(dropoff_longitude-x$Longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$Postal
return(z)
}, error=function(e) NA)
}
}
Apply the Ziplooker function using dplyr's rowwsie() function
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
Upvotes: 0
Views: 3539
Reputation: 679
Comparing the speed of the dplyr rowwise option vs. hrbrmstr's maptools solution, looks like dplyr wins out (at least on the smaller dataset)
# Test lat/long data
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
# Method 1: dplyr + ZipLooker function
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp
x$latdiff=abs(dropoff_latitude-x$latitude)
x$londiff=abs(dropoff_longitude-x$longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$zip
return(z)
}, error=function(e) NA_character_)
}
}
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
# Method 2: maptools + sp
library(sp)
library(maptools)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(ziplkp[ziplkp$state=="NY",]$zip),]
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# comparing the two (only the bulkiest parts)
library(microbenchmark)
microbenchmark(
dat <- pts %over% ny
,
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
,times = 10)
Output:
Unit: milliseconds
expr
dat <- pts %over% ny
latlongdata %>% rowwise() %>% mutate(zipcode = ZipLooker(dropoff_longitude, dropoff_latitude))
min lq median uq max neval
275.89494 286.38187 297.9254 421.8727 445.7165 10
95.18166 97.09873 101.8102 108.8677 122.0515 10
Upvotes: 0
Reputation: 78792
Here's a complete solution for you:
library(sp)
library(maptools)
library(zipcode)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(zipcode[zipcode$state=="NY",]$zip),]
# your points
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# boom
dat
## ZCTA5CE10 AFFGEOID10 GEOID10 ALAND10 AWATER10 lon lat
## 8807218 10019 8600000US10019 10019 1780742 0 -73.98171 40.76056
## 9760613 10018 8600000US10018 10018 836253 0 -73.99355 40.75635
## 3175671 10022 8600000US10022 10022 1107169 0 -73.97330 40.76265
## 10878727 10069 8600000US10069 10069 249044 0 -73.98882 40.77750
## 2025038 11221 8600000US11221 11221 3582803 0 -73.93848 40.68469
## 5345659 10280 8600000US10280 10280 300652 38759 -74.01550 40.70988
## 14474481 10128 8600000US10128 10128 1206195 0 -73.95472 40.78337
## 1650223 10028 8600000US10028 10028 811363 0 -73.95710 40.77666
## 684883 10017 8600000US10017 10017 820953 0 -73.97130 40.75215
## 9129975 10013 8600000US10013 10013 1425085 0 -73.99794 40.72053
Upvotes: 2
Reputation: 34703
I use something along these lines to convert lon-lat to containing polygons:
library(maptools)
points.file<-readShapePoints("path.to.pts.shp")
poly.file<-read.ShapePoy("path.to.poly.shp")
points.file %over% poly.file
Upvotes: 0