Reputation: 856
I have address data and shapefiles with polygons, and am trying to determine the closest distance (in miles) of each address from each polygon, then create a nested dict containing all the info, with this format:
nested_dict = {poly_1: {address1: distance, address2 : distance},
poly2: {address1: distance, address2: distance}, etc}
The full, applicable code I'm using is:
import pandas as pd
from shapely.geometry import mapping, Polygon, LinearRing, Point
import geopandas as gpd
from math import radians, cos, sin, asin, sqrt
address_dict = {k: [] for k in addresses_geo.input_string}
sludge_dtc = {k: [] for k in sf_geo.unique_name}
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 3956 # Radius of earth in miles. Use 6371 for kilometers
return c * r
# Here's the key loop that isn't working correctly
for unique_name, i in zip(sf_geo.unique_name, sf_geo.index):
for address, pt in zip(addresses_geo.input_string, addresses_geo.index):
pol_ext = LinearRing(sf_geo.iloc[i].geometry.exterior.coords)
d = pol_ext.project(addresses_geo.iloc[pt].geometry)
p = pol_ext.interpolate(d)
closest_point_coords = list(p.coords)[0]
# print(closest_point_coords)
dist = haversine(addresses_geo.iloc[pt].geometry.x,
addresses_geo.iloc[pt].geometry.y,
closest_point_coords[0], closest_point_coords[1])
address_dict[address] = dist
sludge_dtc[unique_name] = address_dict
# Test results on a single address
addresses_with_sludge_distance = pd.DataFrame(sludge_dtc)
print(addresses_with_sludge_distance.iloc[[1]].T)
If I break this code out and try and calculate the distances for a single polygon, it seems to work fine. However, when I create the DataFrame and check an address, it lists the same distance for every single polygon.
So, inner-dict-key '123 Main Street' will have 5.25 miles for each of the polygon keys in the outer dict, and '456 South Street' will have 6.13 miles for each of the polygon keys in the outer dict. (Made up examples.)
I realize I must be doing something dumb in the way I have the for loops set up, but I can't figure it out. I've reversed the order of the for statements, messed with indents-- all the same result.
To make it clear, what I want to happen is:
Any ideas what I'm missing?
Upvotes: 1
Views: 164
Reputation: 1971
The problem is very simple, you are always using the same address_dict
instance.
You just need to recreate it inside every key loop.
import pandas as pd
from shapely.geometry import mapping, Polygon, LinearRing, Point
import geopandas as gpd
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 3956 # Radius of earth in miles. Use 6371 for kilometers
return c * r
sludge_dtc = {k: [] for k in sf_geo.unique_name}
# Here's the key loop that isn't working correctly
for unique_name, i in zip(sf_geo.unique_name, sf_geo.index):
address_dict = {k: [] for k in addresses_geo.input_string}
for address, pt in zip(addresses_geo.input_string, addresses_geo.index):
pol_ext = LinearRing(sf_geo.iloc[i].geometry.exterior.coords)
d = pol_ext.project(addresses_geo.iloc[pt].geometry)
p = pol_ext.interpolate(d)
closest_point_coords = list(p.coords)[0]
# print(closest_point_coords)
dist = haversine(addresses_geo.iloc[pt].geometry.x,
addresses_geo.iloc[pt].geometry.y,
closest_point_coords[0], closest_point_coords[1])
address_dict[address] = dist
sludge_dtc[unique_name] = address_dict
# Test results on a single address
addresses_with_sludge_distance = pd.DataFrame(sludge_dtc)
print(addresses_with_sludge_distance.iloc[[1]].T)
Another consideration:
Your are creating empty dictionaries with empty lists as values, but after you set values directly (empty list are replaced). If you need to collect a list of values you should append
values to the existing list, eg:
address_dict[address].append(dist)
and
sludge_dtc[unique_name].append(address_dict)
Upvotes: 1