Reputation: 153
I have a dataframe like following (cannot show from original due to NDA):
points = [(-57.213878612138828, 17.916958304169601),
(76.392039480378514, 0.060882542482108504),
(0.12417670682730897, 1.0417670682730924),
(-64.840321976787706, 21.374279296143762),
(-48.966302937359913, 81.336323778066188),
(11.122014925372399, 85.001119402984656),
(8.6383049769438465, 84.874829066623917),
(-57.349835526315836, 16.683634868421084),
(83.051530302006697, 97.450469562867383),
(8.5405200433369473, 83.566955579631625),
(81.620435769843965, 48.106831247886376),
(78.713027357450656, 19.547209139192304),
(82.926153287322933, 81.026080639302577)]
x = [i[0] for i in points]
y = [i[1] for i in points]
df = pd.DataFrame(list(zip(x, y)), columns =['lat', 'lon'])
df
I want to cluster/group all points where distance <= 400 and after I want to calculate the mean of each group -- PLEASE NO CLUSTERING TECHNIQUE - ONLY HARD CODED:
What I have done is - first wrote the haversine funtion:
from math import radians, cos, sin, asin, sqrt
from scipy.spatial.distance import pdist, squareform
import pandas as pd
def haversine(lon1,lat1, lon2,lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lat1, lon1 = lon1,lat1
lat2, lon2 = lon2,lat2
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles
return c * r
Then I have wrote a function to combine all the gps data - without recurrings - is that a MISTAKE?
def pairs(number):
list_num = []
for i in range(number):
for j in range(number):
if j >= i+1:
list_num.append([i,j])
return list_num
pair_list = pairs(12)
print(pair_list)
!!!HERE HELP NEEDED!!! In the following code I tried to implement the code using a dictionary but I didn't workout
df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
max_index = index
pair_list = pairs(len(list(df['lat'])))
cluster = -1
cluster_dict = {}
for i, j in pair_list:
#print(i, j)
index = i
#stat_nr = df.loc[i]['vwbhfnrprz']
lat = df.loc[i]['lat']
lon = df.loc[i]['lon']
index_next = j
#stat_nr_next = df.loc[j]['vwbhfnrprz']
lat_next = df.loc[j]['lat']
lon_next = df.loc[j]['lon']
distance = haversine(lon,lat, lon_next, lat_next)
if distance <= 400:
cluster += 1
print('cluster ' + str(cluster))
if len(cluster_dict) == 0:
cluster_dict[cluster]=[i,j]
elif len(cluster_dict) != 0:
for k in cluster_dict.copy():
print(cluster_dict)
print('k '+str(k))
print(i,j)
if i not in cluster_dict[k]:
cluster_dict[cluster] = [i]
print(cluster_dict)
#cluster_dict[k].append(i)
if j not in cluster_dict[k]:
#print(cluster_dict[k])
break
if j in cluster_dict[k]:
break
print(cluster, index, index_next, lat, lon, lat_next, lon_next, distance)
The idea behind this was - dictionary=cluster_dict ie. pair (5,6):
Thanks!
Upvotes: 0
Views: 192
Reputation: 369
I took a different approach on some steps by utilizing pandas functionality, but that should do it.
First, from your provided data set, I created a helper dataframe that gives me a point I and the point as list.
points = (df
.assign(
id = lambda x : np.arange(len(x)),
point = lambda x : x.apply(lambda x :
[x.lat, x.lon],
axis = 1)
)
)
points
looking like this:
lat | lon | id | point |
---|---|---|---|
-57.213879 | 17.916958 | 0 | [-57.21387861213883, 17.9169583041696] |
76.392039 | 0.060883 | 1 | [76.39203948037851, 0.060882542482108504] |
0.124177 | 1.041767 | 2 | [0.12417670682730897, 1.0417670682730924] |
-64.840322 | 21.374279 | 3 | [-64.8403219767877, 21.37427929614376] |
-48.966303 | 81.336324 | 4 | [-48.96630293735991, 81.33632377806619] |
Then the following steps are applied:
distances = (
# create the pairs
pd.DataFrame(index =
pd.MultiIndex
.from_product(
[points.id.to_list(), points.id.to_list()],
names = ["from_id", "to_id"]
)
)
.reset_index()
# reduce to unique pairs (including itself, to get single clusters later)
# (if you imaginge this as a from-to-matrix, it takes the upper half in a
# very simplified way)
.query("from_id < to_id")
# add the coordinates from the helper table (please be aware of the required renaming.
# This would create the right id columns as additional columns you would not want if
# this steps is the required output. You would have to .drop() them...
.merge(
points.filter(["id", "point"]).rename(columns={"point" : "from_point"}),
how="left",
left_on = "from_id",
right_on = "id"
)
.merge(
points.filter(["id", "point"]).rename(columns={"point" : "to_point"}),
how="left",
left_on = "to_id",
right_on = "id"
)
# calculate haversine and grouping and calculating the mean (I find it confusing,
# that you points are (lat, lon) but your function takes (lon,lat). Better check this
# for errors!
.assign(
haversine = lambda x : x.apply(lambda x :
haversine(x.from_point[1], x.from_point[0], x.to_point[1], x.to_point[0])
, axis = 1),
# !!!! important change to original
distance_group = lambda x : (x.haversine <= 400).map({True : "< 400", False : ">= 400"}).astype('category'),
# this avoids points showing up in smaler clusters
already_clusterd = lambda x : x.apply(lambda y :
y.from_id in x.query("distance_group == '< 400' and from_id != to_id").to_id.to_list()
, axis = 1)
)
.groupby(["from_id", "distance_group"])
.agg(cluster = ("to_id", lambda x : x.to_list()))
.reset_index()
)
print(distances)
This gives us the following table (example)
from_id | distance_group | cluster | |
---|---|---|---|
0 | 0 | < 400 | [0, 7] |
1 | 0 | >= 400 | [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12] |
2 | 1 | < 400 | [1] |
3 | 1 | >= 400 | [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] |
4 | 2 | < 400 | [2] |
5 | 2 | >= 400 | [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] |
6 | 3 | < 400 | [3] |
So, from here on, I somewhat confused on your inteded result, but this would give us the closest to what I think you want
(distances
.query("distance_group == '< 400'")
# comment the next two lines to get single length clusters
.assign(cluter_length = lambda x : x.cluster.map(lambda x : len(x)))
.query("cluter_length > 1")
).cluster.to_list()
without the line commented the output:
[[0, 7], [5, 6, 9]]
if you comment them out, the output is
[[0, 7], [1], [2], [3], [4], [5, 6, 9], [8], [10], [11], [12]]
Upvotes: 1
Reputation: 153
I used a nested list to do the clustering but is inefficient on large datasets:
df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
max_index = index
pair_list = pairs(len(list(df['lat'])))
cluster = -1
num_exists_list = []
cluster_list = []
for i, j in pair_list:
#print(i, j)
index = i
#stat_nr = df.loc[i]['vwbhfnrprz']
lat = df.loc[i]['lat']
lon = df.loc[i]['lon']
index_next = j
#stat_nr_next = df.loc[j]['vwbhfnrprz']
lat_next = df.loc[j]['lat']
lon_next = df.loc[j]['lon']
distance = haversine(lon,lat, lon_next, lat_next)
if distance <= 400:
if len(cluster_list)==0:
num_exists_list.append(i)
num_exists_list.append(j)
cluster_list.append([i,j])
#print('first - ' + str([i,j]))
else:
if i not in num_exists_list and j not in num_exists_list:
num_exists_list.append(i)
num_exists_list.append(j)
cluster_list.append([i,j])
#print('not exists - ' + str([i,j]))
elif i in num_exists_list and j not in num_exists_list:
index = find_index(cluster_list,i)
cluster_list[index].append(j)
num_exists_list.append(j)
#print('exists - ' + str(i) + ' - missing ' + str(j))
elif i not in num_exists_list and j in num_exists_list:
index = find_index(cluster_list,j)
cluster_list[index].append(i)
num_exists_list.append(i)
#print('exists - ' + str(i) + ' - missing ' + str(j))
print('--------------------')
#print(num_exists_list)
print(cluster_list)
Upvotes: 0