group gps data on haversine distance and calculate the mean

I have a dataframe like following (cannot show from original due to NDA):

points = [(-57.213878612138828, 17.916958304169601),
          (76.392039480378514, 0.060882542482108504),
          (0.12417670682730897, 1.0417670682730924),
          (-64.840321976787706, 21.374279296143762),
          (-48.966302937359913, 81.336323778066188),
          (11.122014925372399, 85.001119402984656),
          (8.6383049769438465, 84.874829066623917),
          (-57.349835526315836, 16.683634868421084),
          (83.051530302006697, 97.450469562867383),
          (8.5405200433369473, 83.566955579631625),
          (81.620435769843965, 48.106831247886376),
          (78.713027357450656, 19.547209139192304),
          (82.926153287322933, 81.026080639302577)]

x = [i[0] for i in points]
y = [i[1] for i in points]

df = pd.DataFrame(list(zip(x, y)), columns =['lat', 'lon'])
df

I want to cluster/group all points where distance <= 400 and after I want to calculate the mean of each group -- PLEASE NO CLUSTERING TECHNIQUE - ONLY HARD CODED:

What I have done is - first wrote the haversine funtion:

from math import radians, cos, sin, asin, sqrt
from scipy.spatial.distance import pdist, squareform
import pandas as pd


def haversine(lon1,lat1, lon2,lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1, lon1 = lon1,lat1
    lat2, lon2 = lon2,lat2
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

Then I have wrote a function to combine all the gps data - without recurrings - is that a MISTAKE?

def pairs(number):
    list_num = []
    for i in range(number):
        for j in range(number):
            if j >= i+1:
                list_num.append([i,j])
    return list_num

pair_list = pairs(12)
print(pair_list)

!!!HERE HELP NEEDED!!! In the following code I tried to implement the code using a dictionary but I didn't workout

df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
    max_index = index
    
pair_list = pairs(len(list(df['lat'])))

cluster = -1
cluster_dict = {}

for i, j in pair_list:
    #print(i, j)
    index = i
    #stat_nr = df.loc[i]['vwbhfnrprz']
    lat = df.loc[i]['lat']
    lon = df.loc[i]['lon']
        
    index_next = j
    #stat_nr_next = df.loc[j]['vwbhfnrprz']
    lat_next = df.loc[j]['lat']
    lon_next = df.loc[j]['lon']
    distance = haversine(lon,lat, lon_next, lat_next)
    if distance <= 400:
        cluster += 1
        print('cluster ' + str(cluster))
        if len(cluster_dict) == 0:
            cluster_dict[cluster]=[i,j]
        elif len(cluster_dict) != 0:
            for k in cluster_dict.copy():
                print(cluster_dict)
                print('k '+str(k))
                print(i,j)
                if i not in cluster_dict[k]:
                    
                    cluster_dict[cluster] = [i]
                    print(cluster_dict)
                    #cluster_dict[k].append(i)
                if j not in cluster_dict[k]:
                    
                    #print(cluster_dict[k])
                    break
                if j in cluster_dict[k]:
                    break
        print(cluster, index, index_next, lat, lon,  lat_next, lon_next, distance)

The idea behind this was - dictionary=cluster_dict ie. pair (5,6):

if cluster_dict was empty then 5 and 6 should be appended to cluster_dict with cluster = 0 --> cluster_dict = {0:[5,6]}
if cluster_dict was not empty cluster_dict = {0:[1,2]} and 5,6 were not in cluster_dict then 5 and 6 should be appended to cluster_dict with cluster = 1 --> cluster_dict = {0:[1,2], 1:[5,6]}
if cluster_dict was not empty cluster_dict = {0:[1,2], 0:[4,5]} and 5 or 6 were in cluster_dict then 5 or 6 should be appended to cluster_dict with cluster = cluster of existing one --> cluster_dict = {0:[1,2], 0:[4,5,6]}

Thanks!

Upvotes: 0

Answers (2)

Racooneer

Reputation: 369

I took a different approach on some steps by utilizing pandas functionality, but that should do it.

First, from your provided data set, I created a helper dataframe that gives me a point I and the point as list.

points = (df
    .assign(
        id = lambda x : np.arange(len(x)),
        point = lambda x : x.apply(lambda x : 
            [x.lat, x.lon], 
            axis = 1)
    )
)
points

looking like this:

lat	lon	id	point
-57.213879	17.916958	0	`[-57.21387861213883, 17.9169583041696]`
76.392039	0.060883	1	`[76.39203948037851, 0.060882542482108504]`
0.124177	1.041767	2	`[0.12417670682730897, 1.0417670682730924]`
-64.840322	21.374279	3	`[-64.8403219767877, 21.37427929614376]`
-48.966303	81.336324	4	`[-48.96630293735991, 81.33632377806619]`

Then the following steps are applied:

expand the ids to create the pairs
reduce pairs to unique ones
left join the point coordinates for "from" and "to"
calculate haversine (using your provided function)
grouping and calcuating the mean

distances = (
    # create the pairs
    pd.DataFrame(index = 
        pd.MultiIndex
            .from_product(
                [points.id.to_list(), points.id.to_list()], 
                names = ["from_id", "to_id"]
            )
        )
    .reset_index()
    # reduce to unique pairs (including itself, to get single clusters later)
    # (if you imaginge this as a from-to-matrix, it takes the upper half in a
    # very simplified way)
    .query("from_id < to_id")
    # add the coordinates from the helper table (please be aware of the required renaming.
    # This would create the right id columns as additional columns you would not want if
    # this steps is the required output. You would have to .drop() them...
    .merge(
        points.filter(["id", "point"]).rename(columns={"point" : "from_point"}),
        how="left",
        left_on = "from_id",
        right_on = "id"
    )
    .merge(
        points.filter(["id", "point"]).rename(columns={"point" : "to_point"}),
        how="left",
        left_on = "to_id",
        right_on = "id"
    )
    # calculate haversine and grouping and calculating the mean (I find it confusing, 
    # that you points are (lat, lon) but your function takes (lon,lat). Better check this
    # for errors!
    .assign(
        haversine = lambda x : x.apply(lambda x : 
            haversine(x.from_point[1], x.from_point[0], x.to_point[1], x.to_point[0])
            , axis = 1),
        # !!!! important change to original
        distance_group = lambda x : (x.haversine <= 400).map({True : "< 400", False : ">= 400"}).astype('category'),
        # this avoids points showing up in smaler clusters
        already_clusterd = lambda x : x.apply(lambda y : 
            y.from_id in x.query("distance_group == '< 400' and from_id != to_id").to_id.to_list()
            , axis = 1) 
    )
    .groupby(["from_id", "distance_group"])
    .agg(cluster = ("to_id", lambda x : x.to_list()))
    .reset_index()
    )
        
print(distances)

This gives us the following table (example)

	from_id	distance_group	cluster
0	0	< 400	`[0, 7]`
1	0	>= 400	`[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12]`
2	1	< 400	`[1]`
3	1	>= 400	`[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]`
4	2	< 400	`[2]`
5	2	>= 400	`[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]`
6	3	< 400	`[3]`

So, from here on, I somewhat confused on your inteded result, but this would give us the closest to what I think you want

(distances
    .query("distance_group == '< 400'")
    # comment the next two lines to get single length clusters
    .assign(cluter_length = lambda x : x.cluster.map(lambda x : len(x)))
    .query("cluter_length > 1")
    ).cluster.to_list()

without the line commented the output: [[0, 7], [5, 6, 9]] if you comment them out, the output is [[0, 7], [1], [2], [3], [4], [5, 6, 9], [8], [10], [11], [12]]

Upvotes: 1

TheDev

Reputation: 153

I used a nested list to do the clustering but is inefficient on large datasets:

df = df.sort_values(['lat', 'lon'])
df = df.reset_index(drop=True)
max_index = 0
for index, row in df.iterrows():
    max_index = index
    
pair_list = pairs(len(list(df['lat'])))

cluster = -1
num_exists_list = []
cluster_list = []

for i, j in pair_list:
    #print(i, j)
    index = i
    #stat_nr = df.loc[i]['vwbhfnrprz']
    lat = df.loc[i]['lat']
    lon = df.loc[i]['lon']
        
    index_next = j
    #stat_nr_next = df.loc[j]['vwbhfnrprz']
    lat_next = df.loc[j]['lat']
    lon_next = df.loc[j]['lon']
    distance = haversine(lon,lat, lon_next, lat_next)
    if distance <= 400:
        if len(cluster_list)==0:
            num_exists_list.append(i)
            num_exists_list.append(j)
            cluster_list.append([i,j])
            #print('first - ' + str([i,j]))
        else:
            if i not in num_exists_list and j not in num_exists_list:
                num_exists_list.append(i)
                num_exists_list.append(j)
                cluster_list.append([i,j])
                #print('not exists - ' + str([i,j]))
            elif i in num_exists_list and j not in num_exists_list:
                index = find_index(cluster_list,i)
                cluster_list[index].append(j)
                num_exists_list.append(j)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
            elif i not in num_exists_list and j in num_exists_list:
                index = find_index(cluster_list,j)
                cluster_list[index].append(i)
                num_exists_list.append(i)
                #print('exists - ' + str(i) + ' - missing ' + str(j))
                
print('--------------------')
#print(num_exists_list)            
print(cluster_list)

Upvotes: 0

group gps data on haversine distance and calculate the mean

Answers (2)

Related Questions