jonboy
jonboy

Reputation: 368

Return distance to nearest point by group - python

I'm hoping to calculate the distance to the nearest point grouped by specific items. Specifically, using below, calculate_distances measures the distances between each specific id and the remaining points. I'm hoping to return the distance to the nearest point but for each item in Group. So the nearest distance to Red and the nearest distance to Grn.

Note: I have 3 unique items in each Group. I'm hoping to handle multiple unique items that contain varying labels.

import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

df = pd.DataFrame({              
    'Time' : [1,1,1,1,1,1,2,2,2,2,2,2],             
    'ID' : ['A','B','C','X','U','V','A','B','C','X','U','V'],      
    'Group' : ['Red','Red','Red','Grn','Grn','Grn','Red','Red','Red','Grn','Grn','Grn'],           
    'X' : [2.0,3.0,4.0,2.0,2.0,1.0,1.0,6.0,4.0,2.0,5.0,3.0],
    'Y' : [3.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0],           
    })

def calculate_distances(df):

    id_distances = pd.DataFrame(
        squareform(pdist(df[['X', 'Y']].to_numpy())),  
        columns = df['ID'],
        index = df['ID'],
    )

    return id_distances

df_distances = df.groupby(['Time']).apply(calculate_distances).reset_index()

intended output:

ID  Time ID  dist_Red  dist_Grn                                    
0      1  A  2.236068  1.000000  
1      1  B  1.414214  1.414214  
2      1  C  1.414214  2.000000  
3      1  X  1.414214  2.000000  
4      1  U  1.000000  1.414214  
5      1  V  2.236068  1.414214  
6      2  A  3.162278  1.414214  
7      2  B  2.236068  1.000000  
8      2  C  2.236068  1.414214 
9      2  X  1.414214  1.414214
10     2  U  1.000000  2.000000
11     2  V  1.414214  1.414214

Upvotes: 1

Views: 378

Answers (1)

Willem Hendriks
Willem Hendriks

Reputation: 1487

I could not find a nice straightforward way, as it seems from your example you don't want to include the point itself. I ended up making groups and calculate distances within them.

Edit: Added variation which (should) include the ID of the nearest point.

from sklearn.neighbors import BallTree
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

df = pd.DataFrame({              
    'Time' : [1,1,1,1,1,1,2,2,2,2,2,2],             
    'ID' : ['A','B','C','X','U','V','A','B','C','X','U','V'],      
    'Group' : ['Red','Red','Red','Grn','Grn','Grn','Red','Red','Red','Grn','Grn','Grn'],           
    'X' : [2.0,3.0,4.0,2.0,2.0,1.0,1.0,6.0,4.0,2.0,5.0,3.0],
    'Y' : [3.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0],           
    })


def subgroup_distance(df, group_column='Group', include_point_itself=True):    
    groups = df[group_column].unique()

    all_points = df[['X','Y']].values
    
    for group in groups:
        group_points = df[df[group_column] == group][['X','Y']]
        tree = BallTree(group_points, leaf_size=15, metric='minkowski')
        
        if include_point_itself:
            distance, index = tree.query(all_points, k=1)
            distances = distance[:,0]
            distance_column_name = "distance_{}".format( group )
            df[ distance_column_name ] = distances
                
        else:
            indici = (df[group_column] == group).values * 1
            distance, index = tree.query(all_points, k=2)
            distances = distance[ np.arange(distance.shape[0]), indici]

            distance_column_name = "distance_{}".format( group )
            df[ distance_column_name ] = distances

    return df

def calculate_distances(df):
    return subgroup_distance(df, include_point_itself=False)

df_distances = df.groupby(['Time']).apply(calculate_distances).reset_index()

this will output

    index  Time ID Group    X    Y  distance_Red  distance_Grn
0       0     1  A   Red  2.0  3.0      2.236068      1.000000
1       1     1  B   Red  3.0  1.0      1.414214      1.414214
2       2     1  C   Red  4.0  0.0      1.414214      2.000000
3       3     1  X   Grn  2.0  0.0      1.414214      1.414214
4       4     1  U   Grn  2.0  2.0      1.000000      1.414214
5       5     1  V   Grn  1.0  1.0      2.000000      1.414214
6       6     2  A   Red  1.0  2.0      3.162278      1.414214
7       7     2  B   Red  6.0  0.0      2.236068      1.000000
8       8     2  C   Red  4.0  1.0      2.236068      1.414214
9       9     2  X   Grn  2.0  1.0      1.414214      1.414214
10     10     2  U   Grn  5.0  0.0      1.000000      2.000000
11     11     2  V   Grn  3.0  0.0      1.414214      1.414214

Variation which will output the ID of the nearest point in the subgroup

def subgroup_distance_with_nearest(df, group_column='Group', include_point_itself=True):    
    groups = df[group_column].unique()

    all_points = df[['X','Y']].values
    
    for group in groups:
        group_points = df[df[group_column] == group][['X','Y']]
        tree = BallTree(group_points, leaf_size=15, metric='minkowski')
        
        distances = None
        nearest_id = None
        
        if include_point_itself:
            distance, index = tree.query(all_points, k=1)
            distances = distance[:,0]
            nearest_id = group_points.index[index[:,0]]
                        
        else:
            indici = (df[group_column] == group).values * 1
            distance, index = tree.query(all_points, k=2)
            distances = distance[ np.arange(distance.shape[0]), indici]
            index_indici =  index[ np.arange(distance.shape[0]), indici]
            nearest_id = group_points.index[index_indici]

        distance_column_nearest_name = "nearest_index_{}".format( group )
        distance_column_name = "distance_{}".format( group )
        df[ distance_column_name ] = distances
        df[ distance_column_nearest_name] = nearest_id


    return df

def subgroup_distance_with_nearest(df):
    return subgroup_distance(df, include_point_itself=False)

df_distances = df.groupby(['Time']).apply(calculate_distances).reset_index()

and it will output

    index  Time ID Group    X    Y  distance_Red  nearest_index_Red  \
0       0     1  A   Red  2.0  3.0      2.236068                  1   
1       1     1  B   Red  3.0  1.0      1.414214                  2   
2       2     1  C   Red  4.0  0.0      1.414214                  1   
3       3     1  X   Grn  2.0  0.0      1.414214                  1   
4       4     1  U   Grn  2.0  2.0      1.000000                  0   
5       5     1  V   Grn  1.0  1.0      2.000000                  1   
6       6     2  A   Red  1.0  2.0      3.162278                  8   
7       7     2  B   Red  6.0  0.0      2.236068                  8   
8       8     2  C   Red  4.0  1.0      2.236068                  7   
9       9     2  X   Grn  2.0  1.0      1.414214                  6   
10     10     2  U   Grn  5.0  0.0      1.000000                  7   
11     11     2  V   Grn  3.0  0.0      1.414214                  8   

    distance_Grn  nearest_index_Grn  
0       1.000000                  4  
1       1.414214                  4  
2       2.000000                  3  
3       1.414214                  5  
4       1.414214                  5  
5       1.414214                  3  
6       1.414214                  9  
7       1.000000                 10  
8       1.414214                 11  
9       1.414214                 11  
10      2.000000                 11  
11      1.414214                  9  

I didn't recalculate and test the ID's, but seems to be at least correct that it indeed return a ID from the subgroup.

Upvotes: 1

Related Questions