Convert dataframe to numpy array

Question

I am somewhat new to Python and I am stuck with this issue. I have the following DataFrame:

import pandas as pd

data = {'id':  ['A', 'A', 'A','B', 'B','B', 'C', 'C'],
        'city':  ['NaN', 'NaN', 'Paris','NaN', 'Berlin','London', 'NaN', 'Rome'],
        'year': [2000, 2001, 2002, 2000, 2001,2001,2000,2001],
        'x': [0,0,1,0,1,1,0,1]}

df = pd.DataFrame(data)

For each year in the DataFrame, I need a matrix where columns and rows are the cities i, and the (i,i) elements are either 0 or 1 according to the value of x. Please note whether an id is located in two cities i and j, (i,j) and (j,i) equal 1 - and NOT (i,i) and (j,j)

The desired output:

year=2000:

year=2001:

year=2002:

JANO · Accepted Answer

Not sure if it is the best solution, but it seems to work fine. The idea is to first find all city/city combinations of your matrix and then check if x should be zero or one.

import itertools
import pandas as pd

# Create your dataframe
data = {'id':  ['A', 'A', 'A','B', 'B','B', 'C', 'C'],
        'city':  ['NaN', 'NaN', 'Paris','NaN', 'Berlin','London', 'NaN', 'Rome'],
        'year': [2000, 2001, 2002, 2000, 2001,2001,2000,2001],
        'x': [0,0,1,0,1,1,0,1]}
df = pd.DataFrame(data)


# Helper function to determine x value in final matrix
def check_if_one_or_zero(df_x, city_a, city_b):
    a = df_x[df_x['city'] == city_a]
    b = df_x[df_x['city'] == city_b]
    
    # if one city has no entry with x==1 => x for both is always zero
    if a.empty or b.empty:
        return 0
    
    a_id = a['id'].iloc[0]
    b_id = b['id'].iloc[0]
    
    # Number of entries with same id
    group_size = len(df_x[df_x['id'] == a_id])
    
    # if city A is city B and it is only city with this label => x is 1
    if group_size == 1 and city_a == city_b:
        return 1
    
    # if city A and B are distinct and have same id => x is 1
    if group_size > 1 and a_id == b_id and city_a != city_b:
        return 1

    return 0


def create_matrix(df, year):
    # Create a data frame where we include every combination of two cities
    all_combinations = list(itertools.product(df[df['city'] != 'NaN']['city'], df[df['city']!='NaN']['city']))
    df_combinations = pd.DataFrame(all_combinations, columns=['City A','City B'])

    # Create helper df for comparison which only has entries with x==1 and correct year
    df_x = df[(df['year']==year) & (df['city']!='NaN') & (df['x']==1)]

    # Set x for each city / city combination with the helper function
    df_combinations['x'] = df_combinations.apply(lambda x: check_if_one_or_zero(df_x, x['City A'], x['City B']), axis=1)

    # Use pivot to create final matrix
    return df_combinations.pivot(index='City A', columns='City B', values='x')


print(create_matrix(df, 2000))
print(create_matrix(df, 2001))
print(create_matrix(df, 2002))

Output year==2000:

City B  Berlin  London  Paris  Rome
City A                             
Berlin       0       0      0     0
London       0       0      0     0
Paris        0       0      0     0
Rome         0       0      0     0

Output year==2001:

City B  Berlin  London  Paris  Rome
City A                             
Berlin       0       1      0     0
London       1       0      0     0
Paris        0       0      0     0
Rome         0       0      0     1

Output year==2002:

City B  Berlin  London  Paris  Rome
City A                             
Berlin       0       0      0     0
London       0       0      0     0
Paris        0       0      1     0
Rome         0       0      0     0

Convert dataframe to numpy array

Answers (1)

Related Questions