b101
b101

Reputation: 297

pandas loop to run multiple cross tabs

I have the following dataset and I want to compute all possible combinations of cross-tabulations in the most efficient way, I have been able to calculate pairs against one master variable, but not for all possibilities (i have popped what i mean in below). Is there a way to get this all in loop that could handle any number of columns? Thanks so much!

data

import pandas as pd
df1 = pd.DataFrame(data={'id': [1,2,3,4,5,6,7,8,9,10], 
                         'a': [1,1,2,2,2,1,1,2,1,1], 
                         'b': [1,2,3,3,3,2,1,2,3,1], 
                         'c': [1,2,2,1,1,1,1,2,1,2],
                         'd': [1,1,2,2,1,1,1,1,1,2],
                        })

d1={1: 'right', 2: 'left'} 
d2={1: '10', 2: '30', 3: '20'} 
d3={1: 'green', 2: 'red'} 
d4={1: 'yes', 2: 'no'} 

df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')
df1['d']=df1['d'].map(d4).fillna('Other')

combinations


pd.crosstab(df1.a, df1.b)
pd.crosstab(df1.a, df1.c)
pd.crosstab(df1.a, df1.d)
pd.crosstab(df1.b, df1.c)
pd.crosstab(df1.b, df1.d)
pd.crosstab(df1.c, df1.d)


pd.crosstab(df1.a, [df1.b, df1.c])
pd.crosstab(df1.a, [df1.b, df1.d])
pd.crosstab(df1.a, [df1.c, df1.d])

pd.crosstab(df1.a, [df1.b, df1.c, df1.d])

what I have so far

def cross_tab(data_frame, id_col):
    col_names=['b','c','d']
    datasets = {}
    for i in col_names:
        datasets['crosstab_{}'.format(i)] = pd.crosstab(data_frame[id_col], data_frame[i])
    return datasets
cross_tab(df1, 'a')

EDIT

slightly edited request now separate to cross tabulation - to split output based on whether the table includes a specific value, in this case, dfs (a) with a value of 100 should be stored in a separate list to the rest (b and c)

data

import pandas as pd
import numpy as np

df1 = pd.DataFrame(data={ 
                         'a': [1,1,1,1], 
                         'b': [1,1,2,1], 
                         'c': [1,2,2,1]
                        })

d1={0: 'right', 1: 'left'} 
d2={1: 'yes', 2: 'no'} 
d3={1: 'up', 2: 'down', 3: 'sideways'} 
#d4={1: 'yes', 2: 'no'} 

df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')

command solved (i think)

def split_cross_tabs(dataframe, cols, contain_val):
    datasets = defaultdict(dict)
    for x in df1:
        p = df1[x].value_counts(normalize=True)*100
        datasets[
            'y' if p.eq(contain_val).any().any() else'n']['crosstab_{}'.format(x)] = p
    return datasets  

output

defaultdict(dict,
            {'y': {'crosstab_a': left    100.0
              Name: a, dtype: float64},
             'n': {'crosstab_b': yes    75.0
              no     25.0
              Name: b, dtype: float64,
              'crosstab_c': down    50.0
              up      50.0
              Name: c, dtype: float64}})

Upvotes: 1

Views: 803

Answers (1)

Henry Ecker
Henry Ecker

Reputation: 35646

Try with the itertools recipe for a powerset and modify to only keep combinations of length 2 or greater:

from itertools import chain, combinations


def all_cross_tabs(dataframe, cols):
    datasets = {}
    for s in chain.from_iterable(
            combinations(cols, r) for r in range(2, len(cols) + 1)
    ):
        datasets[f'crosstab_{"_".join(s)}'] = pd.crosstab(
            dataframe[s[0]],
            [dataframe[c] for c in s[1:]]
        )
    return datasets

Sample:

d = all_cross_tabs(df1, ['a', 'b', 'c', 'd'])

d.keys():

dict_keys(['crosstab_a_b', 'crosstab_a_c', 'crosstab_a_d', 'crosstab_b_c',
           'crosstab_b_d', 'crosstab_c_d', 'crosstab_a_b_c', 'crosstab_a_b_d',
           'crosstab_a_c_d', 'crosstab_b_c_d', 'crosstab_a_b_c_d'])

d['crosstab_a_b']:

b      10  20  30
a                
left    0   3   1
right   3   1   2

d['crosstab_a_b_c']:

b        10        20        30    
c     green red green red green red
a                                  
left      0   0     2   1     0   1
right     2   1     1   0     1   1

d['crosstab_a_b_c_d']

b        10        20            30    
c     green red green     red green red
d       yes  no    no yes  no   yes yes
a                                      
left      0   0     1   1   1     0   1
right     2   1     0   1   0     1   1

Edit: Split into two sections based on contain_val

def split_cross_tabs(dataframe, cols, contain_val):
    datasets = defaultdict(dict)
    for s in chain.from_iterable(
            combinations(cols, r) for r in range(2, len(cols) + 1)
    ):
        ct_df = pd.crosstab(
            dataframe[s[0]],
            [dataframe[c] for c in s[1:]]
        )
        datasets[
            'y' if ct_df.eq(contain_val).any().any() else 'n'
        ][f'crosstab_{"_".join(s)}'] = ct_df
    return datasets


d = split_cross_tabs(df1, ['a', 'b', 'c', 'd'], 3)

d.keys():

dict_keys(['y', 'n'])

list(map(lambda a: a.keys(), d.values())):

[dict_keys(['crosstab_a_b', 'crosstab_b_c', 'crosstab_b_d']),
 dict_keys(['crosstab_a_c', 'crosstab_a_d', 'crosstab_c_d', 'crosstab_a_b_c',
            'crosstab_a_b_d', 'crosstab_a_c_d', 'crosstab_b_c_d',
            'crosstab_a_b_c_d'])]

Upvotes: 3

Related Questions