Reputation: 297
I have the following dataset and I want to compute all possible combinations of cross-tabulations in the most efficient way, I have been able to calculate pairs against one master variable, but not for all possibilities (i have popped what i mean in below). Is there a way to get this all in loop that could handle any number of columns? Thanks so much!
data
import pandas as pd
df1 = pd.DataFrame(data={'id': [1,2,3,4,5,6,7,8,9,10],
'a': [1,1,2,2,2,1,1,2,1,1],
'b': [1,2,3,3,3,2,1,2,3,1],
'c': [1,2,2,1,1,1,1,2,1,2],
'd': [1,1,2,2,1,1,1,1,1,2],
})
d1={1: 'right', 2: 'left'}
d2={1: '10', 2: '30', 3: '20'}
d3={1: 'green', 2: 'red'}
d4={1: 'yes', 2: 'no'}
df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')
df1['d']=df1['d'].map(d4).fillna('Other')
combinations
pd.crosstab(df1.a, df1.b)
pd.crosstab(df1.a, df1.c)
pd.crosstab(df1.a, df1.d)
pd.crosstab(df1.b, df1.c)
pd.crosstab(df1.b, df1.d)
pd.crosstab(df1.c, df1.d)
pd.crosstab(df1.a, [df1.b, df1.c])
pd.crosstab(df1.a, [df1.b, df1.d])
pd.crosstab(df1.a, [df1.c, df1.d])
pd.crosstab(df1.a, [df1.b, df1.c, df1.d])
what I have so far
def cross_tab(data_frame, id_col):
col_names=['b','c','d']
datasets = {}
for i in col_names:
datasets['crosstab_{}'.format(i)] = pd.crosstab(data_frame[id_col], data_frame[i])
return datasets
cross_tab(df1, 'a')
EDIT
slightly edited request now separate to cross tabulation - to split output based on whether the table includes a specific value, in this case, dfs (a) with a value of 100 should be stored in a separate list to the rest (b and c)
data
import pandas as pd
import numpy as np
df1 = pd.DataFrame(data={
'a': [1,1,1,1],
'b': [1,1,2,1],
'c': [1,2,2,1]
})
d1={0: 'right', 1: 'left'}
d2={1: 'yes', 2: 'no'}
d3={1: 'up', 2: 'down', 3: 'sideways'}
#d4={1: 'yes', 2: 'no'}
df1['a']=df1['a'].map(d1).fillna('Other')
df1['b']=df1['b'].map(d2).fillna('Other')
df1['c']=df1['c'].map(d3).fillna('Other')
command solved (i think)
def split_cross_tabs(dataframe, cols, contain_val):
datasets = defaultdict(dict)
for x in df1:
p = df1[x].value_counts(normalize=True)*100
datasets[
'y' if p.eq(contain_val).any().any() else'n']['crosstab_{}'.format(x)] = p
return datasets
output
defaultdict(dict,
{'y': {'crosstab_a': left 100.0
Name: a, dtype: float64},
'n': {'crosstab_b': yes 75.0
no 25.0
Name: b, dtype: float64,
'crosstab_c': down 50.0
up 50.0
Name: c, dtype: float64}})
Upvotes: 1
Views: 803
Reputation: 35646
Try with the itertools recipe for a powerset
and modify to only keep combinations of length 2 or greater:
from itertools import chain, combinations
def all_cross_tabs(dataframe, cols):
datasets = {}
for s in chain.from_iterable(
combinations(cols, r) for r in range(2, len(cols) + 1)
):
datasets[f'crosstab_{"_".join(s)}'] = pd.crosstab(
dataframe[s[0]],
[dataframe[c] for c in s[1:]]
)
return datasets
Sample:
d = all_cross_tabs(df1, ['a', 'b', 'c', 'd'])
d.keys()
:
dict_keys(['crosstab_a_b', 'crosstab_a_c', 'crosstab_a_d', 'crosstab_b_c',
'crosstab_b_d', 'crosstab_c_d', 'crosstab_a_b_c', 'crosstab_a_b_d',
'crosstab_a_c_d', 'crosstab_b_c_d', 'crosstab_a_b_c_d'])
d['crosstab_a_b']
:
b 10 20 30
a
left 0 3 1
right 3 1 2
d['crosstab_a_b_c']
:
b 10 20 30
c green red green red green red
a
left 0 0 2 1 0 1
right 2 1 1 0 1 1
d['crosstab_a_b_c_d']
b 10 20 30
c green red green red green red
d yes no no yes no yes yes
a
left 0 0 1 1 1 0 1
right 2 1 0 1 0 1 1
Edit: Split into two sections based on contain_val
def split_cross_tabs(dataframe, cols, contain_val):
datasets = defaultdict(dict)
for s in chain.from_iterable(
combinations(cols, r) for r in range(2, len(cols) + 1)
):
ct_df = pd.crosstab(
dataframe[s[0]],
[dataframe[c] for c in s[1:]]
)
datasets[
'y' if ct_df.eq(contain_val).any().any() else 'n'
][f'crosstab_{"_".join(s)}'] = ct_df
return datasets
d = split_cross_tabs(df1, ['a', 'b', 'c', 'd'], 3)
d.keys()
:
dict_keys(['y', 'n'])
list(map(lambda a: a.keys(), d.values()))
:
[dict_keys(['crosstab_a_b', 'crosstab_b_c', 'crosstab_b_d']),
dict_keys(['crosstab_a_c', 'crosstab_a_d', 'crosstab_c_d', 'crosstab_a_b_c',
'crosstab_a_b_d', 'crosstab_a_c_d', 'crosstab_b_c_d',
'crosstab_a_b_c_d'])]
Upvotes: 3