user2067687
user2067687

Reputation:

Tukey-Test Grouping and plotting in SciPy

I'm trying to plot results from a Tukey test, but I am struggling with putting data into groups based on a P-Value. This is the equivalent in R which I am trying to replicate. I have been using the SciPy one-way ANOVA tests and the Tukey test statsmodel but can't get these groups done in the same way.

Any help is greatly appreciated

I've also just found this another example in R of what I want to do in python

Upvotes: 1

Views: 3514

Answers (3)

caiohamamura
caiohamamura

Reputation: 2748

I solved this by using more tabular approach, only iterating through the rows because of inevitable iterative approach of labelling:

import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd


df_iris = sm.datasets.get_rdataset('PlantGrowth').data
print(df_iris.columns)
group_column = 'group'
independent_variable = 'weight'

# Tukey test
tukey_result = pairwise_tukeyhsd(endog=df_iris[independent_variable], groups=df_iris[group_column], alpha=0.05)


# Which group is smaller?
res = pd.DataFrame(tukey_result.summary().data[1:], columns=tukey_result.summary().data[0])
print(res)

# Get the means and sort
means = df_iris.groupby(group_column)[independent_variable].mean().sort_values()
means = pd.DataFrame(means)
print(means)

res['lesser_first'] = res['meandiff'] > 0

# Swap groups if the first group is not the smaller one
res.loc[~res['lesser_first'], ['group1', 'group2']] = res.loc[~res['lesser_first'], ['group2', 'group1']].values

# Calculate the means for the groups
res['minMean'] = means.loc[res['group1'].tolist()].values
res['maxMean'] = means.loc[res['group2'].tolist()].values

# Sort the tests by minMean and then maxMean
res.sort_values(['minMean', 'maxMean'], inplace=True)

# Add the column for the group
current_letter = ord('a')
previous_group = ''
last_labeled = ''
means['group'] = chr(current_letter)

for i, row in res.iterrows():
    # If the group has already found a reject==True just skip
    if row['group1'] == previous_group:
        continue
    # If the group is not the same as the last labeled group, increment the letter 
    # and update the last labeled group
    if row['reject']:
        if row['group2'] != last_labeled:
            current_letter += 1
            last_labeled = row['group2']
            means.loc[row['group2'],'group'] = chr(current_letter)
        previous_group = row['group1']
    # If reject == False and group2 is the same as the last labeled group
    # then assign the same letter to group1 from previous group
    elif row['group2'] == last_labeled:
        means.loc[row['group1'],'group'] = chr(current_letter - 1) + means.loc[row['group2'],'group']
    # reject == False and is not the same as the last labeled group just assign the current letter
    else:
        means.loc[row['group2'],'group'] = chr(current_letter)
    

print(means)

Leads to:

    weight  group
group       
trt1    4.661   a
ctrl    5.032   ab
trt2    5.526   b

Upvotes: 0

Daniel Wagenaar
Daniel Wagenaar

Reputation: 131

Here is a function that returns letter labels if you have a symmetric matrix of p-values from a Tukey test:

import numpy as np

def tukeyLetters(pp, means=None, alpha=0.05):
    '''TUKEYLETTERS - Produce list of group labels for TukeyHSD
    letters = TUKEYLETTERS(pp), where PP is a symmetric matrix of 
    probabilities from a Tukey test, returns alphabetic labels
    for each group to indicate clustering. PP may also be a vector
    from PAIRWISE_TUKEYHSD.
    Optional argument MEANS specifies group means, which is used for
    ordering the letters. ("a" gets assigned to the group with lowest
    mean.) Without this argument, ordering is arbitrary.
    Optional argument ALPHA specifies cutoff for treating groups as
    part of the same cluster.'''

    if len(pp.shape)==1:
        # vector
        G = int(3 + np.sqrt(9 - 4*(2-len(pp))))//2
        ppp = .5*np.eye(G)
        ppp[np.triu_indices(G,1)] = pp    
        pp = ppp + ppp.T
    conn = pp>alpha
    G = len(conn)
    if np.all(conn):
        return ['a' for g in range(G)]
    conns = []
    for g1 in range(G):
        for g2 in range(g1+1,G):
            if conn[g1,g2]:
                conns.append((g1,g2))

    letters = [ [] for g in range(G) ]
    nextletter = 0
    for g in range(G):
        if np.sum(conn[g,:])==1:
            letters[g].append(nextletter)
            nextletter += 1
    while len(conns):
        grp = set(conns.pop(0))
        for g in range(G):
            if all(conn[g, np.sort(list(grp))]):
                grp.add(g)
        for g in grp:
            letters[g].append(nextletter)
        for g in grp:
            for h in grp:
                if (g,h) in conns:
                    conns.remove((g,h))
        nextletter += 1

    if means is None:
        means = np.arange(G)
    means = np.array(means)
    groupmeans = []
    for k in range(nextletter):
        ingroup = [g for g in range(G) if k in letters[g]]
        groupmeans.append(means[np.array(ingroup)].mean())
    ordr = np.empty(nextletter, int)
    ordr[np.argsort(groupmeans)] = np.arange(nextletter)
    result = []
    for ltr in letters:
        lst = [chr(97 + ordr[x]) for x in ltr]
        lst.sort()
        result.append(''.join(lst))
    return result

To make that concrete, here is a full example:

from statsmodels.stats.multicomp import pairwise_tukeyhsd

data  = [ 1,2,2,1,4,5,4,5,7,8,7,8,1,3,4,5 ]
group = [ 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 ]
tuk = pairwise_tukeyhsd(data, group) 
letters = tukeyLetters(tuk.pvalues)

This will result in letters containing ['a', 'c', 'b', 'ac']

Upvotes: 1

BrianM
BrianM

Reputation: 48

I have been struggling to do the same thing. I found a paper that tells you how to code the letters.
Hans-Peter Piepho (2004) An Algorithm for a Letter-Based Representation of All-Pairwise Comparisons, Journal of Computational and Graphical Statistics, 13:2, 456-466, DOI: 10.1198/1061860043515

Doing the coding was a little tricky as you need to check and replicate columns and then combine columns. I tried to add some comments to the colde. I figured out a method where you can run tukeyhsd and then from the results compute the letters. It should be possible to turn this into a function. Or hopefully part of tukeyhsd. My data is not posted but it is a column of data and then a column describing the groups. The groups for me are the five boroughs of NYC. You can also just change the comments and use random data the first time.

# Read data.  Comment out the next ones to use random data.  
df=pd.read_excel('anova_test.xlsx')
#n=1000
#df = pd.DataFrame(columns=['Groups','Data'],index=np.arange(n))
#df['Groups']=np.random.randint(1, 4,size=n)
#df['Data']=df['Groups']*np.random.random_sample(size=n)


# define columns for data and then grouping
col_to_group='Groups'
col_for_data='Data'

#Now take teh data and regroup for anova
samples = [cols[1] for cols in df.groupby(col_to_group)[col_for_data]]    #I am not sure how this works but it makes an numpy array for each group 
f_val, p_val = stats.f_oneway(*samples)  # I am not sure what this star does but this passes all the numpy arrays correctly
#print('F value: {:.3f}, p value: {:.3f}\n'.format(f_val, p_val))

# this if statement can be uncommmented if you don't won't to go furhter with out p<0.05
#if p_val<0.05:    #If the p value is less than 0.05 it then does the tukey
mod = MultiComparison(df[col_for_data], df[col_to_group])
thsd=mod.tukeyhsd()
#print(mod.tukeyhsd())

#this is a function to do Piepho method.  AN Alogrithm for a letter based representation of al-pairwise comparisons.  
tot=len(thsd.groupsunique)
#make an empty dataframe that is a square matrix of size of the groups. #set first column to 1
df_ltr=pd.DataFrame(np.nan, index=np.arange(tot),columns=np.arange(tot))
df_ltr.iloc[:,0]=1
count=0
df_nms = pd.DataFrame('', index=np.arange(tot), columns=['names'])  # I make a dummy dataframe to put axis labels into.  sd stands for signifcant difference

for i in np.arange(tot):   #I loop through and make all pairwise comparisons. 
    for j in np.arange(i+1,tot):
        #print('i=',i,'j=',j,thsd.reject[count])
        if thsd.reject[count]==True:
            for cn in np.arange(tot):
                if df_ltr.iloc[i,cn]==1 and df_ltr.iloc[j,cn]==1: #If the column contains both i and j shift and duplicat
                    df_ltr=pd.concat([df_ltr.iloc[:,:cn+1],df_ltr.iloc[:,cn+1:].T.shift().T],axis=1)
                    df_ltr.iloc[:,cn+1]=df_ltr.iloc[:,cn]
                    df_ltr.iloc[i,cn]=0
                    df_ltr.iloc[j,cn+1]=0
                #Now we need to check all columns for abosortpion.
                for cleft in np.arange(len(df_ltr.columns)-1):
                    for cright in np.arange(cleft+1,len(df_ltr.columns)):
                        if (df_ltr.iloc[:,cleft].isna()).all()==False and (df_ltr.iloc[:,cright].isna()).all()==False: 
                            if (df_ltr.iloc[:,cleft]>=df_ltr.iloc[:,cright]).all()==True:  
                                df_ltr.iloc[:,cright]=0
                                df_ltr=pd.concat([df_ltr.iloc[:,:cright],df_ltr.iloc[:,cright:].T.shift(-1).T],axis=1)
                            if (df_ltr.iloc[:,cleft]<=df_ltr.iloc[:,cright]).all()==True:
                                df_ltr.iloc[:,cleft]=0
                                df_ltr=pd.concat([df_ltr.iloc[:,:cleft],df_ltr.iloc[:,cleft:].T.shift(-1).T],axis=1)

        count+=1

#I sort so that the first column becomes A        
df_ltr=df_ltr.sort_values(by=list(df_ltr.columns),axis=1,ascending=False)

# I assign letters to each column
for cn in np.arange(len(df_ltr.columns)):
    df_ltr.iloc[:,cn]=df_ltr.iloc[:,cn].replace(1,chr(97+cn)) 
    df_ltr.iloc[:,cn]=df_ltr.iloc[:,cn].replace(0,'')
    df_ltr.iloc[:,cn]=df_ltr.iloc[:,cn].replace(np.nan,'') 

#I put all the letters into one string
df_ltr=df_ltr.astype(str)
df_ltr.sum(axis=1)
#print(df_ltr)
#print('\n')
#print(df_ltr.sum(axis=1))

#Now to plot like R with a violing plot
fig,ax=plt.subplots()
df.boxplot(column=col_for_data, by=col_to_group,ax=ax,fontsize=16,showmeans=True
                    ,boxprops=dict(linewidth=2.0),whiskerprops=dict(linewidth=2.0))  #This makes the boxplot

ax.set_ylim([-10,20])

grps=pd.unique(df[col_to_group].values)   #Finds the group names
grps.sort() # This is critical!  Puts the groups in alphabeical order to make it match the plotting

props=dict(facecolor='white',alpha=1)
for i,grp in enumerate(grps):   #I loop through the groups to make the scatters and figure out the axis labels. 

    x = np.random.normal(i+1, 0.15, size=len(df[df[col_to_group]==grp][col_for_data]))
    ax.scatter(x,df[df[col_to_group]==grp][col_for_data],alpha=0.5,s=2)
    name="{}\navg={:0.2f}\n(n={})".format(grp
                            ,df[df[col_to_group]==grp][col_for_data].mean()
                            ,df[df[col_to_group]==grp][col_for_data].count())
    df_nms['names'][i]=name 
    ax.text(i+1,ax.get_ylim()[1]*1.1,df_ltr.sum(axis=1)[i],fontsize=10,verticalalignment='top',horizontalalignment='center',bbox=props)


ax.set_xticklabels(df_nms['names'],rotation=0,fontsize=10)
ax.set_title('')
fig.suptitle('')

fig.savefig('anovatest.jpg',dpi=600,bbox_inches='tight')

Results showing the letters above plots using the tukeyhsd

Upvotes: 2

Related Questions