scriptgirl_3000
scriptgirl_3000

Reputation: 303

How to place a median marker on boxplots at the medians from a different group

I made a grouped boxplot with seaborn. I have two subplots that describe different types of data and in order to also compare the types (I want to keep the groups as they are), I'd like to plot the median of the data frame for type 2 on the boxplot for type 1 and vice versa. This is my script

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import netCDF4 as nc
sns.set_theme(style='ticks', palette='pastel')

fig = plt.figure(figsize=(15,5))

fig.subplots_adjust(hspace=0.12)
fig.subplots_adjust(wspace=0.15)
fig.subplots_adjust(right=0.98)
fig.subplots_adjust(left=0.12)
fig.subplots_adjust(bottom=0.1)
fig.subplots_adjust(top=0.98)

plt.rcParams['text.usetex'] = False
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.size'] = 11
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11

ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

def grouped_boxplot(axis_type1, axis_type2):
    methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']

    df_model1_type1 = pd.DataFrame()
    df_model1_type2 = pd.DataFrame()
    df_model2_type1 = pd.DataFrame()
    df_model2_type2 = pd.DataFrame()
    df_model3_type1 = pd.DataFrame()
    df_model3_type2 = pd.DataFrame()
    df_model4_type1 = pd.DataFrame()
    df_model4_type2 = pd.DataFrame()

    for m in methods:
        df_model1_type1[m] = np.random.randint(1,101,10)
        df_model1_type2[m] = np.random.randint(1,101,10)
    for m in methods:
        df_model2_type1[m] = np.random.randint(1,101,10)
        df_model2_type2[m] = np.random.randint(1,101,10)
    for m in methods:
        df_model3_type1[m] = np.random.randint(1,101,10)
        df_model3_type2[m] = np.random.randint(1,101,10)
    for m in methods:
        df_model4_type1[m] = np.random.randint(1,101,10)
        df_model4_type2[m] = np.random.randint(1,101,10)

    df_model1_type1 = df_model1_type1.assign(Model='model1')
    df_model1_type2 = df_model1_type2.assign(Model='model1')
    df_model2_type1 = df_model2_type1.assign(Model='model2')
    df_model2_type2 = df_model2_type2.assign(Model='model2')
    df_model3_type1 = df_model3_type1.assign(Model='model3')
    df_model3_type2 = df_model3_type2.assign(Model='model3')
    df_model4_type1 = df_model4_type1.assign(Model='model4')
    df_model4_type2 = df_model4_type2.assign(Model='model4')

    df_type1 = pd.concat([df_model1_type1,df_model2_type1,df_model3_type1,
                          df_model4_type1])
    df_type2 = pd.concat([df_model1_type2,df_model2_type2,df_model3_type2,
                          df_model4_type2])

    df_type1_long = pd.melt(df_type1, 'Model', var_name='Method',
                            value_name='var')
    df_type2_long = pd.melt(df_type2, 'Model', var_name='Method',
                           value_name='var')

    axis_type1 = sns.boxplot(x='Model', hue='Method', y='var',
                             data=df_type1_long, showfliers=False, whis=0,
                             ax=axis_type1)
    axis_type2 = sns.boxplot(x='Model', hue='Method', y='var', data=df_type2_long,
                            showfliers=False, whis=0, ax=axis_type2)

    type1_median = df_type1.median().to_numpy()
    type2_median = df_type2.median().to_numpy()

    for xtick, ytick in zip(axis_type1.get_xticks(), type2_median):
        axis_type1.scatter(xtick, ytick, s=20, marker='*', color='red')

    for xtick, ytick in zip(axis_type2.get_xticks(), type1_median):
        axis_type2.scatter(xtick, ytick, s=20, marker='*', color='red')

    axis_type1.legend([],[], frameon=False)
    axis_type2.legend(loc='lower center', bbox_to_anchor=(-0.2,-0.25), ncol=7)

grouped_boxplot(ax1, ax2)

plt.show()
# plt.savefig('the_ultimate_boxplot.pdf')

I managed to plot the median on to the boxplot that is right on the xtick.

enter image description here

Is there a way so I can have a symbol for the median of m1 (blue boxplot) for model 1 for type 2 on m1 (blue boxplot) for model 1 for type 1, the median for m2 (orange boxplot) for model 1 for type 2 on m2 (orange boxplot) for model 1 for type 1 [...]?

Upvotes: 0

Views: 631

Answers (1)

JohanC
JohanC

Reputation: 80509

sns.pointplot can be used to calculate and position the medians.

The example code uses following parameters for pointplot:

  • dodge=.8 - .8 / len(methods): dodge separates out the points per hue. The default dodge width is different for point plots as for box plots. See this github issue.
  • linestyles='': don't draw lines between the points
  • markers='D': use a diamond marker
  • color='black': the color for the marker (default the color would come from the hue
  • estimator=np.median: calculate the median of the y-values; note that these are on the same spot as the central line of the box plots
  • ci=None: don't show a confidence interval

The legend has been changed to remove the entries from the pointplot. The x-position of bbox_to_anchor is set to half of wspace in an attempt to center the legend between the two subplots.

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

sns.set_theme(style='ticks', palette='pastel')

fig = plt.figure(figsize=(15, 5))
fig.subplots_adjust(wspace=0.15, right=0.98, left=0.04, bottom=0.14, top=0.98)

axis_type1 = fig.add_subplot(1, 2, 1)
axis_type2 = fig.add_subplot(1, 2, 2)
methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
models = ['model1', 'model2', 'model3', 'model4']

df_type1_long = pd.DataFrame({'Model': np.random.choice(models, 500),
                              'Method': np.random.choice(methods, 500),
                              'var': np.random.randint(1, 101, 500)})
df_type2_long = pd.DataFrame({'Model': np.random.choice(models, 800),
                              'Method': np.random.choice(methods, 800),
                              'var': np.random.randint(1, 101, 800)})

for df_long, ax in zip([df_type1_long, df_type2_long], [axis_type1, axis_type2]):
     sns.boxplot(x='Model', hue='Method', y='var', data=df_long,
                 showfliers=False, whis=0, ax=ax)
     sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
                   linestyles='', markers='D', color='black', estimator=np.median, ci=None,
                   data=df_long, ax=ax)
     # sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
     #               linestyles='', markers='v', color='black', estimator=np.min, ci=None,
     #               data=df_long, ax=ax)
axis_type1.set_xlabel('')
axis_type2.set_xlabel('')
axis_type1.legend_.remove()
axis_type2.legend(handles=axis_type2.legend_.legendHandles[:len(methods)],
                  loc='upper center', bbox_to_anchor=(-0.075, -0.06), ncol=len(methods))
plt.show()

sns.pointplot to show medians on boxplots

Upvotes: 2

Related Questions