Reputation: 303
I made a grouped boxplot with seaborn. I have two subplots that describe different types of data and in order to also compare the types (I want to keep the groups as they are), I'd like to plot the median of the data frame for type 2 on the boxplot for type 1 and vice versa. This is my script
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import netCDF4 as nc
sns.set_theme(style='ticks', palette='pastel')
fig = plt.figure(figsize=(15,5))
fig.subplots_adjust(hspace=0.12)
fig.subplots_adjust(wspace=0.15)
fig.subplots_adjust(right=0.98)
fig.subplots_adjust(left=0.12)
fig.subplots_adjust(bottom=0.1)
fig.subplots_adjust(top=0.98)
plt.rcParams['text.usetex'] = False
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.size'] = 11
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['xtick.labelsize'] = 11
plt.rcParams['ytick.labelsize'] = 11
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
def grouped_boxplot(axis_type1, axis_type2):
methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
df_model1_type1 = pd.DataFrame()
df_model1_type2 = pd.DataFrame()
df_model2_type1 = pd.DataFrame()
df_model2_type2 = pd.DataFrame()
df_model3_type1 = pd.DataFrame()
df_model3_type2 = pd.DataFrame()
df_model4_type1 = pd.DataFrame()
df_model4_type2 = pd.DataFrame()
for m in methods:
df_model1_type1[m] = np.random.randint(1,101,10)
df_model1_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model2_type1[m] = np.random.randint(1,101,10)
df_model2_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model3_type1[m] = np.random.randint(1,101,10)
df_model3_type2[m] = np.random.randint(1,101,10)
for m in methods:
df_model4_type1[m] = np.random.randint(1,101,10)
df_model4_type2[m] = np.random.randint(1,101,10)
df_model1_type1 = df_model1_type1.assign(Model='model1')
df_model1_type2 = df_model1_type2.assign(Model='model1')
df_model2_type1 = df_model2_type1.assign(Model='model2')
df_model2_type2 = df_model2_type2.assign(Model='model2')
df_model3_type1 = df_model3_type1.assign(Model='model3')
df_model3_type2 = df_model3_type2.assign(Model='model3')
df_model4_type1 = df_model4_type1.assign(Model='model4')
df_model4_type2 = df_model4_type2.assign(Model='model4')
df_type1 = pd.concat([df_model1_type1,df_model2_type1,df_model3_type1,
df_model4_type1])
df_type2 = pd.concat([df_model1_type2,df_model2_type2,df_model3_type2,
df_model4_type2])
df_type1_long = pd.melt(df_type1, 'Model', var_name='Method',
value_name='var')
df_type2_long = pd.melt(df_type2, 'Model', var_name='Method',
value_name='var')
axis_type1 = sns.boxplot(x='Model', hue='Method', y='var',
data=df_type1_long, showfliers=False, whis=0,
ax=axis_type1)
axis_type2 = sns.boxplot(x='Model', hue='Method', y='var', data=df_type2_long,
showfliers=False, whis=0, ax=axis_type2)
type1_median = df_type1.median().to_numpy()
type2_median = df_type2.median().to_numpy()
for xtick, ytick in zip(axis_type1.get_xticks(), type2_median):
axis_type1.scatter(xtick, ytick, s=20, marker='*', color='red')
for xtick, ytick in zip(axis_type2.get_xticks(), type1_median):
axis_type2.scatter(xtick, ytick, s=20, marker='*', color='red')
axis_type1.legend([],[], frameon=False)
axis_type2.legend(loc='lower center', bbox_to_anchor=(-0.2,-0.25), ncol=7)
grouped_boxplot(ax1, ax2)
plt.show()
# plt.savefig('the_ultimate_boxplot.pdf')
I managed to plot the median on to the boxplot that is right on the xtick.
Is there a way so I can have a symbol for the median of m1 (blue boxplot) for model 1 for type 2 on m1 (blue boxplot) for model 1 for type 1, the median for m2 (orange boxplot) for model 1 for type 2 on m2 (orange boxplot) for model 1 for type 1 [...]?
Upvotes: 0
Views: 631
Reputation: 80509
sns.pointplot
can be used to calculate and position the medians.
The example code uses following parameters for pointplot
:
dodge=.8 - .8 / len(methods)
: dodge
separates out the points per hue. The default dodge width is different for point plots as for box plots. See this github issue.linestyles=''
: don't draw lines between the pointsmarkers='D'
: use a diamond markercolor='black'
: the color for the marker (default the color would come from the hue
estimator=np.median
: calculate the median of the y-values; note that these are on the same spot as the central line of the box plotsci=None
: don't show a confidence intervalThe legend has been changed to remove the entries from the pointplot
. The x-position of bbox_to_anchor
is set to half of wspace
in an attempt to center the legend between the two subplots.
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(style='ticks', palette='pastel')
fig = plt.figure(figsize=(15, 5))
fig.subplots_adjust(wspace=0.15, right=0.98, left=0.04, bottom=0.14, top=0.98)
axis_type1 = fig.add_subplot(1, 2, 1)
axis_type2 = fig.add_subplot(1, 2, 2)
methods = ['m1', 'm2', 'm3', 'm4', 'm5', 'm6', 'm7']
models = ['model1', 'model2', 'model3', 'model4']
df_type1_long = pd.DataFrame({'Model': np.random.choice(models, 500),
'Method': np.random.choice(methods, 500),
'var': np.random.randint(1, 101, 500)})
df_type2_long = pd.DataFrame({'Model': np.random.choice(models, 800),
'Method': np.random.choice(methods, 800),
'var': np.random.randint(1, 101, 800)})
for df_long, ax in zip([df_type1_long, df_type2_long], [axis_type1, axis_type2]):
sns.boxplot(x='Model', hue='Method', y='var', data=df_long,
showfliers=False, whis=0, ax=ax)
sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
linestyles='', markers='D', color='black', estimator=np.median, ci=None,
data=df_long, ax=ax)
# sns.pointplot(x='Model', hue='Method', y='var', dodge=.8 - .8 / len(methods),
# linestyles='', markers='v', color='black', estimator=np.min, ci=None,
# data=df_long, ax=ax)
axis_type1.set_xlabel('')
axis_type2.set_xlabel('')
axis_type1.legend_.remove()
axis_type2.legend(handles=axis_type2.legend_.legendHandles[:len(methods)],
loc='upper center', bbox_to_anchor=(-0.075, -0.06), ncol=len(methods))
plt.show()
Upvotes: 2