Reputation: 23
I am making a grouped bar chart of proficiency levels on a standardized test. Here is my code:
bush_prof_boy = bush.groupby(['BOY Prof'])['BOY Prof'].count()
bush_prof_pct_boy = bush_prof_boy/bush['BOY Prof'].count() * 100
bush_prof_eoy = bush.groupby(['EOY Prof'])['EOY Prof'].count()
bush_prof_pct_eoy = bush_prof_eoy/bush['EOY Prof'].count() * 100
labels = ['Remedial', 'Below Proficient', 'Proficient', 'Advanced']
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, bush_prof_pct_boy, width, label='BOY',
color='mediumorchid')
rects2 = ax.bar(x + width/2, bush_prof_pct_eoy, width, label='EOY', color='teal')
ax.set_ylabel('% of Students at Proficiency Level', fontsize=18)
ax.set_title('Bushwick Middle Change in Proficiency Levels', fontsize=25)
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=25)
ax.legend(fontsize=25)
plt.yticks(fontsize=15)
plt.figure(figsize=(5,15))
plt.show()
"BOY" stands for "Beginning of Year" and "EOY" "End of Year" so the bar graph is intended to show percent of students who fell into each proficiency level at the beginning and end of the year. The graph looks alright but when I drill into the numbers, I can see that the labels for EOY are incorrect. Here is my graph:
The percentages for BOY are graphed correctly, but the EOY ones are with the wrong labels. Here are the actual percentages, which I am certain are correct:
BOY %
Advanced 14.0
Below Proficient 38.0
Proficient 34.0
Remedial 14.0
EOY %
Advanced 39.0
Below Proficient 18.0
Proficient 32.0
Remedial 11.0
Upvotes: 2
Views: 432
Reputation: 62373
.cut
.melt
, and then use .groupby
to calculate percentage within the 'x of Year'
.pivot
, and plot with pandas.DataFrame.plot
python 3.8
, pandas 1.3.1
, and matplotlib 3.4.2
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
# data
data = {'BOY': [11.0, 11.0, 11.0, 11.0, 11.0, 8.0, 11.0, 14.0, 12.0, 13.0, 11.0, 14.0, 10.0, 9.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 12.0, 11.0, 9.0, 12.0, 16.0, 12.0, 12.0, 12.0, 15.0, 10.0, 10.0, 10.0, 8.0, 11.0, 12.0, 14.0, 10.0, 8.0, 11.0, 12.0, 14.0, 12.0, 13.0, 15.0, 13.0, 8.0, 8.0, 11.0, 10.0, 11.0, 13.0, 11.0, 13.0, 15.0, 10.0, 8.0, 10.0, 9.0, 8.0, 11.0, 13.0, 11.0, 8.0, 11.0, 15.0, 11.0, 12.0, 17.0, 12.0, 11.0, 18.0, 14.0, 15.0, 16.0, 7.0, 11.0, 15.0, 16.0, 13.0, 13.0, 13.0, 0.0, 11.0, 15.0, 14.0, 11.0, 13.0, 16.0, 14.0, 12.0, 8.0, 13.0, 13.0, 14.0, 7.0, 10.0, 16.0, 10.0, 13.0, 10.0, 14.0, 8.0, 16.0, 13.0, 12.0, 14.0, 12.0, 14.0, 16.0, 15.0, 13.0, 13.0, 10.0, 14.0, 8.0, 10.0, 10.0, 11.0, 12.0, 10.0, 12.0, 14.0, 17.0, 13.0, 14.0, 16.0, 15.0, 13.0, 16.0, 9.0, 16.0, 15.0, 11.0, 11.0, 15.0, 14.0, 12.0, 15.0, 11.0, 16.0, 14.0, 14.0, 15.0, 14.0, 14.0, 14.0, 16.0, 15.0, 12.0, 12.0, 14.0, 15.0, 13.0, 14.0, 13.0, 17.0, 14.0, 13.0, 14.0, 13.0, 13.0, 12.0, 10.0, 15.0, 14.0, 12.0, 12.0, 14.0, 12.0, 14.0, 13.0, 15.0, 13.0, 14.0, 14.0, 12.0, 11.0, 15.0, 14.0, 14.0, 10.0], 'EOY': [16.0, 16.0, 16.0, 14.0, 10.0, 14.0, 16.0, 14.0, 15.0, 15.0, 15.0, 11.0, 11.0, 15.0, 10.0, 14.0, 17.0, 14.0, 9.0, 15.0, 14.0, 16.0, 14.0, 13.0, 11.0, 13.0, 12.0, 14.0, 15.0, 13.0, 14.0, 15.0, 12.0, 19.0, 9.0, 13.0, 11.0, 14.0, 17.0, 17.0, 14.0, 13.0, 14.0, 10.0, 16.0, 15.0, 12.0, 11.0, 12.0, 14.0, 15.0, 10.0, 15.0, 14.0, 14.0, 15.0, 18.0, 15.0, 10.0, 10.0, 15.0, 15.0, 13.0, 15.0, 19.0, 13.0, 18.0, 20.0, 21.0, 17.0, 18.0, 17.0, 18.0, 17.0, 12.0, 16.0, 15.0, 18.0, 19.0, 17.0, 20.0, 11.0, 18.0, 19.0, 11.0, 12.0, 17.0, 20.0, 17.0, 15.0, 13.0, 18.0, 14.0, 17.0, 12.0, 12.0, 16.0, 12.0, 14.0, 15.0, 14.0, 10.0, 20.0, 13.0, 18.0, 20.0, 11.0, 20.0, 17.0, 20.0, 13.0, 17.0, 15.0, 18.0, 14.0, 13.0, 13.0, 18.0, 10.0, 13.0, 12.0, 18.0, 20.0, 20.0, 16.0, 18.0, 15.0, 20.0, 22.0, 18.0, 21.0, 18.0, 18.0, 18.0, 17.0, 16.0, 19.0, 16.0, 20.0, 19.0, 19.0, 20.0, 20.0, 14.0, 18.0, 20.0, 20.0, 18.0, 16.0, 21.0, 20.0, 18.0, 15.0, 14.0, 17.0, 19.0, 21.0, 14.0, 18.0, 15.0, 18.0, 21.0, 19.0, 17.0, 16.0, 16.0, 15.0, 20.0, 19.0, 16.0, 21.0, 17.0, 19.0, 15.0, 18.0, 20.0, 18.0, 20.0, 18.0, 16.0, 16.0]}
df = pd.DataFrame(data)
# replace numbers with categorical labels; could also create new columns
labels = ['Remedial', 'Below Proficient', 'Proficient', 'Advanced']
bins = [1, 11, 13, 15, np.inf]
df['BOY'] = pd.cut(x=df.BOY, labels=labels, bins=bins, right=True)
df['EOY'] = pd.cut(x=df.EOY, labels=labels, bins=bins, right=True)
# melt the relevant columns into a long form
dfm = df.melt(var_name='Tested', value_name='Proficiency')
# set the categorical label order, which makes the xaxis labels print in the specific order
dfm['Proficiency'] = pd.Categorical(dfm['Proficiency'], labels, ordered=True)
# groupby and get the value counts
dfg = dfm.groupby('Tested')['Proficiency'].value_counts().reset_index(level=1, name='Size').rename({'level_1': 'Proficiency'}, axis=1)
# divide by the Tested value counts to get the percent
dfg['percent'] = dfg['Size'].div(dfm.Tested.value_counts()).mul(100).round(1)
# reshape to plot
dfp = dfg.reset_index().pivot(index='Proficiency', columns='Tested', values='percent')
# display(dfp)
Tested BOY EOY
Proficiency
Remedial 34.8 9.9
Below Proficient 28.7 12.7
Proficient 27.1 25.4
Advanced 8.8 51.9
ax = dfp.plot(kind='bar', figsize=(15, 5), rot=0, color=['orchid', 'teal'])
# formatting
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('Students at Proficiency Level', fontsize=18)
ax.set_xlabel('')
ax.set_title('Bushwick Middle Change in Proficiency Levels', fontsize=25)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=25)
ax.legend(fontsize=25)
_ = plt.yticks(fontsize=15)
# add bar labels
for p in ax.containers:
ax.bar_label(p, fmt='%.1f%%', label_type='edge', fontsize=12)
# pad the spacing between the number and the edge of the figure
ax.margins(y=0.2)
dfp
Upvotes: 1