Reputation: 426
I have two dataframes df
and df_census_final
. df
data can be downloaded here and df_census_final
can be downloaded here and created with:
import pandas as pd
df = pd.read_csv('population_data.csv')
df_census_final = pd.read_csv('census_data.csv')
The df_census_final
is year on year estimate of population by individual age groups and is a look up dataframe to get a specific value, whilst df
is the actual dataframe where the calculation will be applied and population by age groups 15-19.
I have two functions which are below where the first function function_check_age_19
is used in the bottom function split_population_18_19
:
def function_check_age_19(df,df_census_final):
"""
If the sex, year, and age are equal to the values in the function, then return the population
multiplied by the estimated proportion.
:param df: the dataframe that contains the population data
:param df_census_final: a dataframe with the following columns:
:return: the population multiplied by the estimated proportion.
"""
if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Male' and df['Year'] == 2023 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
return df['population']*estimated_proportion
elif df['Sex'] == 'Female' and df['Year'] == 2023 and df['Age'] == '15-19':
estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
return df['population']*estimated_proportion
else:
return df['population']
def split_population_18_19(df):
"""
It takes a dataframe, creates two new columns, one for 18 year olds and one for 19 year olds, then
adds them together and subtracts them from the total population
:param df: The dataframe that you want to split the population for
:return: A dataframe with the columns:
"""
try:
df['population_18'] = df.apply(lambda row: function_check_age_18(row), axis=1)
df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)
df.loc[df['Age'] != '15-19','population_18'] = 0
df.loc[df['Age'] != '15-19','population_19'] = 0
df['total_population_18'] = df['population_18'] + df['population_19']
df['population'] =df['population'] - df['total_population_18']
df['total_population_18'] = df['total_population_18'].shift(1)
df['total_population_18'] = df['total_population_18'].fillna(0.0)
df['population'] = df['total_population_18'] + df['population']
df.drop(['month','population_18','population_19','total_population_18'],axis=1,inplace=True)
return df
except Exception as e:
print(f"Exception on estimating 18 19 year old population with the error: {e}")
The first function has two dataframe which are passed to it, df
and df_census_final
but the second only has one. When using .apply
in the second function as shown below:
df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)
I get a nonetype returned. i.e. (I am only trying to affect df element-wise) What is the best way to use the .apply function or anything else to pass both the dataframes into the function?
I am refactoring some hard-coded values as shown below:
def function_check_age_19(df):
"""
This function is checking the population of the age group 15-19 for the years
2019, 2020, 2021, and 2022 then mapping the proportion for 19 year olds for each Sex
"""
if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
return df['population']*(19.851399/100)
elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
return df['population']*(20.088195/100)
elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
return df['population']*(19.492101/100)
elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
return df['population']*(19.745797/100)
elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
return df['population']*(19.489112/100)
elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
return df['population']*(19.621289/100)
elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
return df['population']*(19.442557/100)
elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
return df['population']*(19.534988/100)
else:
return df['population']
Upvotes: 0
Views: 89
Reputation: 28565
Ok I think I see what you are trying to do. How I would do it is I would simply merge the percentages by 'Sex'
and 'Year'
. Then just pull out GROUP
18 and 19 and multiply out the columns to get the populations of those.
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/population_data.csv')
df['Year'] = pd.to_datetime(df['month of estimation']).dt.year
df_15_19 = df[df['Age'] == '15-19']
df_census_final = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/census_data.csv')
df_list = []
ages = [18, 19]
for age in ages:
df_census_age = df_census_final[df_census_final['GROUP'] == age]
df_age = df_15_19.merge(df_census_age, how='right', on = ['Year'])
df_age['pop'] = np.where(df_age['Sex'] == 'Female', df_age['Female Pop %'], df_age['Male Pop %'])
df_age['population_calc'] = df_age['population'] * (df_age['pop']/100)
df_list.append(df_age)
final_df = pd.concat(df_list, axis=0)
Output:
print(final_df[['district', 'Sex', 'Year', 'GROUP', 'population_calc']])
district ... population_calc
0 MP - Ehlanzeni District Municipality (DC32) ... 16113.420033
1 MP - Ehlanzeni District Municipality (DC32) ... 15696.491268
2 EC - Alfred Nzo District Municipality (DC44) ... 8754.005492
3 EC - Alfred Nzo District Municipality (DC44) ... 8220.401384
4 EC - Amathole District Municipality (DC12) ... 6713.988338
... ... ...
3947 WC - Overberg District Municipality (DC3) ... 2189.536237
3948 WC - West Coast District Municipality (DC1) ... 3601.165452
3949 WC - West Coast District Municipality (DC1) ... 3579.662585
3950 WC- Garden Route (DC4) ... 5120.187331
3951 WC- Garden Route (DC4) ... 4867.683393
[7904 rows x 5 columns]
Visual:
import matplotlib.pyplot as plt
import seaborn as sns
df_filter = final_df[final_df['district'] == ' MP - Ehlanzeni District Municipality (DC32)']
#set seaborn plotting aesthetics
sns.set(style='white')
#create grouped bar chart
sns.catplot(x='Year',
y='population_calc',
col='GROUP',
hue='Sex',
data=df_filter,
kind='bar',
palette=['#DFA1C1', '#3279BB'])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel('Population')
Upvotes: 1