Mazil_tov998
Mazil_tov998

Reputation: 426

How use the apply function to one dataframe which has two dataframes passing as parameters?

I have two dataframes df and df_census_final. df data can be downloaded here and df_census_final can be downloaded here and created with:

import pandas as pd
df = pd.read_csv('population_data.csv')
df_census_final = pd.read_csv('census_data.csv')

The df_census_final is year on year estimate of population by individual age groups and is a look up dataframe to get a specific value, whilst df is the actual dataframe where the calculation will be applied and population by age groups 15-19.

I have two functions which are below where the first function function_check_age_19 is used in the bottom function split_population_18_19:

def function_check_age_19(df,df_census_final):
    """
    If the sex, year, and age are equal to the values in the function, then return the population
    multiplied by the estimated proportion.
    
    :param df: the dataframe that contains the population data
    :param df_census_final: a dataframe with the following columns:
    :return: the population multiplied by the estimated proportion.
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2019)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2020)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2021)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Male' and df['Year'] == 2023 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Male Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    elif df['Sex'] == 'Female' and df['Year'] == 2023 and df['Age'] == '15-19':
        estimated_proportion =(df_census_final['Female Pop %'].loc[(df_census_final['GROUP'] == '19') & (df_census_final['Year'] == 2022)].values)/100
        return df['population']*estimated_proportion
    else: 
        return df['population'] 

def split_population_18_19(df):
    """
    It takes a dataframe, creates two new columns, one for 18 year olds and one for 19 year olds, then
    adds them together and subtracts them from the total population
    
    :param df: The dataframe that you want to split the population for
    :return: A dataframe with the columns:
    """
    try:
        df['population_18'] = df.apply(lambda row: function_check_age_18(row), axis=1)
        df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)
        df.loc[df['Age'] != '15-19','population_18'] = 0
        df.loc[df['Age'] != '15-19','population_19'] = 0
        df['total_population_18'] = df['population_18'] + df['population_19']
        df['population'] =df['population'] - df['total_population_18']
        df['total_population_18'] = df['total_population_18'].shift(1)
        df['total_population_18'] = df['total_population_18'].fillna(0.0)
        df['population'] = df['total_population_18'] + df['population']
        df.drop(['month','population_18','population_19','total_population_18'],axis=1,inplace=True)
        return df
    except Exception as e:
        print(f"Exception on estimating 18 19 year old population with the error: {e}")

The first function has two dataframe which are passed to it, df and df_census_final but the second only has one. When using .apply in the second function as shown below:

df['population_19'] = df.apply(lambda row: function_check_age_19(row), axis=1)

I get a nonetype returned. i.e. (I am only trying to affect df element-wise) What is the best way to use the .apply function or anything else to pass both the dataframes into the function?

I am refactoring some hard-coded values as shown below:

def function_check_age_19(df):
    """
    This function is checking the population of the age group 15-19 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 19 year olds for each Sex
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '15-19':
        return df['population']*(19.851399/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '15-19':
        return df['population']*(20.088195/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '15-19':
        return df['population']*(19.492101/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '15-19':
        return df['population']*(19.745797/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '15-19':
        return df['population']*(19.489112/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '15-19':
        return df['population']*(19.621289/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '15-19':
        return df['population']*(19.442557/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '15-19':
        return df['population']*(19.534988/100)
    else: 
        return df['population']

Upvotes: 0

Views: 89

Answers (1)

chitown88
chitown88

Reputation: 28565

Ok I think I see what you are trying to do. How I would do it is I would simply merge the percentages by 'Sex' and 'Year'. Then just pull out GROUP 18 and 19 and multiply out the columns to get the populations of those.

import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/population_data.csv')
df['Year'] = pd.to_datetime(df['month of estimation']).dt.year
df_15_19 = df[df['Age'] == '15-19']

df_census_final = pd.read_csv('https://raw.githubusercontent.com/maz2198/data_1/main/census_data.csv')

df_list = []
ages = [18, 19]
for age in ages:
    df_census_age = df_census_final[df_census_final['GROUP'] == age]
    df_age = df_15_19.merge(df_census_age, how='right', on = ['Year'])

    df_age['pop'] = np.where(df_age['Sex'] == 'Female', df_age['Female Pop %'], df_age['Male Pop %'])
    df_age['population_calc'] = df_age['population'] * (df_age['pop']/100)
    
    df_list.append(df_age)
    
final_df = pd.concat(df_list, axis=0)

Output:

print(final_df[['district', 'Sex', 'Year', 'GROUP', 'population_calc']])
                                          district  ... population_calc
0      MP - Ehlanzeni District Municipality (DC32)  ...    16113.420033
1      MP - Ehlanzeni District Municipality (DC32)  ...    15696.491268
2     EC - Alfred Nzo District Municipality (DC44)  ...     8754.005492
3     EC - Alfred Nzo District Municipality (DC44)  ...     8220.401384
4       EC - Amathole District Municipality (DC12)  ...     6713.988338
                                           ...  ...             ...
3947     WC - Overberg District Municipality (DC3)  ...     2189.536237
3948   WC - West Coast District Municipality (DC1)  ...     3601.165452
3949   WC - West Coast District Municipality (DC1)  ...     3579.662585
3950                        WC- Garden Route (DC4)  ...     5120.187331
3951                        WC- Garden Route (DC4)  ...     4867.683393

[7904 rows x 5 columns]

Visual:

import matplotlib.pyplot as plt
import seaborn as sns

df_filter = final_df[final_df['district'] == ' MP - Ehlanzeni District Municipality (DC32)']

#set seaborn plotting aesthetics
sns.set(style='white')

#create grouped bar chart
sns.catplot(x='Year', 
            y='population_calc', 
            col='GROUP',
            hue='Sex', 
            data=df_filter,
            kind='bar',
            palette=['#DFA1C1', '#3279BB']) 
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.ylabel('Population')

Upvotes: 1

Related Questions