Reputation: 4148
I have the below code snippet that loops over all the files in a directory and populates the pandas dataframes. I am new to Python, so just wondering whether any part of this code snippet can be modularized (put it in a function) so that they can be reusable.
df_precision_list_a = []
df_precision_list_b = []
df_precision_list_c = []
##Reading a list of files from a directory
for file in allfiles:
print(file)
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
list_a = []
list_b = []
list_c = []
##Populating year to all the lists
year = df_overall_data ['date'].iloc[0].year
list_a.append(year)
list_b.append(year)
list_c.append(year)
##Populating month to all the lists
month = df_overall_data ['date'].iloc[0].month
list_a .append(month)
list_b.append(month)
list_c.append(month)
##Populating precision value to all the lists
precision_actual_a = df_sheet1.loc[df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
list_a.append(precision_actual_a)
precision_actual_b = df_sheet1.loc[df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
list_b.append(precision_actual_b)
precision_actual_c = df_sheet1.loc[df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
list_c.append(precision_actual_c)
##Populating Q2 values(fixed values) to all lists
list_a.append(77)
list_b.append(65)
list_c.append(72)
##Adding lists (list_a,list_b,list_c) to respective precision lists
df_precision_list_a.append(list_a)
df_precision_list_b.append(list_b)
df_precision_list_c.append(list_c)
##Populating precision dataframes using their respective lists
df_precision_a=pd.DataFrame(df_precision_list_a,columns=['Year', 'Month', 'Q1', 'Q2'])
df_precision_b=pd.DataFrame(df_precision_list_b,columns=['Year', 'Month', 'Q1', 'Q2'])
df_precision_c=pd.DataFrame(df_precision_list_c,columns=['Year', 'Month', 'Q1', 'Q2'])
Upvotes: 2
Views: 301
Reputation: 961
You should be able to do it in a nice clean way, by using an object and then you can call any method in any order that you want, and the object itself will keep track of the lists and attribute values.
I included an initialize_reset method to initialize or clear all lists and attribute values so that you are able to do calculation repeatedly with the same object instance.
Note: also this object is not tested with actual data, it runs without error, so if you test it with data you might have to customize it even further, but with this template you should be able to get it running.
class PandasFrames:
def __init__(self, directory_path):
self.allfiles = glob.glob(directory_path + '/*')
self.initialize_reset_values()
def initialize_reset_values(self):
self.df_precision_list_a = []
self.df_precision_list_b = []
self.df_precision_list_c = []
self.list_a = []
self.list_b = []
self.list_c = []
self.df_sheet1 = None
self.df_overall_data = None
self.df_precision_a = None
self.df_precision_b = None
self.df_precision_c = None
def read_directory_files(self):
''' Reading a list of files from a directory
'''
for file in self.allfiles:
print(file)
xls = pd.ExcelFile(file)
self.df_sheet1 = pd.read_excel(xls, 'sheet1')
self.df_overall_data = pd.read_excel(xls, 'overall_data')
def populate_year(self):
'''Populating year to all the lists
'''
if self.df_overall_data:
year = self.df_overall_data['date'].iloc[0].year
self.list_a.append(year)
self.list_b.append(year)
self.list_c.append(year)
def populate_month(self):
'''Populating month to all the lists
'''
if self.df_overall_data:
month = self.df_overall_data['date'].iloc[0].month
self.list_a.append(month)
self.list_b.append(month)
self.list_c.append(month)
def populate_precision_value(self):
'''Populating precision value to
all the lists
'''
if self.df_sheet1:
precision_actual_a = self.df_sheet1.loc[self.df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
self.list_a.append(precision_actual_a)
precision_actual_b = self.df_sheet1.loc[self.df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
self.list_b.append(precision_actual_b)
precision_actual_c = self.df_sheet1.loc[self.df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
self.list_c.append(precision_actual_c)
def populate_q2(self, values=[]):
'''Populating Q2 values(fixed values)
to all lists
'''
if len(values) == 3:
value1, value2, value3 = values
self.list_a.append(value1)
self.list_b.append(value2)
self.list_c.append(value3)
def add_lists(self):
'''Adding lists (list_a,list_b,list_c)
to respective precision lists
'''
self.df_precision_list_a.append(self.list_a)
self.df_precision_list_b.append(self.list_b)
self.df_precision_list_c.append(self.list_c)
def populate_precision_dataframes(self):
'''Populating precision dataframes
using their respective lists
'''
self.df_precision_a=pd.DataFrame(self.df_precision_list_a,columns=['Year', 'Month', 'Q1', 'Q2'])
self.df_precision_b=pd.DataFrame(self.df_precision_list_b,columns=['Year', 'Month', 'Q1', 'Q2'])
self.df_precision_c=pd.DataFrame(self.df_precision_list_c,columns=['Year', 'Month', 'Q1', 'Q2'])
Upvotes: 1
Reputation: 9746
You don't have to do things in that order. If you instead to first read all data you need, you'll see another pattern emerge:
for file in allfiles:
print(file)
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
year = df_overall_data ['date'].iloc[0].year
month = df_overall_data ['date'].iloc[0].month
precision_actual_a = df_sheet1.loc[df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
precision_actual_b = df_sheet1.loc[df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
precision_actual_c = df_sheet1.loc[df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
list_a = []
list_b = []
list_c = []
##Populating year to all the lists
list_a.append(year)
list_b.append(year)
list_c.append(year)
##Populating month to all the lists
list_a .append(month)
list_b.append(month)
list_c.append(month)
##Populating precision value to all the lists
list_a.append(precision_actual_a)
list_b.append(precision_actual_b)
list_c.append(precision_actual_c)
##Populating Q2 values(fixed values) to all lists
list_a.append(77)
list_b.append(65)
list_c.append(72)
##Adding lists (list_a,list_b,list_c) to respective precision lists
df_precision_list_a.append(list_a)
df_precision_list_b.append(list_b)
df_precision_list_c.append(list_c)
You might notice that all the append
's are unnecessary. We can add all values directly into the lists:
for file in allfiles:
print(file)
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
year = df_overall_data ['date'].iloc[0].year
month = df_overall_data ['date'].iloc[0].month
precision_actual_a = df_sheet1.loc[df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
precision_actual_b = df_sheet1.loc[df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
precision_actual_c = df_sheet1.loc[df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
list_a = [year, month, precision_actual_a, 77]
list_b = [year, month, precision_actual_b, 65]
list_c = [year, month, precision_actual_c, 72]
##Adding lists (list_a,list_b,list_c) to respective precision lists
df_precision_list_a.append(list_a)
df_precision_list_b.append(list_b)
df_precision_list_c.append(list_c)
You might notice now that the temporary lists are unnecessary too:
for file in allfiles:
print(file)
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
year = df_overall_data ['date'].iloc[0].year
month = df_overall_data ['date'].iloc[0].month
precision_actual_a = df_sheet1.loc[df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
precision_actual_b = df_sheet1.loc[df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
precision_actual_c = df_sheet1.loc[df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
df_precision_list_a.append([year, month, precision_actual_a, 77])
df_precision_list_b.append([year, month, precision_actual_b, 65])
df_precision_list_c.append([year, month, precision_actual_c, 72])
And now we have small enough code to be put in its own function. You could reduce the code a bit more and put it in a function like this:
def create_precision_lists(files):
df_precision_lists = [[], [], []]
for file in files:
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
year = df_overall_data ['date'].iloc[0].year
month = df_overall_data ['date'].iloc[0].month
for i, region in ('AA', 'BB', 'CC'):
precision_actual = df_sheet1.loc[df_sheet1['Region'] == region, 'Precision'].iloc[0]
df_precision_lists[i] = [year, month, precision_actual, 77]
df_precision = [pd.DataFrame(precision_list, columns=['Year', 'Month', 'Q1', 'Q2']) for precision_list in df_precision_lists]
return df_precision
It's not a great modular function though. To make it modular, you have to remove all the hardcoded values and you shouldn't rely on that specific keys and attributes exist. However, it's hard to do anything about it without knowing the full context of what you're trying to do and what your hard requirements and constraints are.
Upvotes: 0
Reputation: 1441
Only a quick tweak, but, this should make the flow better:
def main_func(file):
print(file)
xls = pd.ExcelFile(file)
df_sheet1 = pd.read_excel(xls, 'sheet1')
df_overall_data = pd.read_excel(xls, 'overall_data')
list_a = []
list_b = []
list_c = []
def append_to_list(inp_lst):
nonlocal list_a, list_b, list_c
list_a.append(inp_lst[0])
list_b.append(inp_lst[1])
list_c.append(inp_lst[2])
##Populating year to all the lists
year = df_overall_data ['date'].iloc[0].year
append_to_list([year]*3)
##Populating month to all the lists
month = df_overall_data ['date'].iloc[0].month
append_to_list([month]*3)
##Populating precision value to all the lists
precision_actual_a = df_sheet1.loc[df_sheet1['Region'] == 'AA', 'Precision'].iloc[0]
precision_actual_b = df_sheet1.loc[df_sheet1['Region'] == 'BB', 'Precision'].iloc[0]
precision_actual_c = df_sheet1.loc[df_sheet1['Region'] == 'CC', 'Precision'].iloc[0]
append_to_list([precision_actual_a,precision_actual_b,precision_actual_c])
##Populating Q2 values(fixed values) to all lists
append_to_list([77,65,72])
# ... some more stuff
# ... returns the lists
and now call this function with in the for
loop you had:
for file in allfiles:
## prep the main lists ...
main_func(file)
## append and create df
Upvotes: 1