Reputation: 1801
I have the following code which takes the file "University2.csv", and writes new csv files "Hours.csv" - "Hours -Stacked.csv" and "Days.csv".
Now I want the code to be able to loop and run on several files (University3.csv, University4.csv etc.) and produce for each of them "Hours3.csv", "Hours - Stacked3.csv" "Days3.csv", "Hours4.csv" etc.
Here is the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Importing the csv file into df
df = pd.read_csv('university2.csv', sep=";", skiprows=1)
#Changing datetime
df['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'],
format='%Y-%m-%d %H:%M:%S:%f')
#Set index from column
df = df.set_index('YYYY-MO-DD HH-MI-SS_SSS')
#Add Magnetic Magnitude Column
df['magnetic_mag'] = np.sqrt(df['MAGNETIC FIELD X (μT)']**2 + df['MAGNETIC FIELD Y (μT)']**2 + df['MAGNETIC FIELD Z (μT)']**2)
#Copy interesting values
df2 = df[[ 'ATMOSPHERIC PRESSURE (hPa)',
'TEMPERATURE (C)', 'magnetic_mag']].copy()
#Hourly Average and Standard Deviation for interesting values
df3 = df2.resample('H').agg(['mean','std'])
df3.columns = [' '.join(col) for col in df3.columns]
#Daily Average and Standard Deviation for interesting values
df4 = df2.resample('D').agg(['mean','std'])
df4.columns = [' '.join(col) for col in df4.columns]
#Write to new csv
df3.to_csv('Hours.csv', index=True)
df4.to_csv('Days.csv', index=True)
#New csv with stacked hour averages
df5 = pd.read_csv('Hours.csv')
df5['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df5['YYYY-MO-DD HH-MI-SS_SSS'])
hour = pd.to_timedelta(df5['YYYY-MO-DD HH-MI-SS_SSS'].dt.hour, unit='H')
df6 = df5.groupby(hour).mean()
df6.to_csv('Hours - stacked.csv', index=True)
Can anyone help ?
Thank you !
Upvotes: 2
Views: 76
Reputation: 862511
I think you can use loop with list files
. I extract numbers from names of files to i
and then add them to output names.
Also, you can get df5
from df3
by reset_index
, is not necessary read_csv
again.
import pandas as pd
files = ['university1.csv','university2.csv','university3.csv']
for f in files:
i = f[-5]
print i
#Importing the csv file into df
df = pd.read_csv(f, sep=";", skiprows=1)
#Changing datetime
df['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'],
format='%Y-%m-%d %H:%M:%S:%f')
#Set index from column
df = df.set_index('YYYY-MO-DD HH-MI-SS_SSS')
#Add Magnetic Magnitude Column
df['magnetic_mag'] = np.sqrt(df['MAGNETIC FIELD X (μT)']**2 + df['MAGNETIC FIELD Y (μT)']**2 + df['MAGNETIC FIELD Z (μT)']**2)
#Copy interesting values
df2 = df[[ 'ATMOSPHERIC PRESSURE (hPa)',
'TEMPERATURE (C)', 'magnetic_mag']].copy()
#Hourly Average and Standard Deviation for interesting values
df3 = df2.resample('H').agg(['mean','std'])
df3.columns = [' '.join(col) for col in df3.columns]
#Daily Average and Standard Deviation for interesting values
df4 = df2.resample('D').agg(['mean','std'])
df4.columns = [' '.join(col) for col in df4.columns]
#Write to new csv
df3.to_csv('Hours'+ i + '.csv')
df4.to_csv('Day'+ i + 's.csv')
#New csv with stacked hour averages
#df5 = pd.read_csv('Hours.csv')
#df5['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df5['YYYY-MO-DD HH-MI-SS_SSS'])
df5 = df3.reset_index()
hour = pd.to_timedelta(df5['YYYY-MO-DD HH-MI-SS_SSS'].dt.hour, unit='H')
df6 = df5.groupby(hour).mean()
df6.to_csv('Hours - stacked'+ i + '.csv')
EDITED:
More general is Yaron solution, I uset it and change only 2,3,4
to range()
:
import pandas as pd
#files = ['university1.csv','university2.csv','university3.csv']
for i in range(1,4):
print i
print 'university'+ str(i) + '.csv'
#Importing the csv file into df
df = pd.read_csv('university'+ str(i) + '.csv', sep=";", skiprows=1)
#Changing datetime
df['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'],
format='%Y-%m-%d %H:%M:%S:%f')
#Set index from column
df = df.set_index('YYYY-MO-DD HH-MI-SS_SSS')
#Add Magnetic Magnitude Column
df['magnetic_mag'] = np.sqrt(df['MAGNETIC FIELD X (μT)']**2 + df['MAGNETIC FIELD Y (μT)']**2 + df['MAGNETIC FIELD Z (μT)']**2)
#Copy interesting values
df2 = df[[ 'ATMOSPHERIC PRESSURE (hPa)',
'TEMPERATURE (C)', 'magnetic_mag']].copy()
#Hourly Average and Standard Deviation for interesting values
df3 = df2.resample('H').agg(['mean','std'])
df3.columns = [' '.join(col) for col in df3.columns]
#Daily Average and Standard Deviation for interesting values
df4 = df2.resample('D').agg(['mean','std'])
df4.columns = [' '.join(col) for col in df4.columns]
#Write to new csv
df3.to_csv('Hours'+ str(i) + '.csv')
df4.to_csv('Day'+ str(i) + 's.csv')
#New df3 with stacked hour averages
df5 = df3.reset_index()
hour = pd.to_timedelta(df5['YYYY-MO-DD HH-MI-SS_SSS'].dt.hour, unit='H')
df6 = df5.groupby(hour).mean()
df6.to_csv('Hours - stacked'+ str(i) + '.csv')
Upvotes: 1
Reputation: 4070
Another option is to use sys module
And this is how you can use it by calling the python program as prog.py one.csv two.csv 1..n.csv
:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
for input in sys.argv:
process(input)
def process(input):
#Importing the csv file into df
df = pd.read_csv(input, sep=";", skiprows=1)
#Changing datetime
df['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'],
format='%Y-%m-%d %H:%M:%S:%f')
#Set index from column
df = df.set_index('YYYY-MO-DD HH-MI-SS_SSS')
#Add Magnetic Magnitude Column
df['magnetic_mag'] = np.sqrt(df['MAGNETIC FIELD X (μT)']**2 + df['MAGNETIC FIELD Y (μT)']**2 + df['MAGNETIC FIELD Z (μT)']**2)
#Copy interesting values
df2 = df[[ 'ATMOSPHERIC PRESSURE (hPa)',
'TEMPERATURE (C)', 'magnetic_mag']].copy()
#Hourly Average and Standard Deviation for interesting values
df3 = df2.resample('H').agg(['mean','std'])
df3.columns = [' '.join(col) for col in df3.columns]
#Daily Average and Standard Deviation for interesting values
df4 = df2.resample('D').agg(['mean','std'])
df4.columns = [' '.join(col) for col in df4.columns]
#Write to new csv
hours = input[:-4]+'_Hours.csv'
df3.to_csv(hours, index=True)
df4.to_csv(input[:-4]+'_Days.csv', index=True)
#New csv with stacked hour averages
df5 = pd.read_csv(hours)
df5['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df5['YYYY-MO-DD HH-MI-SS_SSS'])
hour = pd.to_timedelta(df5['YYYY-MO-DD HH-MI-SS_SSS'].dt.hour, unit='H')
df6 = df5.groupby(hour).mean()
df6.to_csv(input[:-4]+'_Hours - stacked.csv', index=True)
Upvotes: 1
Reputation: 10450
The following code should do the trick.
It runs a for loop using index (idx) which uses the following values (3,4,5)
It use variable filenames, with the idx as parameter. e.g.
uni_name = "university" + str(idx) + ".csv"
Here is the code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
for idx in 3,4,5:
#Importing the csv file into df
uni_name = "university" + str(idx) + ".csv"
df = pd.read_csv(uni_name, sep=";", skiprows=1)
#Changing datetime
df['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df['YYYY-MO-DD HH-MI-SS_SSS'],
format='%Y-%m-%d %H:%M:%S:%f')
#Set index from column
df = df.set_index('YYYY-MO-DD HH-MI-SS_SSS')
#Add Magnetic Magnitude Column
df['magnetic_mag'] = np.sqrt(df['MAGNETIC FIELD X (μT)']**2 + df['MAGNETIC FIELD Y (μT)']**2 + df['MAGNETIC FIELD Z (μT)']**2)
#Copy interesting values
df2 = df[[ 'ATMOSPHERIC PRESSURE (hPa)',
'TEMPERATURE (C)', 'magnetic_mag']].copy()
#Hourly Average and Standard Deviation for interesting values
df3 = df2.resample('H').agg(['mean','std'])
df3.columns = [' '.join(col) for col in df3.columns]
#Daily Average and Standard Deviation for interesting values
df4 = df2.resample('D').agg(['mean','std'])
df4.columns = [' '.join(col) for col in df4.columns]
#Write to new csv
hours = "Hours" + str(idx) + ".csv"
days = "Days" + str(idx) + ".csv"
df3.to_csv(hours, index=True)
df4.to_csv(days, index=True)
#New csv with stacked hour averages
df5 = pd.read_csv('Hours.csv')
df5['YYYY-MO-DD HH-MI-SS_SSS'] = pd.to_datetime(df5['YYYY-MO-DD HH-MI-SS_SSS'])
hour = pd.to_timedelta(df5['YYYY-MO-DD HH-MI-SS_SSS'].dt.hour, unit='H')
df6 = df5.groupby(hour).mean()
hours_st = "Hours - stacked" + str(idx) + ".csv"
df6.to_csv('Hours - stacked.csv', index=True)
Upvotes: 1