Reputation: 3145
I am hoping in the following code, the for loop will loop around all the csv in the folder, and the df data frame will append after reading from each csv. However, the df here never appends, but only contain the content of the first csv. Any thoughts? Thanks!
We are in python 3.6 and pandas 0.21
path = "/home/ubuntu/QA/client_" + CLIENT_ID + "_raw_data_" + year + "/_ACTUAL_*_Accrual*.xls"
if CLIENT_ID in ('7')
df_columns=pd.DataFrame(columns=['PropID','PROPERTY_CODE','TreeNodeID','ACCOUNT_CODE','TreeNodeName','ReportYear','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
OUTPUT_CSV="Client_"+CLIENT_ID+"_"+year+"_"+ACCOUNTING_TYPE+"_QA.csv"
df_columns.to_csv(OUTPUT_CSV, header=True, index=False, encoding='utf-8',na_rep="NA", mode='w')
df = pd.DataFrame()
for fname in glob.iglob(path):
print (fname)
df2 = pd.DataFrame()
df2=pd.read_excel(fname,skiprows=4,converters={'TreeNodeCode':np.int64,'PropCode':np.str}).dropna(subset=['TreeNodeCode'],how='any') ## convert the account code in the raw data into strings. dropna drops the raw of the column 4 ,which is the IAM account code, if the column 4 is NA
print (df2)
df=df.append(df2)
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE'})
df=df.rename(columns={'PropCode':'PROPERTY_CODE'})
df['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
df_QA['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df_QA['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
print ("this is df")
print (df)
print ("this is df_QA")
print (df_QA)
df_check=pd.merge(df,df_QA, how='inner',on=['PROPERTY_CODE','ACCOUNT_CODE'])
#print (df_check)
# tricks in this ticket: https://stackoverflow.com/questions/384192823/subtracting-multiple-columns-and-appending-results-in-pandas-dataframe
df_check[['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] = df_check[['Jan_x','Feb_x','Mar_x','Apr_x','May_x','Jun_x','Jul_x','Aug_x','Sep_x','Oct_x','Nov_x','Dec_x']] - df_check[['Jan_y','Feb_y','Mar_y','Apr_y','May_y','Jun_y','Jul_y','Aug_y','Sep_y','Oct_y','Nov_y','Dec_y']].values
#print (df_check)
df_check2=df_check[['PropID','PROPERTY_CODE','TreeNodeID','ACCOUNT_CODE','TreeNodeName','ReportYear','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']]
#print (df_check2)
# tricks of panda query: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.query.html#pandas-dataframe-query
df_check3=df_check2.query('Jan > 0 | Jan < 0 | Feb > 0 | Feb < 0 | Mar > 0 | Mar < 0 | Apr > 0 | Apr < 0 | May > 0 | May < 0 | Jun > 0 | Jun < 0 | Jul > 0 | Jul < 0 | Aug > 0 | Aug < 0 | Sep > 0 | Sep < 0 | Oct > 0 | Oct < 0 | Nov > 0 | Nov < 0 | Dec > 0 | Dec < 0')
#print (df_check3)
#print (df_check3.info())
df_check3.to_csv(OUTPUT_CSV, header=False, index=False,
na_rep="NA", mode='a')
Upvotes: 0
Views: 76
Reputation: 862851
I think you need append each DataFrame to list first and then concat
:
dfs = []
for fname in glob.iglob(path):
print (fname)
df2=pd.read_excel(fname,skiprows=4,converters={'TreeNodeCode':np.int64,'PropCode':np.str}).dropna(subset=['TreeNodeCode'],how='any') ## convert the account code in the raw data into strings. dropna drops the raw of the column 4 ,which is the IAM account code, if the column 4 is NA
print (df2)
dfs.append(df2)
df = pd.concat(dfs, ignore_index=True)
Also your code:
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE'})
df=df.rename(columns={'PropCode':'PROPERTY_CODE'})
df['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
df_QA['PROPERTY_CODE'] = df_QA['PROPERTY_CODE'].astype(np.str)
df_QA['ACCOUNT_CODE'] = df_QA['ACCOUNT_CODE'].astype(np.str)
should be simplify to:
df=df.rename(columns={'TreeNodeCode':'ACCOUNT_CODE', 'PropCode':'PROPERTY_CODE'})
cols = ['PROPERTY_CODE','ACCOUNT_CODE']
df_QA[cols] = df[cols] = df_QA[cols].astype(str)
Upvotes: 2