Reputation: 1318
I am trying to read a list of 489 csv files into pandas data frame using the following code
symbols = ['489 symbols']
def data(symbols):
dates=pd.date_range(start_date,end_date)
df=pd.DataFrame(index=dates)
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
df=df.join(df_temp)
df=df.fillna(method='ffill')
df=df.fillna(method='bfill')
return df
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]
I expect the test data frame to have a shape of (365,489) but the shape turns out to be (9874, 489), can't figure out why?
Suppose instead of using 489 elements I use 1 element, in other words if I read one csv file instead of 489 I get the correct shape of (365,1).
Upvotes: 0
Views: 765
Reputation: 1318
Following code worked
def data(symbols):
dates=pd.date_range(start_date,end_date)
df=pd.DataFrame(index=dates)
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
df_temp = df_temp.loc[~df_temp.index.duplicated(keep='first')]
df=df.join(df_temp)
df=df.fillna(method='ffill')
df=df.fillna(method='bfill')
return df
But still can't get what is being duplicated?
Upvotes: 1
Reputation: 863166
I believe you need append to list of Dataframe
s in each loop and last concat
:
symbols = ['489 symbols']
def data(symbols):
dfs = []
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol}).ffill().bfill()
dfs.append(df_temp)
return pd.concat(dfs, axis=1)
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]
Also if necessary add values from date_range
add reindex
:
symbols = ['489 symbols']
def data(symbols):
dfs = []
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
dfs.append(df_temp)
dates=pd.date_range(start_date,end_date)
return pd.concat(dfs, axis=1).reindex(dates, method='ffill').bfill()
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]
Upvotes: 0