Reputation: 9
I want to split one big CSV file into multiple new CSV files.
There is a column SYMBOL
I want the new CSV files on that name.
There are around 200 symbols.
List of columns in the big CSV file are:
I have a code where I can do for one single symbol.
import pandas as pd
import glob
zip_file_list=glob.glob('*.zip')
symbol='ACC'
DATE=[]
OPEN=[]
HIGH=[]
LOW=[]
CLOSE=[]
OI=[]
VOLUME=[]
COI=[]
for i in range (0,len(zip_file_list)):
df=pd.read_csv(zip_file_list[i])
Open=df['OPEN'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].iloc[0]
High=df['HIGH'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].iloc[0]
Low=df['LOW'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].iloc[0]
Close=df['CLOSE'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].iloc[0]
Volume=df['CONTRACTS'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].iloc[0]
oi=df['OPEN_INT'][(df.INSTRUMENT=='FUTSTK') & (df.SYMBOL==symbol)].sum()
coi = df['CHG_IN_OI'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].sum()
date=df['TIMESTAMP'].iloc[0]
DATE.append(date)
OPEN.append(Open)
HIGH.append(High)
LOW.append(Low)
CLOSE.append(Close)
VOLUME.append(Volume)
OI.append(oi)
COI.append(coi)
data={'DATE':DATE,'OPEN':OPEN,'HIGH':HIGH,'LOW':LOW,'CLOSE':CLOSE,'VOLUME':VOLUME,'OI':OI,'COI':COI}
ndf=pd.DataFrame(data)
ndf.to_csv(symbol+'.csv',index=False)
I don't know how to loop through all symbols in that file and create a new file for each distinct symbol.
Then I wish to append daily data to that newly created files.
When I run the code and I have the daily data of 10 days, then I get a file with 10 days data appended for the single symbol mentioned.
But I want that to happen to all the symbols in the big data file. I don't know how to loop through the symbol list and all the file in the dict to append data.
Upvotes: 0
Views: 322
Reputation: 11
First, make a unique list of all the symbols, and iterate through the list using the variable in the code you already have + something else to handle the storage of the data... something like this. Just make sure the data is not that big that can cause a MemoryError reading all of the files from the zip.
import glob
import pandas as pd
from pathlib import Path
def store_data(filename, dataframe):
# Check if file exists to append the data
data_file = Path(filename)
if data_file.is_file():
dataframe.to_csv(filename, index=False, mode='a', header=False)
else:
dataframe.to_csv(filename, index=False)
def extract_symbol_data(dataframes, symbol):
DATE = []
OPEN = []
HIGH = []
LOW = []
CLOSE = []
OI = []
VOLUME = []
COI = []
for df in dataframes:
Open = df['OPEN'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].iloc[0]
High = df['HIGH'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].iloc[0]
Low = df['LOW'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].iloc[0]
Close = df['CLOSE'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].iloc[0]
Volume = df['CONTRACTS'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].iloc[0]
oi = df['OPEN_INT'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].sum()
coi = df['CHG_IN_OI'][(df.INSTRUMENT == 'FUTSTK') & (df.SYMBOL == symbol)].sum()
date = df['TIMESTAMP'].iloc[0]
DATE.append(date)
OPEN.append(Open)
HIGH.append(High)
LOW.append(Low)
CLOSE.append(Close)
VOLUME.append(Volume)
OI.append(oi)
COI.append(coi)
data = {'DATE': DATE, 'OPEN': OPEN, 'HIGH': HIGH, 'LOW': LOW, 'CLOSE': CLOSE, 'VOLUME': VOLUME, 'OI': OI, 'COI': COI}
ndf = pd.DataFrame(data)
store_data(filename=symbol + '.csv', dataframe=ndf)
if __name__ == '__main__':
zip_file_list = glob.glob('*.zip')
# List of all the needed data files
dataframes = [
pd.read_csv(zip_file_list[i])
for i in range(len(zip_file_list))
]
symbols = set()
# Set of all exising symbols in the data
for df in dataframes:
for s in df['SYMBOL'].unique():
symbols.add(s)
for symbol in symbols:
extract_symbol_data(dataframes, symbol)
Upvotes: 1