Reputation: 894
I feel like this should be simple but I'm still a bit new to Python and am struggling to figure out what I should do. I'm scraping historical stock data and would like to put them into one excel spreadsheet. It currently only writes out the last stock data.
I know it is essentially writing over the dataframe each time it goes through the loop but I'm not sure how to fix it to append dataframes, or write to the end of the excel sheet each time it gets to that spot. Any help would be appreciated.
Here is my code:
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
symbols = ['WYNN', 'FL', 'TTWO']
myColumnHeaders = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
for c in range(len(symbols)):
url = 'https://www.nasdaq.com/symbol/'+symbols[c]+'/historical'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
historicaldata = soup.find('div', {'id': 'quotes_content_left_pnlAJAX'})
data_rows = historicaldata.findAll('tr')[2:]
stock_data = [[td.getText().strip() for td in data_rows[a].findAll('td')]
for a in range(len(data_rows))]
df = pd.DataFrame(stock_data, columns=myColumnHeaders)
df.set_index('Date')
df['Volume'].str.replace(',','').astype(int)
for i in range(5):
if i == 0:
df[myColumnHeaders[i]] = pd.to_datetime(df[myColumnHeaders[i]], 'coerce')
else:
df[myColumnHeaders[i]] = pd.to_numeric(df[myColumnHeaders[i]], errors='coerce')
df.to_excel('stock data.xlsx',index=False)
Upvotes: 1
Views: 4190
Reputation: 164623
pd.DataFrame.append
in a loopThis is inefficient as it involves copying data repeatedly. A much better idea is to create a list of dataframes and then concatenate them at the end in a final step outside your loop. Here's some pseudo-code:
symbols = ['WYNN', 'FL', 'TTWO']
cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
dfs = [] # empty list which will hold your dataframes
for c in range(len(symbols)):
# some code
df = pd.DataFrame(stock_data, columns=cols)
df = df.set_index('Date')
df['Volume'] = df['Volume'].str.replace(',', '').astype(int)
df[cols[0]] = pd.to_datetime(df[cols[0]], errors='coerce')
df[cols[1:5]] = df[cols[1:5]].apply(pd.to_datetime, errors='coerce')
dfs.append(df) # append dataframe to list
res = pd.concat(dfs, ignore_index=True) # concatenate list of dataframes
res.to_excel('stock data.xlsx', index=False)
Note you are performing many operations, e.g. set_index
, as if they are by default in place. That's not the case. You should assign back to a variable, e.g. df = df.set_index('Date')
.
Upvotes: 3
Reputation: 1
I'm a newbie to pandas as well. I think this is a good question and I am sure there is a simple way to do it, but this is what I came up with...
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
symbols = ['WYNN', 'FL', 'TTWO']
stocklist = []
for c in range(len(symbols)):
url = 'https://www.nasdaq.com/symbol/'+symbols[c]+'/historical'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
historicaldata = soup.find('div', {'id': 'quotes_content_left_pnlAJAX'})
data_rows = historicaldata.findAll('tr')[2:]
stock_data = [[td.getText().strip() for td in data_rows[a].findAll('td')]for a in range(len(data_rows))]
stocklist.append(stock_data)
df0 = pd.DataFrame(stocklist[0], columns = ['Date', 'Open '+symbols[0], 'High '+symbols[0], 'Low '+symbols[0], 'Close '+symbols[0], 'Volume '+symbols[0]])
df0.set_index('Date', inplace=True)
df1 = pd.DataFrame(stocklist[1], columns = ['Date', 'Open '+symbols[1], 'High '+symbols[1], 'Low '+symbols[1], 'Close '+symbols[1], 'Volume '+symbols[1]])
df1.set_index('Date', inplace=True)
df2 = pd.DataFrame(stocklist[2], columns = ['Date', 'Open '+symbols[2], 'High '+symbols[2], 'Low '+symbols[2], 'Close '+symbols[2], 'Volume '+symbols[2]])
df2.set_index('Date', inplace=True)
df3 = df0.merge(df1, left_index=True, right_index=True)
df = df3.merge(df2, left_index=True, right_index=True)
VW = df['Volume WYNN'].str.replace(',','').astype(int)
FL = df['Volume FL'].str.replace(',','').astype(int)
TTWO = df['Volume TTWO'].str.replace(',','').astype(int)
df['Volume WYNN'] = VW
df['Volume FL'] = FL
df['Volume TTWO'] = TTWO
df.to_excel('stock data.xlsx',index=False)
Upvotes: 0
Reputation: 649
I've updated your code in order to get all the data in a single DataFrame.
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
symbols = ['WYNN', 'FL', 'TTWO']
myColumnHeaders = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
dfs = []
for c in range(len(symbols)):
url = 'https://www.nasdaq.com/symbol/'+symbols[c]+'/historical'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
historicaldata = soup.find('div', {'id': 'quotes_content_left_pnlAJAX'})
data_rows = historicaldata.findAll('tr')[2:]
stock_data = [[td.getText().strip() for td in data_rows[a].findAll('td')]
for a in range(len(data_rows))]
df = pd.DataFrame(stock_data, columns=myColumnHeaders)
df.set_index('Date')
df['Volume'].str.replace(',','').astype(int)
for i in range(5):
if i == 0:
df[myColumnHeaders[i]] = pd.to_datetime(df[myColumnHeaders[i]], 'coerce')
else:
df[myColumnHeaders[i]] = pd.to_numeric(df[myColumnHeaders[i]], errors='coerce')
df.index = [symbols[c]]*len(df)
dfs.append(df)
df = dfs[0].append(dfs[1]).append(dfs[2]).reset_index()
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='data', index=False)
writer.save()
Upvotes: 1