Reputation: 2764
I've built a spider that gets the stock data for a given stock from as many pages that the stock has (this can be 1 page of stock data, or 20 from Yahoo! Finance). It scrapes all the pages well, and gathers all of the data as it should. However, for some reason it won't save any of the data into the actual scrapy item, so that I can export it as a csv.
See Update at the bottom of this question!
I will now show you the code, followed by a sample of what it prints out:
The Code:
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
def stocks1(self, response):
current_page = response.url
print current_page
# If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
if initial_ending not in current_page[-iel:]:
returns_pages = response.meta.get('returns_pages')
# Remove the last stock price from the stock list, because it is the same as the first on the new list
if not not returns_pages:
if len(returns_pages) > 2:
returns_pages = returns_pages[:-1]
else:
# Else, if the link does match that of the first page, create a new list becuase one does not exist yet
returns_pages = []
# This grabs the stock data from the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
print "stocks1"
print returns_pages
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to returns_pages
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And this is the first page:
if initial_ending in current_page[-iel:]:
#create necessary url for the 2nd page
next_page = current_page + "&z=66&y=66"
# If this is not the first page
else:
# This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# Then go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages})
# Else, if there is no 'Next Link'
else:
# Send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
def stocks2(self, response):
# Prints the link of the current url
current_page = response.url
print current_page
# Gets the returns from the previous page
returns_pages = response.meta.get('returns_pages')
# Removes the last return from the previous page because it will be a duplicate
returns_pages = returns_pages[:-1]
print "stocks2"
print returns_pages
# Gets all of the returns on the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to the previous returns
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after 2"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And somehow, this is the first page (should never be true)
if initial_ending in current_page[-iel:]:
# Add necessary link to go to the second page
next_page = current_page + "&z=66&y=66"
print next_page, "66&y not in curr_page"
# Else, this is not the first page (should always be true)
else:
# add 66 to the last number on the preceeding link in order to access the second or later pages
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages})
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
def finalize_stock(self,response):
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
RFR = Risk_free_rate
# Make sure list is exact length, otherwise rate_of_return will be inaccurate
# Returns has not been checked by pipeline yet, so small lists will be in the variable
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = unformatted_returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
I tried to comment everything so that it makes sense to anyone reading this.
How it works:
Essentially, it goes to a given stock and takes the stock data that is presented on the first page. It then sees if there is a 'next page' link. If there is, it will pass it to stocks2
, if there is yet another next page
, it will pass it back to stocks1
, and will continue doing this until there are no more pages. Once there are no more pages, it will send the data to finalize_stock
, where it is supposed to save all of this data, as well as manipulated data that doesn't matter for the purposes of this question.
The Output: (A small sample)
print current_page
http://finance.yahoo.com/q/hp?s=PZA.TO&a=04&b=19&c=2005&d=04&e=19&f=2006&g=d&z=66&y=198
print "stocks 2"
stocks2
# print returns_pages | before scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85']
# print "after 2"
after 2
#Print returns_pages | after scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85', u'4.85', u'4.86', u'4.92', u'4.93', u'4.92', u'4.95', u'4.93', u'4.94', u'4.95', u'4.96', u'4.95', u'4.95', u'4.95', u'4.95', u'4.98', u'4.97', u'4.92', u'4.94', u'4.90', u'4.93', u'4.93', u'4.97', u'4.97', u'4.97', u'4.90', u'5.00', u'5.02', u'5.11', u'5.12', u'5.12']
2015-05-25 17:41:46-0700 [dnot] DEBUG: Crawled (200) <GET http://finance.yahoo.com/lookup?s=PVS.PR.D.TO> (referer: http://eoddata.com/stocklist/TSX/P.htm)
2015-05-25 17:41:46-0700 [dnot] DEBUG: Redirecting (301) to <GET http://finance.yahoo.com/lookup?s=PUD.B.TO> from <GET http://finance.yahoo.com/lookup;_ylc=X3oDMTF2cTUxaTdhBGtleXcDUFVELkIuVE8EbWlkA21lZGlhcXVvdGVzc2VhcmNoBHNlYwNnZXRxdW90ZXNidG4Ec2xrA2xvb2t1cA--?s=PUD.B.TO>
sending to finalize stock # See here, it does call the def finalize_stock function
However, nothing gets saved to the items. Usually scrapy will print the items when they are saved, but it doesn't do this, and I can't figure out why.
If you need any additional information, just ask and I will post it ASAP.
Update: Problem found but not solved:
before, in def stocks1
and def stocks2
I had:
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})
print "sending to finalize stock"
at the bottom of each function, which basically means that when there is no next page, send the information to finalize_stock
and save the information. print "sending to finalize stock"
does get printed, however:
def finalize_stock(self,response):
print "====================="
print "finalize_stock called"
print "====================="
never gets printed! So for some reason, def finalize_stock
never runs, and I have no idea why.
Upvotes: 2
Views: 1635
Reputation: 2594
Your code looks very complicated and hard to debug. I think there is no need for multiple callbacks and calculations for 'link' creation.
A lot of stuff should be simplified, so that it's easier to debug. Have a look on the following (tested) code and feel free to use any useful parts:
import scrapy
class ValueItem(scrapy.Item):
value = scrapy.Field()
class StockSpider(scrapy.Spider):
name = "yahoo_stock_spider"
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=CAT&a=00&b=1&c=2015&d=04&e=26&f=2015&g=d' ]
def parse(self, response):
if 'item' in response.meta:
# If the response contains a 'item' from a previous page unwrap it
item = response.meta['item']
else:
# if it contains no such item, it's the first page, so let's create it
item = ValueItem()
item['value'] = ['']
# Loop over the table rows
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table//tr')
for row in rows[1:]:
cell_values = row.xpath('.//td/text()').extract()
item['value'] = item['value'] + [cell_values[-1]]
# Check if there is a 'Next' link
xpath_Next_Page = './/a[contains(.,"Next")]/@href'
if response.xpath(xpath_Next_Page):
# No need to calculate offset values. Just take the link ...
next_page_href = response.xpath(xpath_Next_Page).extract()[0]
url_next_page = 'http://finance.yahoo.com' + next_page_href
# ... build the request ...
request = scrapy.Request(url_next_page, callback=self.parse)
# ... and add the item with the collected values to the request
request.meta['item'] = item
yield request
else:
# No more 'Next'
# here simple output of uncleaned values
yield item
Upvotes: 4