Alex McLean
Alex McLean

Reputation: 2764

Scrapy gathers data, but does not save it into the item

I've built a spider that gets the stock data for a given stock from as many pages that the stock has (this can be 1 page of stock data, or 20 from Yahoo! Finance). It scrapes all the pages well, and gathers all of the data as it should. However, for some reason it won't save any of the data into the actual scrapy item, so that I can export it as a csv.

See Update at the bottom of this question!

I will now show you the code, followed by a sample of what it prints out:

The Code:

class DmozSpider(Spider):


    name = "dnot"
    allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
    start_urls = ['http://finance.yahoo.com/q?s=CAT']


   def stocks1(self, response):

        current_page = response.url
        print current_page
        # If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
        if initial_ending not in current_page[-iel:]:
            returns_pages = response.meta.get('returns_pages')
            # Remove the last stock price from the stock list, because it is the same as the first on the new list
            if not not returns_pages:
                if len(returns_pages) > 2:
                    returns_pages = returns_pages[:-1]
        else:
            # Else, if the link does match that of the first page, create a new list becuase one does not exist yet
            returns_pages = []

        # This grabs the stock data from the page
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
        print "stocks1"
        print returns_pages
        for row in rows:
            cells = row.xpath('.//td/text()').extract()
            try:
                values = cells[-1]
                try:
                    float(values)
                    # And adds it to returns_pages
                    returns_pages.append(values)
                except ValueError:
                    continue
            except ValueError:
                continue  
        print "after"
        print returns_pages 

        # exp determines if there is a 'Next page' or not
        exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
        # If there is a 'Next Page':
        if not not exp: 
            # And this is the first page:
            if initial_ending in current_page[-iel:]:
                #create necessary url for the 2nd page
                next_page = current_page + "&z=66&y=66"
            # If this is not the first page
            else:
                # This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
                u = int(current_page[-6:].split("=",1)[1])
                o = len(str(u))
                u += 66 
                next_page = current_page[:-o] + str(u)
                print next_page, "66&y in curr_page"
            # Then go back to self.stocks1 to get more data on the next page
            yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages})
        # Else, if there is no 'Next Link'
        else: 
            # Send the retuns to finalize.stock to be saved in the item
            yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages})

    def stocks2(self, response):

        # Prints the link of the current url
        current_page = response.url
        print current_page

        # Gets the returns from the previous page
        returns_pages = response.meta.get('returns_pages')
        # Removes the last return from the previous page because it will be a duplicate
        returns_pages = returns_pages[:-1]
        print "stocks2"
        print returns_pages
        # Gets all of the returns on the page
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
        for row in rows:
            cells = row.xpath('.//td/text()').extract()
            try:
                values = cells[-1]
                try:
                    float(values)
                    # And adds it to the previous returns
                    returns_pages.append(values)
                except ValueError:
                    continue
            except ValueError:
                continue  

        print "after 2"
        print returns_pages

        # exp determines if there is a 'Next page' or not
        exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract() 
        # If there is a 'Next Page':
        if not not exp:
            # And somehow, this is the first page (should never be true)
            if initial_ending in current_page[-iel:]:
                # Add necessary link to go to the second page
                next_page = current_page + "&z=66&y=66"
                print next_page, "66&y not in curr_page"
            # Else, this is not the first page (should always be true)
            else:
                # add 66 to the last number on the preceeding link in order to access the second or later pages
                u = int(current_page[-6:].split("=",1)[1])
                o = len(str(u))
                u += 66 
                next_page = current_page[:-o] + str(u)
                print next_page, "66&y in curr_page"
            # go back to self.stocks1 to get more data on the next page
            yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages}) 
        else: 
            # If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
            yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages}) 
            print "sending to finalize stock"

    def finalize_stock(self,response):

        unformatted_returns = response.meta.get('returns_pages')
        returns = [float(i) for i in returns]
        global required_amount_of_returns, counter
        if counter == 1 and "CAT" in response.url:
            required_amount_of_returns = len(returns)
        elif required_amount_of_returns == 0:
            raise CloseSpider("'Error with initiating required amount of returns'")

        counter += 1
        print counter

        # Iterator to calculate Rate of return 
        # ====================================
        if data_intervals == "m": 
            k = 12
        elif data_intervals == "w":
            k = 4
        else: 
            k = 30

        sub_returns_amount = required_amount_of_returns - k
        sub_returns = returns[:sub_returns_amount]
        rate_of_return = []
        RFR = Risk_free_rate

        # Make sure list is exact length, otherwise rate_of_return will be inaccurate
        # Returns has not been checked by pipeline yet, so small lists will be in the variable

        if len(returns) == required_amount_of_returns or "CAT" in response.url:
            for number in sub_returns:
                numerator = number - returns[k]
                rate = numerator/returns[k]
                if rate == '': 
                    rate = 0
                rate_of_return.append(rate)
                k += 1

        item = Website()
        items = []
        item['url'] = response.url
        item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
        item['avg_returns'] = numpy.average(rate_of_return)
        item['var_returns'] = numpy.cov(rate_of_return)
        item['sd_returns'] = numpy.std(rate_of_return)
        item['returns'] = unformatted_returns
        item['rate_of_returns'] = rate_of_return
        item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
        item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
        items.append(item)
        yield item

I tried to comment everything so that it makes sense to anyone reading this.

How it works:

Essentially, it goes to a given stock and takes the stock data that is presented on the first page. It then sees if there is a 'next page' link. If there is, it will pass it to stocks2, if there is yet another next page, it will pass it back to stocks1, and will continue doing this until there are no more pages. Once there are no more pages, it will send the data to finalize_stock, where it is supposed to save all of this data, as well as manipulated data that doesn't matter for the purposes of this question.

The Output: (A small sample)

print current_page
http://finance.yahoo.com/q/hp?s=PZA.TO&a=04&b=19&c=2005&d=04&e=19&f=2006&g=d&z=66&y=198
print "stocks 2"
stocks2
# print returns_pages | before scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85']
# print "after 2"
after 2
#Print returns_pages | after scraped this page
[u'4.75', u'4.78', u'4.78', u'4.83', u'4.87', u'4.90', u'4.90', u'4.97', u'4.99', u'4.92', u'4.95', u'4.90', u'4.90', u'4.93', u'4.92', u'4.90', u'4.89', u'4.88', u'4.95', u'4.90', u'4.95', u'4.95', u'4.95', u'4.90', u'4.90', u'4.90', u'4.90', u'4.95', u'4.91', u'4.91', u'4.90', u'4.92', u'4.92', u'4.92', u'4.91', u'4.92', u'4.91', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.85', u'4.83', u'4.91', u'4.90', u'4.92', u'4.95', u'4.95', u'4.92', u'4.92', u'4.92', u'4.87', u'4.88', u'4.92', u'4.90', u'4.94', u'4.90', u'4.97', u'4.97', u'4.94', u'4.90', u'4.92', u'4.83', u'4.83', u'4.83', u'4.83', u'4.75', u'4.75', u'4.78', u'4.80', u'4.81', u'4.81', u'4.84', u'4.80', u'4.78', u'4.83', u'4.80', u'4.83', u'4.81', u'4.87', u'4.97', u'5.02', u'4.97', u'4.91', u'4.91', u'4.92', u'4.90', u'4.90', u'4.83', u'4.79', u'4.83', u'4.83', u'4.83', u'4.83', u'4.77', u'4.78', u'4.81', u'4.83', u'4.83', u'4.85', u'4.80', u'4.87', u'4.87', u'4.87', u'4.87', u'4.83', u'4.83', u'4.83', u'4.87', u'4.85', u'4.83', u'4.83', u'4.87', u'4.83', u'4.83', u'4.85', u'4.83', u'4.80', u'4.80', u'4.78', u'4.71', u'4.72', u'4.71', u'4.73', u'4.71', u'4.54', u'4.29', u'4.25', u'4.23', u'4.25', u'4.27', u'4.25', u'4.30', u'4.32', u'4.30', u'4.30', u'4.27', u'4.27', u'4.22', u'4.13', u'4.12', u'4.15', u'4.15', u'4.22', u'4.22', u'4.27', u'4.27', u'4.27', u'4.25', u'4.46', u'4.22', u'4.39', u'4.37', u'4.36', u'4.57', u'4.63', u'4.68', u'4.68', u'4.67', u'4.73', u'4.80', u'4.83', u'4.84', u'4.84', u'4.85', u'4.85', u'4.81', u'4.78', u'4.83', u'5.06', u'5.09', u'5.02', u'5.12', u'5.09', u'5.12', u'5.14', u'5.07', u'5.06', u'4.99', u'5.00', u'4.97', u'4.98', u'4.98', u'4.95', u'4.92', u'4.98', u'4.92', u'4.93', u'4.93', u'4.95', u'4.94', u'4.92', u'4.90', u'4.85', u'4.85', u'4.86', u'4.92', u'4.93', u'4.92', u'4.95', u'4.93', u'4.94', u'4.95', u'4.96', u'4.95', u'4.95', u'4.95', u'4.95', u'4.98', u'4.97', u'4.92', u'4.94', u'4.90', u'4.93', u'4.93', u'4.97', u'4.97', u'4.97', u'4.90', u'5.00', u'5.02', u'5.11', u'5.12', u'5.12']
2015-05-25 17:41:46-0700 [dnot] DEBUG: Crawled (200) <GET http://finance.yahoo.com/lookup?s=PVS.PR.D.TO> (referer: http://eoddata.com/stocklist/TSX/P.htm)
2015-05-25 17:41:46-0700 [dnot] DEBUG: Redirecting (301) to <GET http://finance.yahoo.com/lookup?s=PUD.B.TO> from <GET http://finance.yahoo.com/lookup;_ylc=X3oDMTF2cTUxaTdhBGtleXcDUFVELkIuVE8EbWlkA21lZGlhcXVvdGVzc2VhcmNoBHNlYwNnZXRxdW90ZXNidG4Ec2xrA2xvb2t1cA--?s=PUD.B.TO>
sending to finalize stock # See here, it does call the def finalize_stock function

However, nothing gets saved to the items. Usually scrapy will print the items when they are saved, but it doesn't do this, and I can't figure out why.

If you need any additional information, just ask and I will post it ASAP.

Update: Problem found but not solved:

before, in def stocks1 and def stocks2 I had:

        else: 
        # If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
        yield Request(current_page, self.finalize_stock, meta={'returns_pages': returns_pages}) 
        print "sending to finalize stock"

at the bottom of each function, which basically means that when there is no next page, send the information to finalize_stock and save the information. print "sending to finalize stock" does get printed, however:

def finalize_stock(self,response):

print "====================="
print "finalize_stock called"
print "====================="

never gets printed! So for some reason, def finalize_stock never runs, and I have no idea why.

Upvotes: 2

Views: 1635

Answers (1)

Frank Martin
Frank Martin

Reputation: 2594

Your code looks very complicated and hard to debug. I think there is no need for multiple callbacks and calculations for 'link' creation.

A lot of stuff should be simplified, so that it's easier to debug. Have a look on the following (tested) code and feel free to use any useful parts:

import scrapy

class ValueItem(scrapy.Item):
    value = scrapy.Field()

class StockSpider(scrapy.Spider):

    name = "yahoo_stock_spider"
    allowed_domains = ['finance.yahoo.com']
    start_urls = ['http://finance.yahoo.com/q/hp?s=CAT&a=00&b=1&c=2015&d=04&e=26&f=2015&g=d' ]

    def parse(self, response):

        if 'item' in response.meta:
            # If the response contains a 'item' from a previous page unwrap it
            item = response.meta['item']
        else:
            # if it contains no such item, it's the first page, so let's create it
            item = ValueItem()
            item['value'] = ['']


        # Loop over the table rows
        rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table//tr')
        for row in rows[1:]:

            cell_values = row.xpath('.//td/text()').extract()
            item['value'] = item['value'] + [cell_values[-1]]


        # Check if there is a 'Next' link
        xpath_Next_Page = './/a[contains(.,"Next")]/@href'
        if response.xpath(xpath_Next_Page):
            # No need to calculate offset values. Just take the link ...
            next_page_href = response.xpath(xpath_Next_Page).extract()[0]
            url_next_page = 'http://finance.yahoo.com' + next_page_href
            # ... build the request ...
            request = scrapy.Request(url_next_page, callback=self.parse)
            # ... and add the item with the collected values to the request
            request.meta['item'] = item
            yield request
        else:
            # No more 'Next'
            # here simple output of uncleaned values
            yield item

Upvotes: 4

Related Questions