Cannot implement recursion with Python Scrapy

Question

Please pardon my knowledge in Scrapy, I have been doing Data Scraping for past 3 years or so using PHP and Python BeautifulSoup, but I am new to Scrapy.

I have Python 2.7 and latest Scrapy.

I have a requirement where I need to scrape http://www.dos.ny.gov/corps/bus_entity_search.html it shows results in paginations.

My requiement is that if a search returns more than 500 results, for example "AME" returns more than 500 results, then code should search for "AMEA" to "AMEZ", and for "AMEA" if it still returns more than 500 results then search "AMEAA" and so on recursively

But it is giving me unexpected results. Here is crawler code.

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.http import FormRequest
from scrapy.http.request import Request

import urllib
from appext20.items import Appext20Item
from scrapy.selector import HtmlXPathSelector

class Appext20Spider(CrawlSpider):
    name = "appext20"

    allowed_domains = ["appext20.dos.ny.gov"]

    # p_entity_name means Keyword to search
    payload = {"p_entity_name": '', "p_name_type": 'A', 'p_search_type':'BEGINS'}

    url = 'https://appext20.dos.ny.gov/corp_public/CORPSEARCH.SELECT_ENTITY'

    search_characters = ["A","B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y","Z"," "]

    construction_keywords = ['Carpenters','Carpentry','Plastering','Roofers','Roofing','plumbing','remodelling','remodeling','Tiling','Painting','Rendering','Electrical','Plumber','contracting ','contractor','construction','Waterproofing','Landscaping','Bricklaying','Cabinet Maker','Flooring','carpenters','electricians','restoration','drywall','renovation','renovating ','remodels ','framing','Masonry','builders','Woodwork','Cabinetry','Millwork','Electric','plastering','painters','painting','HVAC','Labouring','Fencing','Concreting','Glass','AC','Heating','glazier ','air duct','tiles','deck','Guttering','Concrete','Demolition','Debris','Dumpster','Cabinet','Junk','stucco','general contract','home improvement','home repair','home build','homes','building maintenance','masons','siding','kitchens','paving','landscapers','landscapes','design & build','design build','design and build']

    search_keywords = ['']


    def start_requests(self):
        # create keywords combo
        for char in self.search_characters:
          for char2 in self.search_characters:
            for char3 in self.search_characters:
              self.search_keywords.extend([char+char2+char3])

        # now start requests
        for keyword in self.search_keywords:
            self.payload['p_entity_name'] = keyword
            print ('this is keyword '+ keyword)
            # parse_data() is my callback func
            yield FormRequest(self.url, formdata= self.payload, callback=self.parse_data)


    def parse_data(self, response):
        ads_on_page = Selector(response).xpath("//td[@headers='c1']")

        # get that message to see how many results this keyword returned.
        # if it returns more than 500, then page shows "More than 500 entities were found. Only the first 500 entities will be displayed."
        try:
            results = Selector(response).xpath("//center/p/text()").extract()[0]
        except Exception,e:
            results = ''

        all_links = []
        for tr in ads_on_page:
            temp_dict = {}
            temp_dict['title'] = tr.xpath('a/text()').extract()
            temp_dict['link'] = tr.xpath('a/@href').extract()
            temp_dict['p_entity_name'] = self.payload['p_entity_name']
            temp_dict['test'] = results
            yield temp_dict

        # check if has next page 
        try:
            next_page = Selector(response).xpath("//a[text()='Next Page']/@href").extract()
            next_page = 'https://appext20.dos.ny.gov/corp_public/' + next_page[0]

            next_page_text = Selector(response).xpath("//a[text()='Next Page']/@href/text()").extract()

            # if it has more than 1 page, then do recursive calls to search
            # I.E: "AME" returns more than 500 resutls, then code should search for "AMEA" to "AMEZ" 
            # and for "AMEA" if it still returns more than 500 results then search "AMEAA" and so on recursively
            if next_page_text == 2:
                if "More than 500 entities were found" in results:
                    # search through "A" to "Z"
                    for char3 in self.search_characters:
                        self.payload['p_entity_name'] = self.payload['p_entity_name'] + char3
                        print ('THIS is keyword '+ self.payload['p_entity_name'])
                        yield FormRequest(self.url, formdata= self.payload, callback=self.parse_data)

            # scrape that next page.
            yield Request(url=next_page, callback=self.parse_data)
        except Exception,e:
            # no next page.
            return

Here is full copy of my project

I am running my code using scrapy crawl appext20 -t csv -o app.csv --loglevel=INFO command.

Cannot implement recursion with Python Scrapy

Answers (1)

What happens here?

Monkey Patches

Related Questions