Jason Baker
Jason Baker

Reputation: 3

Scrapy pagination issues - new to this stuff

I am trying to make a scrapy bot that utilizes pagination but having no success...

The bot crawls through all of the links on the first page one but never goes on to the next page. I have read a ton of different threads and I cant figure this out at all. I am very new to web scraping to please feel free to hammer the crap out of my code.

    import time
    from scrapy.spiders import CrawlSpider, Rule
    #from scrapy.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.contrib.linkextractors import LinkExtractor
    from scrapy.selector import Selector
    from scrapy.http.request import Request
    from tutorial.items import TutorialItem


    #from scrapy_tutorial.items import ScrapyTutorialItem

    class raytheonJobsPageSpider(CrawlSpider):

        name = "raytheonJobsStart"
        allowed_domains = ["jobs.raytheon.com"]
        start_urls = [
            "https://jobs.raytheon.com/search-jobs"
        ]

        rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[@class="next"]',)), callback='parse_listings',follow=True), )

        def parse_start_url(self, response):
            '''
            Crawl start URLs
            '''

            return self.parse_listings(response)

        def parse_listings(self, response):
            '''
            Extract data from listing pages
            '''

            sel = Selector(response)
            jobs = response.xpath(
                '//*[@id="search-results-list"]/ul/*/a/@href'
            ).extract()
            nextLink = response.xpath('//a[@class="next"]').extract()
            print "This is just the next page link - ",nextLink

            for job_url in jobs:
                job_url = self.__normalise(job_url)
                job_url = self.__to_absolute_url(response.url, job_url)

                yield Request(job_url, callback=self.parse_details)

        def parse_details(self, response):
            '''
            Extract data from details pages
            '''


            sel = Selector(response)
            job = sel.xpath('//*[@id="content"]')
            item = TutorialItem()
            # Populate job fields
            item['title'] = job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
            jobTitle=job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
            item['reqid'] = job.xpath('//*[@id="content"]/section[1]/div/span[1]/text()').extract()
            item['location'] = job.xpath('//*[@id="content"]/section[1]/div/span[last()]/text()').extract()
            item['applink'] = job.xpath('//*[@id="content"]/section[1]/div/a[2]/@href').extract()
            item['description'] = job.xpath('//*[@id="content"]/section[1]/div/div').extract()
            item['clearance'] = job.xpath('//*[@id="content"]/section[1]/div/*/text()').extract()
            #item['page_url'] = response.url
            item = self.__normalise_item(item, response.url)
            time.sleep(1)
            return item

        def __normalise_item(self, item, base_url):
            '''
            Standardise and format item fields
            '''

            # Loop item fields to sanitise data and standardise data types
            for key, value in vars(item).values()[0].iteritems():
                item[key] = self.__normalise(item[key])

            # Convert job URL from relative to absolute URL
            #item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])

            return item

        def __normalise(self, value):
            print self,value
            # Convert list to string
            value = value if type(value) is not list else ' '.join(value)
            # Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
            value = value.strip()

            return value

        def __to_absolute_url(self, base_url, link):
            '''
            Convert relative URL to absolute URL
            '''

            import urlparse

            link = urlparse.urljoin(base_url, link)

            return link

        def __to_int(self, value):
            '''
            Convert value to integer type
            '''

            try:
                value = int(value)
            except ValueError:
                value = 0

            return value

        def __to_float(self, value):
            '''
            Convert value to float type
            '''

            try:
                value = float(value)
            except ValueError:
                value = 0.0

            return value

Upvotes: 0

Views: 746

Answers (2)

Umair Ayub
Umair Ayub

Reputation: 21351

You dont need PhantomJS or Splash.

By inspecting the AJAX calls I found that they are loading jobs via AJAX calls to this URL

You can see CurrentPage parameter at the end of URL.

And the result is returned in JSON format, and all jobs are on the key named results

I created a project on my side and I created fully 100% working code for you. Here is link to that in github, just download and run it ... you dont have to do anything at all :P

Download whole working project fomr here https://github.com/mani619cash/raytheon_pagination

Basic logic is here class RaytheonspiderSpider(CrawlSpider):

name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="

def start_requests(self):
    yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

def parse_listings(self, response):
    resp = json.loads(response.body)

    response = Selector(text = resp['results'])

    jobs = response.xpath('//*[@id="search-results-list"]/ul/*/a/@href').extract()
    if jobs:
        for job_url in jobs:
            job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
            #job_url = self.__to_absolute_url(response.url, job_url)
            yield Request(url=job_url, callback=self.parse_details)
    else:
        raise CloseSpider("No more pages... exiting...")

    # go to next page...
    self.page = self.page + 1
    yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

Upvotes: 1

Umair Ayub
Umair Ayub

Reputation: 21351

Change

restrict_xpaths=('//div[@class="next"]',)) to

restrict_xpaths=('//a[@class="next"]',))

If this not working then do a recursive call to parse_listings function

def parse_listings(self, response):
        '''
        Extract data from listing pages
        '''

        sel = Selector(response)
        jobs = response.xpath(
            '//*[@id="search-results-list"]/ul/*/a/@href'
        ).extract()
        nextLink = response.xpath('//a[@class="next"]').extract()
        print "This is just the next page link - ",nextLink

        for job_url in jobs:
            job_url = self.__normalise(job_url)
            job_url = self.__to_absolute_url(response.url, job_url)

            yield Request(job_url, callback=self.parse_details)

       yield Request(pagination link here, callback=self.parse_listings)

I am on mobile so cant type code. I hope the logic i told you makes sense

Upvotes: 0

Related Questions