Reputation: 3
I am trying to make a scrapy bot that utilizes pagination but having no success...
The bot crawls through all of the links on the first page one but never goes on to the next page. I have read a ton of different threads and I cant figure this out at all. I am very new to web scraping to please feel free to hammer the crap out of my code.
import time
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http.request import Request
from tutorial.items import TutorialItem
#from scrapy_tutorial.items import ScrapyTutorialItem
class raytheonJobsPageSpider(CrawlSpider):
name = "raytheonJobsStart"
allowed_domains = ["jobs.raytheon.com"]
start_urls = [
"https://jobs.raytheon.com/search-jobs"
]
rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[@class="next"]',)), callback='parse_listings',follow=True), )
def parse_start_url(self, response):
'''
Crawl start URLs
'''
return self.parse_listings(response)
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[@id="search-results-list"]/ul/*/a/@href'
).extract()
nextLink = response.xpath('//a[@class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
'''
Extract data from details pages
'''
sel = Selector(response)
job = sel.xpath('//*[@id="content"]')
item = TutorialItem()
# Populate job fields
item['title'] = job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
jobTitle=job.xpath('//*[@id="content"]/section[1]/div/h1/text()').extract()
item['reqid'] = job.xpath('//*[@id="content"]/section[1]/div/span[1]/text()').extract()
item['location'] = job.xpath('//*[@id="content"]/section[1]/div/span[last()]/text()').extract()
item['applink'] = job.xpath('//*[@id="content"]/section[1]/div/a[2]/@href').extract()
item['description'] = job.xpath('//*[@id="content"]/section[1]/div/div').extract()
item['clearance'] = job.xpath('//*[@id="content"]/section[1]/div/*/text()').extract()
#item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
time.sleep(1)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
print self,value
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value
Upvotes: 0
Views: 746
Reputation: 21351
You dont need PhantomJS or Splash.
By inspecting the AJAX calls I found that they are loading jobs via AJAX calls to this URL
You can see CurrentPage
parameter at the end of URL.
And the result is returned in JSON format, and all jobs are on the key named results
I created a project on my side and I created fully 100% working code for you. Here is link to that in github, just download and run it ... you dont have to do anything at all :P
Download whole working project fomr here https://github.com/mani619cash/raytheon_pagination
Basic logic is here class RaytheonspiderSpider(CrawlSpider):
name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="
def start_requests(self):
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
def parse_listings(self, response):
resp = json.loads(response.body)
response = Selector(text = resp['results'])
jobs = response.xpath('//*[@id="search-results-list"]/ul/*/a/@href').extract()
if jobs:
for job_url in jobs:
job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
#job_url = self.__to_absolute_url(response.url, job_url)
yield Request(url=job_url, callback=self.parse_details)
else:
raise CloseSpider("No more pages... exiting...")
# go to next page...
self.page = self.page + 1
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
Upvotes: 1
Reputation: 21351
Change
restrict_xpaths=('//div[@class="next"]',))
to
restrict_xpaths=('//a[@class="next"]',))
If this not working then do a recursive call to parse_listings function
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[@id="search-results-list"]/ul/*/a/@href'
).extract()
nextLink = response.xpath('//a[@class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
yield Request(pagination link here, callback=self.parse_listings)
I am on mobile so cant type code. I hope the logic i told you makes sense
Upvotes: 0