Reputation: 37
I modified an old script and got it running. However, the old values weren't outputting anything to JSON. I am pretty new to scraping. I am practicing on scraping indeed.com. Also how would I pull the keyword I am search for "remote" to list as "Job Type". I am also unsure if I have correct url and rule. Thanks
I know the script runs, but I need help on the response css or response.xpath. I can find the all the xpath values but some don't work. xpathing "jobtitle" I get a bunch of code/ like onmouse click. Providing code..
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
rules = (
Rule(LinkExtractor(allow=('/jobs.q=linux&l=remote&l$','q=linux&l=remote&sort=l&start=[0-9]+$',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=True),
)
def parse_next_site(self, response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
def parse_item(self, response):
self.log('\n Crawling %s\n' % response.url)
hxs = Selector(response)
sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
#sites = hxs.select("//div[@class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
item['job_title'] = site.select("//a[contains(concat(' ', normalize-space(@class), ' '),' jobtitle ')]").extract()
# link not working
link_url= site.select('h2/a/@href').extract()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' location ')]/text()").extract()
# salary returns ''
item['salary'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' salaryText ')]").extract()
# Not all entries have a company. got a lot of , '\n
if site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract() == []:
item['company'] = [u'']
else:
item['company'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract()
# Summary seems to work
item['summary'] = site.select("//div[contains(concat(' ', normalize-space(@class), ' '),' summary ')]").extract()
item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract()
#item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
return
SPIDER=IndeedSpider()
Perhaps someone can test existing code to see some of the output as well as tell me what I need to do to fix whats not working. Would really help me to move forward to figure out what I'm doing wrong and to understand workings of these things better. Again thanks.
Upvotes: 0
Views: 100
Reputation: 306
when iterating over scrapy selectors with xpath use './/myxpath' to use a realtive path, you can look at the code exmaple here hope it helps :)
from scrapy.spiders import CrawlSpider
from scrapy.http import Request, Response
from scrapy.linkextractors import LinkExtractor
import time
class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
"https://www.indeed.com/jobs?q=remote&l=",
]
def start_requests(self):
for link in IndeedSpider.start_urls:
yield Request(url=link, callback=self.parse_site)
def parse_site(self, response: Response):
extracted_links = LinkExtractor(
allow=['/jobs.q=linux&l=remote&l$', 'q=linux&l=remote&sort=l&start=[0-9]+$'],
deny=['/my/mysearches', '/preferences', '/advanced_search', '/my/myjobs']) \
.extract_links(response)
for link in extracted_links:
yield Request(url=link.url, callback=self.parse_item)
def parse_item(self, response: Response):
self.log('\n Crawling %s\n' % response.url)
sites = response.xpath("//div[@class='row ' or @class='row lastRow']")
# sites = hxs.select("//div[@class='row ']")
items = []
for site in sites:
item = IndeedItem(company='none')
# Producing output with onmouse click. etc. Gets title as well.
# when Iterating over selectors use .// to use a relative xpath
item['job_title'] = site.xpath(".//a[has-class('jobtitle')]").get()
# link not working
link_url = site.xpath('.//h2/a/@href').get()
item['link_url'] = link_url
item['crawl_url'] = response.url
item['location'] = site.xpath(".//span[has-class('location')]/text()").get()
# salary returns ''
item['salary'] = site.xpath(".//span[has-class('salaryText')]").get()
# Not all entries have a company. got a lot of , '\n
if not site.xpath(".//span[has-class('company')]/text()").getall():
item['company'] = [u'']
else:
item['company'] = site.xpath(".//span[has-class('company')/text()").get()
# Summary seems to work
item['summary'] = site.xpath("//div[has-class('summary')]").get()
item['source'] = site.xpath(".//table/tr/td/span[@class='source']/text()").get()
item['found_date'] = site.xpath(".//table/tr/td/span[@class='date']/text()").get()
# item['source_url'] = self.get_source(link_url)
request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
request.meta['item'] = item
yield request
items.append(item)
def parse_next_site(self, response: Response):
item = response.request.meta['item']
item['source_url'] = response.url
item['source_page_body'] = response.body
item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
Upvotes: 1