Marshall
Marshall

Reputation: 37

Scrapy JSON output - values empty - trying xpath

I modified an old script and got it running. However, the old values weren't outputting anything to JSON. I am pretty new to scraping. I am practicing on scraping indeed.com. Also how would I pull the keyword I am search for "remote" to list as "Job Type". I am also unsure if I have correct url and rule. Thanks

I know the script runs, but I need help on the response css or response.xpath. I can find the all the xpath values but some don't work. xpathing "jobtitle" I get a bunch of code/ like onmouse click. Providing code..

class IndeedSpider(CrawlSpider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = [
    "https://www.indeed.com/jobs?q=remote&l=",
]


rules = ( 
    Rule(LinkExtractor(allow=('/jobs.q=linux&l=remote&l$','q=linux&l=remote&sort=l&start=[0-9]+$',),deny=('/my/mysearches', '/preferences', '/advanced_search','/my/myjobs')), callback='parse_item', follow=True),


    )

def parse_next_site(self, response):
    item = response.request.meta['item'] 
    item['source_url'] = response.url
    item['source_page_body'] = response.body
    item['crawl_timestamp'] =  time.strftime('%Y-%m-%d %H:%M:%S')



    def parse_item(self, response):
    self.log('\n Crawling  %s\n' % response.url)
    hxs = Selector(response)
    sites = hxs.select("//div[@class='row ' or @class='row lastRow']")
    #sites = hxs.select("//div[@class='row ']")
    items = []
    for site in sites:
        item = IndeedItem(company='none')
        # Producing output with onmouse click. etc. Gets title as well.
        item['job_title'] = site.select("//a[contains(concat(' ', normalize-space(@class), ' '),' jobtitle ')]").extract()
        # link not working
        link_url= site.select('h2/a/@href').extract()
        item['link_url'] = link_url
        item['crawl_url'] = response.url
        item['location'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' location ')]/text()").extract()
        # salary returns ''
        item['salary'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' salaryText ')]").extract()
        # Not all entries have a company. got a lot of , '\n
        if  site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract() == []:
            item['company'] = [u'']
        else:
            item['company'] = site.select("//span[contains(concat(' ', normalize-space(@class), ' '),' company ')]/text()").extract()
                    # Summary seems to work
            item['summary'] = site.select("//div[contains(concat(' ', normalize-space(@class), ' '),' summary ')]").extract()
        item['source'] = site.select("table/tr/td/span[@class='source']/text()").extract()
        item['found_date'] = site.select("table/tr/td/span[@class='date']/text()").extract()
        #item['source_url'] = self.get_source(link_url)
        request = Request("http://www.indeed.com" +   item['link_url'][0], callback=self.parse_next_site)
    request.meta['item'] = item
    yield request
    items.append(item)
    return

    
SPIDER=IndeedSpider()

Perhaps someone can test existing code to see some of the output as well as tell me what I need to do to fix whats not working. Would really help me to move forward to figure out what I'm doing wrong and to understand workings of these things better. Again thanks.

Upvotes: 0

Views: 100

Answers (1)

Tal Leibman
Tal Leibman

Reputation: 306

when iterating over scrapy selectors with xpath use './/myxpath' to use a realtive path, you can look at the code exmaple here hope it helps :)

from scrapy.spiders import CrawlSpider
from scrapy.http import Request, Response
from scrapy.linkextractors import LinkExtractor
import time


class IndeedSpider(CrawlSpider):
    name = "indeed"
    allowed_domains = ["indeed.com"]
    start_urls = [
        "https://www.indeed.com/jobs?q=remote&l=",
    ]

    def start_requests(self):
        for link in IndeedSpider.start_urls:
            yield Request(url=link, callback=self.parse_site)

    def parse_site(self, response: Response):
        extracted_links = LinkExtractor(
            allow=['/jobs.q=linux&l=remote&l$', 'q=linux&l=remote&sort=l&start=[0-9]+$'],
            deny=['/my/mysearches', '/preferences', '/advanced_search', '/my/myjobs']) \
            .extract_links(response)

        for link in extracted_links:
            yield Request(url=link.url, callback=self.parse_item)

    def parse_item(self, response: Response):
        self.log('\n Crawling  %s\n' % response.url)
        sites = response.xpath("//div[@class='row ' or @class='row lastRow']")
        # sites = hxs.select("//div[@class='row ']")
        items = []
        for site in sites:
            item = IndeedItem(company='none')
            # Producing output with onmouse click. etc. Gets title as well.

            # when Iterating over selectors use .// to use a relative xpath
            item['job_title'] = site.xpath(".//a[has-class('jobtitle')]").get()
            # link not working
            link_url = site.xpath('.//h2/a/@href').get()
            item['link_url'] = link_url
            item['crawl_url'] = response.url
            item['location'] = site.xpath(".//span[has-class('location')]/text()").get()
            # salary returns ''
            item['salary'] = site.xpath(".//span[has-class('salaryText')]").get()
            # Not all entries have a company. got a lot of , '\n
            if not site.xpath(".//span[has-class('company')]/text()").getall():
                item['company'] = [u'']
            else:
                item['company'] = site.xpath(".//span[has-class('company')/text()").get()
                # Summary seems to work
                item['summary'] = site.xpath("//div[has-class('summary')]").get()
            item['source'] = site.xpath(".//table/tr/td/span[@class='source']/text()").get()
            item['found_date'] = site.xpath(".//table/tr/td/span[@class='date']/text()").get()
            # item['source_url'] = self.get_source(link_url)
            request = Request("http://www.indeed.com" + item['link_url'][0], callback=self.parse_next_site)
            request.meta['item'] = item
            yield request
            items.append(item)

    def parse_next_site(self, response: Response):
        item = response.request.meta['item']
        item['source_url'] = response.url
        item['source_page_body'] = response.body
        item['crawl_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')

Upvotes: 1

Related Questions