In Scrapy, how to use JSON loaded items to populate new fields?

Question

I am trying to produce columns of data from the LD+JSON metadata found in the HTML of a jobs listings site. I've used scrapy Item Loaders to clean the HTML string and convert this metadata to a JSON object. I would then like to use the information contained within that JSON to populate further fields within my crawler.

Here is the spider so far, which crawls the most recent 100 jobs listing:

import scrapy, json
from ..items import EthjobsScrapyItem, EthJobsLoader

class EthioJobsSpider(scrapy.Spider):
    name = "EthioJobs"
    allowed_domains = ["ethiojobs.net"]
    start_urls = ["http://www.ethiojobs.net/latest-jobs-in-ethiopia/?searchId=1573067457.6526&action=search&page=1&listings_per_page=100&view=list"]

    def parse(self, response):
        for listing_url in response.xpath('/html/body/div[4]/section/div/div/div/div[4]/div/div[1]/div[4]/div/div/div/table/tbody//@href').getall():
            yield response.follow(listing_url, callback=self.parse_listing)

    def parse_listing(self, response):
        loader = EthJobsLoader(item = EthjobsScrapyItem(), response=response)
        loader.add_xpath('JSON_LD', '//script[@type="application/ld+json"]/text()')

        yield loader.load_item()

where items.py is:

import scrapy, re, json
from scrapy.loader import ItemLoader

class EthjobsScrapyItem(scrapy.Item):
    JSON_LD     = scrapy.Field()
    datePosted  = scrapy.Field() # an example of a field that would populate data from the JSON data


def cleanJsonVar(self, jsonvar): # Clean HTML markup
    for TEXT in jsonvar:
        if jsonvar:
            try:
                jsonvar = re.sub(r"
+|
+|	+|  | |amp;|", " ", TEXT).strip()
                jsonvar = re.sub(r"Job\sDescription", "", jsonvar)
                jsonvar = re.sub(r"\A\s+", "", jsonvar) 
                jsonvar = re.sub(r"( ){2,}", r" ", jsonvar)
                jsonvar = re.sub(r"\u2019", r" '", jsonvar)
                jsonvar = re.sub(r"\u25cf", r" -", jsonvar)
                jsonvar = re.sub(r"\",r"/", jsonvar)

            except Exception as e:
                jsonvar = None
                print("ERROR: ", str(e))
        else:
            pass
        return jsonvar

def intoJsonVar(self, jsonvar): # Convert from string to JSON
    for TEXT in jsonvar: 
        return json.loads(TEXT)


class EthJobsLoader(ItemLoader):
    JSON_LD_in  =  cleanJsonVar
    JSON_LD_out =  intoJsonVar

JSON_LD is outputted from the crawler like so:

{'JSON_LD': ["{
    '@context': 'http://schema.org/',
    '@type': 'JobPosting',
    'title': 'Terms of Reference',
    'description': ' Terms of Reference for developing General Management Plan...,'
    'identifier': {
        '@type': 'PropertyValue',
        'name': 'Population Health and Environment â€“ Ethiopia Consortium (PHE EC)',
        'value': '65264'
    },
    'datePosted': '2019-12-10 04:13:31',
    'validThrough': '2019-12-20 23:59:59',
    'employmentType': 'Full Time',
    'hiringOrganization': {
        '@type': 'Organization',
        'name': 'Population Health and Envir...'
    },
    'jobLocation': {
        '@type': 'Place',
        'address': {
            '@type': 'PostalAddress',
            'addressLocality': 'ETH Region',
            'addressRegion': ' Addis Ababa ',
            'addressCountry': 'ETH'
        }
    }
}"]
}

My question is this: how would I take information from the above JSON and use it to populate new fields in my crawler?

Any and all input/critique is beyond welcome!

In Scrapy, how to use JSON loaded items to populate new fields?

Answers (1)

Related Questions