me.limes
me.limes

Reputation: 471

Scrapy won't follow next page it gives an error

I cannot get any information on the next page and do not understand where I went wrong. I get the following error for the next page follow:

DEBUG: Crawled (204) <GET https://www.cv-library.co.uk/data-jobs?page=2&us=1.html> (referer: https://www.cv-library.co.uk/data-jobs?us=1.html)

Which suggests it has the correct next page, but I get a response 204 for some reason.

Here's my script:

from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict

headers = {
    'authority': 'www.cv-library.co.uk',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
    'tracestate': '2836143@nr=0-1-2836143-225765981-8ed5e144475f5fd9----1641467711944',
    'traceparent': '00-2299c077ba709dbe9305cba0626c6ca0-8ed5e144475f5fd9-01',
    'sec-ch-ua-mobile': '?0',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjI4MzYxNDMiLCJhcCI6IjIyNTc2NTk4MSIsImlkIjoiOGVkNWUxNDQ0NzVmNWZkOSIsInRyIjoiMjI5OWMwNzdiYTcwOWRiZTkzMDVjYmEwNjI2YzZjYTAiLCJ0aSI6MTY0MTQ2NzcxMTk0NH19',
    'accept': '*/*',
    'x-requested-with': 'XMLHttpRequest',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://www.cv-library.co.uk/degree-statistics-jobs?page=2&us=1',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cookie': 'referral_affiliate_id_persistent=102595; referral_clickthrough_id=745498790; _gid=GA1.3.262155324.1641395920; _gac_UA-23741307-1=1.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_aw=GCL.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_au=1.1.1766125280.1641395920; _gaexp=GAX1.3.nAurI6Y-TM6B_OWpvrCV0Q.19075.0; gdpr-auditId=ab7a11f410b546fca190d1878f18f3d2; geo-location={"country":"GB","region":"ENG"}; _lr_geo_location=GB; r3sess6876=cbc722437f09cd32971f8123baf6f760; R3_SESSID_JS=cbc722437f09cd32971f8123baf6f760; gdpr-dau-log-sent=true; gdpr-last-interaction=1641395937.728; gdpr-config-version=66; euconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAH_AAAAAAAAMZYDyFyIgkKC4NCSAQYIgAgsgiAgAsAAEAACAAAAAAAAQQAAAABAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAIAABAAAAAAgBAAAAAAQAA.YAAAAagAA4Dk; cconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAOAAAAAAAAAAAAGhAAonpRPTyepJ60T14n4wwAYTjAnpRPTyepJ60T14n4xAAUT0onp5PVE9aJ68T8YoACJ6sYADE4wJ6UT8Y4AIJxgT08nqSetE9eQAAicYJAAROMFAAInGGQAiTixOMiccE74J50T0knpietk9eT2Qn42gAeTixOMiccE74J52T0hPWyevJ7IT8bYAPJxYnGROOCd8E87J6QnrZPXk9kJ-JwAHk4sTjInHBO-Cedk9IT1snryeyE_G.YAAAHIAAAAA; gdpr-dau=true; __Host-session=c86bc25441c240f9a30bbac8daffb0f6; recently_viewed_jobs=215368291; r3engage=true; tempbasket=1641467697195181162186; cookiecheck=1; PSGI-XSRF-Token=f45f1f1dada8d70820ad717ae9a88d5830b5b17e; _sp_ses.4b8c=*; _gat_UA-23741307-1=1; cto_bundle=skLHfl9MazFaR1lmYVZUaFdBOE8zVE5jbUNGZ200U3AwY3JvcVJBTEJEZFkwdWczVzFuVkIxUFEzQzNZbzliaXZrMVp4M05FVmxlNzdzMTQyRkR2SmpZUERNQVJZRFhZSFowU1lMdTVPOWdLMGltR1J2UENLV1BGJTJCREU5SG9wRzh4MFQ5NzNMZGZEc3BmMGhpbjElMkJUTDByVXpBJTNEJTNE; datadome=.9o_rHEVgkEOG7FFOM0MYLi3DzvqPsgg_l.OVD0sqEJoRSY4kXEfneqL_YOTaLxLRD9fVQidW_BGt7O3V1bd2v0p6OtCgqV5IzXVFu5EWNuNvqXckK~n7Hn8L_fX4vYc; session=1641467710.76267%3ABQsDAAAAAA%3D%3D%3A991145cf3c65c2e16fe7eed48adf30d7f869f4bc; _sp_id.4b8c=84ded5d4-a8e7-46e4-aa9a-bdcdebdbf52e.1641395920.4.1641467712.1641408229.ff300996-fc3a-4f77-b2c7-435969eeec12; _ga_8R43H4HVGM=GS1.1.1641467699.4.1.1641467711.48; _ga=GA1.1.1969297745.1641395920',
}

class CvItem(scrapy.Item):
    category = Field(output_processor = TakeFirst())
    salary = Field(output_processor = TakeFirst())
    title =  Field(output_processor = TakeFirst())
    organisation = Field(output_processor = TakeFirst())

class CvSpider(scrapy.Spider):
    name = 'cv'
    degree_data = pd.read_csv('/Users/indeed/indeed/degree_names2.csv')
    start_urls = defaultdict(list)

    custom_settings = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        'DOWNLOAD_DELAY':2,
        }

    def start_requests(self):
        yield scrapy.Request(
            url = 'https://www.cv-library.co.uk/data-jobs?us=1.html', 
            callback = self.parse,
            headers=headers,
            dont_filter = True,
            cb_kwargs = {
                'page_counter':0,
            })

    def parse(self, response, page_counter):
        if page_counter > 300:
            return
        container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
        for lists in container:
            loader = ItemLoader(CvItem(), selector = lists)
            loader.add_xpath('title', '//li/article[@id]//a[@title]/@title')
            loader.add_xpath('salary', '//li/article[@id]//dl//dd[@class="job__details-value salary"]//text()')
            loader.add_xpath('organisation', '//li/article[@id]/div//div/p/a//text()')
            yield loader.load_item()

        next_page = response.xpath('//a[@class="pagination__next"]/@href').get()
        page_counter += 1
        if next_page:
            yield response.follow(
            next_page,
            callback = self.parse,
            cb_kwargs = {
                'page_counter':page_counter
                })
process = CrawlerProcess(
    settings = {
        'FEED_URI':'cv_jobs2.jl',
        'FEED_FORMAT':'jsonlines'
    })
process.crawl(CvSpider)
process.start()

Upvotes: 0

Views: 125

Answers (1)

SuperUser
SuperUser

Reputation: 4822

You also need the headers in response.follow

yield response.follow(
next_page,
headers=headers,
callback = self.parse,
cb_kwargs = {
    'page_counter':page_counter
    })

EDIT:

from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict

headers = {
    'authority': 'www.cv-library.co.uk',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
    'tracestate': '2836143@nr=0-1-2836143-225765981-8ed5e144475f5fd9----1641467711944',
    'traceparent': '00-2299c077ba709dbe9305cba0626c6ca0-8ed5e144475f5fd9-01',
    'sec-ch-ua-mobile': '?0',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjI4MzYxNDMiLCJhcCI6IjIyNTc2NTk4MSIsImlkIjoiOGVkNWUxNDQ0NzVmNWZkOSIsInRyIjoiMjI5OWMwNzdiYTcwOWRiZTkzMDVjYmEwNjI2YzZjYTAiLCJ0aSI6MTY0MTQ2NzcxMTk0NH19',
    'accept': '*/*',
    'x-requested-with': 'XMLHttpRequest',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://www.cv-library.co.uk/degree-statistics-jobs?page=2&us=1',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cookie': 'referral_affiliate_id_persistent=102595; referral_clickthrough_id=745498790; _gid=GA1.3.262155324.1641395920; _gac_UA-23741307-1=1.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_aw=GCL.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_au=1.1.1766125280.1641395920; _gaexp=GAX1.3.nAurI6Y-TM6B_OWpvrCV0Q.19075.0; gdpr-auditId=ab7a11f410b546fca190d1878f18f3d2; geo-location={"country":"GB","region":"ENG"}; _lr_geo_location=GB; r3sess6876=cbc722437f09cd32971f8123baf6f760; R3_SESSID_JS=cbc722437f09cd32971f8123baf6f760; gdpr-dau-log-sent=true; gdpr-last-interaction=1641395937.728; gdpr-config-version=66; euconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAH_AAAAAAAAMZYDyFyIgkKC4NCSAQYIgAgsgiAgAsAAEAACAAAAAAAAQQAAAABAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAIAABAAAAAAgBAAAAAAQAA.YAAAAagAA4Dk; cconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAOAAAAAAAAAAAAGhAAonpRPTyepJ60T14n4wwAYTjAnpRPTyepJ60T14n4xAAUT0onp5PVE9aJ68T8YoACJ6sYADE4wJ6UT8Y4AIJxgT08nqSetE9eQAAicYJAAROMFAAInGGQAiTixOMiccE74J50T0knpietk9eT2Qn42gAeTixOMiccE74J52T0hPWyevJ7IT8bYAPJxYnGROOCd8E87J6QnrZPXk9kJ-JwAHk4sTjInHBO-Cedk9IT1snryeyE_G.YAAAHIAAAAA; gdpr-dau=true; __Host-session=c86bc25441c240f9a30bbac8daffb0f6; recently_viewed_jobs=215368291; r3engage=true; tempbasket=1641467697195181162186; cookiecheck=1; PSGI-XSRF-Token=f45f1f1dada8d70820ad717ae9a88d5830b5b17e; _sp_ses.4b8c=*; _gat_UA-23741307-1=1; cto_bundle=skLHfl9MazFaR1lmYVZUaFdBOE8zVE5jbUNGZ200U3AwY3JvcVJBTEJEZFkwdWczVzFuVkIxUFEzQzNZbzliaXZrMVp4M05FVmxlNzdzMTQyRkR2SmpZUERNQVJZRFhZSFowU1lMdTVPOWdLMGltR1J2UENLV1BGJTJCREU5SG9wRzh4MFQ5NzNMZGZEc3BmMGhpbjElMkJUTDByVXpBJTNEJTNE; datadome=.9o_rHEVgkEOG7FFOM0MYLi3DzvqPsgg_l.OVD0sqEJoRSY4kXEfneqL_YOTaLxLRD9fVQidW_BGt7O3V1bd2v0p6OtCgqV5IzXVFu5EWNuNvqXckK~n7Hn8L_fX4vYc; session=1641467710.76267%3ABQsDAAAAAA%3D%3D%3A991145cf3c65c2e16fe7eed48adf30d7f869f4bc; _sp_id.4b8c=84ded5d4-a8e7-46e4-aa9a-bdcdebdbf52e.1641395920.4.1641467712.1641408229.ff300996-fc3a-4f77-b2c7-435969eeec12; _ga_8R43H4HVGM=GS1.1.1641467699.4.1.1641467711.48; _ga=GA1.1.1969297745.1641395920',
}

class CvItem(scrapy.Item):
    category = Field(output_processor = TakeFirst())
    salary = Field(output_processor = TakeFirst())
    title =  Field(output_processor = TakeFirst())
    organisation = Field(output_processor = TakeFirst())

class CvSpider(scrapy.Spider):
    name = 'cv'
    degree_data = pd.read_csv('/Users/indeed/indeed/degree_names2.csv')
    start_urls = defaultdict(list)

    custom_settings = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        'DOWNLOAD_DELAY':2,
    }

    def start_requests(self):
        yield scrapy.Request(
            url = 'https://www.cv-library.co.uk/data-jobs?us=1.html',
            callback = self.parse,
            headers=headers,
            dont_filter = True,
            cb_kwargs = {
                'page_counter':0,
            })

    def parse(self, response, page_counter):
        if page_counter > 300:
            return
        container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
        for lists in container:
            loader = ItemLoader(CvItem(), selector=lists)
            loader.add_xpath('title', './/article//a[@title]/@title')
            loader.add_xpath('salary', './/dd[contains(@class, "salary")]/text()')
            loader.add_xpath('organisation', './/a[@class="job__company-link"]/text()')
            yield loader.load_item()

        next_page = response.xpath('//a[@class="pagination__next"]/@href').get()
        page_counter += 1
        if next_page:
            yield response.follow(
                next_page,
                headers=headers,
                callback = self.parse,
                cb_kwargs = {
                    'page_counter':page_counter
                })
process = CrawlerProcess(
    settings = {
        'FEED_URI':'cv_jobs2.jl',
        'FEED_FORMAT':'jsonlines'
    })
process.crawl(CvSpider)
process.start()

Upvotes: 1

Related Questions