Reputation: 471
I cannot get any information on the next page and do not understand where I went wrong. I get the following error for the next page follow:
DEBUG: Crawled (204) <GET https://www.cv-library.co.uk/data-jobs?page=2&us=1.html> (referer: https://www.cv-library.co.uk/data-jobs?us=1.html)
Which suggests it has the correct next page, but I get a response 204 for some reason.
Here's my script:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
headers = {
'authority': 'www.cv-library.co.uk',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'tracestate': '2836143@nr=0-1-2836143-225765981-8ed5e144475f5fd9----1641467711944',
'traceparent': '00-2299c077ba709dbe9305cba0626c6ca0-8ed5e144475f5fd9-01',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjI4MzYxNDMiLCJhcCI6IjIyNTc2NTk4MSIsImlkIjoiOGVkNWUxNDQ0NzVmNWZkOSIsInRyIjoiMjI5OWMwNzdiYTcwOWRiZTkzMDVjYmEwNjI2YzZjYTAiLCJ0aSI6MTY0MTQ2NzcxMTk0NH19',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.cv-library.co.uk/degree-statistics-jobs?page=2&us=1',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'referral_affiliate_id_persistent=102595; referral_clickthrough_id=745498790; _gid=GA1.3.262155324.1641395920; _gac_UA-23741307-1=1.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_aw=GCL.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_au=1.1.1766125280.1641395920; _gaexp=GAX1.3.nAurI6Y-TM6B_OWpvrCV0Q.19075.0; gdpr-auditId=ab7a11f410b546fca190d1878f18f3d2; geo-location={"country":"GB","region":"ENG"}; _lr_geo_location=GB; r3sess6876=cbc722437f09cd32971f8123baf6f760; R3_SESSID_JS=cbc722437f09cd32971f8123baf6f760; gdpr-dau-log-sent=true; gdpr-last-interaction=1641395937.728; gdpr-config-version=66; euconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAH_AAAAAAAAMZYDyFyIgkKC4NCSAQYIgAgsgiAgAsAAEAACAAAAAAAAQQAAAABAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAIAABAAAAAAgBAAAAAAQAA.YAAAAagAA4Dk; cconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAOAAAAAAAAAAAAGhAAonpRPTyepJ60T14n4wwAYTjAnpRPTyepJ60T14n4xAAUT0onp5PVE9aJ68T8YoACJ6sYADE4wJ6UT8Y4AIJxgT08nqSetE9eQAAicYJAAROMFAAInGGQAiTixOMiccE74J50T0knpietk9eT2Qn42gAeTixOMiccE74J52T0hPWyevJ7IT8bYAPJxYnGROOCd8E87J6QnrZPXk9kJ-JwAHk4sTjInHBO-Cedk9IT1snryeyE_G.YAAAHIAAAAA; gdpr-dau=true; __Host-session=c86bc25441c240f9a30bbac8daffb0f6; recently_viewed_jobs=215368291; r3engage=true; tempbasket=1641467697195181162186; cookiecheck=1; PSGI-XSRF-Token=f45f1f1dada8d70820ad717ae9a88d5830b5b17e; _sp_ses.4b8c=*; _gat_UA-23741307-1=1; cto_bundle=skLHfl9MazFaR1lmYVZUaFdBOE8zVE5jbUNGZ200U3AwY3JvcVJBTEJEZFkwdWczVzFuVkIxUFEzQzNZbzliaXZrMVp4M05FVmxlNzdzMTQyRkR2SmpZUERNQVJZRFhZSFowU1lMdTVPOWdLMGltR1J2UENLV1BGJTJCREU5SG9wRzh4MFQ5NzNMZGZEc3BmMGhpbjElMkJUTDByVXpBJTNEJTNE; datadome=.9o_rHEVgkEOG7FFOM0MYLi3DzvqPsgg_l.OVD0sqEJoRSY4kXEfneqL_YOTaLxLRD9fVQidW_BGt7O3V1bd2v0p6OtCgqV5IzXVFu5EWNuNvqXckK~n7Hn8L_fX4vYc; session=1641467710.76267%3ABQsDAAAAAA%3D%3D%3A991145cf3c65c2e16fe7eed48adf30d7f869f4bc; _sp_id.4b8c=84ded5d4-a8e7-46e4-aa9a-bdcdebdbf52e.1641395920.4.1641467712.1641408229.ff300996-fc3a-4f77-b2c7-435969eeec12; _ga_8R43H4HVGM=GS1.1.1641467699.4.1.1641467711.48; _ga=GA1.1.1969297745.1641395920',
}
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
degree_data = pd.read_csv('/Users/indeed/indeed/degree_names2.csv')
start_urls = defaultdict(list)
custom_settings = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2,
}
def start_requests(self):
yield scrapy.Request(
url = 'https://www.cv-library.co.uk/data-jobs?us=1.html',
callback = self.parse,
headers=headers,
dont_filter = True,
cb_kwargs = {
'page_counter':0,
})
def parse(self, response, page_counter):
if page_counter > 300:
return
container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector = lists)
loader.add_xpath('title', '//li/article[@id]//a[@title]/@title')
loader.add_xpath('salary', '//li/article[@id]//dl//dd[@class="job__details-value salary"]//text()')
loader.add_xpath('organisation', '//li/article[@id]/div//div/p/a//text()')
yield loader.load_item()
next_page = response.xpath('//a[@class="pagination__next"]/@href').get()
page_counter += 1
if next_page:
yield response.follow(
next_page,
callback = self.parse,
cb_kwargs = {
'page_counter':page_counter
})
process = CrawlerProcess(
settings = {
'FEED_URI':'cv_jobs2.jl',
'FEED_FORMAT':'jsonlines'
})
process.crawl(CvSpider)
process.start()
Upvotes: 0
Views: 125
Reputation: 4822
You also need the headers in response.follow
yield response.follow(
next_page,
headers=headers,
callback = self.parse,
cb_kwargs = {
'page_counter':page_counter
})
EDIT:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst
import pandas as pd
from collections import defaultdict
headers = {
'authority': 'www.cv-library.co.uk',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
'tracestate': '2836143@nr=0-1-2836143-225765981-8ed5e144475f5fd9----1641467711944',
'traceparent': '00-2299c077ba709dbe9305cba0626c6ca0-8ed5e144475f5fd9-01',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'newrelic': 'eyJ2IjpbMCwxXSwiZCI6eyJ0eSI6IkJyb3dzZXIiLCJhYyI6IjI4MzYxNDMiLCJhcCI6IjIyNTc2NTk4MSIsImlkIjoiOGVkNWUxNDQ0NzVmNWZkOSIsInRyIjoiMjI5OWMwNzdiYTcwOWRiZTkzMDVjYmEwNjI2YzZjYTAiLCJ0aSI6MTY0MTQ2NzcxMTk0NH19',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.cv-library.co.uk/degree-statistics-jobs?page=2&us=1',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'referral_affiliate_id_persistent=102595; referral_clickthrough_id=745498790; _gid=GA1.3.262155324.1641395920; _gac_UA-23741307-1=1.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_aw=GCL.1641395920.Cj0KCQiAoNWOBhCwARIsAAiHnEi237VKWkFgXeDITTNqjqyhKJ9nWemzxLGMWGQ-Z5fFW98E51O-F9UaAnQYEALw_wcB; _gcl_au=1.1.1766125280.1641395920; _gaexp=GAX1.3.nAurI6Y-TM6B_OWpvrCV0Q.19075.0; gdpr-auditId=ab7a11f410b546fca190d1878f18f3d2; geo-location={"country":"GB","region":"ENG"}; _lr_geo_location=GB; r3sess6876=cbc722437f09cd32971f8123baf6f760; R3_SESSID_JS=cbc722437f09cd32971f8123baf6f760; gdpr-dau-log-sent=true; gdpr-last-interaction=1641395937.728; gdpr-config-version=66; euconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAH_AAAAAAAAMZYDyFyIgkKC4NCSAQYIgAgsgiAgAsAAEAACAAAAAAAAQQAAAABAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAIAABAAAAAAgBAAAAAAQAA.YAAAAagAA4Dk; cconsent-v2=CPSWSQfPSWSTRADABCENB8CgAAAAAOAAAAAAAAAAAAGhAAonpRPTyepJ60T14n4wwAYTjAnpRPTyepJ60T14n4xAAUT0onp5PVE9aJ68T8YoACJ6sYADE4wJ6UT8Y4AIJxgT08nqSetE9eQAAicYJAAROMFAAInGGQAiTixOMiccE74J50T0knpietk9eT2Qn42gAeTixOMiccE74J52T0hPWyevJ7IT8bYAPJxYnGROOCd8E87J6QnrZPXk9kJ-JwAHk4sTjInHBO-Cedk9IT1snryeyE_G.YAAAHIAAAAA; gdpr-dau=true; __Host-session=c86bc25441c240f9a30bbac8daffb0f6; recently_viewed_jobs=215368291; r3engage=true; tempbasket=1641467697195181162186; cookiecheck=1; PSGI-XSRF-Token=f45f1f1dada8d70820ad717ae9a88d5830b5b17e; _sp_ses.4b8c=*; _gat_UA-23741307-1=1; cto_bundle=skLHfl9MazFaR1lmYVZUaFdBOE8zVE5jbUNGZ200U3AwY3JvcVJBTEJEZFkwdWczVzFuVkIxUFEzQzNZbzliaXZrMVp4M05FVmxlNzdzMTQyRkR2SmpZUERNQVJZRFhZSFowU1lMdTVPOWdLMGltR1J2UENLV1BGJTJCREU5SG9wRzh4MFQ5NzNMZGZEc3BmMGhpbjElMkJUTDByVXpBJTNEJTNE; datadome=.9o_rHEVgkEOG7FFOM0MYLi3DzvqPsgg_l.OVD0sqEJoRSY4kXEfneqL_YOTaLxLRD9fVQidW_BGt7O3V1bd2v0p6OtCgqV5IzXVFu5EWNuNvqXckK~n7Hn8L_fX4vYc; session=1641467710.76267%3ABQsDAAAAAA%3D%3D%3A991145cf3c65c2e16fe7eed48adf30d7f869f4bc; _sp_id.4b8c=84ded5d4-a8e7-46e4-aa9a-bdcdebdbf52e.1641395920.4.1641467712.1641408229.ff300996-fc3a-4f77-b2c7-435969eeec12; _ga_8R43H4HVGM=GS1.1.1641467699.4.1.1641467711.48; _ga=GA1.1.1969297745.1641395920',
}
class CvItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
class CvSpider(scrapy.Spider):
name = 'cv'
degree_data = pd.read_csv('/Users/indeed/indeed/degree_names2.csv')
start_urls = defaultdict(list)
custom_settings = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2,
}
def start_requests(self):
yield scrapy.Request(
url = 'https://www.cv-library.co.uk/data-jobs?us=1.html',
callback = self.parse,
headers=headers,
dont_filter = True,
cb_kwargs = {
'page_counter':0,
})
def parse(self, response, page_counter):
if page_counter > 300:
return
container = response.xpath('//ol[@id="searchResults"]//li[@class="results__item"]')
for lists in container:
loader = ItemLoader(CvItem(), selector=lists)
loader.add_xpath('title', './/article//a[@title]/@title')
loader.add_xpath('salary', './/dd[contains(@class, "salary")]/text()')
loader.add_xpath('organisation', './/a[@class="job__company-link"]/text()')
yield loader.load_item()
next_page = response.xpath('//a[@class="pagination__next"]/@href').get()
page_counter += 1
if next_page:
yield response.follow(
next_page,
headers=headers,
callback = self.parse,
cb_kwargs = {
'page_counter':page_counter
})
process = CrawlerProcess(
settings = {
'FEED_URI':'cv_jobs2.jl',
'FEED_FORMAT':'jsonlines'
})
process.crawl(CvSpider)
process.start()
Upvotes: 1