Reputation: 1
I am currently making a scraper which scrapes all takeaway information from JustEat.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
import xlsxwriter
from ..items import JusteatItem
class JustEat(scrapy.Spider):
name = 'completeJE'
start_urls = ['https://www.just-eat.co.uk/takeaway']
def parse(self, response):
location_urls = response.css('div.groups')[0]
location_url = location_urls.css('a.link::attr(href)').extract()
lop = []
for lurls in location_url:
lop.append("https://www.just-eat.co.uk"+lurls)
for url in lop:
yield scrapy.Request(url, callback=self.parse_postcodes)
def parse_postcodes(self, response):
loolink = response.css('ul.group-links a::attr(href)').extract()
for loo in loolink:
yield scrapy.Request(loo, callback=self.parse_takeaway_links)
def parse_takeaway_links(self, response):
self.driver=webdriver.Chrome()
self.driver.get(response.url)
for i in range(1, 100):
page = self.driver.find_element_by_tag_name('html')
page.send_keys(Keys.END)
soup = BeautifulSoup(self.driver.page_source, "html.parser")
url_all = soup.find_all('a', class_='c-listing-item-link u-clearfix', href=True)
url_list = []
for a in url_all:
url_list.append('https://www.just-eat.co.uk' + a['href'])
self.driver.close()
for ls in url_list:
yield scrapy.Request(ls, callback=self.parse_info)
def parse_info(self, response):
takeaway = JusteatItem()
takeaway['title'] = response.css('h1.name::text').extract()
takeaway['street'] = response.css('#street::text').extract()
takeaway['city'] = response.css('#city::text').extract()
takeaway['postcode'] = response.css('#postcode::text').extract()
takeaway['num_rating'] = response.css('.rating a::text').extract()
takeaway['cuisine'] = response.css('.cuisines span::text').extract()
yield takeaway
However my internet seems to sometimes be unreliable so I would like to make a fail safe just in case the network cuts out whilst running the script.
These are the ideas I have:
But preferabbly the first one. Thanks in advance.
Upvotes: 0
Views: 91
Reputation: 10666
There is a built-in Scrapy setting that works almost like you want: RETRY_TIMES
Upvotes: 1