Aman Praveen
Aman Praveen

Reputation: 1

Fail-Safe for when i have a network error whilst scraping

I am currently making a scraper which scrapes all takeaway information from JustEat.

import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
import xlsxwriter
from ..items import JusteatItem



class JustEat(scrapy.Spider):
    name = 'completeJE'
    start_urls = ['https://www.just-eat.co.uk/takeaway']

    def parse(self, response):
        location_urls = response.css('div.groups')[0]
        location_url = location_urls.css('a.link::attr(href)').extract()
        lop = []
        for lurls in location_url:
            lop.append("https://www.just-eat.co.uk"+lurls)

        for url in lop:
            yield scrapy.Request(url, callback=self.parse_postcodes)

    def parse_postcodes(self, response):
        loolink = response.css('ul.group-links a::attr(href)').extract()

        for loo in loolink:
            yield scrapy.Request(loo, callback=self.parse_takeaway_links)

    def parse_takeaway_links(self, response):
        self.driver=webdriver.Chrome()
        self.driver.get(response.url)
        for i in range(1, 100):
            page = self.driver.find_element_by_tag_name('html')
            page.send_keys(Keys.END)

        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        url_all = soup.find_all('a', class_='c-listing-item-link u-clearfix', href=True)
        url_list = []
        for a in url_all:
            url_list.append('https://www.just-eat.co.uk' + a['href'])
        self.driver.close()
        for ls in url_list:
           yield scrapy.Request(ls, callback=self.parse_info)

    def parse_info(self, response):
        takeaway = JusteatItem()


        takeaway['title'] = response.css('h1.name::text').extract()
        takeaway['street'] = response.css('#street::text').extract()
        takeaway['city'] = response.css('#city::text').extract()
        takeaway['postcode'] = response.css('#postcode::text').extract()
        takeaway['num_rating'] = response.css('.rating a::text').extract()
        takeaway['cuisine'] = response.css('.cuisines span::text').extract()

        yield takeaway

However my internet seems to sometimes be unreliable so I would like to make a fail safe just in case the network cuts out whilst running the script.

These are the ideas I have:

But preferabbly the first one. Thanks in advance.

Upvotes: 0

Views: 91

Answers (1)

gangabass
gangabass

Reputation: 10666

There is a built-in Scrapy setting that works almost like you want: RETRY_TIMES

Upvotes: 1

Related Questions