Fail-Safe for when i have a network error whilst scraping

Question

I am currently making a scraper which scrapes all takeaway information from JustEat.

import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
import xlsxwriter
from ..items import JusteatItem



class JustEat(scrapy.Spider):
    name = 'completeJE'
    start_urls = ['https://www.just-eat.co.uk/takeaway']

    def parse(self, response):
        location_urls = response.css('div.groups')[0]
        location_url = location_urls.css('a.link::attr(href)').extract()
        lop = []
        for lurls in location_url:
            lop.append("https://www.just-eat.co.uk"+lurls)

        for url in lop:
            yield scrapy.Request(url, callback=self.parse_postcodes)

    def parse_postcodes(self, response):
        loolink = response.css('ul.group-links a::attr(href)').extract()

        for loo in loolink:
            yield scrapy.Request(loo, callback=self.parse_takeaway_links)

    def parse_takeaway_links(self, response):
        self.driver=webdriver.Chrome()
        self.driver.get(response.url)
        for i in range(1, 100):
            page = self.driver.find_element_by_tag_name('html')
            page.send_keys(Keys.END)

        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        url_all = soup.find_all('a', class_='c-listing-item-link u-clearfix', href=True)
        url_list = []
        for a in url_all:
            url_list.append('https://www.just-eat.co.uk' + a['href'])
        self.driver.close()
        for ls in url_list:
           yield scrapy.Request(ls, callback=self.parse_info)

    def parse_info(self, response):
        takeaway = JusteatItem()


        takeaway['title'] = response.css('h1.name::text').extract()
        takeaway['street'] = response.css('#street::text').extract()
        takeaway['city'] = response.css('#city::text').extract()
        takeaway['postcode'] = response.css('#postcode::text').extract()
        takeaway['num_rating'] = response.css('.rating a::text').extract()
        takeaway['cuisine'] = response.css('.cuisines span::text').extract()

        yield takeaway

However my internet seems to sometimes be unreliable so I would like to make a fail safe just in case the network cuts out whilst running the script.

These are the ideas I have:

I want it to either keep trying to scrape the same url until the network comes back instead of it skipping on to the next url.
I want it to give me the list of all urls that were not scraped because of the network error.

But preferabbly the first one. Thanks in advance.

Fail-Safe for when i have a network error whilst scraping

Answers (1)

Related Questions