Consistently getting timeout errors in scrapy playwright

Question

Been trying to implement a Yelp Scraper to scrape reviews of restaurants but am constantly getting download and timeout errors. I feed the base url page and find the number of reviews to calculate how many requests I need to make on accord of yelps pagination of reviews to get all of the restaurant reviews. I use the command scrapy crawl yelp -o reviews.csv to get a csv output of the reviews. Have tried decreasing concurrent_requests in settings.py. And even have tried to serialize the requests rather than running it concurrently but found no luck. Also tried changing Timeout delays and still failed. Any suggestions as to how I could fix this?

Here is my spider implementation

import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError

class YelpSpider(scrapy.Spider):

    name = 'yelp'
    HEADERS = {
            "user-agent": 
            """ 
            Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) 
            Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66,
            """,
            "referer": None
            }
    page_number = 0

    def start_requests(self):
        urls = [
            """
            https://www.yelp.com/biz/tulsi-indian-eatery-westwood-los-angeles?sort_by=date_desc
            """,
        ]

        for url in urls:
            yield scrapy.Request(url=url,
                                 callback=self.page_breakdown,
                                 meta={
                                     'playwright': True,
                                     'playwright_include_page': True
                                     },
                                 errback = self.errback_httpbin,
                                 dont_filter=True)

    def page_breakdown(self, response):
        number_of_reviews = 158    # response.xpath("//div[@class='arrange-unit__09f24__rqHTg css-73a431']/div/span/div[2]/p/text()").get().split(" ")[0]
        number_of_pages = round(int(number_of_reviews) / 10)
        for i in range(0, number_of_pages):
            yield scrapy.Request(url=response.url + f'&start={i*10}',
                                 callback=self.parse_reviews,
                                 meta={
                                     'playwright': True,
                                     'playwright_include_page': True
                                     },
                                 errback = self.errback_httpbin,
                                 dont_filter=True)

    def errback_httpbin(self, failure):
        self.logger.error(repr(failure))
        print("Reached Error function: ",repr(failure))

        if failure.check(HttpError):
            response = failure.value.response
            print("a_debug: HttpError on %s", response.url)
            self.logger.error("HttpError on %s", response.url)

        elif failure.check(DNSLookupError):
            request = failure.request

            print("a_debug: DNSLookupError on %s", request.url) 
            self.logger.error("DNSLookupError on %s", request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            yield scrapy.Request(url=request.url,
                                 callback=self.parse_reviews,
                                 meta={
                                     'playwright': True,
                                     'playwright_include_page': True
                                     },
                                 errback = self.errback_httpbin,
                                 dont_filter=True)
            print("a_debug: TimeoutError on %s", request.url)
            self.logger.error("a_debug: TimeoutError on %s", request.url)

    def parse_reviews(self, response):
        print("Reached Parsed Reviews")
        all_reviews = response.xpath("//*[@id='reviews']/section/div[2]/ul/li")

        print("a_debug: url: ", response.url)
        for review in all_reviews:
            yield {
                    "reviewer": review.xpath(
                        ".//*[starts-with(@class, 'user-passport-info')]/span/a/text()"
                        ).get(),
                    "descr": review.css("span.raw__09f24__T4Ezm::text").extract_first(),
                    "rating": review.xpath(
                        ".//div[@class='css-14g69b3']/@aria-label"
                        ).get().split(" ")[0],
                    "date": review.css("span.css-chan6m::text").extract_first(),
            }

Here is the link to the log file: https://drive.google.com/file/d/1PHhTxiG-bCzgMjo5iMy9I-oOqdWmRVZ3/view?usp=sharing

Consistently getting timeout errors in scrapy playwright

Answers (0)

Related Questions