Reputation: 1
Been trying to implement a Yelp Scraper to scrape reviews of restaurants but am constantly getting download and timeout errors. I feed the base url page and find the number of reviews to calculate how many requests I need to make on accord of yelps pagination of reviews to get all of the restaurant reviews. I use the command scrapy crawl yelp -o reviews.csv
to get a csv output of the reviews. Have tried decreasing concurrent_requests
in settings.py
. And even have tried to serialize the requests rather than running it concurrently but found no luck. Also tried changing Timeout delays and still failed. Any suggestions as to how I could fix this?
Here is my spider implementation
import scrapy
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class YelpSpider(scrapy.Spider):
name = 'yelp'
HEADERS = {
"user-agent":
"""
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66,
""",
"referer": None
}
page_number = 0
def start_requests(self):
urls = [
"""
https://www.yelp.com/biz/tulsi-indian-eatery-westwood-los-angeles?sort_by=date_desc
""",
]
for url in urls:
yield scrapy.Request(url=url,
callback=self.page_breakdown,
meta={
'playwright': True,
'playwright_include_page': True
},
errback = self.errback_httpbin,
dont_filter=True)
def page_breakdown(self, response):
number_of_reviews = 158 # response.xpath("//div[@class='arrange-unit__09f24__rqHTg css-73a431']/div/span/div[2]/p/text()").get().split(" ")[0]
number_of_pages = round(int(number_of_reviews) / 10)
for i in range(0, number_of_pages):
yield scrapy.Request(url=response.url + f'&start={i*10}',
callback=self.parse_reviews,
meta={
'playwright': True,
'playwright_include_page': True
},
errback = self.errback_httpbin,
dont_filter=True)
def errback_httpbin(self, failure):
self.logger.error(repr(failure))
print("Reached Error function: ",repr(failure))
if failure.check(HttpError):
response = failure.value.response
print("a_debug: HttpError on %s", response.url)
self.logger.error("HttpError on %s", response.url)
elif failure.check(DNSLookupError):
request = failure.request
print("a_debug: DNSLookupError on %s", request.url)
self.logger.error("DNSLookupError on %s", request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
yield scrapy.Request(url=request.url,
callback=self.parse_reviews,
meta={
'playwright': True,
'playwright_include_page': True
},
errback = self.errback_httpbin,
dont_filter=True)
print("a_debug: TimeoutError on %s", request.url)
self.logger.error("a_debug: TimeoutError on %s", request.url)
def parse_reviews(self, response):
print("Reached Parsed Reviews")
all_reviews = response.xpath("//*[@id='reviews']/section/div[2]/ul/li")
print("a_debug: url: ", response.url)
for review in all_reviews:
yield {
"reviewer": review.xpath(
".//*[starts-with(@class, 'user-passport-info')]/span/a/text()"
).get(),
"descr": review.css("span.raw__09f24__T4Ezm::text").extract_first(),
"rating": review.xpath(
".//div[@class='css-14g69b3']/@aria-label"
).get().split(" ")[0],
"date": review.css("span.css-chan6m::text").extract_first(),
}
Here is the link to the log file: https://drive.google.com/file/d/1PHhTxiG-bCzgMjo5iMy9I-oOqdWmRVZ3/view?usp=sharing
Upvotes: 0
Views: 173