Reputation: 95
I want to iterate through all the category urls and scrape the content from each page. Although urls = [response.xpath('//ul[@class="flexboxesmain categorieslist"]/li/a/@href').extract()[0]]
in this code I have tried to fetch only the first category url but my goal is to fetch all urls and the content inside each urls.
I'm using scrapy_selenium library. Selenium page source is not passing to the 'scrape_it' function. Please review my code and let me know if there's anything wrong in it. I'm new to scrapy framework.
Below is my spider code -
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
from ..items import CouponcollectItem
class Couponsite6SpiderSpider(scrapy.Spider):
name = 'couponSite6_spider'
allowed_domains = ['www.couponcodesme.com']
start_urls = ['https://www.couponcodesme.com/ae/categories']
def parse(self, response):
urls = [response.xpath('//ul[@class="flexboxesmain categorieslist"]/li/a/@href').extract()[0]]
for url in urls:
yield SeleniumRequest(
url=response.urljoin(url),
wait_time=3,
callback=self.parse_urls
)
def parse_urls(self, response):
driver = response.meta['driver']
while True:
next_page = driver.find_element_by_xpath('//a[@class="category_pagination_btn next_btn bottom_page_btn"]')
try:
html = driver.page_source
response_obj = Selector(text=html)
self.scrape_it(response_obj)
next_page.click()
except:
break
driver.close()
def scrape_it(self, response):
items = CouponcollectItem()
print('Hi there')
items['store_img_src'] = response.css('#temp1 > div > div.voucher_col_left.flexbox.spaceBetween > div.vouchercont.offerImg.flexbox.column1 > div.column.column1 > div > div > a > img::attr(src)').extract()
yield items
I have added the following code inside settings.py file -
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
#SELENIUM
from shutil import which
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox
I'm attaching a terminal_output screenshot. Thank you for your time! Please help me solve this.
Upvotes: 0
Views: 331
Reputation: 54992
The problem is you can't share the driver among asynchronously running threads, and you also can't run more than one in parallel. You can take the yield out and it will do them one at a time:
At the top:
from selenium import webdriver
import time
driver = webdriver.Chrome()
and then in your class:
def parse(self, response):
urls = response.xpath('//ul[@class="flexboxesmain categorieslist"]/li/a/@href').extract()
for url in urls:
self.do_category(url)
def do_page(self):
time.sleep(1)
html = driver.page_source
response_obj = Selector(text=html)
self.scrape_it(response_obj)
def do_category(self, url):
driver.get(url)
self.do_page()
next_links = driver.find_elements_by_css_selector('a.next_btn')
while len(next_links) > 0:
next_links[0].click()
self.do_page()
next_links = driver.find_elements_by_css_selector('a.next_btn')
And if that's too slow for you I recommend switching to Puppeteer.
Upvotes: 1