Reputation: 93
I am trying to scrape links to product pages from a listing page using a scrapy spider. The page shows the first 10 machines and has a button for 'show all machines' that calls some javascript. The javascript is reasonably complicated (i.e. I can't just look at the function and see the url that the button points to). I'm trying to use the selenium webdriver to simulate a click on the button but it isn't working for some reason. When I scrape the product links I only get the first 10, not the complete list.
Can anybody tell me why it doesn't work?
The page I'm trying to scrape is http://www.ncservice.com/en/second-hand-milling-machines
The spider is
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request, FormRequest
from scrapy import log
from scrapy.exceptions import DropItem
from scrapy import signals
from mtispider.items import MachineItem
import urlparse
import time
import MySQLdb
import unicodedata
import re
from mtispider import tools
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
requests = list(super(MachineSpider, self).start_requests())
requests.append(Request('http://www.ncservice.com/en/second-hand-milling-machines', callback=self.parsencmilllist))
return requests
def parsencmilllist(self,response):
hxs=HtmlXPathSelector(response)
driver= webdriver.Firefox()
driver.get(response.url)
try:
driver.FindElement(By.Id("mas-resultados-fresadoras")).Click()
except:
log.msg("Couldnt get all the machines", level=log.INFO)
ncmachs = hxs.select('//div[@id="resultados"]//a/@href').extract()
for ncmach in ncmachs:
yield Request(ncmach,
meta = {'type':'Milling'},
callback=self.parsencmachine)
driver.quit()
def parsencmachine(self,response):
#scrape the machine
return item
Thanks!
Upvotes: 2
Views: 2760
Reputation: 864
If you are seeing the error AttributeError: ‘WebDriver’ object has no attribute ‘find_element_by_name’
you are using Selenium >= 4.3.0
.
Replace the method find_element_by_name
or find_element_by_id
by find_element
Upvotes: 0
Reputation: 473863
The main problem is that you need to initialize your Selector
from the webdriver's page_source
and not the response
passed into the callback:
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy import Selector
from selenium import webdriver
class MachineSpider(CrawlSpider):
name = 'nc-spider'
allowed_domains = ['ncservice.com']
def start_requests(self):
yield Request('http://www.ncservice.com/en/second-hand-milling-machines',
callback=self.parsencmilllist)
def parsencmilllist(self, response):
driver = webdriver.Firefox()
driver.get(response.url)
driver.find_element_by_id("mas-resultados-fresadoras").click()
sel = Selector(text=driver.page_source)
driver.quit()
links = sel.xpath('//div[@id="resultados"]//a/@href').extract()
for link in links:
yield Request(link,
meta={'type': 'Milling'},
callback=self.parsencmachine)
def parsencmachine(self, response):
print response.url
Upvotes: 1