Reputation: 829
I am having difficulty to achieve my scraper (I took the initial example code from here[selenium with scrapy for dynamic page from @alecxe, and completed for getting some results, but if the scraper seems to lauch (we can observe the simulation of clicking the next button), it shuts down one second after and doesn't print or get anything in the items.
Here is the code
from scrapy.spider import BaseSpider
from selenium import webdriver
class product_spiderItem(scrapy.Item):
title = scrapy.Field()
price=scrapy.Field()
pass
class ProductSpider(BaseSpider):
name = "product_spider"
allowed_domains = ['ebay.com']
start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')
try:
next.click()
# get the data and write it to scrapy items
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
print response.url
for prod in response.xpath('//ul[@id="GalleryViewInner"]/li/div/div'):
item = product_spiderItem()
item['title'] = prod.xpath('.//div[@class="gvtitle"]/h3/a/text()').extract()[0]
item['price'] = prid.xpath('.//div[@class="prices"]/span[@class="bold"]/text()').extract()[0]
print item['price']
yield item
except:
break
self.driver.close()
I use scrapy crawl product_scraper -o products.json, to store results.what am i missing?
Upvotes: 0
Views: 2124
Reputation: 2594
While trying to understand what's wrong with your code I did some editing and came up with the following (tested) code that should be closer to your goal:
import scrapy
from selenium import webdriver
class product_spiderItem(scrapy.Item):
title = scrapy.Field()
price=scrapy.Field()
pass
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['ebay.com']
start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
sel = scrapy.Selector(text=self.driver.page_source)
for prod in sel.xpath('//ul[@id="GalleryViewInner"]/li/div/div'):
item = product_spiderItem()
item['title'] = prod.xpath('.//div[@class="gvtitle"]/h3/a/text()').extract()
item['price'] = prod.xpath('.//div[@class="prices"]//span[@class=" bold"]/text()').extract()
yield item
next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')
try:
next.click()
except:
break
def closed(self, reason):
self.driver.close()
Please try if this code works better.
Upvotes: 2