Reputation: 570
I came to know at one point you need to use webtoolkits like selenium to automate the scraping.
How I will be able to click the next button on google play store in order to scrape the reviews for my college purpose only !!
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from selenium import webdriver
import time
class Product(scrapy.Item):
title = scrapy.Field()
class FooSpider(CrawlSpider):
name = 'foo'
start_urls = ["https://play.google.com/store/apps/details?id=com.gaana&hl=en"]
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Chrome(executable_path="C:\chrm\chromedriver.exe")
self.browser.implicitly_wait(60) #
def parse(self,response):
self.browser.get(response.url)
sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')
items = []
for i in range(0,200):
time.sleep(20)
button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")
button.click()
self.browser.implicitly_wait(30)
for site in sites:
item = Product()
item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()
yield item
I have updated my code and it is only giving me repeative 40 items again and again.whats wrong with my for loop?
It seems that the source code which is being updated is not passed to the xpath thats why it is returning with same 40 items
Upvotes: 2
Views: 8758
Reputation: 510
I'd do something like that:
from scrapy import CrawlSpider
from selenium import webdriver
import time
class FooSpider(CrawlSpider):
name = 'foo'
allow_domains = 'foo.com'
start_urls = ['foo.com']
def __init__(self, *args, **kwargs):
super(FooSpider, self).__init__(*args, **kwargs)
self.download_delay = 0.25
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(60)
def parse_foo(self.response):
self.browser.get(response.url) # load response to the browser
button = self.browser.find_element_by_xpath("path") # find
# the element to click to
button.click() # click
time.sleep(1) # wait until the page is fully loaded
source = self.browser.page_source # get source of the loaded page
sel = Selector(text=source) # create a Selector object
data = sel.xpath('path/to/the/data') # select data
...
It's better not to wait for a fixed amount of time, though. So instead of time.sleep(1)
, you can use one of the approaches described here http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html.
Upvotes: 5