Reputation: 2225
I want to run Scrapy Spider from my script, but it works only for 1 request. I cannot execute the procedure self.parse_product from scrapy.http.Request(product_url, callback=self.parse_product)
.
I guess it's being due the command crawler.signals.connect(callback, signal=signals.spider_closed)
. Please advise how correctly go over all links and sub-links.
Whole script is shown below.
import json
import scrapy
from scrapy.crawler import Crawler
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose, TakeFirst
from scrapy import log, signals, Spider, Item, Field
from scrapy.settings import Settings
from twisted.internet import reactor
# https://gist.github.com/alecxe/fc1527d6d9492b59c610
# define an item class
class WebStoreItem(Item):
name = Field()
price = Field()
developer = Field()
date_added = Field()
date_modified = Field()
votes = Field()
views = Field()
sales = Field()
avg_rating = Field()
comments = Field()
# define an item loader with input and output processors
class WebStoreItemLoader(ItemLoader):
default_input_processor = MapCompose(unicode.strip)
default_output_processor = TakeFirst()
desc_out = Join()
# define a pipeline
class JsonWriterPipeline(object):
def __init__(self):
self.file = open('items.json', 'wb')
def __del__(self):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# define a spider
class WebStoreSpider(Spider):
name = "WebStore"
allowed_domains = ["http://www.WebStore.com"]
start_urls = [
"http://www.WebStore.com/index.php"
]
def parse(self, response):
for meta in response.xpath('//div[@class="extension-grid"]'):
for product_block in meta.xpath('//div[@class="image-holder image"]'):
item = WebStoreItem()
avg_rating = meta.xpath('//div[@class="rating"]/text()').extract()[0]
item['avg_rating'] = avg_rating[avg_rating.find(': ') + 1:].strip()
comment = meta.xpath('//div[@class="comment"]/text()').extract()[0]
item['comments'] = comment[comment.find(': ') + 1:].strip()
print 'product_block: ', product_block
product_url = product_block.xpath('a[1]/@href').extract()[0]
print 'product_url: ', product_url
request = scrapy.http.Request(product_url, callback=self.parse_product)
request.meta['item'] = item
yield request
def parse_product(self, response):
item = response.meta['item']
product_meta_block = response.xpath('//div[@class="name"]')
print 'product_meta_block: ', product_meta_block
product_rows = product_meta_block.xpath('//tr)')
print 'product_rows: ', product_rows
i = 0
for row in product_rows:
if i == 1:
item['name'] = row.select('td/text()').extract()
elif i == 3:
item['votes'] = row.select('td/text()').extract()
i += 1
return item
# callback fired when the spider is closed
def callback(spider, reason):
stats = spider.crawler.stats.get_stats() # collect/log stats?
# stop the reactor
reactor.stop()
def stop_reactor():
reactor.stop()
if __name__ == '__main__':
# instantiate settings and provide a custom configuration
settings = Settings()
settings.set('ITEM_PIPELINES', {
'__main__.JsonWriterPipeline': 100
})
# instantiate a crawler passing in settings
crawler = Crawler(settings)
# instantiate a spider
spider = WebStoreSpider()
# configure signals
crawler.signals.connect(callback, signal=signals.spider_closed)
# configure and start the crawler
crawler.configure()
crawler.crawl(spider)
crawler.start()
# start logging
log.start()
# start the reactor (blocks execution)
reactor.run()
Upvotes: 0
Views: 356
Reputation: 1202
Your spider is being blocked from visiting pages after the start page by your allowed_domains
specification. The value should include just the domain, not the protocol. Try
allowed_domains = ["www.WebStore.com"]
Also the line desc_out = Join()
in your WebStoreItemLoader
definition may give an error as you have no desc
field.
Upvotes: 1