Reputation: 49
This is a scrapy code and I want to scrape data from mouthshut.com and it includes the strong tag in between. I am able to run it and have title coming but they are blank. Why it isn't extracting any data?
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[@class="profile"]')
items = []
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[@style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
#item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
items.append(item)
return items
Upvotes: 0
Views: 351
Reputation: 4685
You should use a pipeline to extract data from your spider! Here is a sample that extract data to json files:
# -*- coding: utf-8 -*-
# python import
from scrapy import signals, log
from scrapy.contrib.exporter import JsonItemExporter
from datetime import datetime
import os
# project import
from items import tgju
from pymongo import MongoClient
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
def get_items(module):
md = module.__dict__
return (str(md[c].__name__) for c in md if (isinstance(md[c], type) and md[c].__module__ == module.__name__))
class JsonPipeline(object):
def __init__(self):
self.files = dict()
self.exporter = dict()
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
for key in get_items(tgju):
path = os.path.join('temp', key)
if not os.path.exists(path):
os.makedirs(path)
self.files[key] = open(os.path.join(path,
'%s_%s_%s.json' % (spider.name,
key.lower(),
datetime.now().strftime('%Y%m%dT%H%M%S'))),
'w+b')
self.exporter[key] = JsonItemExporter(self.files[key])
self.exporter[key].start_exporting()
def spider_closed(self, spider):
for key in get_items(tgju):
self.exporter[key].finish_exporting()
self.files.pop(key).close()
def process_item(self, item, spider):
try:
log.msg('-----------------%s------------------' % item.__class__.__name__)
self.exporter[item.__class__.__name__].export_item(item)
except KeyError:
pass
return item
Add this line to your settings files:
ITEM_PIPELINES = {
'pipelines.JsonPipeline': 800,
}
And try yield
each item instead of return
.
Update: Also change your spider to this one...
import scrapy
from scrapy.selector import Selector
from shut.items import ShutItem
class criticspider(scrapy.Spider):
name ="shut"
allowed_domains =["mouthshut.com"]
start_urls =["http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930"]
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//li[@class="profile"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[@style=" font-size: 15px;font-weight: 700;"]//a/text()').extract()
#item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
#item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
yield item
Upvotes: 2
Reputation: 1
def parse(self,response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="reviewtitle fl"]')
for site in sites:
item = ShutItem()
item['title'] = site.select('//strong[@style=" font-size: 15px;font-weight: 700;"]/a/text()').extract()
#item['date'] = site.select('div[@class="review_stats"]//div[@class="date"]/text()').extract()
#item['desc'] = site.select('div[@class="review_body"]//span[@class="blurb blurb_expanded"]/text()').extract()
yield item
this is work well.
2015-01-21 19:06:33+0800 [shut] DEBUG: Scraped from <200 http://www.mouthshut.com/mobile-operators/vodafone-mobile-operator-reviews-925020930>
{'title': [u'Vodafone 3G - Useless in Bangalore',
u'Worst Mobile Operator Ever',
u'Worst 3g connectivity of vodafone in bangalore',
u'Pathetic Network 3G',
u'HOW DO THEY STILL DO BUSINESS WITH SUCH SERVICES!!',
u'Bad customer service',
u'Vodafone Kolkata \u2013 My worst ever experience.',
u'Network connectivity - permanent nemesis',
u'VODAFONE MOBILE OPERATOR',
u'Beware of Vodafone billing plans',
u'Vodafone changed my billing plan without my notice',
u'Pathetic service. They deduct balance unnecessari',
u'Worst service from Vodafone',
u'Forget Vodafone',
u'Vodafone Data Services sucks',
u'Outgoing calls has been barred',
u'Vodafone Sucks',
u'Worst Customer satisfaction I have ever Faced',
u'Untrained Customer Care... Seems like headline de',
u'3rd Party downloads - shameless way to make money!']}
here you should know: 1. yield is much better then list in scrapy. 2. li node is not the parent of strong. 3. the value of strong stype has some blank.
Upvotes: 0