Reputation: 3025
I've created the scrapy spider below and I want to improve its functionality so that I can capture more data.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from TOSpider.items import TOSpiderItem
class MySpider(BaseSpider):
name = "TOSpider"
allowed_domains = ["http://nowtoronto.com/"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//span[@class='listing-body']")
items = []
for listings in listings:
item = NowTorontoItem()
item ["eventArtist"] = listings.select("div[@class='List- Body']/span[@class='List-Name']/text()").extract()
item ["eventTitle"] = listings.select("div[@class='List-Body']/span[@class='List-Body-Emphasis']/text()").extract()
item ["eventHolder"] = listings.select("div[@class='List-Body']/span[@class='List-Body-Strong']/text()").extract()
item ["eventDetails"] = listings.select("div[@class='List-Body']/text()").extract()
items.append(item)
return items
Specifically, I want it to be able to follow a link and then capture data on the linked page. If you look at the source code for the site it scrapes from - http://www.nowtoronto.com/music/listings/ -, you will see that each <div>
that is being scraped also contains an internal link within a <div>
called <div class="listing-readmore">
. I would like to follow each internal link so I can then scrape the data from the internal page, specifically the data contained within the <span>
tags named <span property="v:location">
, <span property="v:organization">
, <span property="v:name">
, <span property="v:street-address">
, <span property="v:locality">
, <span property="v:postal-code">
, <span property="v:tel">
and <span property="v:url">
and the <td>
tags named <td class="small-txt medgrey-txt leftLabelTD">ADMISSION</td>
, <td class="small-txt medgrey-txt leftLabelTD">TICKETS AT</td>
and <td class="small-txt medgrey-txt leftLabelTD">WHEN</td>
.
I have read the Scrapy manual on how to follow internal links but everything I have tried has produced an error. It hasn't seemed to click for me yet and I was hoping someone could point me in the right direction. Thanks.
Note: The items file referenced in the import statement is
from scrapy.item import Item, Field
class TOSpiderItem(Item):
eventArtist = Field()
eventTitle = Field()
eventHolder = Field()
eventDetails = Field()
Edit
After playing around with the code provided so graciously provided by paul t., I have come up with the following spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from NT.items import NowTorontoItem
class MySpider(BaseSpider):
name = "NTSpider"
allowed_domains = ["http://nowtoronto.com/"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]`
def parse(self, response):
selector = Selector(response)
listings = selector.css("div.listing-item0, div.listing-item1")
for listing in listings:
item = NowTorontoItem()
for body in listing.css('span.listing-body > div.List-Body'):
item ["eventArtist"] = body.css("span.List-Name::text").extract()
item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
item ["eventDetails"] = body.css("::text").extract()
# instead of returning all items at the end,
# you can use yield with an item in each iteration
yield item
# and you can also yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.select("div.listing-readmore > a::attr(href)")
if detail_url:
yield Request(detail_url.extract()[0], callback=self.parse_details)
def parse_details(self, response):
selector = Selector(response)
listings = selector.css("whenwhereContent")
for listing in listings:
item = TOSpiderItem()
for body in listing.css('td.small-txt medgrey-txt leftLabelTD > td.small-txt dkgrey-txt rightInfoTD'):
item ["eventLocation"] = body.css("span.v:location::text").extract()
item ["eventOrganization"] = body.css("span.v:organization::text").extract()
item ["eventName"] = body.css("span.v:name::text").extract()
item ["eventAddress"] = body.css("span.v:street-address::text").extract()
item ["eventLocality"] = body.css("span.v:locality::text").extract()
item ["eventPostalCode"] = body.css("span.v:postal-code::text").extract()
item ["eventPhone"] = body.css("span.v:tel::text").extract()
item ["eventURL"] = body.css("span.v:url::text").extract()
item.append(item)
return item
However, when I run it, I receive the error exceptions.ValueError: Invalid XPath: div.listing-readmore > a::attr<href>
. Can anyone help me out with this? I am not that familiar with the new CSS selectors.
Edit 2
I changed detail_url = listing.select("div.listing-readmore > a::attr(href)")
to detail_url = listing.css("div.listing-readmore > a::attr(href)")
and I am now receiving the error yield Request(detail_url.extract()[0], callback=self.parse_details) exceptions.NameError: global name 'Request' is not defined
. Does the Request function need to be imported?
Edit 3
I still can't get the second block of code to run. I've attached the entire code block below. Note that I have changed the name of TOSpiderItem to NowTorontoItem. When I run this code, the numbers 1 and 2 will print to screen with each iteration but 3 and 4 will never print, suggesting that the second code block is not running. At this point, it doesn't even seem to be the second big for
loop that is the problem since it doesn't even have a chance to execute.
import urlparse
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from NT.items import NowTorontoItem
class MySpider(BaseSpider):
name = "NTSpider"
allowed_domains = ["http://nowtoronto.com/"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
selector = Selector(response)
listings = selector.css("div.listing-item0, div.listing-item1")
for listing in listings:
item = NowTorontoItem()
for body in listing.css('span.listing-body > div.List-Body'):
item ["eventArtist"] = body.css("span.List-Name::text").extract()
item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
item ["eventDetails"] = body.css("::text").extract()
#for body in listing.css('div.listing-readmore'):
# item ["eventURL"] = body.css("a::attr(href)").extract()
# instead of returning all items at the end,
# you can use yield with an item in each iteration
yield item
# and you can also yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.css("div.listing-readmore")
print '1'
if detail_url:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
callback=self.parse_details)
print '2'
def parse_details(self, response):
print '3'
selector = Selector(response)
listings = selector.css("div.whenwhereContent")
print '4'
for listing in listings:
for body in listing.css('td.small-txt dkgrey-txt rightInfoTD'):
item = NowTorontoItem()
item ["eventLocation"] = body.css("span.v:location::text").extract()
item ["eventOrganization"] = body.css("span.v:organization::text").extract()
item ["eventName"] = body.css("span.v:name::text").extract()
item ["eventAddress"] = body.css("span.v:street-address::text").extract()
item ["eventLocality"] = body.css("span.v:locality::text").extract()
item ["eventPostalCode"] = body.css("span.v:postal-code::text").extract()
item ["eventPhone"] = body.css("span.v:tel::text").extract()
item ["eventURL"] = body.css("span.v:url::text").extract()
yield item
Upvotes: 0
Views: 335
Reputation: 20748
I think what you're after is something like:
import urlparse
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from TOSpider.items import TOSpiderItem
class MySpider(BaseSpider):
name = "TOSpider"
allowed_domains = ["http://nowtoronto.com/"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
listings = hxs.select("//div[starts-with(@class, 'listing-item')]")
for listing in listings:
item = NowTorontoItem()
item["eventArtist"] = listing.select("span[@class='listing-body']/div[@class='List-Body']/span[@class='List-Name']/text()").extract()
item["eventTitle"] = listing.select("span[@class='listing-body']/div[@class='List-Body']/span[@class='List-Body-Emphasis']/text()").extract()
item["eventHolder"] = listing.select("span[@class='listing-body']/div[@class='List-Body']/span[@class='List-Body-Strong']/text()").extract()
item["eventDetails"] = listing.select("span[@class='listing-body']/div[@class='List-Body']/text()").extract()
# instead of returning all items at the end,
# you can use yield with an item in each iteration
yield item
# and you can also yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.select("div[@class='listing-readmore']/a/@href")
if detail_url:
yield Request(detail_url.extract()[0], callback=self.parse_details)
def parse_details(self, response):
hxs = HtmlXPathSelector(response)
# and here you parse
# <span property="v:location">,
# <span property="v:organization">,
# <span property="v:name">,
# <span property="v:street-address">,
# <span property="v:locality">,
# <span property="v:postal-code">,
# <span property="v:tel"> and <span property="v:url"> ...
#
# and return a item
Equivalent but using the new CSS selectors:
import urlparse
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import Selector
from TOSpider.items import TOSpiderItem
class MySpider(BaseSpider):
name = "TOSpider"
allowed_domains = ["http://nowtoronto.com/"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
selector = Selector(response)
listings = selector.css("div.listing-item0, div.listing-item1")
for listing in listings:
item = NowTorontoItem()
for body in listing.css('span.listing-body > div.List-Body'):
item ["eventArtist"] = body.css("span.List-Name::text").extract()
item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
item ["eventDetails"] = body.css("::text").extract()
# instead of returning all items at the end,
# you can use yield with an item in each iteration
yield item
# and you can also yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.css("div.listing-readmore > a::attr(href)")
if detail_url:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
callback=self.parse_details)
def parse_details(self, response):
selector = Selector(response)
# and here you parse
# <span property="v:location">,
# <span property="v:organization">,
# <span property="v:name">,
# <span property="v:street-address">,
# <span property="v:locality">,
# <span property="v:postal-code">,
# <span property="v:tel"> and <span property="v:url"> ...
#
# and return a item
Edited spider to get OP's started:
import urlparse
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
#from NT.items import NowTorontoItem
from scrapy.item import Item, Field
class NowTorontoItem(Item):
eventArtist = Field()
eventTitle = Field()
eventHolder = Field()
eventDetails = Field()
eventLocation = Field()
eventOrganization = Field()
eventName = Field()
eventAddress = Field()
eventLocality = Field()
eventPostalCode = Field()
eventPhone = Field()
eventURL = Field()
class MySpider(BaseSpider):
name = "NTSpider"
allowed_domains = ["nowtoronto.com"]
start_urls = ["http://www.nowtoronto.com/music/listings/"]
def parse(self, response):
selector = Selector(response)
listings = selector.css("div.listing-item0, div.listing-item1")
for listing in listings:
item = NowTorontoItem()
for body in listing.css('span.listing-body > div.List-Body'):
item ["eventArtist"] = body.css("span.List-Name::text").extract()
item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
item ["eventDetails"] = body.css("::text").extract()
#for body in listing.css('div.listing-readmore'):
# item ["eventURL"] = body.css("a::attr(href)").extract()
# instead of returning all items at the end,
# you can use yield with an item in each iteration
#yield item
# and you can also yield a Request()
# so that scrapy enqueues a new page to fetch
detail_url = listing.css("div.listing-readmore > a::attr(href)")
if detail_url:
yield Request(urlparse.urljoin(response.url,
detail_url.extract()[0]),
callback=self.parse_details)
def parse_details(self, response):
self.log("parse_details: %r" % response.url)
selector = Selector(response)
listings = selector.css("div.whenwhereContent")
for listing in listings:
for body in listing.css('td.small-txt.dkgrey-txt.rightInfoTD'):
item = NowTorontoItem()
item ["eventLocation"] = body.css("span[property='v:location']::text").extract()
item ["eventOrganization"] = body.css("span[property='v:organization'] span[property='v:name']::text").extract()
item ["eventName"] = body.css("span[property='v:name']::text").extract()
item ["eventAddress"] = body.css("span[property='v:street-address']::text").extract()
item ["eventLocality"] = body.css("span[property='v:locality']::text").extract()
item ["eventPostalCode"] = body.css("span[property='v:postal-code']::text").extract()
item ["eventPhone"] = body.css("span[property='v:tel']::text").extract()
item ["eventURL"] = body.css("span[property='v:url']::text").extract()
yield item
Upvotes: 1