When scraping a website using Scrapy, how do I make sure all characters scrape properly?

Question

I am using Scrapy to scrape a website but some of the characters, such as apostrophes, do not scrape correctly nor are they consistently the same wrong character, i.e., I've had an apostrophe show up as multiple odd characters in my result set. How do I ensure that all characters scrape properly?

Edit

I am trying to scrape http://www.nowtoronto.com/music/listings/ with the following scraper:

import urlparse
import time
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
#from NT.items import NowTorontoItem
from scrapy.item import Item, Field

class NowTorontoItem(Item):
    eventArtist = Field()
    eventTitle = Field()
    eventHolder = Field()
    eventDetails = Field()
    #venueName = Field()
    eventAddress = Field()
    eventLocality = Field()
    eventPostalCode = Field()
    eventPhone = Field()
    eventURL = Field()
    eventPrice = Field()
    eventDate = Field()
    internalURL = Field()

class MySpider(BaseSpider):
    name = "NTSpider"
    allowed_domains = ["nowtoronto.com"]
    start_urls = ["http://www.nowtoronto.com/music/listings/"]

    def parse(self, response):
        selector = Selector(response)
        listings = selector.css("div.listing-item0, div.listing-item1")

        for listing in listings:
            item = NowTorontoItem()
            for body in listing.css('span.listing-body > div.List-Body'):
                item ["eventArtist"] = body.css("span.List-Name::text").extract()
                item ["eventTitle"] = body.css("span.List-Body-Emphasis::text").extract()
                item ["eventHolder"] = body.css("span.List-Body-Strong::text").extract()
                item ["eventDetails"] = body.css("::text").extract()
                #item ["internalURL"] = body.css("a::attr(href)").extract() 
                time.sleep(1)
            for body in listing.css('div.listing-readmore'):
                item ["internalURL"] = body.css("a::attr(href)").extract()   


            # yield a Request()
            # so that scrapy enqueues a new page to fetch
            detail_url = listing.css("div.listing-readmore > a::attr(href)")

            if detail_url:
                yield Request(urlparse.urljoin(response.url,
                              detail_url.extract()[0]),
                              meta={'item': item},
                              callback=self.parse_details)
            else:
               yield item

    def parse_details(self, response):
        self.log("parse_details: %r" % response.url)
        selector = Selector(response)
        listings = selector.css("div.whenwhereContent")

        for listing in listings:
            for body in listing.css('tr:nth-child(1) td.small-txt.dkgrey-txt.rightInfoTD'):
                item = response.meta['item']
                #item ["eventLocation"] = body.css("span[property='v:location']::text").extract()
                #item ["eventOrganization"] = body.css("span[property='v:organization'] span[property='v:name']::text").extract()
                #item ["venueName"] = body.css("span[property='v:name']::text").extract()
                item ["eventAddress"] = body.css("span[property='v:street-address']::text").extract()
                item ["eventLocality"] = body.css("span[property='v:locality']::text").extract()
                item ["eventPostalCode"] = body.css("span[property='v:postal-code']::text").extract()
                item ["eventPhone"] = body.css("span[property='v:tel']::text").extract()
                item ["eventURL"] = body.css("span[property='v:url']::text").extract()

            item ["eventPrice"] = listing.css('tr:nth-child(2) td.small-txt.dkgrey-txt.rightInfoTD::text').extract()

            item ["eventDate"] = listing.css('span[content*="201"]::attr(content)').extract()       

            yield item

Edit 2

I am not sure the issue is simply related to the file viewer I am using. When I open my first scrape in a text editor, an apostrophe is formatted as â€™ whereas in my second scrape, the same apostrophe (from the same text string) is formatted as —È.

When scraping a website using Scrapy, how do I make sure all characters scrape properly?

Answers (1)

Related Questions