Arkan Kalu
Arkan Kalu

Reputation: 403

Scrapy not collecting properly emails

I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.

import re
import scrapy


class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    link = scrapy.Field()
    attr = scrapy.Field()
    title = scrapy.Field()
    tag = scrapy.Field()


class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["hanford.craigslist.org"]
    start_urls = [
        "http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
    ]

    BASE_URL = 'http://hanford.craigslist.org/'

    def parse(self, response):
        links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
        for link in links:
            absolute_url = self.BASE_URL + link
            yield scrapy.Request(absolute_url, callback=self.parse_attr)

    def parse_attr(self, response):
        match = re.search(r"(\w+)\.html", response.url)
        if match:
            item_id = match.group(1)
            url = self.BASE_URL + "reply/sdo/cto/" + item_id

            item = DmozItem()
            item["link"] = response.url
            item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())
            item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])
            return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)

    def parse_contact(self, response):
        item = response.meta['item']
        item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())
        return item

Upvotes: 1

Views: 992

Answers (1)

alecxe
alecxe

Reputation: 474001

First of all, a quote from Terms of Use as a warning:

USE. You agree not to use or provide software (except for general purpose web browsers and email clients, or software expressly licensed by us) or services that interact or interoperate with CL, e.g. for downloading, uploading, posting, flagging, emailing, search, or mobile use. Robots, spiders, scripts, scrapers, crawlers, etc. are prohibited, as are misleading, unsolicited, unlawful, and/or spam postings/email. You agree not to collect users' personal and/or contact information ("PI").

Several things to fix here:

  • the contact information is under reply/hnf/cto/ instead of reply/sdo/cto/
  • specify User-Agent and X-Requested-With headers

The complete code that works for me:

import re
from urlparse import urljoin

import scrapy


class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    link = scrapy.Field()
    attr = scrapy.Field()
    title = scrapy.Field()
    tag = scrapy.Field()


class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["hanford.craigslist.org"]
    start_urls = [
        "http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
    ]

    BASE_URL = 'http://hanford.craigslist.org/'

    def parse(self, response):
        links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
        for link in links:
            absolute_url = urljoin(self.BASE_URL, link)
            yield scrapy.Request(absolute_url,
                                 callback=self.parse_attr)

    def parse_attr(self, response):
        match = re.search(r"(\w+)\.html", response.url)
        if match:
            item_id = match.group(1)
            url = urljoin(self.BASE_URL, "reply/hnf/cto/" + item_id)

            item = DmozItem()
            item["link"] = response.url
            item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())
            item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])
            return scrapy.Request(url,
                                  meta={'item': item},
                                  callback=self.parse_contact,
                                  headers={"X-Requested-With": "XMLHttpRequest",
                                           "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"})

    def parse_contact(self, response):
        item = response.meta['item']
        item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())
        return item

Upvotes: 1

Related Questions