Scrapy not collecting properly emails

Question

I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.

import re
import scrapy


class DmozItem(scrapy.Item):
    # define the fields for your item here like:
    link = scrapy.Field()
    attr = scrapy.Field()
    title = scrapy.Field()
    tag = scrapy.Field()


class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domains = ["hanford.craigslist.org"]
    start_urls = [
        "http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
    ]

    BASE_URL = 'http://hanford.craigslist.org/'

    def parse(self, response):
        links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
        for link in links:
            absolute_url = self.BASE_URL + link
            yield scrapy.Request(absolute_url, callback=self.parse_attr)

    def parse_attr(self, response):
        match = re.search(r"(\w+)\.html", response.url)
        if match:
            item_id = match.group(1)
            url = self.BASE_URL + "reply/sdo/cto/" + item_id

            item = DmozItem()
            item["link"] = response.url
            item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())
            item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])
            return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)

    def parse_contact(self, response):
        item = response.meta['item']
        item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())
        return item

Scrapy not collecting properly emails

Answers (1)

Related Questions