Reputation: 403
I'm using Scrapy to collect some data and everything works fine except the email extraction part. For some reason email row in .csv file is blank or there is only a few emails extracted. I've tried limiting download_delay and CLOSESPIDER_ITEMCOUNT but it's not working. Any help is much appreciated.
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/sdo/cto/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())
return item
Upvotes: 1
Views: 992
Reputation: 474001
First of all, a quote from Terms of Use as a warning:
USE. You agree not to use or provide software (except for general purpose web browsers and email clients, or software expressly licensed by us) or services that interact or interoperate with CL, e.g. for downloading, uploading, posting, flagging, emailing, search, or mobile use. Robots, spiders, scripts, scrapers, crawlers, etc. are prohibited, as are misleading, unsolicited, unlawful, and/or spam postings/email. You agree not to collect users' personal and/or contact information ("PI").
Several things to fix here:
reply/hnf/cto/
instead of reply/sdo/cto/
User-Agent
and X-Requested-With
headersThe complete code that works for me:
import re
from urlparse import urljoin
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["hanford.craigslist.org"]
start_urls = [
"http://hanford.craigslist.org/search/cto?min_auto_year=1980&min_price=3000"
]
BASE_URL = 'http://hanford.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[@class="hdrlnk"]/@href').extract()
for link in links:
absolute_url = urljoin(self.BASE_URL, link)
yield scrapy.Request(absolute_url,
callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = urljoin(self.BASE_URL, "reply/hnf/cto/" + item_id)
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[@class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[@class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url,
meta={'item': item},
callback=self.parse_contact,
headers={"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36"})
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[@class='anonemail']//text()").extract())
return item
Upvotes: 1