robinma
robinma

Reputation: 43

scrapy Redirect 302

i am just crawl to a websit.but redirecting anthor page. in spider i added

handle_httpstatus_list = [302,301]

and overwrite the start_requests method. but problem is

AttributeError: 'Response' object has no attribute 'xpath'

spider code:

# -*- coding=utf-8 -*-
from __future__ import absolute_import
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider,Rule,Spider
from car.items import Car58Item
import scrapy
import time

class Car51Spider (CrawlSpider):
    name = 'car51'
    allowed_domains = ['51auto.com']
    start_urls = ['http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time']
    rules = [Rule(LinkExtractor(allow=('/pabmdcigf?searchtype=searcarlist&curentPage=\d+\&isNewsCar\=0\&isSaleCar\=0\&isQa\=0\&orderValue\=record_time')),callback='parse_item',follow=True)] #//页面读取策略
    handle_httpstatus_list = [302,301]
    items = {}

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, dont_filter=True, callback=self.parse_item)

    def parse_item(self,response):
        trs = response.xpath("//div[@class='view-grid-overflow']/a").extract()
        for tr in trs:
            sales_1 = u''
            item = Car58Item()
            urls = tr.xpath("a/@href").extract_first()
            item['url'] = tr.xpath("a/@href").extract_first()
            item['tip'] = tr.xpath("a/ul/li[@class='title']/text()").extract_first()
            item['name'] = tr.xpath("a/ul/li[@class='title']/text()").extract_first()
            sales_times = tr.xpath("a/ul/li[@class='info']/span/text()").extract()
            for x in sales_times:
                sales_1 = sales_1 + x
            item['sales_time'] = sales_1
            item['region'] = tr.xpath("a/ul/li[@class='info']/span[@class='font-color-red']/text()").extract_first()
            item['amt'] = tr.xpath("a/ul/li[@class='price']/div[1]/text()").extract_first()
            yield scrapy.Request(url=urls,callback=self.parse_netsted_item,meta={'item':item})

    def parse_netsted_item(self,response):
        dh = u''
        dha = u''

        mode = response.xpath("//body")
        item = Car58Item(response.meta['item'])


        dhs = mode.xpath("//div[@id='contact-tel1']/p/text()").extract()
        for x in dhs:
            dh = dh + x
        item['lianxiren_dh'] = dh

        lianxiren = mode.xpath("//div[@class='section-contact']/text()").extract()
        item['lianxiren'] = lianxiren[1]
        item['lianxiren_dz'] = lianxiren[2]


        item['details'] = mode.xpath("//div[@id='car-dangan']").extract()
        desc = mode.xpath("//div[@class='car-detail-container']/p/text()").extract()
        for d in desc:
            dha = dha + d
        item['description'] = dha

        item['image_urls'] = mode.xpath("//div[@class='car-pic']/img/@src").extract()
        item['collection_dt'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))

        return item

settting.py

# -*- coding: utf-8 -*-

# Scrapy settings for car project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'car'
SPIDER_MODULES = ['car.spiders.car51']
#NEWSPIDER_MODULE = 'car.spiders.zhaoming'
DEFAULT_ITEM_CLASS = 'car.items.Car58Item'
ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1,
                  'car.pipelines.MongoDBPipeline': 300,
                  'car.pipelines.Car58ImagesPipeline': 301
                  }
MONGODB_SERVER ="localhost"
MONGODB_PORT=27017
MONGODB_DB="car"
MONGODB_COLLECTION_CAR="car"
MONGODB_COLLECTION_ZHAOMING="zhaoming"
IMAGES_STORE = "img/"
DOWNLOAD_DELAY = 0.25    # 250 ms of delay
IMAGES_EXPIRES = 90

DOWNLOAD_TIMEOUT=10


LOG_ENABLED=True
LOG_ENCODING='utf-8'
LOG_LEVEL="DEBUG"
LOGSTATS_INTERVAL=5
# LOG_FILE='/tmp/scrapy.log'

CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16

scrapy log:

 $scrapy crawl car51
2016-06-14 14:18:38 [scrapy] INFO: Scrapy 1.1.0 started (bot: car)
2016-06-14 14:18:38 [scrapy] INFO: Overridden settings: {'CONCURRENT_REQUESTS_PER_DOMAIN': 16, 'SPIDER_MODULES': ['car.spiders.car51'], 'BOT_NAME': 'car', 'DOWNLOAD_TIMEOUT': 10, 'LOGSTATS_INTERVAL': 5, 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0', 'DEFAULT_ITEM_CLASS': 'car.items.Car58Item', 'DOWNLOAD_DELAY': 0.25}
2016-06-14 14:18:38 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2016-06-14 14:18:38 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-06-14 14:18:38 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-06-14 14:18:38 [py.warnings] WARNING: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/deprecate.py:156: ScrapyDeprecationWarning: `scrapy.contrib.pipeline.images.ImagesPipeline` class is deprecated, use `scrapy.pipelines.images.ImagesPipeline` instead
  ScrapyDeprecationWarning)

2016-06-14 14:18:38 [py.warnings] WARNING: /Users/mayuping/PycharmProjects/car/car/pipelines.py:13: ScrapyDeprecationWarning: Module `scrapy.log` has been deprecated, Scrapy now relies on the builtin Python library for logging. Read the updated logging entry in the documentation to learn more.
  from scrapy import log

2016-06-14 14:18:38 [scrapy] INFO: Enabled item pipelines:
['scrapy.pipelines.images.ImagesPipeline',
 'car.pipelines.MongoDBPipeline',
 'car.pipelines.Car58ImagesPipeline']
2016-06-14 14:18:38 [scrapy] INFO: Spider opened
2016-06-14 14:18:38 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-06-14 14:18:38 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-06-14 14:18:38 [scrapy] DEBUG: Crawled (302) <GET http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time> (referer: None)
**2016-06-14 14:18:39 [scrapy] ERROR: Spider error processing <GET http://www.51auto.com/quanguo/pabmdcigf?searchtype=searcarlist&curentPage=1&isNewsCar=0&isSaleCar=0&isQa=0&orderValue=record_time> (referer: None)**
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
    yield next(it)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
    for x in result:
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "/Users/mayuping/PycharmProjects/car/car/spiders/car51.py", line 22, in parse_item
    trs = response.xpath("//div[@class='view-grid-overflow']/a").extract()
AttributeError: 'Response' object has no attribute 'xpath'
2016-06-14 14:18:39 [scrapy] INFO: Closing spider (finished)
2016-06-14 14:18:39 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 351,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 420,
 'downloader/response_count': 1,
 'downloader/response_status_count/302': 1,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2016, 6, 14, 6, 18, 39, 56461),
 'log_count/DEBUG': 2,
 'log_count/ERROR': 1,
 'log_count/INFO': 7,
 'log_count/WARNING': 2,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'spider_exceptions/AttributeError': 1,
 'start_time': datetime.datetime(2016, 6, 14, 6, 18, 38, 437336)}
2016-06-14 14:18:39 [scrapy] INFO: Spider closed (finished)

Upvotes: 0

Views: 1586

Answers (1)

paul trmbrth
paul trmbrth

Reputation: 20748

When you add handle_httpstatus_list = [302,301] you're telling Scrapy to call your callback even for HTTP redirection, instead of letting the framework handle the redirection transparently for you (which is the default).

Some HTTP responses for redirections do NOT have bodies nor content headers, so in those cases, in your callback, Scrapy hands you the response as-is, i.e. a plain Response object, and not an HtmlResponse for which you have .xpath() and .css() shortcuts.

Either you really need to handle HTTP 301 and 302 responses, and you need to write your callback so it tests the status code (response.status), extracting data only in the non-3xx cases,

Or, you let Scrapy handle HTTP redirections for you and you need to remove handle_httpstatus_list in your spider.

Upvotes: 4

Related Questions