vchslv13
vchslv13

Reputation: 73

Scrapy doesn`t make POST-requests

I write my Scrapy spider which should handle some site with AJAX. In theory it should work OK and moreover it works OK when I use it manually with fetch() in Scrapy shell, but when I run "scrapy crawl ..." I don`t see any POST requests in log and no items are scraped. What can it be and what is the source of the problem?

    import scrapy
    from scrapy import Request, FormRequest
    import json


class ExpertSpider(scrapy.Spider):
    name = "expert"
    allowed_domains = ["expert.fi"]
    start_urls = (
        'http://www.expert.fi/',
    )

def parse(self, response):
    categories = response.xpath('//div[@id="categories-navigation"]//a/@href').extract()
    for cat in categories:
        yield Request(response.urljoin(cat), callback=self.parseCat)

def parseCat(self, response):
    catMenu = response.xpath('//div[@id="category-left-menu"]')
    if catMenu:
        subCats = catMenu.xpath('.//a[@class="category"]/@href').extract()
        for subCat in subCats:
            yield Request(response.urljoin(subCat), callback=self.parseCat)
    else:
        self.parseProdPage(response)
        print "I`ve reached this point"  # debug

def parseProdPage(self, response):
    catId = response.css...
    url = 'https://www.expert.fi/Umbraco/Api/Product/ProductsByCategory'

    data = dict()
    ...
    jsonDict = json.dumps(data)

    heads = dict()
    heads['Content-Type'] = 'application/json;charset=utf-8'
    heads['Content-Length'] = len(jsonDict)
    heads['Accept'] = 'application/json, text/plain, */*'
    heads['Referer'] = response.url

    return Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.startItemProc)

def startItemProc(self, response):
    resDict = json.loads(response.body)

    item = dict()
    for it in resDict['Products']:
        # Product data
        ...
        item['Category Path'] = it['Breadcrumb'][-1]['Name'] + ''.join([' > ' + crumb['Name']
                                                                for crumb in it['Breadcrumb'][-2::-1]])
        # Make the new request for delivery price
        url = 'https://www.expert.fi/Umbraco/Api/Cart/GetFreightOptionsForProduct'
        data = dict()
        ...
        jsonDict = json.dumps(data)

        heads = dict()
        heads['Content-Type'] = 'application/json;charset=utf-8'
        heads['Content-Length'] = len(jsonDict)
        heads['Accept'] = 'application/json, text/plain, */*'
        heads['Referer'] = item['Product URL']

        req = Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.finishItemProc)
        req.meta['item'] = item
        yield req

def finishItemProc(self, response):
    item = response.meta['item']
    ansList = json.loads(response.body)
    for delivery in ansList:
        if delivery['Name'] == ...
            item['Delivery price'] = delivery['Price']
    return item

The log is:

2016-10-09 01:11:16 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9,
 'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 1,
 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 8,
 'downloader/request_bytes': 106652,
 'downloader/request_count': 263,
 'downloader/request_method_count/GET': 263,
 'downloader/response_bytes': 5644786,
 'downloader/response_count': 254,
 'downloader/response_status_count/200': 252,
 'downloader/response_status_count/301': 1,
 'downloader/response_status_count/302': 1,
 'dupefilter/filtered': 19,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2016, 10, 8, 22, 11, 16, 949472),
 'log_count/DEBUG': 265,
 'log_count/INFO': 11,
 'request_depth_max': 3,
 'response_received_count': 252,
 'scheduler/dequeued': 263,
 'scheduler/dequeued/memory': 263,
 'scheduler/enqueued': 263,
 'scheduler/enqueued/memory': 263,
 'start_time': datetime.datetime(2016, 10, 8, 22, 7, 7, 811163)}
2016-10-09 01:11:16 [scrapy] INFO: Spider closed (finished)

Upvotes: 0

Views: 656

Answers (1)

elacuesta
elacuesta

Reputation: 911

The request returned by the parseProdPage method is not used inside the parseCat method. You should start by yielding that: yield self.parseProdPage(response)

Also, you probably want to set dont_filter=True in that same request, otherwise most of them will be filtered out (because all of them have the same URL).

Upvotes: 1

Related Questions