Reputation: 73
I write my Scrapy spider which should handle some site with AJAX. In theory it should work OK and moreover it works OK when I use it manually with fetch() in Scrapy shell, but when I run "scrapy crawl ..." I don`t see any POST requests in log and no items are scraped. What can it be and what is the source of the problem?
import scrapy
from scrapy import Request, FormRequest
import json
class ExpertSpider(scrapy.Spider):
name = "expert"
allowed_domains = ["expert.fi"]
start_urls = (
'http://www.expert.fi/',
)
def parse(self, response):
categories = response.xpath('//div[@id="categories-navigation"]//a/@href').extract()
for cat in categories:
yield Request(response.urljoin(cat), callback=self.parseCat)
def parseCat(self, response):
catMenu = response.xpath('//div[@id="category-left-menu"]')
if catMenu:
subCats = catMenu.xpath('.//a[@class="category"]/@href').extract()
for subCat in subCats:
yield Request(response.urljoin(subCat), callback=self.parseCat)
else:
self.parseProdPage(response)
print "I`ve reached this point" # debug
def parseProdPage(self, response):
catId = response.css...
url = 'https://www.expert.fi/Umbraco/Api/Product/ProductsByCategory'
data = dict()
...
jsonDict = json.dumps(data)
heads = dict()
heads['Content-Type'] = 'application/json;charset=utf-8'
heads['Content-Length'] = len(jsonDict)
heads['Accept'] = 'application/json, text/plain, */*'
heads['Referer'] = response.url
return Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.startItemProc)
def startItemProc(self, response):
resDict = json.loads(response.body)
item = dict()
for it in resDict['Products']:
# Product data
...
item['Category Path'] = it['Breadcrumb'][-1]['Name'] + ''.join([' > ' + crumb['Name']
for crumb in it['Breadcrumb'][-2::-1]])
# Make the new request for delivery price
url = 'https://www.expert.fi/Umbraco/Api/Cart/GetFreightOptionsForProduct'
data = dict()
...
jsonDict = json.dumps(data)
heads = dict()
heads['Content-Type'] = 'application/json;charset=utf-8'
heads['Content-Length'] = len(jsonDict)
heads['Accept'] = 'application/json, text/plain, */*'
heads['Referer'] = item['Product URL']
req = Request(url=url, method="POST", body=jsonDict, headers=heads, callback=self.finishItemProc)
req.meta['item'] = item
yield req
def finishItemProc(self, response):
item = response.meta['item']
ansList = json.loads(response.body)
for delivery in ansList:
if delivery['Name'] == ...
item['Delivery price'] = delivery['Price']
return item
The log is:
2016-10-09 01:11:16 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9,
'downloader/exception_type_count/twisted.internet.error.DNSLookupError': 1,
'downloader/exception_type_count/twisted.internet.error.TimeoutError': 8,
'downloader/request_bytes': 106652,
'downloader/request_count': 263,
'downloader/request_method_count/GET': 263,
'downloader/response_bytes': 5644786,
'downloader/response_count': 254,
'downloader/response_status_count/200': 252,
'downloader/response_status_count/301': 1,
'downloader/response_status_count/302': 1,
'dupefilter/filtered': 19,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 8, 22, 11, 16, 949472),
'log_count/DEBUG': 265,
'log_count/INFO': 11,
'request_depth_max': 3,
'response_received_count': 252,
'scheduler/dequeued': 263,
'scheduler/dequeued/memory': 263,
'scheduler/enqueued': 263,
'scheduler/enqueued/memory': 263,
'start_time': datetime.datetime(2016, 10, 8, 22, 7, 7, 811163)}
2016-10-09 01:11:16 [scrapy] INFO: Spider closed (finished)
Upvotes: 0
Views: 656
Reputation: 911
The request returned by the parseProdPage
method is not used inside the parseCat
method. You should start by yielding that: yield self.parseProdPage(response)
Also, you probably want to set dont_filter=True
in that same request, otherwise most of them will be filtered out (because all of them have the same URL).
Upvotes: 1