Reputation: 407
I am scraping data from an url (product name, price, etc.) but there is a json file in the backend which I also want to scrape as it has relevant information. In a nutshell I want to change my request url to the json one and and then return to the original url so the crawling can continue.
It would be very important to have the availability data in a variable before the actual yield as I have to return to the original url before checking the colors at the end of the code:
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
yield{
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
So to sum up: I want to change the scraped url from https://www2.hm.com/hu_hu/productpage.0906822002.html to https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json then return back to https://www2.hm.com/hu_hu/productpage.0906822002.html so my scraper can continue work.
Upvotes: 0
Views: 231
Reputation: 3720
You can do something like this, if you do the json request after extracting all item data you don't have to return to the function. (The colour varaitions request will still be created since we're yielding requests and nothing returning) Try if this works for you:
import json
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
item = {
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
if availability_url:
yield scrapy.Request(
url=availability_url,
callback=self.parse_availability,
meta={
'item': item
}
)
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
def parse_availability(self, response):
item = response.meta.get('item')
json_data = json.loads(response.body)
#do something with json data here and add it to item
yield item
Upvotes: 1