Change request.url in Scrapy to crawl a json file then return to the original request

Question

I am scraping data from an url (product name, price, etc.) but there is a json file in the backend which I also want to scrape as it has relevant information. In a nutshell I want to change my request url to the json one and and then return to the original url so the crawling can continue.

Product url:https://www2.hm.com/hu_hu/productpage.0906822002.html
Related json url (this can be found in the Networks tab, and I store this in a variable named availability_url): https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json

It would be very important to have the availability data in a variable before the actual yield as I have to return to the original url before checking the colors at the end of the code:

import scrapy
import re

class HMSpider(scrapy.Spider):
    name = 'hm'
    
    start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
    custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}

    def parse(self, response):
        
        items = response.css('h3.item-heading  a::attr(href)').getall()
        for item in items:
            link = 'https://www2.hm.com' + item
            yield scrapy.Request(link, self.parse_item)


    def parse_item(self, response, request):
        page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
        data = response.css('div.product.parbase script::text').get()
        base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
        detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
        current_price = int(re.findall(r'\d+',re.findall('product_list_price :  $$"(.*?)$$,', page_source_data)[0])[0])
        original_price = int(re.findall(r'\d+',re.findall('product_original_price : $$(.*?)$$,', page_source_data)[0])[0])
        availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
        info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"

        if current_price != original_price:
            yield{
            'product_name': re.findall('(?<=  ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
            'vendor': 'H&M',
            'current_price': int(current_price),
            'original_price': int(original_price),
            'discount_percent': 100-round((current_price / original_price)*100),
            'colors': response.css('li.list-item a::attr(title)').getall(),
            'link': response.request.url,
            #rating
            #reviews
            }
            color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall()) 
            if color_count > 1:
                additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
                color_index = 1
                for color in additonal_colors:
                    if color_index <= color_count:
                        link = 'https://www2.hm.com' + color
                        yield scrapy.Request(link, self.parse_item)
                        color_index += 1

So to sum up: I want to change the scraped url from https://www2.hm.com/hu_hu/productpage.0906822002.html to https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json then return back to https://www2.hm.com/hu_hu/productpage.0906822002.html so my scraper can continue work.

Felix Ekl&#246;f · Accepted Answer

You can do something like this, if you do the json request after extracting all item data you don't have to return to the function. (The colour varaitions request will still be created since we're yielding requests and nothing returning) Try if this works for you:

import json
import scrapy
import re

class HMSpider(scrapy.Spider):
    name = 'hm'
    
    start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
    custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}

    def parse(self, response):
        
        items = response.css('h3.item-heading  a::attr(href)').getall()
        for item in items:
            link = 'https://www2.hm.com' + item
            yield scrapy.Request(link, self.parse_item)


    def parse_item(self, response, request):
        page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
        data = response.css('div.product.parbase script::text').get()
        base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
        detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
        current_price = int(re.findall(r'\d+',re.findall('product_list_price :  $$"(.*?)$$,', page_source_data)[0])[0])
        original_price = int(re.findall(r'\d+',re.findall('product_original_price : $$(.*?)$$,', page_source_data)[0])[0])
        availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
        info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"

        if current_price != original_price:
            item = {
                'product_name': re.findall('(?<=  ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
                'vendor': 'H&M',
                'current_price': int(current_price),
                'original_price': int(original_price),
                'discount_percent': 100-round((current_price / original_price)*100),
                'colors': response.css('li.list-item a::attr(title)').getall(),
                'link': response.request.url,
                #rating
                #reviews
            }

            if availability_url:
                yield scrapy.Request(
                    url=availability_url,
                    callback=self.parse_availability,
                    meta={
                        'item': item
                    }
                )

            color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall()) 
            if color_count > 1:
                additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
                color_index = 1
                for color in additonal_colors:
                    if color_index <= color_count:
                        link = 'https://www2.hm.com' + color
                        yield scrapy.Request(link, self.parse_item)
                        color_index += 1
        
    def parse_availability(self, response):
        item = response.meta.get('item')
        json_data = json.loads(response.body)
        #do something with json data here and add it to item
        yield item

Change request.url in Scrapy to crawl a json file then return to the original request

Answers (1)

Related Questions