I need help scraping an aspx site

Question

i am currently trying to scrape the main information of the products (name, price, and image url) from the diferent categories of a supermarket, but I am strugling with the page as it seems that I can't access the categorie url directly, it always redirects me to the home page.

The page I'm trying to scrape is: https://www.veadigital.com.ar/ (this is the main page) But I would like to access the diferent subcategorie's pages of the 'Bebidas' categories. The url of a subcategories is like this: https://www.veadigital.com.ar/Comprar/Home.aspx#_atCategory=false&_atGrilla=true&_id=141446

Only the id changes, but when I run my spider in the subcategorie url, I get the main page as a response. Sorry if I am not being clear enough, any help would be much apreciated

Here is my spider:

from scrapy.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.selector import Selector
from ..items import ProductoGenericoItem


class VeaSpider(CrawlSpider):
    name = "vea"

    pos = 1
    base_url = "https://www.veadigital.com.ar/Comprar/Home.aspx#_atCategory=false&_atGrilla=true&_id={0}"
    c = 0

    cat = [
        141446, # a base de hierbas
        446126, # aguas sin gas
        446127, # aguas con gas
        446128, # aguas saborizadas
        141231, # aperitivos
        141236, # gaseosas cola
    ]

    start_urls = [
        base_url.format(cat[c])
    ]

    def parse(self, response):
        item = ProductoGenericoItem()

        product_info = response.xpath("//li[@class='grilla-producto-container full-layout']").getall()
        for p in product_info:
            sel = Selector(text=p)

            item['repetido'] = False
            item['superMercado'] = 'Vea Argentina'
            item['sucursal'] = 'NO'
            item['marca'] = ''
            item['empresa'] = ''
            item['ean'] = ''
            item['sku'] = ''
            item['idArticulo'] = ''
            item['nombre'] = sel.xpath(
                "normalize-space(/html/body/li/div[2]/div/div[2]/div/div//text())"
            ).get()
            item['descripcion'] = ''
            precio = sel.xpath(
                "normalize-space(/html/body/li/div[2]/div/div[2]/div/div[2]/text())"
            ).get()
            centavos = sel.xpath(
                "normalize-space(/html/body/li/div[2]/div/div[2]/div/div[2]/span/text())"
            ).get()
            item['precio'] = precio + ',' + centavos
            item['precioPromocional'] = ''
            item['condicion'] = ''
            item['precioPorMedida'] = sel.xpath(
                "normalize-space(/html/body/li/div[2]/div/div[2]/div/div[3]/text())"
            ).get()
            item['stock'] = ''
            item['categoria'] = 'Bebidas'
            item['subcategoria'] = response.xpath(
                "normalize-space(//div[@class='category-breadcrumbs']/a//text())"
            )
            item['segmento'] = response.xpath(
                "normalize-space(//span[@class='selected']//text())"
            )
            item['imagen'] = sel.xpath(
                "/html/body/li/div[2]/div/div/img[1]/@src"
            ).get()
            item['promocion'] = sel.xpath(
                "normalize-space(/html/body/li/div/div/p)"
            ).get()
            # if 'Oferta' in item['promocion']:
            #     item['precioPromocional'] = item['promocion'].replace('Oferta', '')
            if item['segmento'] != '':
                    item['posicionSegmento'] = self.pos
            else:
                item['posicionSubcategoria'] = self.pos

            self.pos += 1

            yield item


        if self.c < len(self.cat) - 1:
            self.c += 1
            self.pos = 1
            yield Request(
                self.base_url.format(self.cat[self.c]),
                callback=self.parse,
            )
        else:
            print('finished')

I need help scraping an aspx site

Answers (1)

Related Questions