Scrapy .css in page using atributte

Question

I am trying to use scrapy on this page: http://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/

But I can't bring the image of the product, it can't find anything I might be missing?

I tried by attribute, by ID, by class and nothing

import scrapy
from scrapy import Request
import random



class BrickSetSpider(scrapy.Spider):
    name = 'spider'
    USER_AGENT_LIST = [
        'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0',
    ]
    start_urls = [
        'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/',

    ]
    download_delay = 5
    FEED_EXPORT_ENCODING = 'utf-8'


    def start_requests(self):
        for url in self.start_urls:
            headers = {'User-Agent': random.choice(self.USER_AGENT_LIST)}
            yield Request(url, headers=headers)



    def parse(self, response):
        SET_SELECTOR = '.content-left'
        for brickset in response.css(SET_SELECTOR):
            SEARCH_SELECTOR = response.url
            NAME_SELECTOR = 'span.keyValue span  ::text'
            IMAGE_SELECTOR = 'img[itemprop="image"] ::attr(src)'

            yield {
                'search': SEARCH_SELECTOR,
                'name': brickset.css(NAME_SELECTOR).re('[^	
]+'),
                'link': brickset.css(IMAGE_SELECTOR).extract(),

            }

dabingsou · Accepted Answer

The image is generated dynamically by JS. Try the following code.

from simplified_scrapy.spider import Spider, SimplifiedDoc
import re
class MySpider(Spider):
  name = 'rs-online.com'
  # allowed_domains = ['example.com']
  start_urls = [
    'https://it.rs-online.com/web/p/sensori-di-prossimita-induttivi/7858468/'
  ]
  # refresh_urls = True # For debug. If efresh_urls = True, start_urls will be crawled again.

  def extract(self, url, html, models, modelNames):
    doc = SimplifiedDoc(html)
    # print (doc.html)
    div = doc.getElementByClass('content-left')
    imgs = re.compile(u'largeImageURL: ".*"').findall(div.script.html)
    imgs = ['https:'+img[len('largeImageURL: "'):-1] for img in imgs]
    lis = doc.getElementByClass('keyDetailsLL').lis
    names = {}
    for li in lis:
      spans=li.spans
      names[spans[0].text]=spans[1].text
    data = [{'imgs':imgs,'names':names}]
    print (data)
    return {"Urls": [], "Data": data} # Return data to framework

from simplified_scrapy.simplified_main import SimplifiedMain
SimplifiedMain.startThread(MySpider()) # Start crawling

Result:

[{'imgs': ['https://media.rs-online.com/t_large/F7858468-01.jpg', 'https://media.rs-online.com/t_large/F7858468-02.jpg'], 'names': {'Codice RS': '785-8468', 'Codice costruttore': 'E2E-S05S12-WC-B1 2M', 'Costruttore': 'Omron'}}]

Scrapy .css in page using atributte

Answers (2)

Related Questions