how to create separate variable in scrapy for multiple urls?

so I want to scrape the data of multiple URLs and retrieve all the information. but I can only scrape from 1 URL if more than 1 URL will be an error (list index out of range). and I was given the info that separates yield into several different variables. what should be like for the syntax itself?

import scrapy

class QuotesSpider(scrapy.Spider): name = "quotes"

def start_requests(self):
    urls = [
       # 'https://jdih.kaltimprov.go.id/produk_hukum/detail/9ef7f994-9db4'
        
    ]
    for url in urls:
        yield scrapy.Request(url=url, callback=self.parse)

def parse(self, response):
    yield{
        'Kategori':response.xpath('//*[@class="text-left"]/text()')[0].extract(), 
        'Nomor':response.xpath('//*[@class="text-left"]/text()')[1].extract(),
        'Judul':response.xpath('//*[@class="text-left"]/text()')[2].extract().strip(),
        'Tanggal Diterapkan':response.xpath('//*[@class="text-left"]/text()')[3].extract(),
        'Tanggal Diundangkan':response.xpath('//*[@class="text-left"]/text()')[4].extract(),
        'Keterangan Status':response.xpath('//*[@class="text-left"]/p/text()')[0].extract(),
        'Statistik View':response.xpath('//*[@class="text-left"]/text()')[5].extract(),
        'Statistik Download':response.xpath('//*[@class="text-left"]/text()')[6].extract(),
        'Katalog': response.xpath('//*[@class="text-left"]/p/span/text').extract(),
        'Abstraksi' :response.xpath('//*[@class="text-left"]/p/text()')[1].extract(),
        'Lampiran': response.css('body > section > div > div > div > div.row > div.col-3 > a::attr(href)').extract()  
    }

and the error are

File "C:\Users\Prihantoro Tri N\OneDrive\Documents\file toro\MSIB\Magang\Hukumonline\Project\list_url\test1.py", line 28, in parse
    'kategori' : response.css('body > section > div > div > div > div.row > div.col-9 > table > tr > td::text')[0].extract(),
  File "C:\Users\Prihantoro Tri N\AppData\Local\Programs\Python\Python310\lib\site-packages\parsel\selector.py", line 70, in __getitem__
    o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 15:11:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 380,
 'downloader/request_count': 1,
 'downloader/request_method_count/GET': 1,
 'downloader/response_bytes': 1761,
 'downloader/response_count': 1,
 'downloader/response_status_count/200': 1,
 'elapsed_time_seconds': 0.888989,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2022, 3, 24, 8, 11, 1, 373605),
 'httpcompression/response_bytes': 1606,
 'httpcompression/response_count': 1,
 'log_count/DEBUG': 10,
 'log_count/ERROR': 1,
 'log_count/INFO': 10,
 'response_received_count': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'spider_exceptions/IndexError': 1,
 'start_time': datetime.datetime(2022, 3, 24, 8, 11, 0, 484616)}
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Spider closed (finished)

Upvotes: 0

Views: 253

Answers (1)

Geomario
Geomario

Reputation: 212

are you trying to use pagination with that spider? if so, check the code snippet. It is taken from the documentation here

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://quotes.toscrape.com/page/1/',
            'https://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = f'quotes-{page}.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log(f'Saved file {filename}')

Upvotes: 0

Related Questions