Reputation: 1
so I want to scrape the data of multiple URLs and retrieve all the information. but I can only scrape from 1 URL if more than 1 URL will be an error (list index out of range). and I was given the info that separates yield into several different variables. what should be like for the syntax itself?
import scrapy
class QuotesSpider(scrapy.Spider): name = "quotes"
def start_requests(self):
urls = [
# 'https://jdih.kaltimprov.go.id/produk_hukum/detail/9ef7f994-9db4'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
yield{
'Kategori':response.xpath('//*[@class="text-left"]/text()')[0].extract(),
'Nomor':response.xpath('//*[@class="text-left"]/text()')[1].extract(),
'Judul':response.xpath('//*[@class="text-left"]/text()')[2].extract().strip(),
'Tanggal Diterapkan':response.xpath('//*[@class="text-left"]/text()')[3].extract(),
'Tanggal Diundangkan':response.xpath('//*[@class="text-left"]/text()')[4].extract(),
'Keterangan Status':response.xpath('//*[@class="text-left"]/p/text()')[0].extract(),
'Statistik View':response.xpath('//*[@class="text-left"]/text()')[5].extract(),
'Statistik Download':response.xpath('//*[@class="text-left"]/text()')[6].extract(),
'Katalog': response.xpath('//*[@class="text-left"]/p/span/text').extract(),
'Abstraksi' :response.xpath('//*[@class="text-left"]/p/text()')[1].extract(),
'Lampiran': response.css('body > section > div > div > div > div.row > div.col-3 > a::attr(href)').extract()
}
and the error are
File "C:\Users\Prihantoro Tri N\OneDrive\Documents\file toro\MSIB\Magang\Hukumonline\Project\list_url\test1.py", line 28, in parse
'kategori' : response.css('body > section > div > div > div > div.row > div.col-9 > table > tr > td::text')[0].extract(),
File "C:\Users\Prihantoro Tri N\AppData\Local\Programs\Python\Python310\lib\site-packages\parsel\selector.py", line 70, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Closing spider (finished)
2022-03-24 15:11:01 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 380,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 1761,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 0.888989,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 3, 24, 8, 11, 1, 373605),
'httpcompression/response_bytes': 1606,
'httpcompression/response_count': 1,
'log_count/DEBUG': 10,
'log_count/ERROR': 1,
'log_count/INFO': 10,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'spider_exceptions/IndexError': 1,
'start_time': datetime.datetime(2022, 3, 24, 8, 11, 0, 484616)}
2022-03-24 15:11:01 [scrapy.core.engine] INFO: Spider closed (finished)
Upvotes: 0
Views: 253
Reputation: 212
are you trying to use pagination with that spider? if so, check the code snippet. It is taken from the documentation here
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://quotes.toscrape.com/page/1/',
'https://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = f'quotes-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
Upvotes: 0