Reputation: 65
Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.
"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
Upvotes: 0
Views: 465
Reputation: 2120
The problem is that your xpath selector returns None
instead of the next page number. Consider changing it from
next_page = response.xpath("//button[@class='tile-root-1uO'][1]/text()").get()
to
next_page = response.xpath("//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using
scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[@class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[@class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[@class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[@class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()
Upvotes: 2