Scrapy Playwright Click and loop through Virtual Javascript page

Question

I'm working on a scrapy bot that can get specific details for optics. I need to click on a javascript button to show a virtual page, so that my scrapy bot can scrape the Optic details.

This is what I need the playwright to click on show in a red rectangle. Details tab highlighted in red

On certain pages, the first item details page is already showing. Example:

Virtual page details tab open

I probably need to create some sort of If else statement for this? I would to work on it but I've been stuck on the prior issue.

import scrapy


class UpperSpider(scrapy.Spider):
    name = 'Optic'
    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                                    meta={'playwright': True})

    # Issue Here is I'm not sure if this is working I want to click on the Details Tab
    def virtualpage(self, response, page):
        # Virtual Page button
        vpButton = response.css('div[id="wrap"]')
        for page in vpButton:
            page.click('#detailTab')

    # Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
    # Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

Okay, So I tried to make scrapy crawler work. I'm pretty sure I know the problem is at my start_request for URL in self.start_urls: I believe it's telling the playwright to start at Start_URLS. How do I tell the playwright to also start on each crawled page? So that "Clickallbtns" can run?

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine


class UpperSpider(CrawlSpider):
    name = 'Upper'
    allowed_domains = ['brownells.com']
    start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']



    le_item_details = LinkExtractor(restrict_css='.listing')

    rule_product_detail = Rule(le_item_details,
                                callback='parse_item',
                                follow=True,
                                )
    rules = (
        rule_product_detail,
    )


    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                    meta={'playwright': True,
                                        'playwright_page_coroutines': {
                                            #"waitforload": PageCoroutine("waitforNavagation", 'url'),
                                            "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
                                        }
                                        }
                                    )

    def parse_item(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
                }

Scrapy Playwright Click and loop through Virtual Javascript page

Answers (1)

Related Questions