user3395284
user3395284

Reputation: 51

Scrapy Playwright Click and loop through Virtual Javascript page

I'm working on a scrapy bot that can get specific details for optics. I need to click on a javascript button to show a virtual page, so that my scrapy bot can scrape the Optic details.

This is what I need the playwright to click on show in a red rectangle. Details tab highlighted in red

On certain pages, the first item details page is already showing. Example:

Virtual page details tab open

I probably need to create some sort of If else statement for this? I would to work on it but I've been stuck on the prior issue.

import scrapy


class UpperSpider(scrapy.Spider):
    name = 'Optic'
    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                                    meta={'playwright': True})

    # Issue Here is I'm not sure if this is working I want to click on the Details Tab
    def virtualpage(self, response, page):
        # Virtual Page button
        vpButton = response.css('div[id="wrap"]')
        for page in vpButton:
            page.click('#detailTab')

    # Also some pages for instance https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/carbine-optic-aco--prod73112.aspx
    # Already have there virtual pages showing. I think I would need a if .. statement to make sure it didn't close the page.

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

Okay, So I tried to make scrapy crawler work. I'm pretty sure I know the problem is at my start_request for URL in self.start_urls: I believe it's telling the playwright to start at Start_URLS. How do I tell the playwright to also start on each crawled page? So that "Clickallbtns" can run?

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_playwright.page import PageCoroutine


class UpperSpider(CrawlSpider):
    name = 'Upper'
    allowed_domains = ['brownells.com']
    start_urls = ['https://www.brownells.com/optics-mounting/electronic-sights/red-dot-sights/index.htm']



    le_item_details = LinkExtractor(restrict_css='.listing')

    rule_product_detail = Rule(le_item_details,
                                callback='parse_item',
                                follow=True,
                                )
    rules = (
        rule_product_detail,
    )


    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url,
                                    meta={'playwright': True,
                                        'playwright_page_coroutines': {
                                            #"waitforload": PageCoroutine("waitforNavagation", 'url'),
                                            "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'),
                                        }
                                        }
                                    )

    def parse_item(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
                } 

Upvotes: 5

Views: 4924

Answers (1)

msenior_
msenior_

Reputation: 2120

You need to include the clicking logic in the playwright PageCoroutines dictionary so that the buttons are clicked before the response is returned.

See below sample code. If you are defining the scrapy-playwright values in settings.py then you can comment out the custom_settings variable. Otherwise if you are running it from a script, the below code will be sufficient (using scrapy 2.6.1).

import scrapy
from scrapy_playwright.page import PageCoroutine


class UpperSpider(scrapy.Spider):
    name = 'Optic'
    custom_settings = dict(
        DOWNLOAD_HANDLERS={
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
        TWISTED_REACTOR="twisted.internet.asyncioreactor.AsyncioSelectorReactor",
    )

    start_urls = [
        'https://www.brownells.com/optics-mounting/scopes/rifle-scopes/strike-eagle-1-6x24mm-rifle-scope-prod135114.aspx']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url=url,
                        meta={'playwright': True,
                            'playwright_page_coroutines': {
                                "clickallbtns": PageCoroutine("evaluate", 'document.querySelectorAll("#detailTab").forEach(x=>x.click())'), 
                                }
                            }
                    )

    def parse(self, response):
        container = response.css('div[id="wrap"]')
        for products in container:
            yield {
                'ProductName': products.css('span[itemprop="name"]::text').get(),
                'Stock': products.css('span[itemprop="availability"]::text').get(),
                'Brand': response.css('#listMain .wrap .mbm a::text').get(),
                'Name': response.css('#listMain span+ span::text').get(),
                'Price': products.css('#priceContainer > span > p > span::text').get(),
                'Image': products.css('#lnkImgSku img::attr(src)').get(),
                'Battery': products.css('section:nth-child(1) p:contains("Battery")::text').get(),
                'Length': products.css('section:nth-child(1) p:contains("Length")::text').get(),
                'Weight': products.css('section:nth-child(1) p:contains("Weight")::text').get(),
                'URL': response.url,
                'Reticle': products.css('#detailWrap p:contains("Reticle")::text').get()
            }

Upvotes: 3

Related Questions