sigma5563
sigma5563

Reputation: 65

How to open new links in the same context and tab in Scrapy-Playwright

I'm facing this issue but I couldn't fix it. I'm using Scrapy-Playwright to scrape some sites, when I use chromium, it opens new tabs in the same BrowserContext to access a list of pages I load previously. But due to JS limitations, in chromium these sites don't work properly, so I changed to Firefox where they work fine, just that instead of open a new tab, the script launches a new browser, increasing resource consuption. I didn't find any related info about how avoid this behavior. Also, I'd like to keep just two tabs, using the second one to open all the links I need to load, and if it's posible use the first tab better. Could you help me? This is my code: spider.py

import scrapy
from lxml import html
from scrapy_playwright.page import PageMethod
from MyScrapy.items import MyScrapyItem



class MySpider(scrapy.Spider):
    name = 'MySpider'

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cookie = "" # PUT YOUR COOKIE IN JSON FORMAT HERE
        super(MySpider, self).__init__()

    def start_requests(self):
        query = "News"
        kiwifarms_url = f"https://kiwifarms.st/search/24953346/?q={query}&o=date"
        yield scrapy.Request(kiwifarms_url, callback=self.extract_urls, errback=self.error_request,
                             cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
                                                        "playwright_page_methods": [
                                                            PageMethod("wait_for_load_state", "networkidle"),
                                                            PageMethod("wait_for_timeout", 60000)]
                                                        })

    async def extract_urls(self, response):
        try:
            list_of_urls_to_visit = response.xpath('.//h3[@class="contentRow-title"]/a//@href').extract()
        except AttributeError:
            print("No URLs found")
            list_of_urls_to_visit = []
        start_url = "https://kiwifarms.st"
        for url in list_of_urls_to_visit:
            complete_url = start_url + url
            yield scrapy.Request(complete_url, callback=self.parse_posts, errback=self.error_request,
                                 cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
                                                            "playwright_page_methods": [
                                                                PageMethod("wait_for_load_state", "networkidle"),
                                                                PageMethod("wait_for_timeout", 60000)]})

    async def parse_posts(self, response):
        tree = html.fromstring(response.text)
        individual_posts = tree.xpath('//div[@class="block-container lbContainer"]/div/article')
        for post in individual_posts:
            item = MyScrapyItem()
            id_post = post.xpath('...')
            # Parsing HTML
        next_page = "https://kiwifarms.st" + tree.xpath(
            './/div[@class="pageNav pageNav--skipStart "]/a[@class="pageNav-jump pageNav-jump--next"]//@href')[
            0].__str__()
        if next_page:
            yield scrapy.Request(next_page, callback=self.parse_posts, errback=self.error_request,
                                 cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
                                                            "playwright_page_methods": [
                                                                PageMethod("wait_for_load_state",
                                                                           "networkidle"),
                                                                PageMethod("wait_for_timeout", 60000)]})

    async def error_request(self, failure):
        if failure.value.status != 200:
            self.logger.error("Error not 200 status code")
            self.logger.info("Closing spider and playwright context.")
            page = failure.request.meta["playwright_page"]
            await page.close()
            await page.context.close()

My settings.py:

BOT_NAME = "MySpider"

SPIDER_MODULES = ["MySpider.spiders"]
NEWSPIDER_MODULE = "MySpider.spiders"

DOWNLOAD_HANDLERS = {
    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}

TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
CONCURRENT_REQUESTS = 2
PLAYWRIGHT_BROWSER_TYPE = "firefox"
PLAYWRIGHT_LAUNCH_OPTIONS = {
    "headless": False,
    "timeout": 40 * 1000,  # 40 seconds
}
USER_AGENT = None
ROBOTSTXT_OBEY = False
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
FEED_EXPORT_ENCODING = "utf-8"
DOWNLOAD_DELAY = 3

Upvotes: 0

Views: 216

Answers (0)

Related Questions