Reputation: 65
I'm facing this issue but I couldn't fix it. I'm using Scrapy-Playwright to scrape some sites, when I use chromium, it opens new tabs in the same BrowserContext to access a list of pages I load previously. But due to JS limitations, in chromium these sites don't work properly, so I changed to Firefox where they work fine, just that instead of open a new tab, the script launches a new browser, increasing resource consuption. I didn't find any related info about how avoid this behavior. Also, I'd like to keep just two tabs, using the second one to open all the links I need to load, and if it's posible use the first tab better. Could you help me?
This is my code:
spider.py
import scrapy
from lxml import html
from scrapy_playwright.page import PageMethod
from MyScrapy.items import MyScrapyItem
class MySpider(scrapy.Spider):
name = 'MySpider'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.cookie = "" # PUT YOUR COOKIE IN JSON FORMAT HERE
super(MySpider, self).__init__()
def start_requests(self):
query = "News"
kiwifarms_url = f"https://kiwifarms.st/search/24953346/?q={query}&o=date"
yield scrapy.Request(kiwifarms_url, callback=self.extract_urls, errback=self.error_request,
cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
"playwright_page_methods": [
PageMethod("wait_for_load_state", "networkidle"),
PageMethod("wait_for_timeout", 60000)]
})
async def extract_urls(self, response):
try:
list_of_urls_to_visit = response.xpath('.//h3[@class="contentRow-title"]/a//@href').extract()
except AttributeError:
print("No URLs found")
list_of_urls_to_visit = []
start_url = "https://kiwifarms.st"
for url in list_of_urls_to_visit:
complete_url = start_url + url
yield scrapy.Request(complete_url, callback=self.parse_posts, errback=self.error_request,
cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
"playwright_page_methods": [
PageMethod("wait_for_load_state", "networkidle"),
PageMethod("wait_for_timeout", 60000)]})
async def parse_posts(self, response):
tree = html.fromstring(response.text)
individual_posts = tree.xpath('//div[@class="block-container lbContainer"]/div/article')
for post in individual_posts:
item = MyScrapyItem()
id_post = post.xpath('...')
# Parsing HTML
next_page = "https://kiwifarms.st" + tree.xpath(
'.//div[@class="pageNav pageNav--skipStart "]/a[@class="pageNav-jump pageNav-jump--next"]//@href')[
0].__str__()
if next_page:
yield scrapy.Request(next_page, callback=self.parse_posts, errback=self.error_request,
cookies=self.cookie, meta={'playwright': True, 'playwright_include_page': True,
"playwright_page_methods": [
PageMethod("wait_for_load_state",
"networkidle"),
PageMethod("wait_for_timeout", 60000)]})
async def error_request(self, failure):
if failure.value.status != 200:
self.logger.error("Error not 200 status code")
self.logger.info("Closing spider and playwright context.")
page = failure.request.meta["playwright_page"]
await page.close()
await page.context.close()
My settings.py
:
BOT_NAME = "MySpider"
SPIDER_MODULES = ["MySpider.spiders"]
NEWSPIDER_MODULE = "MySpider.spiders"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
CONCURRENT_REQUESTS = 2
PLAYWRIGHT_BROWSER_TYPE = "firefox"
PLAYWRIGHT_LAUNCH_OPTIONS = {
"headless": False,
"timeout": 40 * 1000, # 40 seconds
}
USER_AGENT = None
ROBOTSTXT_OBEY = False
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
FEED_EXPORT_ENCODING = "utf-8"
DOWNLOAD_DELAY = 3
Upvotes: 0
Views: 216