Reputation: 31
I am trying to log into a webpage using scrapy-playwright, as I want the nice integration with scrapy. I can't log in using scrapy-playwright, as it redirects to a page that does not exist. I have also tried doing a post request instead of clicking, that doesn't work either.
However, if I try the same thing using only Playwright, it works perfectly... Is there a difference between websites opened with scrapy-playwright compared to only Playwright? And does anyone know how to solve this using scrapy-playwright?
scrapy-playwright code:
def start_requests(self):
yield scrapy.Request(
url = self.url,
meta = dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods = [PageMethod('wait_for_selector', 'a[data-toggle=dropdown]')],
),
callback = self.sign_in,
)
async def sign_in(self, response):
page = response.meta['playwright_page']
while await page.is_visible("button[class='close close-news']"):
await page.click("button[class='close close-news']")
await page.click('button#declineAllConsentSummary')
await page.click('div.my-account-sub > a[data-toggle=dropdown]', timeout=10000)
await page.fill('input#j_username_header', os.getenv(self.usernameKey), timeout=10000)
await page.fill('input#j_password_header', os.getenv(self.passwordKey), timeout=10000)
await page.click('button#responsiveMyAccLoginGA')
Playwright code:
async def test_async_playwright(self):
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=False)
context = await browser.new_context(base_url=self.url)
page = await context.new_page()
await page.goto(self.url, wait_until='commit')
while await page.is_visible("button[class='close close-news']"):
await page.click("button[class='close close-news']")
await page.click('button#declineAllConsentSummary')
await page.wait_for_selector('a[data-toggle=dropdown]')
await page.click('div.my-account-sub > a[data-toggle=dropdown]', timeout=5000)
await page.fill('input#j_username_header', os.getenv(self.usernameKey), timeout=5000)
await page.fill('input#j_password_header', os.getenv(self.passwordKey), timeout=5000)
await page.click('button#responsiveMyAccLoginGA')
Upvotes: 3
Views: 2816
Reputation: 86
I used to have the same problem as you. Here's how I solved it.
More detailed. imagine that i am trying to login my favorite site named babypips. The code below helps me login and save Browser storage state. Maybe it's because my internet is a bit slow, so i add one more another address navigation step. This is login.py
from playwright.sync_api import Playwright, sync_playwright
from configparser import ConfigParser
from pathlib import Path
current_folder = Path(__file__).parent.resolve()
def run(playwright: Playwright) -> None:
# Loading user info
config = ConfigParser()
config.read(f"{current_folder}/accounts.ini")
user = input("What is your name?: ")
try:
config_data = config[user]
except:
print("User not found!")
exit(0)
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
# Open new page
page = context.new_page()
# Go to https://www.babypips.com/account/sign-in
page.goto("https://www.babypips.com/account/sign-in")
page.wait_for_timeout(500)
# Click [placeholder="Email or Username"]
page.click("[placeholder=\"Email or Username\"]")
page.wait_for_timeout(500)
# Fill [placeholder="Email or Username"]
page.fill("[placeholder=\"Email or Username\"]", config_data['username'])
page.wait_for_timeout(500)
# Click [placeholder="Password"]
page.click("[placeholder=\"Password\"]")
page.wait_for_timeout(500)
# Fill [placeholder="Password"]
page.fill("[placeholder=\"Password\"]", config_data['password'])
page.wait_for_timeout(500)
# with page.expect_navigation(url="https://www.babypips.com/"):
with page.expect_navigation():
page.click("button:has-text(\"Sign In\")")
page.wait_for_timeout(500)
page.get_by_role("navigation").get_by_role("link", name="MarketMilk™").click()
page.wait_for_timeout(1000)
# Save storage state into the file.
context.storage_state(path="babystate.json")
# ---------------------
context.close()
browser.close()
with sync_playwright() as playwright:
run(playwright)
Define your state file in settings.py like below
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
FEED_EXPORT_ENCODING = 'utf-8'
COOKIES_FILE = 'babystate.json'
HTTPERROR_ALLOWED_CODES = [403]
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
After that using state file when call scrapy start_requests in your crawler. You put the file so that it can be found by scrapy
from crawldata.settings import COOKIES_FILE
......
def start_requests(self):
....
yield Request('https://marketmilk.babypips.com', meta={
'playwright': True,
'playwright_include_page': True,
'playwright_context': 'new',
'playwright_context_kwargs': {
'storage_state': COOKIES_FILE,
},
'playwright_page_methods': [
*tz_methods,
PageMethod('keyboard.down', 'End'),
PageMethod('wait_for_timeout', 500),
],
'errback': self.errback,
},)
....
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
Upvotes: 3
Reputation: 113
As a possible workaround, if you are redirected(to the broken page) after the token/cookie is granted, you can as well navigate to a normal site url, and you should be logged in
Upvotes: 0