Reputation: 629
I am trying to disable javascript so that websites know that the javascript is disabled on puppeteer (ie: <noscript>
tags) in a base class made to crawl websites however my script fail to so as it's not disabling javascript when I go to any websites.
Here is my code:
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"
/**
* Base class for all crawler
*/
abstract class BaseCrawler {
public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
public static readonly TORRC_PATH = process.env.TORRC_PATH;
public static headless = false
public readonly browser: Promise<puppeteer.Browser>;
private readonly jsEnabled: boolean;
/**
* get the active page
* @returns null if it couldn't get the active
*/
public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
const browser = await this.browser;
var start = new Date().getTime();
while (new Date().getTime() - start < timeout) {
var pages = await browser.pages();
var arr = [];
for (const p of pages) {
if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
arr.push(p);
}
}
if (arr.length == 1)
return arr[0];
}
return null;
}
constructor(jsEnabled = false, website = "https://google.com") {
console.log(Browser)
this.browser = puppeteer.launch({
headless: BaseCrawler.headless,
//args: ["--proxy-server=socks5://127.0.0.1:9050"],
userDataDir: "./.headless-data"
});
this.jsEnabled = jsEnabled;
this.browser.then(async (b) => {
b.on("targetcreated", async (e: Target) => {
const page = await e.page();
// set a tor useragent
page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);
// disable script if it's aksed
if (page?.url()) {
// console.log(page.url().search("chrome://"))
if (page.url().search("chrome://") < 0)
page?.setJavaScriptEnabled(this.jsEnabled)
}
page?.on('request', request => {
if (request.resourceType() === 'script')
request.abort();
else
request.continue();
})
})
})
this.browser.then(async (b) => {
b.newPage()
const page = await b.newPage();
page.goto(website);
})
}
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};
I tried to hook the newPage
function however it doesn't work as it's giving me the following error:
PS C:\Users\vince\project\js\crawler-project> yarn run browser
yarn run v1.22.10
$ node . --tor-window
undefined
C:\Users\vince\project\js\crawler-project\\dist\Crawler.js:35
const old_newpage = puppeteer_1.default.Browser.prototype.newPage;
^
TypeError: Cannot read property 'prototype' of undefined
at new BaseCrawler (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:35:57)
at new TorWindow (C:\Users\vince\project\js\crawler-project\dist\Crawler.js:98:1)
at Object.<anonymous> (C:\Users\vince\project\js\crawler-project\dist\index.js:27:5)
at Module._compile (node:internal/modules/cjs/loader:1092:14)
at Object.Module._extensions..js (node:internal/modules/cjs/loader:1121:10)
at Module.load (node:internal/modules/cjs/loader:972:32)
at Function.Module._load (node:internal/modules/cjs/loader:813:14)
at Function.executeUserEntryPoint [as runMain] (node:internal/modules/run_main:76:12)
at node:internal/main/run_main_module:17:47
error Command failed with exit code 1.
info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.
// https://stackoverflow.com/questions/39134419/run-tor-browser-with-selenium-webdriver
import puppeteer, { puppeteerErrors, Target, Browser } from "puppeteer";
import { readFileSync } from "fs"
import { helpers } from "./helpers";
import _ from "lodash"
/**
* Base class for all crawler
*/
abstract class BaseCrawler {
public static readonly TOR_PATH = process.env.TOR_PATH ?? "";
public static readonly TOR_PROFILE_PATH = process.env.TOR_PROFILE_PATH ?? "";
public static readonly TORRC_PATH = process.env.TORRC_PATH;
public static headless = false
public readonly browser: Promise<puppeteer.Browser>;
private readonly jsEnabled: boolean;
/**
* get the active page
* @returns null if it couldn't get the active
*/
public async activePage(timeout = 30_000): Promise<puppeteer.Page | null> {
const browser = await this.browser;
var start = new Date().getTime();
while (new Date().getTime() - start < timeout) {
var pages = await browser.pages();
var arr = [];
for (const p of pages) {
if (await p.evaluate(() => { return document.visibilityState == 'visible' })) {
arr.push(p);
}
}
if (arr.length == 1)
return arr[0];
}
return null;
}
constructor(jsEnabled = false, website = "https://check.torproject.org") {
console.log(Browser)
const old_newpage = puppeteer.Browser.prototype.newPage
puppeteer.Browser.prototype.newPage = async () => {
const page = await old_newpage()
page.setJavaScriptEnabled(this.jsEnabled)
return page;
}
this.browser = puppeteer.launch({
headless: BaseCrawler.headless,
args: ["--proxy-server=socks5://127.0.0.1:9050"],
userDataDir: "./.headless-data"
});
this.jsEnabled = jsEnabled;
this.browser.then(async (b) => {
b.on("targetcreated", async (e: Target) => {
const page = await e.page();
// set a tor useragent
page?.setUserAgent(`Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/${_.random(60, 100)}.0`);
// disable script if it's aksed
if (page?.url()) {
// console.log(page.url().search("chrome://"))
if (page.url().search("chrome://") < 0)
page?.setJavaScriptEnabled(this.jsEnabled)
}
page?.on('request', request => {
if (request.resourceType() === 'script')
request.abort();
else
request.continue();
})
})
})
this.browser.then(async (b) => {
b.newPage()
const page = await b.newPage();
page.goto(website);
})
}
}
/** Bland tor window just made to browse tor */
export class TorWindow extends BaseCrawler {
};
Upvotes: 5
Views: 5159
Reputation: 459
Try the following:
page.setJavaScriptEnabled(false)
Set this before navigating to the website.
More information to read on this you can find at: page.setJavaScriptEnabled(enabled)
Upvotes: 6
Reputation: 81653
Page.setJavaScriptEnabled(value)
returns a Promise
so you need to await
it. Also according to the docs:
NOTE changing this value won't affect scripts that have already been run. It will take full effect on the next navigation.
await page.setJavaScriptEnabled(false);
Upvotes: 2
Reputation: 825
To disable javascript, we need to monitor all the requests/responses flowing. Then based on the type, we can decide to terminate the request/response.
In the below example, we will load flipkart.com without using the javascript files.
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType() === 'script')
request.abort();
else
request.continue();
});
await page.goto('https://www.flipkart.com');
await browser.close();
})();
source: https://chercher.tech/puppeteer/disable-javascript-puppeteer
If you know that website will serve page without js for certain crawlers, you can try change headers.
Upvotes: 1