Reputation: 5459
I'm using puppeteer-extra
and node.js to iterate accross multiple urls.
I'm trying to intercept some resourceType to load upon each iteration, and getting the following error.
PS C:\Users\someuser\Desktop\Project> node temp.js
-- running
C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26
throw new Error(message);
^
Error: Request is already handled!
at Object.exports.assert (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\assert.js:26:15)
at HTTPRequest.continue (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\HTTPRequest.js:217:21)
at PuppeteerBlocker.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:225:33)
at BlockingContext.onRequest (C:\Users\someuser\node_modules\@cliqz\adblocker-puppeteer\dist\cjs\adblocker.js:64:47)
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62
at Array.map (<anonymous>)
at Object.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:43)
at Page.emit (C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\EventEmitter.js:72:22)
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\puppeteer\common\Page.js:143:100
at C:\Users\someuser\node_modules\puppeteer\lib\cjs\vendor\mitt\src\index.js:51:62
I'm having trouble understanding why the request would be already handled as the actual request page.goto
is done while in the for
loop. Would anyone one have any hints?
Here is the full project
const puppeteer = require( 'puppeteer-extra' );
const StealthPlugin = require( 'puppeteer-extra-plugin-stealth' );
puppeteer.use( StealthPlugin() );
const AdblockerPlugin = require( 'puppeteer-extra-plugin-adblocker' );
puppeteer.use( AdblockerPlugin( { blockTrackers: true } ) );
puppeteer.launch( { headless: true } ).then( async browser => {
console.log( '--\xa0running' );
console.time( '--\xa0process' );
const page = await browser.newPage();
await page.setRequestInterception( true );
page.on( 'request', ( request ) => {
if ( [ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) ) {
request.abort();
} else {
request.continue();
};
} );
for ( var i = 1; i <= 20; i++ ) {
console.time( '--\xa0iteration\xa0' + i ); // ... timer start
await page.goto( 'https://www.someurl.it/shop/s%2D' + i, { waitUntil: 'load' } );
const title = await page.title();
console.log( title.includes( '404' ) ? false : title );
console.timeEnd( '--\xa0iteration\xa0' + i ); // ... timer end
};
await browser.close();
console.timeEnd( '--\xa0process' );
console.log( '--\xa0ending' );
} );
Upvotes: 6
Views: 11707
Reputation: 7
page.on("request", (request) => {
const requestUrl = request.url();
if (!request.isInterceptResolutionHandled())
if (
blockResourceType.includes(request.resourceType()) ||
blockResourceName.some((resource) => requestUrl.includes(resource))
) {
request.abort();
} else {
request.continue();
}
});
You can try this to avoid this error message
Upvotes: 0
Reputation: 5459
Ressources interception must be made against each new pages.
Here is the full list of ressources you can intercept: stylesheet
, image
, media
, font
, script
, texttrack
, xhr
, fetch
, eventsource
, websocket
, manifest
, other
.
Note:
Most of the time, intercepting ALL resources might negatively impact your scraper.
I would advise to ONLY intercept image
, media
and font
. (In some cases intercepting stylesheet
might impact puppeteer click action).
/**
* Puppeteer, Headless Chrome Node.js API
*
* @link https://github.com/puppeteer/puppeteer
*
* @package npm install puppeteer
*/
const puppeteer = require( 'puppeteer' );
const brewery = async ( page ) => {
await page.setRequestInterception( true );
page.on( 'request', r => {
/**
* @see https://stackoverflow.com/a/47166637/3645650
*/
if ( [
//'stylesheet',
'image',
'media',
'font',
].indexOf( r.resourceType() ) !== -1 ) {
r.abort();
} else {
r.continue();
};
} );
};
( async () => {
// ... start
let start = new Date();
console.log( '--\xa0process:\xa0start' );
const browser = await puppeteer.launch( {
headless: true
} );
const page = await browser.newPage();
await brewery( page );
await page.goto( 'https://github.com/login' );
await page.screenshot( { path: Date.now() + '.png' } );
console.log( '--\xa0process:\xa0screenshot' );
// ... end
await browser.close().then( () => {
var end = ( new Date() - start ) / 1000;
console.log( '--\xa0process:\xa0end,\xa0runtime\xa0' + end + '\xa0seconds' );
} );
} ) ()
Upvotes: 3
Reputation: 309
Adding a return statement solved the issue on my end.
page.on( 'request', ( request ) => {
if ([ 'image', 'stylesheet', 'font', 'script' ].indexOf( request.resourceType() ) !== -1 ) {
return request.abort();
}
request.continue();
} );
Upvotes: 6