Reputation: 161
I'm using puppeteer-cluster to crawling web pages.
If I open many pages at time per single website (8-10 pages), the connection slow down and many timeout errors coming up, like this:
TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
I need to access only the HTML code of each page. I don't need to wait for domcontentloaded and so on.
Is there a way to tell page.goto() to wait only the first response from the webserver? Or I need to use another technology instead of puppeteer?
Upvotes: 3
Views: 5100
Reputation: 907
I can see two other ways to achieve what you want: using page.waitForResponse
and page.waitForFunction
. Let's see both.
With page.waitForResponse you can do something as simple as this:
page.goto('https://www.google.com/').catch(() => {});
await page.waitForResponse('https://www.google.com/'); // don't forget to put the final slash
Pretty simple, ahn? If you don't like it, try page.waitForFunction
and wait until de document
is created:
page.goto('https://www.google.com/').catch(() => {});
await page.waitForFunction(() => document); // you can use `window` too. It is almost the same
This code will wait until the document
exists. This happens when the first bit of html arrives and the browsers starts to create de DOM tree representation of the document.
But be aware that despite this two solutions are simple, neither of them wait until the entire html page/document is downloaded. If you want that, you should modify the waitForEvent
function of my other Answer, to accept the specific url that you want to be complete downloaded. Example:
/**
* The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
* event `domcontentloaded` at minimum. This function returns a promise that resolves as
* soon as the specified `requestUrl` resource has finished downloading, or `timeout` elapses.
*
* @param {puppeteer.Page} page
* @param {string} requestUrl pass the exact url of the resource you want to wait for. Paths must be ended with slash "/". Don't forget that.
* @param {number} [timeout] optional time to wait. If not specified, waits forever.
*/
function waitForRequestToFinish(page, requestUrl, timeout) {
page.on('requestfinished', onRequestFinished);
let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
return new Promise(resolve => fulfill = resolve);
function done() {
page.removeListener('requestfinished', onRequestFinished);
clearTimeout(timeoutId);
fulfill();
}
function onRequestFinished(req) {
if (req.url() === requestUrl) done();
}
}
How to use it:
page.goto('https://www.amazon.com/').catch(() => {});
await waitForRequestToFinish(page, 'https://www.amazon.com/', 3000);
Complete example showing neat console.logs:
const puppeteer = require('puppeteer');
/**
* The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
* event `domcontentloaded` at minimum. This function returns a promise that resolves as
* soon as the specified `requestUrl` resource has finished downloading, or `timeout` elapses.
*
* @param {puppeteer.Page} page
* @param {string} requestUrl pass the exact url of the resource you want to wait for. Paths must be ended with slash "/". Don't forget that.
* @param {number} [timeout] optional time to wait. If not specified, waits forever.
*/
function waitForRequestToFinish(page, requestUrl, timeout) {
page.on('requestfinished', onRequestFinished);
let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
return new Promise(resolve => fulfill = resolve);
function done() {
page.removeListener('requestfinished', onRequestFinished);
clearTimeout(timeoutId);
fulfill();
}
function onRequestFinished(req) {
if (req.url() === requestUrl) done();
}
}
(async () => {
const netMap = new Map();
const browser = await puppeteer.launch();
const page = await browser.newPage();
const cdp = await page.target().createCDPSession();
await cdp.send('Network.enable');
await cdp.send('Page.enable');
const t0 = Date.now();
cdp.on('Network.requestWillBeSent', ({ requestId, request: { url: requestUrl } }) => {
netMap.set(requestId, requestUrl);
console.log(`> ${Date.now() - t0}ms\t requestWillBeSent:\t${requestUrl}`);
});
cdp.on('Network.responseReceived', ({ requestId }) => console.log(`< ${Date.now() - t0}ms\t responseReceived:\t${netMap.get(requestId)}`));
cdp.on('Network.dataReceived', ({ requestId, dataLength }) => console.log(`< ${Date.now() - t0}ms\t dataReceived:\t\t${netMap.get(requestId)} ${dataLength} bytes`));
cdp.on('Network.loadingFinished', ({ requestId }) => console.log(`. ${Date.now() - t0}ms\t loadingFinished:\t${netMap.get(requestId)}`));
cdp.on('Network.loadingFailed', ({ requestId }) => console.log(`E ${Date.now() - t0}ms\t loadingFailed:\t${netMap.get(requestId)}`));
// The magic happens here
page.goto('https://www.amazon.com').catch(() => { });
await waitForRequestToFinish(page, 'https://www.amazon.com/', 3000);
console.log(`\nThe page was released after ${Date.now() - t0}ms\n`);
await page.close();
await browser.close();
})();
/* OUTPUT EXAMPLE
[... lots of logs removed ...]
> 574ms requestWillBeSent: https://images-na.ssl-images-amazon.com/images/I/71vvXGmdKWL._AC_SY200_.jpg
< 574ms dataReceived: https://www.amazon.com/ 65536 bytes
< 624ms responseReceived: https://images-na.ssl-images-amazon.com/images/G/01/AmazonExports/Fuji/2019/February/Dashboard/computer120x._CB468850970_SY85_.jpg
> 628ms requestWillBeSent: https://images-na.ssl-images-amazon.com/images/I/81Hhc9zh37L._AC_SY200_.jpg
> 629ms requestWillBeSent: https://images-na.ssl-images-amazon.com/images/G/01/personalization/ybh/loading-4x-gray._CB317976265_.gif
< 631ms dataReceived: https://www.amazon.com/ 58150 bytes
. 631ms loadingFinished: https://www.amazon.com/
*/
This code show lots of requests and responses, but the code stops as soon has "https://www.amazon.com/" has been completely downloaded.
Upvotes: 1
Reputation: 907
@user3817605, I have the perfect code for you. :)
/**
* The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
* event `domcontentloaded` at minimum. This function returns a promise that resolves as
* soon as the specified page `event` happens.
*
* @param {puppeteer.Page} page
* @param {string} event Can be any event accepted by the method `page.on()`. E.g.: "requestfinished" or "framenavigated".
* @param {number} [timeout] optional time to wait. If not specified, waits forever.
*/
function waitForEvent(page, event, timeout) {
page.once(event, done);
let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
return new Promise(resolve => fulfill = resolve);
function done() {
clearTimeout(timeoutId);
fulfill();
}
}
You asked a function to wait only first response, so you use this function like this:
page.goto(<URL>); // use .catch(() => {}) if you kill the page too soon, to avoid throw errors on console
await waitForEvent(page, 'response'); // after this line here you alread have the html response received
This is exactly what you've asked for. But be aware that "response received" is not the same as "complete html response received". The first is the start of response, and the last is the end of it. So, maybe you want to use the event "requestfinished" insted of "response". In fact you can use any event accecpted by the puppeteer Page. They are: close, console, dialog, domcontentloaded, error, frameattached, framedetached, framenavigated, load, metrics, pageerror, popup, request, requestfailed, requestfinished, response, workercreated, workerdestroyed.
Try to use these: requestfinished or framenavigated. Maybe they will fit weel for you.
To help you decide which one is perfect for you, you could setup a test code like this:
const puppeteer = require('puppeteer');
/**
* The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
* event `domcontentloaded` at minimum. This function returns a promise that resolves as
* soon as the specified page `event` happens.
*
* @param {puppeteer.Page} page
* @param {string} event Can be any event accepted by the method `page.on()`. E.g.: "requestfinished" or "framenavigated".
* @param {number} [timeout] optional time to wait. If not specified, waits forever.
*/
function waitForEvent(page, event, timeout) {
page.once(event, done);
let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
return new Promise(resolve => fulfill = resolve);
function done() {
clearTimeout(timeoutId);
fulfill();
}
}
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const cdp = await page.target().createCDPSession();
await cdp.send('Network.enable');
await cdp.send('Page.enable');
const t0 = Date.now();
page.on('request', req => console.log(`> ${Date.now() - t0} request start: ${req.url()}`));
page.on('response', req => console.log(`< ${Date.now() - t0} response: ${req.url()}`));
page.on('requestfinished', req => console.log(`. ${Date.now() - t0} request finished: ${req.url()}`));
page.on('requestfailed', req => console.log(`E ${Date.now() - t0} request failed: ${req.url()}`));
page.goto('https://www.google.com').catch(() => { });
await waitForEvent(page, 'requestfinished');
console.log(`\nThe page was released after ${Date.now() - t0}ms\n`);
await page.close();
await browser.close();
})();
/* The output should be something like this:
> 2 request start: https://www.google.com/
< 355 response: https://www.google.com/
> 387 request start: https://www.google.com/tia/tia.png
> 387 request start: https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png
. 389 request finished: https://www.google.com/
The page was released after 389ms
*/
Upvotes: 3
Reputation: 18826
The domcontentloaded is the event for first html content.
The DOMContentLoaded event fires when the initial HTML document has been completely loaded and parsed, without waiting for stylesheets, images, and subframes to finish loading.
The following will finish loading just when the initial HTML document is loaded.
await page.goto(url, {waitUntil: 'domcontentloaded'})
However, you can block images or stylesheets to save your bandwidth and load even faster in case you are loading 10 pages at once.
Put the code below on the right place (before navigating using page.goto
) and it will stop loading image, stylesheet, font and scripts.
await page.setRequestInterception(true);
page.on('request', (request) => {
if (['image', 'stylesheet', 'font', 'script'].indexOf(request.resourceType()) !== -1) {
request.abort();
} else {
request.continue();
}
});
Upvotes: 5