user3817605
user3817605

Reputation: 161

Puppeteer: how to wait only first response (HTML)

I'm using puppeteer-cluster to crawling web pages.

If I open many pages at time per single website (8-10 pages), the connection slow down and many timeout errors coming up, like this:

TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded

I need to access only the HTML code of each page. I don't need to wait for domcontentloaded and so on.

Is there a way to tell page.goto() to wait only the first response from the webserver? Or I need to use another technology instead of puppeteer?

Upvotes: 3

Views: 5100

Answers (3)

lcrespilho
lcrespilho

Reputation: 907

I can see two other ways to achieve what you want: using page.waitForResponse and page.waitForFunction. Let's see both.

With page.waitForResponse you can do something as simple as this:

page.goto('https://www.google.com/').catch(() => {});
await page.waitForResponse('https://www.google.com/'); // don't forget to put the final slash

Pretty simple, ahn? If you don't like it, try page.waitForFunction and wait until de document is created:

page.goto('https://www.google.com/').catch(() => {});
await page.waitForFunction(() => document); // you can use `window` too. It is almost the same

This code will wait until the document exists. This happens when the first bit of html arrives and the browsers starts to create de DOM tree representation of the document.

But be aware that despite this two solutions are simple, neither of them wait until the entire html page/document is downloaded. If you want that, you should modify the waitForEvent function of my other Answer, to accept the specific url that you want to be complete downloaded. Example:

/**
 * The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
 * event `domcontentloaded` at minimum. This function returns a promise that resolves as
 * soon as the specified `requestUrl` resource has finished downloading, or `timeout` elapses.
 * 
 * @param {puppeteer.Page} page
 * @param {string} requestUrl pass the exact url of the resource you want to wait for. Paths must be ended with slash "/". Don't forget that.
 * @param {number} [timeout] optional time to wait. If not specified, waits forever.
 */
function waitForRequestToFinish(page, requestUrl, timeout) {
  page.on('requestfinished', onRequestFinished);
  let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
  return new Promise(resolve => fulfill = resolve);
  function done() {
    page.removeListener('requestfinished', onRequestFinished);
    clearTimeout(timeoutId);
    fulfill();
  }
  function onRequestFinished(req) {
    if (req.url() === requestUrl) done();
  }
}

How to use it:

page.goto('https://www.amazon.com/').catch(() => {});
await waitForRequestToFinish(page, 'https://www.amazon.com/', 3000);

Complete example showing neat console.logs:

const puppeteer = require('puppeteer');

/**
 * The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
 * event `domcontentloaded` at minimum. This function returns a promise that resolves as
 * soon as the specified `requestUrl` resource has finished downloading, or `timeout` elapses.
 * 
 * @param {puppeteer.Page} page
 * @param {string} requestUrl pass the exact url of the resource you want to wait for. Paths must be ended with slash "/". Don't forget that.
 * @param {number} [timeout] optional time to wait. If not specified, waits forever.
 */
function waitForRequestToFinish(page, requestUrl, timeout) {
  page.on('requestfinished', onRequestFinished);
  let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
  return new Promise(resolve => fulfill = resolve);
  function done() {
    page.removeListener('requestfinished', onRequestFinished);
    clearTimeout(timeoutId);
    fulfill();
  }
  function onRequestFinished(req) {
    if (req.url() === requestUrl) done();
  }
}

(async () => {
  const netMap = new Map();
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  const cdp = await page.target().createCDPSession();
  await cdp.send('Network.enable');
  await cdp.send('Page.enable');
  const t0 = Date.now();
  cdp.on('Network.requestWillBeSent', ({ requestId, request: { url: requestUrl } }) => {
    netMap.set(requestId, requestUrl);
    console.log(`> ${Date.now() - t0}ms\t requestWillBeSent:\t${requestUrl}`);
  });
  cdp.on('Network.responseReceived', ({ requestId }) => console.log(`< ${Date.now() - t0}ms\t responseReceived:\t${netMap.get(requestId)}`));
  cdp.on('Network.dataReceived', ({ requestId, dataLength }) => console.log(`< ${Date.now() - t0}ms\t dataReceived:\t\t${netMap.get(requestId)} ${dataLength} bytes`));
  cdp.on('Network.loadingFinished', ({ requestId }) => console.log(`. ${Date.now() - t0}ms\t loadingFinished:\t${netMap.get(requestId)}`));
  cdp.on('Network.loadingFailed', ({ requestId }) => console.log(`E ${Date.now() - t0}ms\t loadingFailed:\t${netMap.get(requestId)}`));

  // The magic happens here
  page.goto('https://www.amazon.com').catch(() => { });
  await waitForRequestToFinish(page, 'https://www.amazon.com/', 3000);

  console.log(`\nThe page was released after ${Date.now() - t0}ms\n`);
  await page.close();
  await browser.close();
})();

/* OUTPUT EXAMPLE
[... lots of logs removed ...]
> 574ms  requestWillBeSent:     https://images-na.ssl-images-amazon.com/images/I/71vvXGmdKWL._AC_SY200_.jpg
< 574ms  dataReceived:          https://www.amazon.com/ 65536 bytes
< 624ms  responseReceived:      https://images-na.ssl-images-amazon.com/images/G/01/AmazonExports/Fuji/2019/February/Dashboard/computer120x._CB468850970_SY85_.jpg
> 628ms  requestWillBeSent:     https://images-na.ssl-images-amazon.com/images/I/81Hhc9zh37L._AC_SY200_.jpg
> 629ms  requestWillBeSent:     https://images-na.ssl-images-amazon.com/images/G/01/personalization/ybh/loading-4x-gray._CB317976265_.gif
< 631ms  dataReceived:          https://www.amazon.com/ 58150 bytes
. 631ms  loadingFinished:       https://www.amazon.com/

*/

This code show lots of requests and responses, but the code stops as soon has "https://www.amazon.com/" has been completely downloaded.

Upvotes: 1

lcrespilho
lcrespilho

Reputation: 907

@user3817605, I have the perfect code for you. :)

/**
 * The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
 * event `domcontentloaded` at minimum. This function returns a promise that resolves as
 * soon as the specified page `event` happens.
 * 
 * @param {puppeteer.Page} page
 * @param {string} event Can be any event accepted by the method `page.on()`. E.g.: "requestfinished" or "framenavigated".
 * @param {number} [timeout] optional time to wait. If not specified, waits forever.
 */
function waitForEvent(page, event, timeout) {
  page.once(event, done);
  let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
  return new Promise(resolve => fulfill = resolve);
  function done() {
    clearTimeout(timeoutId);
    fulfill();
  }
}

You asked a function to wait only first response, so you use this function like this:

page.goto(<URL>); // use .catch(() => {}) if you kill the page too soon, to avoid throw errors on console
await waitForEvent(page, 'response'); // after this line here you alread have the html response received

This is exactly what you've asked for. But be aware that "response received" is not the same as "complete html response received". The first is the start of response, and the last is the end of it. So, maybe you want to use the event "requestfinished" insted of "response". In fact you can use any event accecpted by the puppeteer Page. They are: close, console, dialog, domcontentloaded, error, frameattached, framedetached, framenavigated, load, metrics, pageerror, popup, request, requestfailed, requestfinished, response, workercreated, workerdestroyed.

Try to use these: requestfinished or framenavigated. Maybe they will fit weel for you.

To help you decide which one is perfect for you, you could setup a test code like this:

const puppeteer = require('puppeteer');

/**
 * The methods `page.waitForNavigation` and `frame.waitForNavigation` wait for the page
 * event `domcontentloaded` at minimum. This function returns a promise that resolves as
 * soon as the specified page `event` happens.
 * 
 * @param {puppeteer.Page} page
 * @param {string} event Can be any event accepted by the method `page.on()`. E.g.: "requestfinished" or "framenavigated".
 * @param {number} [timeout] optional time to wait. If not specified, waits forever.
 */
function waitForEvent(page, event, timeout) {
  page.once(event, done);
  let fulfill, timeoutId = (typeof timeout === 'number' && timeout >= 0) ? setTimeout(done, timeout) : -1;
  return new Promise(resolve => fulfill = resolve);
  function done() {
    clearTimeout(timeoutId);
    fulfill();
  }
}

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  const cdp = await page.target().createCDPSession();
  await cdp.send('Network.enable');
  await cdp.send('Page.enable');
  const t0 = Date.now();
  page.on('request', req => console.log(`> ${Date.now() - t0} request start: ${req.url()}`));
  page.on('response', req => console.log(`< ${Date.now() - t0} response: ${req.url()}`));
  page.on('requestfinished', req => console.log(`. ${Date.now() - t0} request finished: ${req.url()}`));
  page.on('requestfailed', req => console.log(`E ${Date.now() - t0} request failed: ${req.url()}`));

  page.goto('https://www.google.com').catch(() => { });
  await waitForEvent(page, 'requestfinished');
  console.log(`\nThe page was released after ${Date.now() - t0}ms\n`);
  await page.close();
  await browser.close();
})();

/* The output should be something like this:

> 2 request start: https://www.google.com/
< 355 response: https://www.google.com/
> 387 request start: https://www.google.com/tia/tia.png
> 387 request start: https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png
. 389 request finished: https://www.google.com/

The page was released after 389ms

*/

Upvotes: 3

Md. Abu Taher
Md. Abu Taher

Reputation: 18826

The domcontentloaded is the event for first html content.

The DOMContentLoaded event fires when the initial HTML document has been completely loaded and parsed, without waiting for stylesheets, images, and subframes to finish loading.

The following will finish loading just when the initial HTML document is loaded.

await page.goto(url, {waitUntil: 'domcontentloaded'})

However, you can block images or stylesheets to save your bandwidth and load even faster in case you are loading 10 pages at once.

Put the code below on the right place (before navigating using page.goto) and it will stop loading image, stylesheet, font and scripts.

await page.setRequestInterception(true);
page.on('request', (request) => {
    if (['image', 'stylesheet', 'font', 'script'].indexOf(request.resourceType()) !== -1) {
        request.abort();
    } else {
        request.continue();
    }
});

Upvotes: 5

Related Questions