Chakra
Chakra

Reputation: 68

Puppeteer-cluster using tab and taking screenshot

I am using puppeteer-clustor and imagemagick (convert) / xwd command to take screenshot of complete desktop.

Would need browser with viewable part of page and browser navigation buttons and URL. I could get screenshot most of the times, however it does fail other times.

Error message is the tab is closed screenshot is done. Please suggest what is that I am doing wrong.

Code runs on linux with a X running on DISPLAY:0.3. I can see

Below is the code which I have I tried blockingWait and also

const {
  Cluster
} = require('puppeteer-cluster');
const execSync = require('child_process').execSync;

process.env['DISPLAY'] = ':0.3';
let i = 0;

function wait(time) {
  return new Promise((resolve) => setTimeout(resolve, time));
}

function blockingWait(seconds) {
  //simple blocking technique (wait...)
  var waitTill = new Date(new Date().getTime() + seconds * 1000);
  while (waitTill > new Date()) {}
}

function getscreenshot(url, page) {
  page.bringToFront(); // Get the tab to focus 
  wait(200);
  i = i + 1; // For now get screenshot as number will add image named based on URL 
  path = i + '.jpg';
  var r = execSync('import -window root ' + path);
  console.log('Taken screenshot: ' + path);
  console.log(url);
  blockingWait(1);
}

(async () => {
  // Create a cluster with 6 workers or 6 tabs which loads all the url
  const cluster = await Cluster.launch({
    concurrency: Cluster.CONCURRENCY_PAGE,
    maxConcurrency: 6,
    timeout: 120000,
    puppeteerOptions: {
      executablePath: 'google-chrome-stable',
      args: [
        '--ignore-certificate-errors',
        '--no-sandbox',
        '--incognito',
        '--disable-infobars',
        '--disable-setuid-sandbox',
        '--window-size=1600,1200',
        '--start-maximized',
        '--disable-gpu'
      ],
      headless: false, //headless:false so we can watch the browser as it works
    },
  });
  console.log('cluster launched');

  // We don't define a task and instead use own functions
  const screenshot = async ({
    page,
    data: url
  }) => {
    console.log('screenshot entered ');
    await page.setExtraHTTPHeaders({
      'CUSTOMER-ID': "66840"
    }, ); // use same customer id as header
    await page.setViewport({
      width: 1600,
      height: 1200
    });
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3419.0 Safari/537.36');
    await page.goto(url, {
      waitUntil: 'domcontentloaded'
    }, {
      waitUntil: 'networkidle0'
    }, {
      waitUntil: 'load'
    });
    // Since we wait the page to fully load

    await page.waitForResponse(response => response.ok()) // ok page is ready .. will deal here for other HTTP error beside 200, 404,500 etc 

    await page.waitForNavigation({
      waitUntil: 'domcontentloaded'
    }, {
      waitUntil: 'networkidle0'
    }, ); // Wait for page to load before screenshot
    await page.bringToFront(); // Get the tab to focus 
    wait(100); // Blocking wait
    console.log('Waiting 5 sec');
    blockingWait(5); // different kind of wait
    getscreenshot(url, page);
    console.log('screenshot exited');
  };

  const extractTitle = async ({
    page,
    data: url
  }) => {
    console.log('scrapelinks entered');
    await page.setExtraHTTPHeaders({
      'CUSTOMER-ID': "66840"
    }, );
    await page.setViewport({
      width: 1600,
      height: 1200
    });
    await page.goto(url);
    const pageTitle = await page.evaluate(() => document.title); // will later used to confirm the page matches with client details.
    // get all Links on the page
    const hrefs = await page.$$eval('a', hrefs => hrefs.map((a) => {
      return {
        href: a.href,
        text: a.textContent,
      };
    }));
    // get 1st links matching text or link value having bioanalyzer-systems/instrument-2100.xhtml
    for (let postUrl of hrefs) {
      if (postUrl.text.indexOf("Client-s") > -1) {
        cluster.execute(postUrl.href, screenshot); // add this link also to queue
      } else if (postUrl.href.indexOf("bioanalyzer-systems/instrument-2100.xhtml") > -1) {
        cluster.execute(postUrl.href, screenshot); // add this url to queue
        break;
      }
    }
    console.log('scrapelinks exited');
  };

  // Make screenshots
  cluster.execute('http://www.internal-site.int/en/product/66840?product=NEW&CodeList=bio&Id=66840', screenshot);
  cluster.execute('http://www.internal-site.int/en/product/66840?product=USED&CodeList=nonbio&Id=66840', screenshot);

  // But also do some other stuff
  cluster.execute('http://www.internal-site.int/en/product/66840?product=NEW&CodeList=bio&Id=66840', extractTitle);
  cluster.execute('http://www.internal-site.int/en/product/66840?product=USED&CodeList=nonbio&Id=66840', extractTitle);

  await cluster.idle();
  await cluster.close();
})();```

I expect output to take screenshot once the page or tab load is completed.

Upvotes: 0

Views: 1771

Answers (1)

Thomas Dondorf
Thomas Dondorf

Reputation: 25280

The page is being closed as soon as the function is finished executing (or the Promise is resolved). You are not using await to wait for the asynchronous action to finish.

For example, in your screenshot function, there is the following code:

wait(100);
console.log('Waiting 5 sec');
blockingWait(5);
getscreenshot(url, page);
console.log('screenshot exited');

The first line calls the wait function (which is async), but as you are not awaiting it, the function will be executed in the background and Node.js will continue to execute your script.

The blockingWait is not the JavaScript-like way to write code. This completely blocks the execution.

The getscreenshot function should again be async so that you can await it. Also, some of the puppeteer function calls should have await in front of them (e.g. page.bringToFront) to wait until they are finished.

In general, you should check out the concept of async/await and Promises to understand where and why you should be using these keywords.

Upvotes: 1

Related Questions