diesel94
diesel94

Reputation: 159

Requests for multiple pages with puppeteer

I am trying to get information from many sites (links from array) which have dynamically content (emails and names of companies) with puppeteer. I use "for" cycle to iterate array with links, do page.goto... to each site, wait until the site is loaded , wait several seconds for dynamical content, and begin doing requests. But i have first and last request completed (Promises resolve). Other promises don't return me dynamical content. What should i do for fix that? Thanks

let puppeteer = require('puppeteer');

(async() => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first','second','third','abcd'];
for(let i=0;i<await arrayNames.length;){
    let nameUrl = await arrayNames[i];
    if (i<4){
      let temp1;
      console.log(`begin for ${nameUrl}`);
      await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' })
          .then(()=>{
            return new Promise(res=>{
              //wait content dynamic load
              setTimeout(()=>{
                temp1 = page.evaluate(() => {
                  return new Promise(resolve => { // <-- return the data to node.js from browser
                    let name = document.querySelector('h1').innerHTML;
                    let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
                        .children[2].children[0].children[0].innerHTML;
                    resolve(email);
                  });
                });
                res(temp1);
              },7000);

            })
      })
          .then((res)=>{
            i++;
            console.log(`https://abcdsite.com/${nameUrl}`,temp1);
          });
    }
    else{
      break
    }
  }
})();

Upvotes: 0

Views: 6908

Answers (2)

charly rl
charly rl

Reputation: 885

puppeteer's page.goto function has multiple parameters you can use to ensure that the page is fully loaded. See the documentation here. In addition, you can use the page.waitFor method to wait for a few seconds. See documentation here.

Here you have a simple example that I think may work for you:

const puppeteer = require('puppeteer')

const url = 'https://stackoverflow.com/'
const arrayNames = ['tags', 'users', 'jobs', 'questions'];

(async () => {
  const browser = await puppeteer.launch()
  const page = await browser.newPage()

  const data = {}
  for (const nameUrl of arrayNames) {
    const fullUrl = `${url}${nameUrl}`
    console.log(`begin for ${fullUrl}`)
    await page.goto(fullUrl, { waitUntil: 'networkidle0' }) // check networkidle0 parameter and others here: https://pptr.dev/#?product=Puppeteer&version=v2.1.1&show=api-pagegotourl-options
    await page.waitFor(2000) // wait 2 seconds to allow a full login. Optional
    const pageData = await page.evaluate(() => {
      const name = document.querySelector('h1').innerText
      const pageTitle = document.querySelector('title').innerText
      // get whatever data you need to get from the page.
      return { name: name, title: pageTitle }
    })
    console.log('\t Data from page: ', pageData)
    data[fullUrl] = pageData
  }
  console.log(data)
})()

This does not run all sites in parallel, but you can then play around with the example. Instead of 'awaiting' the await page.evaluate part, you could get all the promises in an array and then use await Promise.all([listOfPromises])

Upvotes: 1

Saeed
Saeed

Reputation: 5488

I think this helps you.

1) make an async function to request and parse your data

2) create an array of parallel tasks.

let puppeteer = require('puppeteer');

async function makeRequest(page, url, nameUrl) {
  await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' });

  setTimeout(() => {
    const userEmail = await page.evaluate(() => {
      let name = document.querySelector('h1').innerHTML;
      let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
        .children[2].children[0].children[0].innerHTML;

      return email;
    });

    return Promise.resolve(userEmail);
  }, 7000);
}

(async () => {
  const browser = await puppeteer.launch();
  let page = await browser.newPage();
  const url = 'https://abcdsite.com/';
  let arrayNames = ['first', 'second', 'third', 'abcd'];

  let tasks = [];
  for (let i = 0; i < arrayNames.length; i++) {
    tasks.push(makeRequest(page, url, arrayNames[i]));
  }

  Promise.all(tasks)
    .then((res) => {
      for (let i = 0; i < arrayNames.length; i++) {
        console.log(`https://abcdsite.com/${arrayNames[i]}`, res[i]);
      }
    });

})();

Series solution

For more information read this.

for (let i = 0; i < arrayNames.length; i++) {
  let temp = await makeRequest(page, url, arrayNames[i]);
  console.log(`https://abcdsite.com/${arrayNames[i]}`, temp);
}

Upvotes: 2

Related Questions