Reputation: 159
I am trying to get information from many sites (links from array) which have dynamically content (emails and names of companies) with puppeteer. I use "for" cycle to iterate array with links, do page.goto...
to each site, wait until the site is loaded , wait several seconds for dynamical content, and begin doing requests. But i have first and last request completed (Promises resolve). Other promises don't return me dynamical content. What should i do for fix that? Thanks
let puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first','second','third','abcd'];
for(let i=0;i<await arrayNames.length;){
let nameUrl = await arrayNames[i];
if (i<4){
let temp1;
console.log(`begin for ${nameUrl}`);
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' })
.then(()=>{
return new Promise(res=>{
//wait content dynamic load
setTimeout(()=>{
temp1 = page.evaluate(() => {
return new Promise(resolve => { // <-- return the data to node.js from browser
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
resolve(email);
});
});
res(temp1);
},7000);
})
})
.then((res)=>{
i++;
console.log(`https://abcdsite.com/${nameUrl}`,temp1);
});
}
else{
break
}
}
})();
Upvotes: 0
Views: 6908
Reputation: 885
puppeteer's page.goto
function has multiple parameters you can use to ensure that the page is fully loaded. See the documentation here.
In addition, you can use the page.waitFor
method to wait for a few seconds. See documentation here.
Here you have a simple example that I think may work for you:
const puppeteer = require('puppeteer')
const url = 'https://stackoverflow.com/'
const arrayNames = ['tags', 'users', 'jobs', 'questions'];
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const data = {}
for (const nameUrl of arrayNames) {
const fullUrl = `${url}${nameUrl}`
console.log(`begin for ${fullUrl}`)
await page.goto(fullUrl, { waitUntil: 'networkidle0' }) // check networkidle0 parameter and others here: https://pptr.dev/#?product=Puppeteer&version=v2.1.1&show=api-pagegotourl-options
await page.waitFor(2000) // wait 2 seconds to allow a full login. Optional
const pageData = await page.evaluate(() => {
const name = document.querySelector('h1').innerText
const pageTitle = document.querySelector('title').innerText
// get whatever data you need to get from the page.
return { name: name, title: pageTitle }
})
console.log('\t Data from page: ', pageData)
data[fullUrl] = pageData
}
console.log(data)
})()
This does not run all sites in parallel, but you can then play around with the example.
Instead of 'awaiting' the await page.evaluate
part, you could get all the promises in an array and then use await Promise.all([listOfPromises])
Upvotes: 1
Reputation: 5488
I think this helps you.
1) make an async function to request and parse your data
2) create an array of parallel tasks.
let puppeteer = require('puppeteer');
async function makeRequest(page, url, nameUrl) {
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' });
setTimeout(() => {
const userEmail = await page.evaluate(() => {
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
return email;
});
return Promise.resolve(userEmail);
}, 7000);
}
(async () => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first', 'second', 'third', 'abcd'];
let tasks = [];
for (let i = 0; i < arrayNames.length; i++) {
tasks.push(makeRequest(page, url, arrayNames[i]));
}
Promise.all(tasks)
.then((res) => {
for (let i = 0; i < arrayNames.length; i++) {
console.log(`https://abcdsite.com/${arrayNames[i]}`, res[i]);
}
});
})();
Series solution
For more information read this.
for (let i = 0; i < arrayNames.length; i++) {
let temp = await makeRequest(page, url, arrayNames[i]);
console.log(`https://abcdsite.com/${arrayNames[i]}`, temp);
}
Upvotes: 2