Reputation: 1924
I need help to undestand how timeout works, especially with node/puppeteer
I read all stack questions and github issues about this, but i can figure it out what is wrong
Probably my code...
When i run this file, i receive the error from image. You can see the ways i tryied to fix it, nothing works
Can someone explain why this happens and the best approach to avoid this? Is there a better way to get these Projects?
//vou até os seeds em x tempo
var https = require('https');
var Q = require('q');
var fs = require('fs');
var puppeteer = require('puppeteer');
var Projeto = require('./Projeto.js');
const url = 'https://www.99freelas.com.br/projects?categoria=web-e-desenvolvimento'
/*const idToScrape;
deverá receber qual a url e os parametros específicos de cada seed */
async function genScraper() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
//page.setDefaultNavigationTimeout(60000);
page.waitForNavigation( { timeout: 60000, waitUntil: 'domcontentloaded' });
await page.goto(url);
var projetos = await page.evaluate(() => {
let qtProjs = document.querySelectorAll('.result-list li').length;
let listaDeProjs = Array.from(document.querySelectorAll('.result-list li'));
let tempProjetos = [];
for( var i=0; i<=listaDeProjs.length; i++ ) {
let titulo = listaDeProjs[i].children[1].children[0].textContent;
let descricao = listaDeProjs[i].children[2].textContent;
let habilidades = listaDeProjs[i].children[3].textContent;
let publicado = listaDeProjs[i].children[1].children[1].children[0].textContent;
let tempoRestante = listaDeProjs[i].children[1].children[1].children[1].textContent;
//let infoCliente;
proj = new Projeto(titulo, descricao, habilidades, publicado, tempoRestante);
tempProjetos.push(proj);
}
return tempProjetos;
});
console.log(projetos);
browser.close();
}
genScraper();
Upvotes: 3
Views: 3370
Reputation: 957
I recommend you to avoid using the method waitForNavigation
before the goTo
call.
Basically, It would be better to use the method gotTo
with the default value, that is 30000
. In my opinion, if the website takes more than 30 seconds to work or respond, there should be something wrong.
Instead, I would do something like this:
await page.goto(url, {
waitUntil: 'networkidle0'
});
Depending on the version of puppeteer that you're using, you will have different behaviours. I am using version 1.4.0 and it is working good so far.
Inside the documentation states the following:
The page.goto will throw an error if:
- there's an SSL error (e.g. in case of self-signed certificates).
- target URL is invalid.
- the timeout is exceeded during navigation.
- the main resource failed to load.
So, check that none of the previous scenarios is happening.
Also, you can curl the URL from your terminal to see if the URL respond to outside calls, cross origin problems are common too.
Sincerely, there is no way to say what can be triggering your timeout, but that checklist should help. I had a problem with timeout recently and the problem was my server configuration, so I suggest you to see also if the machine in which you are running this code, has the necessary memory to execute.
Upvotes: 1
Reputation: 8617
In your for loop,
for( var i=0; i<=listaDeProjs; i++ ) {
...
}
listaDeProjs
should be listaDeProjs.length
Your evaluation script will fail in several places, if anywhere along this path is undefined: (E.g., if children[1]
is undefined or children[0]
is undefined.)
listaDeProjs[i].children[1].children[0].textContent;
You can do the following with lodash
:
_.get(listaDeProjs[i],"children[1].children[0].textContent","")
That will default to ""
if there is no such value.
Additionally, the following works perfectly fine with your code in 1.7 via https://try-puppeteer.appspot.com/
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: '5000'
});
Upvotes: 1