Reputation: 107
I'm trying to get several elements from a website with several pages. I'm currently using PhantomJS to do that work and my code almost works, but the issue is that my code scrapes twice the first page even if (according to the log) it seems that I already moved to the second one.
Here's the code:
var page = require('webpage').create();
page.viewportSize = { width: 1061, height: 1000 }; //To specify the window size
page.open("website", function () {
function fetch_names(){
var name = page.evaluate(function () {
return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
return name.getAttribute('href');
});
});
console.log(name.join('\n'));
page.render('1.png');
window.setTimeout(function (){
goto_next_page();
}, 5000);
}
function goto_next_page(){
page.evaluate(function () {
var a = document.querySelector('#block-system-main .next a');
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
a.dispatchEvent(e);
waitforload = true;
});
fetch_names();
}
fetch_names();
});
You can try it by yourself to understand how all of that work.
Upvotes: 4
Views: 1003
Reputation: 61892
You need to wait for the page to load after you click and not before you click by moving setTimeout()
from fetch_names
to goto_next_page
:
function fetch_names(){
var name = page.evaluate(function () {
return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
return name.getAttribute('href');
});
});
console.log(name.join('\n'));
page.render('1.png');
goto_next_page();
}
function goto_next_page(){
page.evaluate(function () {
var a = document.querySelector('#block-system-main .next a');
var e = document.createEvent('MouseEvents');
e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
a.dispatchEvent(e);
waitforload = true;
});
window.setTimeout(function (){
fetch_names();
}, 5000);
}
Note that there are many more ways to wait for something other than the static timeout. Instead, you can
register to the page.onLoadFinished
event:
page.onLoadFinished = fetch_names;
wait for a specific selector to appear with the waitFor()
function from the examples.
Upvotes: 3