Jeremy Dana
Jeremy Dana

Reputation: 107

How to go to the next page for scraping in PhantomJS

I'm trying to get several elements from a website with several pages. I'm currently using PhantomJS to do that work and my code almost works, but the issue is that my code scrapes twice the first page even if (according to the log) it seems that I already moved to the second one.

Here's the code:

var page = require('webpage').create();
page.viewportSize = { width: 1061, height: 1000 }; //To specify the window size
page.open("website", function () {

    function fetch_names(){
        var name = page.evaluate(function () {
            return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
                return name.getAttribute('href');
            });
        });
        console.log(name.join('\n'));
        page.render('1.png');
        window.setTimeout(function (){
            goto_next_page();
        }, 5000);
    }

    function goto_next_page(){
        page.evaluate(function () {
            var a = document.querySelector('#block-system-main .next a');
            var e = document.createEvent('MouseEvents');
            e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
            a.dispatchEvent(e);
            waitforload = true;

        });
        fetch_names();
    }

    fetch_names();
});

You can try it by yourself to understand how all of that work.

Upvotes: 4

Views: 1003

Answers (1)

Artjom B.
Artjom B.

Reputation: 61892

You need to wait for the page to load after you click and not before you click by moving setTimeout() from fetch_names to goto_next_page:

function fetch_names(){
    var name = page.evaluate(function () {
        return [].map.call(document.querySelectorAll('div.pepitesteasermain h2 a'), function(name){
            return name.getAttribute('href');
        });
    });
    console.log(name.join('\n'));
    page.render('1.png');
    goto_next_page();
}

function goto_next_page(){
    page.evaluate(function () {
        var a = document.querySelector('#block-system-main .next a');
        var e = document.createEvent('MouseEvents');
        e.initMouseEvent('click', true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null);
        a.dispatchEvent(e);
        waitforload = true;

    });
    window.setTimeout(function (){
        fetch_names();
    }, 5000);
}

Note that there are many more ways to wait for something other than the static timeout. Instead, you can

  • register to the page.onLoadFinished event:

    page.onLoadFinished = fetch_names;
    
  • wait for a specific selector to appear with the waitFor() function from the examples.

Upvotes: 3

Related Questions