André
André

Reputation: 25554

How to navigate a paginated website with PhantomJS?

I need to navigate a website that is paginated, the pagination fires up an Ajax request that will bring to the page the new data.

For now I've a working example code that will wait 20 seconds and then click on the link ("ul.pageNavi li.next").

url = 'https://www.somewebsite.com';

// open the url
var page = require('webpage').create();
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0';
page.open(url, function (status) {


    if (status !== 'success') {
        console.log('Unable to load the address!');
        phantom.exit();
    } else {
        window.setTimeout(function () { // Wait 20 seconds so the page loads

            page.render('1.png');

            // Begin - click on the pagination 
            page.evaluate( function() {
                // find element to send click to
                var element = document.querySelector( 'ul.pageNavi li.next' );

                // create a mouse click event
                var event = document.createEvent( 'MouseEvents' );
                event.initMouseEvent( 'click', true, true, window, 1, 0, 0 );

                // send click to element
                element.dispatchEvent( event );                     
            });         
            // End - click on the pagination        

            page.render('2.png');

            phantom.exit();
        }, 20000); // Change timeout as required to allow sufficient time 
    }

});

This code above is working and I successfully go the the page 2. Now I'm stuck in implementing a loop so I can navigate to the following pages.

I need to wait some seconds between each click, I've implemented this code but this is not working.

url = 'https://www.somewebsite.com';

// open the url
var page = require('webpage').create();
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0';
page.open(url, function (status) {


    if (status !== 'success') {
        console.log('Unable to load the address!');
        phantom.exit();
    } else {
        window.setTimeout(function () { // Wait 20 seconds so the page loads

            var morelinks = moreLinks();
            var i = 0;

            page.render(i + '.png');
            console.log('1: ' + morelinks); 

            while (morelinks != 0) {
                window.setTimeout(function () { // Wait 20 seconds so the page loads

                    i++;

                    // Begin
                    page.evaluate( function() {
                        // find element to send click to
                        var element = document.querySelector( 'ul.pageNavi li.next' );

                        // create a mouse click event
                        var event = document.createEvent( 'MouseEvents' );
                        event.initMouseEvent( 'click', true, true, window, 1, 0, 0 );

                        // send click to element
                        element.dispatchEvent( event );                     
                    });         
                    // End

                    page.render(i + '.png');

                    morelinks = moreLinks();
                    console.log('2: ' + morelinks);

                }, 20000); // Change timeout as required to allow sufficient time  
            }           

            phantom.exit();
        }, 20000); // Change timeout as required to allow sufficient time 
    }

});

function moreLinks() {
    var morelinks = page.evaluate(function() {
        return $('ul.pageNavi li.next').length;
    });

    return morelinks;
}

function getHref() {
    var links = page.evaluate(function() {
        return $('#ulSearchResults li a');
    }); 

    return links;
}

Can somebody give me a clue on how to implement the navigation to the following pages?

Upvotes: 0

Views: 2068

Answers (1)

Artjom B.
Artjom B.

Reputation: 61892

You have two problems.

Premature exit

You are handling asynchronous functions (in a loop). After the loop finishes you immediately exit (phantom.exit()). At this point none of the asynchronous functions even began executing.

Loop of asynchronous functions

You should ask yourself how asynchronous functions are evaluated if they are called from the loop. After the loop finishes the none of the functions are executed yet. After the first timeout triggers, all the other timeouts also trigger because setTimeout was called basically at the same time.

There are many ways to solve this. Here are two:

1. Static timeout delay

Schedule the timeout in a way that they are called with a delay from the previous function.

while (morelinks != 0) {
    // IIFE to keep a proper reference to `i`
    (function(i){
        setTimeout(function () {
            // do your stuff
        }, 20000 * i);
    })(i);
    i++;
}

2. Recursion (recommended)

Mix between pseudo-code and real code.

function scrapePage(){
    page.render(i + '.png');
    if (exists(".next")) {
        click(".next");
        setTimeout(function (){
            scrapePage();
        }, 5000);
    } else {
        phantom.exit();
    }
}

page.open(url, function(){
    scrapePage();
});

The key is to check if the next button exists (or is visible or enabled) only then click it. If it is not, then you know that you're on the last page and you can safely exit.

I'm sure you can implement the exists() and click() functions yourself.

Use waitFor

Instead of waiting a static amount of time in the second suggestion, use the waitFor function from the examples to wait until the page is fully loaded by looking for a fitting selector that is loaded the last.

Upvotes: 1

Related Questions