Pierre
Pierre

Reputation: 1114

Phantomjs opening many pages

I have been having some issues with opening multiple webpages in phantomjs, I am first opening a website which contains a few links, which I want to open as well, and save a piece of text from each URL to my jobs_list which has many objects inside of it. And after all the URL's have been run, I want to exit phantomjs. But as it is right now it never exits, and I have trouble recieving data from second function.

var webPage = require('webpage');
var page = webPage.create();
var jobs_list = [];

page.open('url', function (status) {
    page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function() {
        page.onConsoleMessage = function(msg) {
            console.log(msg);
        };
        var list = page.evaluate(function() {

            var jobs = [];
            var job;

            $('.test').each(function(){
                $(this).find('span').each(function(){
                    var job_link = $(this).find('a');
                    var url = job_link.attr("href");

                    job = {title : job_link.text(), url : url, location : ""};
                    jobs.push(job);
                })
            });
            return jobs;
        });
        var i = 0;
        jobs_list = list;
        next_page(i);
    });
});


function next_page(i){
    if (i <= (jobs_list.length-1)) {
        var current_job = jobs_list[i];
        var url = current_job.url;

        page.open(url, function (status) {
            page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function () {


                var location = page.evaluate(function() {
                    var job_location;
                    $('.job-location').each(function(){
                        $(this).find('li').each(function(){
                            job_location = $(this).text();
                        })
                    })
                    console.log(job_location);
                    return job_location;
                });
                jobs_list[i].location = location;

                if(i == (jobs_list.length-1)) {
                    phantom.exit(0);
                }
            });
        });
        console.log(i, current_job.title);

        next_page(++i);
    }
}

Upvotes: 0

Views: 857

Answers (1)

Artjom B.
Artjom B.

Reputation: 61892

The problem is that the page.open call is asynchronous. If you look closely to your next_page function it can be shortened to this:

function next_page(i){
    if (i <= (jobs_list.length-1)) {
        var current_job = jobs_list[i];
        var url = current_job.url;

        page.open(url, function (status) {
            ...
        });
        console.log(i, current_job.title);

        next_page(++i);
    }
}

It means that next_page(++i); is executed before page.open(url, ...) even managed to load the first HTML content. This call leads to the next page.open(url, ...) being executed immediately, thus overwriting the previous request. And you're never going to get any data this way.

You have to do two things:

  • move the next_page(++i); call where the execution of one page is finished
  • reduce the number of condition checking

I propose:

function next_page(i){
    if (i <= (jobs_list.length-1)) {
        var current_job = jobs_list[i];
        var url = current_job.url;

        page.open(url, function (status) {
            page.includeJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js", function () {

                var location = page.evaluate(function() {
                    var job_location;
                    $('.job-location').each(function(){
                        $(this).find('li').each(function(){
                            job_location = $(this).text();
                        })
                    })
                    console.log(job_location);
                    return job_location;
                });
                jobs_list[i].location = location;

                console.log(i, current_job.title);
                next_page(++i);
            });
        });
    } else {
        phantom.exit(0);
    }
}

That's quite an old version of jQuery. Perhaps you want to load a newer version. If the page already has jQuery included, you will likely break the page by loading another jQuery into it. Don't load an additional jQuery version at all in this case.

Upvotes: 2

Related Questions