Adrian
Adrian

Reputation: 355

CasperJS - How to save some data for each page from an array?

I'm trying to crawl a website using CasperJS but I ran into a problem.

One the first page I collect the links I want to crawl and save them to an array using the getLinks() function - this works well.

Then I want to crawl each page from this array (I got this part working) and I want to grab some details from each of these pages.

My code as follows (trimmed the working stuff like casper start and login etc.):

// Casper start here, and some login stuff, all these are working fine so I removed them to have a light example

// ....
// ....
// ....

// Function for saving members to an array
function getLinks() {
    var links = document.querySelectorAll(".member_name_and_title");
    return Array.prototype.map.call(links, function(link) {
        return link.childNodes[1].childNodes[1].href
    });
}

casper.then(function() {
    // Aggregate results 
    links = this.evaluate(getLinks);

    casper.each(links, function (self, link) {
        self.thenOpen(link, function () {

            var details = this.evaluate(function(){
                document.getElementsByClassName('member_name')[0].textContent;
            });

            // Grab details for each member
            var data = details + " - " + link;

            // Save data
            var fs = require('fs');
            fs.write('results/output.txt', JSON.stringify(data, null, '  '), 'aw');

        });
    });

});

// Casper run
casper.run(function() {
    this.exit();
});

The problem is that the details var will return null, so the final output.txt would be something like:

"null - domain.com/link1"
"null - domain.com/link2"
"null - domain.com/link3"
"null - domain.com/link4"
"null - domain.com/link5"

The link var is working fine but the details var is returning null.

When I go to any of the urls from the array (example: domain.com/link1) and run document.getElementsByClassName('member_name')[0].textContent in the browser console it returns the value correctly so I'm sure the targeting is fine.

I'm not sure what I'm missing or what I'm doing wrong. Any help would be much appreciated. Thanks!

Upvotes: 1

Views: 671

Answers (2)

Adrian
Adrian

Reputation: 355

Ok I figured this out in the end, totally rookie mistake ... The query was correct the problem was caused by the page load, or better to say the links from the array were not loaded before the actual query was ran.

To test this I used captureSelection() inside the self.thenOpen function to capture the state of the page when it's open, but right before the data is collected.

this.captureSelector('1.jpg', '#page');

I immediately noticed that the page was not fully loaded, hence the return document.querySelector('.member_name.').textContent; returning null.

To fix this I've added a 1.5s wait time, as follows:

casper.wait(1500, function() {
  var details = this.evaluate(function(){
    return document.querySelector('.member_name').textContent;
  });
});

Rookie mistake but might help someone else in the future.

Upvotes: 1

Kalpana
Kalpana

Reputation: 198

Try adding a return as mentioned below:

var details = this.evaluate(function(){
    return document.getElementsByClassName('member_name')[0].textContent;
});

EDIT:

This worked for me. My code setup is as follows:

var casper = require('casper').create();

function getLinks() {
    var matchedLinks = document.querySelectorAll(".member_name_and_title");
    return Array.prototype.map.call(matchedLinks, function(link) {
        return link.href;
    });
}

casper.start('http://localhost:8080');

casper.then(function() {
    // Aggregate results 
    links = this.evaluate(getLinks);

    casper.each(links, function (self, link) {

        // INSPECT: Check if it shows the correct link here.
        self.echo('Opening link:' + link);

        self.thenOpen(link, function () {

            var details = this.evaluate(function(){
                // INSPECT: Make sure to 'return' the text content.
                return document.getElementsByClassName('member_name')[0].textContent;
            });

            // Grab details for each member
            var data = details + " - " + link;

            // INSPECT: Check if the data is correct.
            self.echo(data);

            // Save data
            var fs = require('fs');
            fs.write('results/output.txt', JSON.stringify(data, null, '  '), 'aw');
        });
    });
});


casper.run(function(){
    this.exit();
});

My html files are as below:

index.html

<!DOCTYPE html>
<html>
<head>
<title>Hello CasperJs</title>
</head>
<body>

<a href="page1.html" class="member_name_and_title">Page 1</a>
<a href="page2.html" class="member_name_and_title">Page 2</a>
<a href="page3.html" class="member_name_and_title">Page 3</a>
<a href="page4.html" class="member_name_and_title">Page 4</a>

</body>
</html>

page1.html

<!DOCTYPE html>
<html>
<head>
<title>Page 1 Title</title>
</head>
<body>

<p class="member_name">Page 1 Text</p>

</body>
</html>

Similar HTML markup for page2.html, page3.html and page4.html. My http server was running at port 8080.

My console output is as follows:

Opening link:http://localhost:8080/page1.html
Opening link:http://localhost:8080/page2.html
Opening link:http://localhost:8080/page3.html
Opening link:http://localhost:8080/page4.html
Page 1 Text - http://localhost:8080/page1.html
Page 2 Text - http://localhost:8080/page2.html
Page 3 Text - http://localhost:8080/page3.html
Page 4 Text - http://localhost:8080/page4.html

I am using casperjs 1.1.3 with phantomjs 2.1.1.

Can you update your code and share your console output and package versions?

Upvotes: 0

Related Questions