Reputation: 355
I'm trying to crawl a website using CasperJS but I ran into a problem.
One the first page I collect the links I want to crawl and save them to an array using the getLinks()
function - this works well.
Then I want to crawl each page from this array (I got this part working) and I want to grab some details from each of these pages.
My code as follows (trimmed the working stuff like casper start and login etc.):
// Casper start here, and some login stuff, all these are working fine so I removed them to have a light example
// ....
// ....
// ....
// Function for saving members to an array
function getLinks() {
var links = document.querySelectorAll(".member_name_and_title");
return Array.prototype.map.call(links, function(link) {
return link.childNodes[1].childNodes[1].href
});
}
casper.then(function() {
// Aggregate results
links = this.evaluate(getLinks);
casper.each(links, function (self, link) {
self.thenOpen(link, function () {
var details = this.evaluate(function(){
document.getElementsByClassName('member_name')[0].textContent;
});
// Grab details for each member
var data = details + " - " + link;
// Save data
var fs = require('fs');
fs.write('results/output.txt', JSON.stringify(data, null, ' '), 'aw');
});
});
});
// Casper run
casper.run(function() {
this.exit();
});
The problem is that the details
var will return null, so the final output.txt
would be something like:
"null - domain.com/link1"
"null - domain.com/link2"
"null - domain.com/link3"
"null - domain.com/link4"
"null - domain.com/link5"
The link
var is working fine but the details
var is returning null.
When I go to any of the urls from the array (example: domain.com/link1) and run document.getElementsByClassName('member_name')[0].textContent
in the browser console it returns the value correctly so I'm sure the targeting is fine.
I'm not sure what I'm missing or what I'm doing wrong. Any help would be much appreciated. Thanks!
Upvotes: 1
Views: 671
Reputation: 355
Ok I figured this out in the end, totally rookie mistake ... The query was correct the problem was caused by the page load, or better to say the links from the array were not loaded before the actual query was ran.
To test this I used captureSelection() inside the self.thenOpen
function to capture the state of the page when it's open, but right before the data is collected.
this.captureSelector('1.jpg', '#page');
I immediately noticed that the page was not fully loaded, hence the return document.querySelector('.member_name.').textContent;
returning null
.
To fix this I've added a 1.5s wait time, as follows:
casper.wait(1500, function() {
var details = this.evaluate(function(){
return document.querySelector('.member_name').textContent;
});
});
Rookie mistake but might help someone else in the future.
Upvotes: 1
Reputation: 198
Try adding a return as mentioned below:
var details = this.evaluate(function(){
return document.getElementsByClassName('member_name')[0].textContent;
});
EDIT:
This worked for me. My code setup is as follows:
var casper = require('casper').create();
function getLinks() {
var matchedLinks = document.querySelectorAll(".member_name_and_title");
return Array.prototype.map.call(matchedLinks, function(link) {
return link.href;
});
}
casper.start('http://localhost:8080');
casper.then(function() {
// Aggregate results
links = this.evaluate(getLinks);
casper.each(links, function (self, link) {
// INSPECT: Check if it shows the correct link here.
self.echo('Opening link:' + link);
self.thenOpen(link, function () {
var details = this.evaluate(function(){
// INSPECT: Make sure to 'return' the text content.
return document.getElementsByClassName('member_name')[0].textContent;
});
// Grab details for each member
var data = details + " - " + link;
// INSPECT: Check if the data is correct.
self.echo(data);
// Save data
var fs = require('fs');
fs.write('results/output.txt', JSON.stringify(data, null, ' '), 'aw');
});
});
});
casper.run(function(){
this.exit();
});
My html files are as below:
index.html
<!DOCTYPE html>
<html>
<head>
<title>Hello CasperJs</title>
</head>
<body>
<a href="page1.html" class="member_name_and_title">Page 1</a>
<a href="page2.html" class="member_name_and_title">Page 2</a>
<a href="page3.html" class="member_name_and_title">Page 3</a>
<a href="page4.html" class="member_name_and_title">Page 4</a>
</body>
</html>
page1.html
<!DOCTYPE html>
<html>
<head>
<title>Page 1 Title</title>
</head>
<body>
<p class="member_name">Page 1 Text</p>
</body>
</html>
Similar HTML markup for page2.html, page3.html and page4.html. My http server was running at port 8080.
My console output is as follows:
Opening link:http://localhost:8080/page1.html
Opening link:http://localhost:8080/page2.html
Opening link:http://localhost:8080/page3.html
Opening link:http://localhost:8080/page4.html
Page 1 Text - http://localhost:8080/page1.html
Page 2 Text - http://localhost:8080/page2.html
Page 3 Text - http://localhost:8080/page3.html
Page 4 Text - http://localhost:8080/page4.html
I am using casperjs 1.1.3 with phantomjs 2.1.1.
Can you update your code and share your console output and package versions?
Upvotes: 0