Reputation: 113
I am trying to go to this page and scrape from each link the 'Title' and 'Authors' for each thesis. So far I have this (my issues that I require assistance with are in the comments within code):
var utils = require('utils');
var casper = require('casper').create({
verbose: true,
logLevel: 'error',
pageSettings: {
loadImages: false,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
},
clientScripts: ['lib/jquery.min.js']
});
var i = 0;
var links = [];
var thesis_data = [];
function getThesisLinks () {
var links = document.querySelectorAll('');//Not sure what should go in ('')
return [].map.call(links, function(link) {
return link.getAttribute('href');
});
}
function loopThroughThesisLinks() {
// Recurses until all links are processed
if (i < links.length) {
this.echo('[LINK #' + i + '] ' + links[i]);
getThesisData.call(this, links[i]);
i++;
this.run(loopThroughThesisLinks);
} else {
utils.dump(thesis_data);
this.exit();
}
}
function getThesisData(link) {
this.start(link, function() {
// Get title of thesis - not sure what element to insert for this.fetchText
var title = this.fetchText('');
// Get name of authors - not sure what element to insert for this.fetchText
var author = this.fetchText('');
// Add the title & author data to the thesis_data array
var data = {
title: title,
author: author
};
thesis_data.push(data);
});
}
casper.start('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse=Update', function() {
links = this.evaluate(getThesisLinks);
// Convert relative links to absolute URLs
for (var i = 0; i < links.length; i++) {
links[i] = "http://ses.library.usyd.edu.au/handle/" + links[i];
}
utils.dump(links);
});
casper.run(loopThroughThesisLinks);
Any assistance would be appreciated.
Upvotes: 0
Views: 218
Reputation: 61952
This is a simple CSS selector for all links:
var links = document.querySelectorAll(
'table.misctable > tbody > tr > td:nth-of-type(3) > a');
You can also use XPath like this:
var x = require('casper').selectXPath; // goes to the beginning of the file
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
I think you can figure out the authors-query. I probably would have done the crawling differently using casper.thenOpen
in a loop, because this is rather hard to read with the additional start
and run
calls being in different functions.
With casper.thenOpen
it would look like this:
var x = require('casper').selectXPath; // goes to the beginning of the file
function loopThroughThesisLinks() {
// Recurses until all links are processed
if (i < links.length) {
this.echo('[LINK #' + i + '] ' + links[i]);
getThesisData.call(this, links[i]);
i++;
this.then(loopThroughThesisLinks);
} else {
utils.dump(thesis_data);
this.exit();
}
}
function getThesisData(link) {
this.thenOpen(link, function() {
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
var author = this.fetchText(x('//table//tr/td[1][contains(text(),"Authors:")]/../td[2]'));
// Add the title & author data to the thesis_data array
var data = {
title: title,
author: author
};
thesis_data.push(data);
});
}
Upvotes: 1