user1222447
user1222447

Reputation: 113

CasperJS scraping assistance required

I am trying to go to this page and scrape from each link the 'Title' and 'Authors' for each thesis. So far I have this (my issues that I require assistance with are in the comments within code):

var utils = require('utils');
var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  },
  clientScripts: ['lib/jquery.min.js']
});

var i = 0;
var links = [];
var thesis_data = [];

function getThesisLinks () {
  var links = document.querySelectorAll('');//Not sure what should go in ('')
  return [].map.call(links, function(link) {
    return link.getAttribute('href');
  });
}

function loopThroughThesisLinks() {
  // Recurses until all links are processed
  if (i < links.length) {
    this.echo('[LINK #' + i + '] ' + links[i]);
    getThesisData.call(this, links[i]);
    i++;
    this.run(loopThroughThesisLinks);
  } else {
    utils.dump(thesis_data);
    this.exit();
  }
}

function getThesisData(link) {
  this.start(link, function() {

    // Get title of thesis - not sure what element to insert for this.fetchText
    var title = this.fetchText('');

    // Get name of authors - not sure what element to insert for this.fetchText
    var author = this.fetchText('');

    // Add the title & author data to the thesis_data array
    var data = {
      title: title,
      author: author
    };
    thesis_data.push(data);

  });
}

casper.start('http://ses.library.usyd.edu.au/handle/2123/345/browse?type=dateissued&sort_by=2&order=DESC&rpp=1495&etal=0&submit_browse=Update', function() {
  links = this.evaluate(getThesisLinks);

  // Convert relative links to absolute URLs
  for (var i = 0; i < links.length; i++) {
    links[i] = "http://ses.library.usyd.edu.au/handle/" + links[i];
  }

  utils.dump(links);
});

casper.run(loopThroughThesisLinks);

Any assistance would be appreciated.

Upvotes: 0

Views: 218

Answers (1)

Artjom B.
Artjom B.

Reputation: 61952

This is a simple CSS selector for all links:

var links = document.querySelectorAll(
           'table.misctable > tbody > tr > td:nth-of-type(3) > a');

You can also use XPath like this:

var x = require('casper').selectXPath; // goes to the beginning of the file
var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));

I think you can figure out the authors-query. I probably would have done the crawling differently using casper.thenOpen in a loop, because this is rather hard to read with the additional start and run calls being in different functions.

With casper.thenOpen it would look like this:

var x = require('casper').selectXPath; // goes to the beginning of the file

function loopThroughThesisLinks() {
  // Recurses until all links are processed
  if (i < links.length) {
    this.echo('[LINK #' + i + '] ' + links[i]);
    getThesisData.call(this, links[i]);
    i++;
    this.then(loopThroughThesisLinks);
  } else {
    utils.dump(thesis_data);
    this.exit();
  }
}

function getThesisData(link) {
  this.thenOpen(link, function() {
    var title = this.fetchText(x('//table//tr/td[1][contains(text(),"Title:")]/../td[2]'));
    var author = this.fetchText(x('//table//tr/td[1][contains(text(),"Authors:")]/../td[2]'));

    // Add the title & author data to the thesis_data array
    var data = {
      title: title,
      author: author
    };
    thesis_data.push(data);
  });
}

Upvotes: 1

Related Questions