Mohit Athwani
Mohit Athwani

Reputation: 893

Scraping with phantomJS and NodeJS

I'm following the tutorial listed here :

http://code.tutsplus.com/tutorials/screen-scraping-with-nodejs--net-25560

When I run the code:

  var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
  var phantom = require('phantom');
 phantom.create(function(ph) {
 return ph.createPage(function(page) {
 return page.open(host, function(status) {
  console.log("opened site? ", status);         

        page.injectJs('http://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js', function() {
            //jQuery Loaded.
            //Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
            setTimeout(function() {
                return page.evaluate(function() {

                    //Get what you want from the page using jQuery. A good way is to populate an object with all the jQuery commands that you need and then return the object.
                    console.log(document.getElementsByClassName('transition')[0]);

                    return document.getElementsByClassName('transition')[0];



                }, function(result) {
                    console.log(result);
                    ph.exit();
                });
            }, 5000);

        });
});
});
});

I get the following error :

phantom stdout: ReferenceError: Can't find variable: $


phantom stdout:   phantomjs://webpage.evaluate():7
phantomjs://webpage.evaluate():10
phantomjs://webpage.evaluate():10

I have no idea what this means and there's no help on how to resolve it ... How can this be solved ?

Basically I want all the 'a' tags with class transition from the site I'm scraping. All these tags are loaded asynchronously on the site.

Upvotes: 1

Views: 3624

Answers (1)

sudipto
sudipto

Reputation: 2482

The $ is due to jQuery and possible conflicts. You hardly require to inject jQuery just to scrape 'a' tags with class transition. You always have document.querySelector or document.querySelectorAll.

var host = 'http://www.shoutcast.com/?action=sub&cat=Hindi#134';
var phantom = require('phantom');

phantom.create(function(ph) {
    ph.createPage(function(page) {

        page.open(host, function(status) {

            console.log("opened site? ", status);
            //Wait for a bit for AJAX content to load on the page. Here, we are waiting 5 seconds.
            setTimeout(function() {

                page.evaluate(function() {
                    // here you need to add more code to get the html/text
                    // more code incase you use querySelectorAll
                    return document.document.querySelector('a.transition');
                    //return document.document.querySelectorAll('a.transition');
                },

                function(result) {
                    console.log(result);
                    ph.exit();
                });

            }, 5000);

        });
    });
});

However, I am not able to understand the way function (result) { console.log(result); ...} is coded. I am not aware if page.evaluate takes callback function as second parameter. Please check that with the documentation.

Upvotes: 3

Related Questions