Web Scraping Using Nodejs

Question

I have created a simple web scraper that pulls in the article titles and URL from this website: http://espn.go.com/college-football/. However, the scraper only returns 46-50 articles, instead of all the articles from the site. I've tried changing the CSS selector that cheerio uses, but nothing changes with regards to the number of articles it scrapes. Here is the code I'm using:

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var mongo = require('mongoskin');
var db = mongo.db("mongodb://localhost:27017/test", { native_parser: true });


url = 'http://espn.go.com/college-football/';

function Headline(title, link) {
    this.Title = title;
    this.link = link;
}

request(url, function (error, response, html) {
    if (!error) {
        var $ = cheerio.load(html);

        var result = [];

        // Grab the articles titles/url
        $('.text-container h1 a.realStory', '#news-feed-content').each(function (i, elem) {
            console.log($(elem).text(), elem.attribs.href);
            var articleObject = new Headline($(elem).text(), elem.attribs.href);
            result.push(articleObject);
        });
    }

    fs.writeFile('espn_articles.json', JSON.stringify(result, null, 4), function (err) {

        console.log('File successfully written! - Check your project directory for the output.json file');

    })

    db.collection('articles').insert(result, function (error, record) {
        if (error) throw error;
        console.log("data saved");
    });
});

rchipka · Accepted Answer

Here's an example using Osmosis.

osmosis('http://espn.go.com/college-football/')
    .find('#news-feed-content .text-container')
    .set({
        author:   '.author',
        category: '.category-link',
        title:    '.realStory',
        link:     '.realStory@href',
        blurb:    'p'
    })
    .follow('.realStory@href')
    .set({
        date:    '.article-meta @data-date',
        images:  [ 'picture @srcset' ],
        content: '.article-body'
    })
    .data(function (article) {
        /*
        { author: '...',
          category: '...',
          title: 'Harbaugh, Michigan reel in Florida OL Herbert',
          link: '...',
          blurb: 'Jim Harbaugh and Michigan have landed another recruit from SEC country in Kai-Leon Herbert of Florida.',
          date: '2016-07-06T17:25:09Z',
          images: [ '...', '...' ],
          content: '...'
        }
        */

        db.collection('articles').insert(article, function (error, record) {
            // ...
        });
    })
    .log(console.log)
    .error(console.log)
    .debug(console.log);

Web Scraping Using Nodejs

Answers (2)

Related Questions