Keon Cummings
Keon Cummings

Reputation: 1811

Having trouble looping through DOM elements with node web scraper

I was able to get the scraper to do what I want it to do, I'm having a lot of issues actually getting it to loop through the pages I want it to loop through. I think my issue may be with the placement of my for loop and how it's executed.

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url

This is the problem area here, how do I set this up so it doesn't just set and loop the last page, but all 101 pages?

    for(var i = 1; i < 101; i++){
          url = 'http://www.goodreads.com/quotes?page=' + i;
    }

//

    request(url, function(error, response, html){
        if(!error){
            //use cheerio to use jquery to select DOM elements
            var $ = cheerio.load(html);

            //select DOM elements using jquery selectors
            $('.quoteText > a').filter(function(){
                var data = $(this);
                author = data.text();

                json.author.push(author);
                // all.push(data.text());
            })
            //select DOM elements using jquery selectors
            $('.quoteText').filter(function(){
                var data = $(this);
                quote = data.text();

                json.quote.push(quote);
            })
        }
        //loop through json object to clean up stings
        for(var i = 0; i < json.quote.length; i++) {
            //find the index of where the quote ends
            endQuote = json.quote[i].indexOf("―")
            //select only the part of the string that contains a quote
            json.quote[i] = json.quote[i].substring(0, endQuote - 1);
            //remove non breaking spaces from string
            json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
        }
        //write the json file to folder 
        fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
            console.log('File successfully written! - Check your project directory for the output.json file');
        })

        res.send('Check your console!')
    })
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

****edit****

Changed the code around to run res.send('Check your console!') at the end of function call, app will throw error if res is called more than once. Also included changes based on accepted answer.

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
//set object to be populated by scraped DOM elements
var author, quote;
var json = { author : [], quote : []};
var url = []

//Initialize the scraper the scraper url in the DOM
app.get('/scrape', function(req, res){
    //set the scraper url
    for(var i = 1; i < 101; i++){
          url.push('http://www.goodreads.com/quotes?page=' + i);
    }

    for(i in url){
        request(url[i], function(error, response, html){
            if(!error){
                //use cheerio to use jquery to select DOM elements
                var $ = cheerio.load(html);

                //select DOM elements using jquery selectors
                $('.quoteText > a').filter(function(){
                    var data = $(this);
                    author = data.text();

                    json.author.push(author);
                    // all.push(data.text());
                })
                //select DOM elements using jquery selectors
                $('.quoteText').filter(function(){
                    var data = $(this);
                    quote = data.text();

                    json.quote.push(quote);
                })
            }
        })
    }

    res.send('Check your console!')
})

function cleanUp(){
    //loop through json object to clean up stings
    for(var i = 0; i < json.quote.length; i++) {
        //find the index of where the quote ends
        endQuote = json.quote[i].indexOf("―")
        //select only the part of the string that contains a quote
        json.quote[i] = json.quote[i].substring(0, endQuote - 1);
        //remove non breaking spaces from string
        json.quote[i] = json.quote[i].replace(/(\r\n|\n|\r)/gm,"");
    }
    //write the json file to folder 
    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
        console.log('File successfully written! - Check your project directory for the output.json file');
    })
}


app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

Upvotes: 0

Views: 512

Answers (1)

Ethan Gardner
Ethan Gardner

Reputation: 96

In the example code you provided:

for(var i = 1; i < 101; i++){
      url = 'http://www.goodreads.com/quotes?page=' + i;
}

The for loop is overwriting the url variable each time through the loop.

You can make it work with a few small changes to your code; the easiest way would be to make url an array and then push into the array on each time through the loop so the list of urls continues to accumulate like the code below:

var url = [];
for(var i = 1; i < 101; i++){
    url.push('http://www.goodreads.com/quotes?page=' + i);
}

You would then need to call your request function for each item in the array since url now contains an array with 100 items in it and also change your fs.writeFile call to fs.appendFile so the results of each request call get added to the output.json file instead of overwriting it.

Finally, you should also consider throttling the requests so you aren't hammering the server of the site you are scraping.

Upvotes: 1

Related Questions