Maximilian Fuchs
Maximilian Fuchs

Reputation: 441

iterate node.js request function

This question is about a crawler in node.js. A start_url is given where he crawls for URLs, and "pushes" them to a .json-file (output.json). At the moment, he runs the request function only with the start_url, and saves the collected URLs in output.json. I want that he uses the saved URLs by replacing the start_url with the first collected URL and collect links again ... and so on ...

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://stackoverflow.com/"]

var req = function(url){
    request(url, function(error, response, html){
        var $ = cheerio.load(html);

        var data = [];

        $("a").each(function() {
            var link = $(this);
                var exurls = {exurl: new Array(link.attr("href"))}

                data.push(exurls);

                // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
                // save to "output.json" from time to time, so you can stop it anytime
        });

        fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
            if(err){
                console.log(err);
            } else {
                console.log("File successfully written!");
            }
        });
    });
}
for (var i = 0; i < start_url.length; i++){
    req(start_url[i]);
}

Upvotes: 1

Views: 828

Answers (1)

aray12
aray12

Reputation: 1843

So what you can do is make the function call recursively. The below example should work:

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://stackoverflow.com/"]

var req = function(url){
  var count = 0;
  
  request(url, function(error, response, html){
    var $ = cheerio.load(html);

    $("a").each(function() {
      var link = $(this);
      var exurls = {exurl: new Array(link.attr("href"))}

      start_url.push(exurls);

      // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
      // save to "output.json" from time to time, so you can stop it anytime
    });

    try {
      fs.writeFileSync("output.json");
      console.log("File successfully written!");
     }catch(err){
       console.log(err);
     }
      
      ++count;
      
      if(start_url.length > count) {
        req(start_url[count]);
      }
  });
}

return req(start_url[0]);

The problem with this is that you are completely rewriting the file each time. If this goes on for awhile you are going to run out of memory. Another option is to create a write stream

var fs = require('fs');
    var request = require('request');
    var cheerio = require('cheerio');

    var start_url = ["http://stackoverflow.com/"]
    
    var wstream = fs.createWriteStream("output.json");

    var req = function(url){
      
      request(url, function(error, response, html){
        var $ = cheerio.load(html);

        $("a").each(function() {
          var link = $(this);
          var exurls = {exurl: new Array(link.attr("href"))}

          start_url.push(exurls);

          // Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
          // save to "output.json" from time to time, so you can stop it anytime
          wstream.write('"'+ exurls + '",');
        });
          
        start_url.shift();
        if(start_url.length > 0) {
          return req(start_url[0]);
        }
          
          wstream.end();
      });
    }

    req(start_url[0]);

Edit: switched to a basic queue so combat memory problems



Upvotes: 2

Related Questions