Sai Krishna
Sai Krishna

Reputation: 557

How to download large number of files on nodejs

I have a json file with about 20000 to 100000 links in it it looks like this

[{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"

}]

Are there any ways to parallel download them about 100 at a time? Will I get any warnings or errors while downloading 1000s of links? Right now I'm using sequential download,but I'm not sure it's suitable for such large number of links.

Here's how I'm downloading currently

async function downloadALL(ultUrls) {
  let i = 1;
  const len = ultUrls.length;
  for (ult of ultUrls) {

    await downloadFile(ult, i, len)
      .then(() => i++)
      .catch(err => console.log(err));
  }


}



function downloadFile(ult, i, len) {
  return new Promise((resolve, reject, cb) => {
    console.log('Downloading File: () ', ult.file_name);
    const download = {
      file: {},
    };

    let percentage = 0;
    const percentage2 = ((i / len) * 100).toFixed(0);

    download.file.name = ult.file_name;

    download.file.percentage = percentage;
    download.file.downloadId = ult.downloadId;
    download.percentage = percentage2;
    // console.log(download);
    // let console_message = download;


    let request = (ult.url.substr(0, 5) === 'https' ? https : http)
      .get(ult.url, function(response) {
        const lent = parseInt(response.headers['content-length'], 10);

        let body = '';
        let cur = 0;

        const total = lent / 1048576; // 1048576 - bytes in  1Megabyte

        response.on('data', function(chunk) {
          body += chunk;
          cur += chunk.length;
          percentage = ((100.0 * cur) / lent).toFixed(0);
          download.file.percentage = percentage;
          mainWindow.webContents.send('download-info', download);
        });

        const file = utility.writeFile(ult.file_name, dir);
        response.pipe(file);
        file.on('error', function(err) {
          console.log(`ERROR:${  err}`);
          file.read();
        });
        file.on('finish', function() {
          console.log('File downloaded');
          return resolve(file.close(cb)); // close() is async, call cb after close completes.
        });
      })
      .on('error', function(err) {
        // Handle errors
        return reject(err);
      });
  });

Upvotes: 1

Views: 3267

Answers (2)

Rami Loiferman
Rami Loiferman

Reputation: 912

I recommend to use bluebird. This Promise library has a batch promises concurrency solution.

This is the link to their tutorial: http://bluebirdjs.com/docs/api/promise.map.html

And here is a code solution with bluebird for your case:

// don't forget to run `npm install bluebird` first
const Promise = require('bluebird');

async function downloadAll(ultUrls) {
// The concurrency property here represents the number of promises that will be allowed to run at the same time
// You can surround this line with try/catch scope if you want to
  await Promise.map(ultUrls, downloadFile, {concurrency: 100});
}

// Here you no longer need the i and len parameters
function downloadFile() {
  // Code change needed here stop using the i and len parameters 
}

Upvotes: 2

damitj07
damitj07

Reputation: 2919

So since you mentioned parallel, the usual way in NodeJS is to use child process and spawn multiple parallel threads based on a number of computing resources available.

Here is a pseudo-code that you can refer to create a solution.

// parent.js
var child_process = require('child_process');

var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
    "file_name": "Blessed_Titles.jpg",
    "url": "https://i.imgur.com/FRDibHa.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles2.jpg",
    "url": "https://i.imgur.com/FRDibHa2.jpg",
    "downloadId": "6r44r4k340rvvr"
}, {
    "file_name": "Blessed_Titles3.jpg",
    "url": "https://i.imgur.com/FRDibHa3.jpg",
    "downloadId": "6r44r4k340rvvr"
}];

// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);

for (var i = 0; i < numchild; i++) {
    var
    var child = child_process.fork('./child');
    //send the chunk of the list to respective thread. 
    child.send(chunks[i]);
    //ps please check the count and logic for yourself I have not tested this.
    child.on('message', function (message) {
        console.log('[parent] received message from child:', message);
        done++;
        if (done === numchild) {
            console.log('[parent] received all results');
      ...
    }
  });
}

// child.js
process.on('message', function (list) {
    console.log('[child] received message from server:', message);
    downloadFiles(list, function (done) {
        console.log("Done  downloading files : " + list.length);
        process.send({
            child: process.pid,
            result: done
        });
        process.disconnect();
    });
});

function downloadFiles(list, cb) {
    //loop over list
    //logic to download files
    //cb(true)
}

Refer to this link for more details about the logic used.

Also, I have used chuck function from the lodash library to split the array for processing. https://lodash.com/docs/3.10.1#chunk

Upvotes: 0

Related Questions