Reputation: 93
I need to download large no of files(say 100k, each file size 0.2 - 1 MB) from aws s3 to node js server. The code I am using is
app.get('/api/download-all', function(req, res) {
res.json({status: 'download initiated'})
downloadFromS3(getDocs());
});
The function that downloads the audios is
function downloadFromS3(docs){
docs.forEach((doc, fileIndex)=>{
var s3FilePath = doc.wav
var fileName = s3FilePath.split('/').pop();
var s3Params = {Bucket: 'zzzzz', Key: s3FilePath};
var file = fs.createWriteStream(dir + '/' + fileName);
console.log(downloadSession);
s3.getObject(s3Params)
.on('httpData', function (chunk) {
console.log("file writing happening", fileName);
file.write(chunk);
})
.send();
}); }
Here the download function fires S3.getObject call as many times as the no of files to download. it doesn't wait for the status of the file. its almost like some 100k (in my case) s3.getObject has been made before letting a file to download. is this a right way or should I wait for one file to download and invoke the s3 call after that. what will be the right approach.
2) There is one other issue I am facing with this code. Once I make the download api call from UI the server gets busy with download. its not returning any requests from the UI. all requests gets pending. Is there is anyway to do the download in background. I had gone through some approaches like fork a child process or a web worker to handle this. I am not sure which one to use. what is the best way to handle this.
Upvotes: 1
Views: 5563
Reputation: 2982
I'd advise an in-between approach. Kicking off 100k downloads in parallel is really not a good idea. But similarly, waiting for each download to fully complete won't utilise your full bandwidth. I'd suggest a solution that "pools" jobs - e.g., you create a pool of promises, each of which can download one file at a time, as soon as it finishes it starts the next.
I've been using a function like this:
Promise.pool = function pool(funcs, inParallel, progressCallback) {
const promises = [];
const results = [];
function getNext() {
if (funcs.length) {
return funcs.pop()()
.catch(() => {})
.then((res) => {
results.push(res);
if (progressCallback) {
progressCallback(results);
}
return getNext();
});
}
}
for (let i = 0; i < Math.min(inParallel, funcs.length); i++) {
promises.push(getNext());
}
return Promise.all(promises)
.then(() => results);
};
Then you'd define an array of functions, each downloads one file and returns a promise which resolves on completion:
const funcs = docs.map((doc) => {
return () => {
return new Promise((resolve) => {
var s3FilePath = doc.wav
var fileName = s3FilePath.split('/').pop();
var s3Params = {Bucket: 'zzzzz', Key: s3FilePath};
var file = fs.createWriteStream(dir + '/' + fileName);
console.log(downloadSession);
s3.getObject(s3Params)
.on('httpData', function (chunk) {
console.log("file writing happening", fileName);
file.write(chunk);
})
.on("end", () => resolve())
.send();
});
}
});
Finally, you'd use it like this:
const inParallel = 32;
function callback(partialResults) {
//console log, whatever
}
Promise.pool(funcs, inParallel, callback)
.then(() => console.log("all done!"));
Upvotes: 2