Reputation: 99
My task is to parse a large json data (consists of 1 million URLs) and download images to local server, but while downloading we are losing large no of data (around 50k) which are not captured in log files
We are splitting large json file into several small json files. We have written a program in nodeJS which read URLs (consist of images) from JSON and download that images to local server, but in this process we are loosing a large amount of data which are not written in log files or not downloaded.
So what are the modification needs to be done in this code?
My JSON
[{
"url": "http://example1.com",
"id": "jweewewii833388efje.jpg",
"class": 2
},
{
"url": "http://example2.com",
"id": "987c2732561cb6bbc151cb6e184d019b.jpg",
"class": 24
},
{
"url": "http://example3.com",
"id": "141cbe32ed1f0071431677a2c7754a70.jpg",
"class": 22
},
{
"url": "http://example4.com",
"id": "291a09e237b43d9fa1bf64481a679eaf.png",
"class": 1
}]
Node Js code
var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
const download = require('image-downloader')
var util = require('util');
var log_file = fs.createWriteStream('logfiles/log13.log', {flags : 'w'});
var getStream = function () {
var jsonData = 'train.json',
stream = fs.createReadStream(jsonData, {encoding: 'utf8'}),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
//console.log(data.url);
const options = {
url: data.url,
dest: './uploads/'+data.id // Save to /path/to/dest/image.jpg
}
download.image(options)
.then(({ filename, image }) => {
console.log('Saved to', filename) // Saved to /path/to/dest/image.jpg
})
.catch((err) => {
//errorlog.error(`Error Message : $(data.id)${err}`);
const logdata = function(d) { //
log_file.write(util.format(d) + '\n');
//log_stdout.write(util.format(d) + '\n');
};
console.error =logdata
console.error(" Error URL:: "+data.url+" ID::"+data.id+"::"+err)
})
}));
I want to download total data without losing any. if there is any error URL that will be written in log file Ex: If I am processing 10k data from Json file I want around 8k images will be downloaded and other 2k will be written to log file
Upvotes: 1
Views: 2166
Reputation: 529
I think you missed calling the function responsible for logging failed operations. I revised your code and added "NOTEs". I hope it helps.
const fs = require('fs');
const JSONStream = require('JSONStream');
const es = require('event-stream');
const download = require('image-downloader')
const util = require('util');
const log_file = fs.createWriteStream('logfiles/log13.log', { flags: 'w' });
const getStream = function () {
const jsonData = 'train.json',
stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
parser = JSONStream.parse('*');
return stream.pipe(parser);
};
getStream()
.pipe(es.mapSync(function (data) {
//console.log(data.url);
const options = {
url: data.url,
dest: `./uploads/${data.id}` // Save to /path/to/dest/image.jpg
}
download.image(options)
.then(({ filename, image }) => {
console.log('Saved to', filename) // Saved to /path/to/dest/image.jpg
}).catch((err) => {
//errorlog.error(`Error Message : $(data.id)${err}`);
/*
NOTE: I am assuming the function below logs the affected data
in some file that you can access at a later time(?)
*/
const logdata = function (d) { //
log_file.write(util.format(d) + '\n');
//log_stdout.write(util.format(d) + '\n');
};
// NOTE: Call the function and pass the data to log it for later review
logdata(data);
// NOTE: I commented the below line out
// console.error =logdata
// Output to console as you have done before
console.error(" Error URL:: " + data.url + " ID::" + data.id + "::" + err)
});
}));
Upvotes: 1