developerExecutive
developerExecutive

Reputation: 99

Download huge number of images reading a JSON file

My task is to parse a large json data (consists of 1 million URLs) and download images to local server, but while downloading we are losing large no of data (around 50k) which are not captured in log files

We are splitting large json file into several small json files. We have written a program in nodeJS which read URLs (consist of images) from JSON and download that images to local server, but in this process we are loosing a large amount of data which are not written in log files or not downloaded.

So what are the modification needs to be done in this code?

My JSON

    [{
        "url": "http://example1.com", 
        "id": "jweewewii833388efje.jpg", 
        "class": 2
    }, 
    {
        "url": "http://example2.com", 
        "id": "987c2732561cb6bbc151cb6e184d019b.jpg", 
        "class": 24
    }, 
    {
        "url": "http://example3.com", 
        "id": "141cbe32ed1f0071431677a2c7754a70.jpg", 
        "class": 22
    }, 
    {
        "url": "http://example4.com", 
        "id": "291a09e237b43d9fa1bf64481a679eaf.png", 
        "class": 1
    }]

Node Js code

var fs = require('fs'),
JSONStream = require('JSONStream'),
es = require('event-stream');
const download = require('image-downloader')
var util = require('util');
var log_file = fs.createWriteStream('logfiles/log13.log', {flags : 'w'});

var getStream = function () {
    var jsonData = 'train.json',
    stream = fs.createReadStream(jsonData, {encoding: 'utf8'}),
    parser = JSONStream.parse('*');
    return stream.pipe(parser);
};

getStream()
  .pipe(es.mapSync(function (data) {
        //console.log(data.url);
        const options = {
         url: data.url,
         dest: './uploads/'+data.id             // Save to /path/to/dest/image.jpg
        }
        download.image(options)
         .then(({ filename, image }) => {
        console.log('Saved to', filename)  // Saved to /path/to/dest/image.jpg
         })
         .catch((err) => {
            //errorlog.error(`Error Message : $(data.id)${err}`);
            const logdata = function(d) { //
              log_file.write(util.format(d) + '\n');
              //log_stdout.write(util.format(d) + '\n');
            };
            console.error =logdata
             console.error(" Error URL:: "+data.url+" ID::"+data.id+"::"+err)
         })


}));

I want to download total data without losing any. if there is any error URL that will be written in log file Ex: If I am processing 10k data from Json file I want around 8k images will be downloaded and other 2k will be written to log file

Upvotes: 1

Views: 2166

Answers (1)

Grey
Grey

Reputation: 529

I think you missed calling the function responsible for logging failed operations. I revised your code and added "NOTEs". I hope it helps.

const fs = require('fs');
const JSONStream = require('JSONStream');
const es = require('event-stream');
const download = require('image-downloader')
const util = require('util');
const log_file = fs.createWriteStream('logfiles/log13.log', { flags: 'w' });

const getStream = function () {
    const jsonData = 'train.json',
        stream = fs.createReadStream(jsonData, { encoding: 'utf8' }),
        parser = JSONStream.parse('*');
    return stream.pipe(parser);
};

getStream()
    .pipe(es.mapSync(function (data) {
        //console.log(data.url);
        const options = {
            url: data.url,
            dest: `./uploads/${data.id}`             // Save to /path/to/dest/image.jpg
        }
        download.image(options)
            .then(({ filename, image }) => {
                console.log('Saved to', filename)  // Saved to /path/to/dest/image.jpg
            }).catch((err) => {

                //errorlog.error(`Error Message : $(data.id)${err}`);

                /* 
                     NOTE: I am assuming the function below logs the affected data
                      in some file that you can access at a later time(?)
                 */

                const logdata = function (d) { //
                    log_file.write(util.format(d) + '\n');
                    //log_stdout.write(util.format(d) + '\n');
                };

                // NOTE: Call the function and pass the data to log it for later review
                logdata(data);

                // NOTE: I commented the below line out
                // console.error =logdata

                // Output to console as you have done before
                console.error(" Error URL:: " + data.url + " ID::" + data.id + "::" + err)
            });
    }));

Upvotes: 1

Related Questions