krisaulie
krisaulie

Reputation: 13

Nodejs createReadStream only reads one data chunk of large JSON file

I am using Nodejs to read JSON objects from a really large JSON file (1GB+). The JSON file has the format [{field1: x, field2: x, field3: x},{...},...,{...}]. There is no line separation for each object. In order to avoid memory problems, I am using fs.createReadStream and treating each chunk of data in sequence. This works and I get valid JSON objects, but the reader stops after reading only one data chunk. Why is it not reading the rest of the file?

My solution was inspired by the accepted answer in this question: Parse large JSON file in Nodejs

Here is the code:

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    console.log("Stream on data!"); // ONLY EXECUTED ONCE
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(); // process the buffer
});
stream.on('error', function(err) {
    // NEVER EXECUTED
    console.log(err);
});
stream.on('end', function() {
    // NEVER EXECUTED
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}

EDIT: After fixing the errors causing the infinite while loop, the following is a working solution iterating through all the objects in the JSON file. It might not be very elegant, but at least it works (for anyone who might have a similar problem).

// Get the JSON file
var fs = require('fs');
var stream = fs.createReadStream('Results.json', {flags: 'r', encoding: 'utf-8'});
var buf = '';
var count = 0;

stream.on('data', function(chunk) {
    buf += chunk.toString(); // when data is read, stash it in a string buffer
    process(buf); // process the buffer
});
stream.on('error', function(err) {
    console.log(err);
});
stream.on('end', function() {
    console.log("Count: " + count);
});

function process() {
    var posStart = buf.indexOf('{');
    var posEnd = buf.indexOf('}');

    while (posStart >= 0 || posEnd >= 0) { // keep going until the start or end of the json object in the string
        // IF the start bracket is before the end, skip to the start
        if((posStart < posEnd || posEnd < 0) && posStart >= 0){ 
            buf = buf.slice(posStart);
        }
        if(posStart == 0 && posEnd >= 0){ // IF the end bracket is next
            processObjectString(buf.slice(0, posEnd+1)); // Process the complete object string
            buf = buf.slice(posEnd+1); // Remove the processed string from the buffer
        }else if(posStart < 0 || posEnd < 0){ // Return to get a new chunk
            return;
        }
        // Update the positions
        posStart = buf.indexOf('{');
        posEnd = buf.indexOf('}');
    }
}

function processObjectString(objectString) {
    count++;
    var obj = JSON.parse(objectString); // parse the JSON
    console.log(obj.id); // Print object ID (works)
}

Upvotes: 1

Views: 4523

Answers (1)

alandarev
alandarev

Reputation: 8635

Some Theory

Node.js is asynchronous but is actually single threaded. If the process gets stuck on processing the data received, it will never receive a second chunk, as the sender waits for the stuck thread to be released before it can do anything.

Meaning

If the line process(); inside of 'data', function(chunk) is in infinite loop, then you will never receive a second chunk, thus it may look like sender is being lazy.


For the future: try to always isolate the problem to ensure you are looking in the right place.

P.S. it actually is easy to get yourself into infinite loop while processing text, I feel your pain here.

Upvotes: 2

Related Questions