ttyagi3963
ttyagi3963

Reputation: 17

Reading very large data files using nodejs? Interview question

I recently wrote a solution for an interview question which was rejected. Please analyze my solution and advice? is it not efficient to handle large files?

Problem Given a data file containing scored records, in your favorite programming language, write a program to output the N highest record IDs ordered by descending score. The output should be well-formed JSON. Consider giving thought to the resource efficiency of your solution.

DATA File The input data file is a series of key-value pairs (one per line) with the format :

3830591998918656: {"id":"2208ef95-355c-53a6-96bc-206a4831f2fe","data":"Tu"} 548113328635904: {"id":"d5887987-bf5d-5813-b524-722ffff11882","data":"Vubiteh hone na dabupkof tetnut."} 4322085113430016: {"id":"8f5330b9-0279-5a67-aee3-d6acf221a53a","data":"Losu nihpe upaveitpse teblujnis."} 6348702421614592: {"id":"1fef0dbc-c75b-5959-835f-80e5f15b6da1","data":"Riliv kaliku laiza zen daze ."}

can be upto 100k lines or more

Success Conditions Upon successful running, your solution should exit with exit code 0. If the input data file is not valid, your solution should exit with code 2, while if the input file is not found, your solution should exit with code 1. Empty lines in the input file should be ignored rather than treated as invalid input.

My solution

highest.js the intry point

{

let argv = require("yargs").argv;

const yargs = require("yargs");
const handler = require("./handler");

const fileName = argv._[0];
const numberOfEntries = argv._[1];

if (!fileName || !numberOfEntries) {
  console.log("Command Incorrect : --use node highest <filename> <count>");
} else {
  handler.getHighestScore(fileName, numberOfEntries);
}

}

Handler.js {

const path = require("path");
const fs = require("fs");
const es = require("event-stream");

let ln = 0;
const scoreArray = [];

exports.getHighestScore = (fileName, numberOfEntries) => {
  const filePath = path.join(path.dirname(require.main.filename), fileName);

  fs.stat(filePath, function (err, stat) {
    if (stat && stat.isFile()) {
      let s = fs
        .createReadStream(filePath)
        .pipe(es.split())
        .pipe(
          es
            .mapSync(function (line) {
              s.pause();
              if (ln < parseInt(numberOfEntries)) {
                if (line.trim().length > 0) {
                  let score = line.slice(0, line.indexOf(":"));
                  let secondPart = line.slice(line.indexOf(":") + 1);
                  let jsonObject = JSON.parse(secondPart);

                  if (
                    jsonObject.hasOwnProperty("id") &&
                    jsonObject.id.trim().length > 0
                  ) {
                    let outputObject = { score: score, id: jsonObject.id };
                    scoreArray.push(outputObject);
                  } else {
                    process.stdout.write(
                      "***********File Invalid - ID Mssing************* code(2)"
                    );
                    process.exit(2);
                  }
                }
              }
              if (ln == parseInt(numberOfEntries)) {
                s.end();
              } else {
                ln += 1;
              }
              s.resume();
            })
            .on("error", function (err) {
              process.stdout.write(
                "***********Error while reading file************* code(2)"
              );

              process.exit(2);
            })
            .on("end", function () {
              let arr = scoreArray.sort((a, b) => (b.score > a.score ? 1 : -1));
              process.stdout.write(
                "TOTAL LINES READ = " +
                  ln +
                  " TOTAL OUTPUT = " +
                  arr.length +
                  "\n"
              );
              process.stdout.write(JSON.stringify(arr));
              process.stdout.write("\n");
              process.exit(0);
            })
        );
    } else {
      process.stdout.write("***********FILE NOT FOUND************* code(1)");
      process.exit(1);
    }

  });
};

}

Any advice and criticism about solution is appreciated

Upvotes: 0

Views: 242

Answers (1)

jfriend00
jfriend00

Reputation: 707318

You are reading the entire input file into memory and storing everything. A space efficient way to do things is to stream the input one line at a time, parse it, check to see if it contains an N-highest score. If so, add it to the N-highest data structure. If not, skip it. Only retain in memory the N-highest data as you go through the whole file. This seems to be the main point that your code misses.

For efficiency reasons, this does an insertion sort into the highest array so it isn't constantly resorting the entire array every time you add a value.

Here's an implementation of that in nodejs:

const readline = require('node:readline');
const fs = require('node:fs');

// sample input data per line
// 3830591998918656: { "id": "2208ef95-355c-53a6-96bc-206a4831f2fe", "data": "Tu" }
// 548113328635904: { "id": "d5887987-bf5d-5813-b524-722ffff11882", "data": "Vubiteh hone na dabupkof tetnut." }
// 4322085113430016: { "id": "8f5330b9-0279-5a67-aee3-d6acf221a53a", "data": "Losu nihpe upaveitpse teblujnis." }
// 6348702421614592: { "id": "1fef0dbc-c75b-5959-835f-80e5f15b6da1", "data": "Riliv kaliku laiza zen daze ." }

const scoreRegex = /^\s*(\d+):\s*/;

function parseLine(line) {
    // ok to skip empty lines
    if (line.trim() === "") return null;

    const result = {};
    // parse leading digits
    const scoreMatch = line.match(scoreRegex);
    if (!scoreMatch) throw new Error("Missing score at beginning of line");
    result.score = BigInt(scoreMatch[1], 10);
    const remainingLine = line.slice(scoreMatch[0].length);
    result.info = JSON.parse(remainingLine);
    if (typeof result.info.id !== "string" || result.info.id === "") {
        throw new Error("Missing valid id value");
    }
    return result;
}

// input and output files
const fileInput = "input.txt";
const fileOutput = "output.txt";

const howManyHighest = 2;
const highestScores = [];

function getLowest() {
    return highestScores[highestScores.length - 1].score;
}

// do an insertion sort into the highestScores array
// highest score record first
// maintain length at no more than howManyHighest
function insertHighest(val) {
    let inserted = false;
    // for performance reasons, only search through the highestScores
    // list if this new score is higher than the lowest score we have in the list so far
    if (highestScores.length && val.score > getLowest()) {
        for (let [index, item] of highestScores.entries()) {
            if (val.score > item.score) {
                // insert it into this position in the array, pushing the others up
                highestScores.splice(index, 0, val);
                inserted = true;
                break;
            }
        }
    }
    if (inserted) {
        // trim overflow, if any
        if (highestScores.length > howManyHighest) {
            highestScores.pop();
        }
    } else {
        // didn't insert it, see if we aren't full yet
        if (highestScores.length < howManyHighest) {
            highestScores.push(val);
        }
    }
}

const rl = readline.createInterface({
    input: fs.createReadStream(fileInput),
    crlfDelay: Infinity
});

rl.on('error', err => {
    if (err.code === 'ENOENT') {
        process.exit(1);
    } else {
        console.log(err);
        // some sort of read error
        process.exit(2);
    }
}).on('line', line => {
    try {
        const data = parseLine(line);
        if (data) {
            insertHighest(data);
        }
    } catch (e) {
        console.log(e);
        console.log(`Invalid line: ${line}`);
        process.exit(2);
    }

}).on('close', () => {
    // generate array of highest scores
    const output = highestScores.map(item => item.info.id)
    console.log(output);
    fs.writeFile(fileOutput, JSON.stringify(output), err => {
        if (err) {
            console.log(err);
            // output write error
            process.exit(3);
        } else {
            // all done, successfully
            process.exit(0);
        }
    });
});

I ran into a few unspecified implementation questions, which if you bring up during your answer will show you've fully understood the problem.

  1. What is the max score value that can exist in this data? This determines if we can use a Javascript number type or whether BigInt needs to be used to handle arbitrarily large numbers (at a cost of some run-time performance). Since this was not specified, I've used BigInt parsing and comparisons here to make the score values limitless integers.

  2. Is the output data supposed to be only an array of ids (sorted by highest score) that belong to the highest scores. Or is it supposed to be some sorted data structure that includes the score and the id and the other data? Your question does not make that entirely clear. Showing an example of the output data for N = 3 would make this clear. This code produces an output array of id values, sorted by id with highest score first.

  3. It is not specified what the valid structure of an id property is. This code just tests for a non-empty string value.

  4. It is not specified what should be output if there are high tie scores or if the highest N+1 scores are all the same (e.g. there's no unique N highest scores). In case of ties at the end of the high score list, this retains only the first ones encountered in the input file (up to the N we're outputing).

Upvotes: 1

Related Questions