Reputation: 101
I am currently trying to implement SPIMI index construction method in Node and I have ran into an issue.
The code is the following:
let fs = require("fs");
let path = require("path");
module.exports = {
fileStream: function (dirPath, fileStream) {
return buildFileStream(dirPath, fileStream);
},
buildSpimi: function (fileStream, outDir) {
let invIndex = {};
let sortedInvIndex = {};
let fileNameCount = 1;
let outputTXT = "";
let entryCounter = 0;
let resString = "";
fileStream.forEach((filePath, fileIndex) => {
let data = fs.readFileSync(filePath).toString('utf-8');
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(function (ch) { return ch.length != 0; });
data.forEach(token => {
//CHANGE THE SIZE IF NECESSARY (4e+?)
if (entryCounter > 100000) {
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
resString = "";
entryCounter = 0;
sortedInvIndex = {};
invIndex = {};
console.log(outputTXT + " - written;");
fileNameCount++;
};
if (invIndex[token] == undefined) {
invIndex[token] = [];
entryCounter++;
};
if (!invIndex[token].includes(fileIndex)) {
invIndex[token].push(fileIndex);
entryCounter++;
};
});
});
Object.keys(invIndex).sort().forEach((key) => {
sortedInvIndex[key] = invIndex[key];
});
outputTXT = outDir + "block" + fileNameCount;
for (let SItoken in sortedInvIndex) {
resString += SItoken + "," + sortedInvIndex[SItoken].toString();
};
fs.writeFile(outputTXT, resString, (err) => { if (err) console.log(error); });
console.log(outputTXT + " - written;");
}
}
function buildFileStream(dirPath, fileStream) {
fileStream = fileStream || 0;
fs.readdirSync(dirPath).forEach(function (file) {
let filepath = path.join(dirPath, file);
let stat = fs.statSync(filepath);
if (stat.isDirectory()) {
fileStream = buildFileStream(filepath, fileStream);
} else {
fileStream.push(filepath);
}
});
return fileStream;
}
I am using the exported functions in a separate file:
let spimi = require("./spimi");
let outputDir = "/Users/me/Desktop/SPIMI_OUT/"
let inputDir = "/Users/me/Desktop/gutenberg/2/2";
fileStream = [];
let result = spimi.fileStream(inputDir, fileStream);
console.table(result)
console.log("Finished building the filestream");
let t0 = new Date();
spimi.buildSpimi(result, outputDir);
let t1 = new Date();
console.log(t1 - t0);
While this code kind of works when trying on relatively small volumes of data (I tested up to 1.5 GB), there is obviously a memory leak somewhere, as when monitoring the RAM usage I can see it going up as far as to 4-5 GB).
I spent quite a lot of time trying to figure out what might be the cause, but I still couldn't find the issue.
I would appreciate any hints on this! Thanks!
Upvotes: 1
Views: 2554
Reputation: 441
Mykhailo, adding on to what jfriend said, it's actually not a memory leak. It's working as intended.
Something to consider is that readFile buffers the entire file! This will cause the huge memory bloat. Better alternative is to implement fs.createReadStream()
which will only buffer the part of the file you're currently reading. Unfortunately, implementing that solution may require a full rewrite of your code as it returns fs.ReadStream
which won't behave the way you're currently handling files Checkout this link and read the bottom of the section to see what I'm referencing
Upvotes: 0
Reputation: 707158
Something to understand about the language and garbage collection in general is that this:
data = data.toUpperCase().split(/[^a-zA-Z]/).filter(...)
creates three additional copies of your data. First, an uppercase copy. Then, a split array copy. Then, a filtered copy of the split array.
So, at this point, you have four copies of your data all in memory. All, but the filtered array are now eligible for garbage collection when the GC gets a chance to run, but if this data was initially large, you're going to be using at least 3x-4x as much memory as the filesize (depending upon how many array items are removed in your .filter()
operation).
None of this is a leak, but it's a very big peak memory usage which can be a problem.
A more memory efficient way to process large files is to process them as a stream (not read them all into memory at once). You read a small size chunk (say 1024 bytes), process it, read a chunk, process it while being careful about chunk boundaries. If your file naturally has line boundaries, there are already pre-built solutions for processing line by line. If not, you can create your own chunk processing mechanism. We would have to see a sample of your data to make more specific chunk processing suggestions.
As another point, if you end up with a lot of keys in invIndex
, then this line of code starts to become inefficient and you're doing it in your loop:
Object.keys(invIndex).sort()
This takes your object and gets all the keys in a temporary array which you use only for the purposes of updating the sortedInvIndex
which is yet another copy of your data. So, right there alone, this set of code makes three copies of all your keys and two copies of all the values. And, it does it every time through your loop. Again, lots of peak memory usage that the GC won't normally clean up until your function is done.
A redesign to the way you process this data could probably reduce the peak memory usage by a factor of 100x. For memory efficiency, you want only the initial data, the final data representation and then just a little more used for temporary transformations to over be in use at the same time. You don't want to EVER be processing all the data multiple times because each time you do that, it creates yet another entire copy of all the data that contributes to peak memory usage.
If you show what the data input looks like and what data structure you're trying to end up with, I could probably take a crack at a much more efficient implementation.
Upvotes: 2