Reputation: 17
I have a folder with about 120k HTML pages that I need to open (each file is about 70kb), parse some data out with xPath and append that data to a .csv file.
Below is my code:
It should read a list of files from the parseFolder, iterate through each file name opening it with fs.readFile then parsing the data out with jsdom and xpath and saving it to the csv file using fs.appendFile.
It seems to do well for the first 100 or so files but after that will gradually slow down, consume memory and cpu and eventually stall. I have 16 gigs of memory and it seems to hit some limit when my memory usage is up to about 7gigs.
I'm new to JS and Node, any help pointing out what I'm missing would be very much appreciated.
var fs = require('fs');
var jsdom = require('jsdom').jsdom;
var xpath = require('xpath');
var S = require('string');
var os = require('os');
ParserRules = {
saveFile: 'output.csv',
parseFolder: '/a/folder/with/120k/HTML/files',
fields: {
"field1": "//div[@class='field1']/text()",
}
};
start();
function start() {
console.log('Starting...');
fs.readdir(ParserRules.parseFolder, iterateFiles);
}
function iterateFiles(err, filesToParse) {
for (var i = 0; i < filesToParse.length; i++) {
file = ParserRules.parseFolder + '/' + filesToParse[i];
console.log('Beginning read of ' + file);
fs.readFile(file, {encoding: 'utf8'}, parseFile);
}
}
function parseFile(err, data) {
if (err == null) {
var jsdomDocument = jsdom(data);
var document = jsdomDocument.parentWindow.document;
getContent(document);
}
}
function getContent(document) {
fields = ParserRules.fields;
var csvRow = [];
for (var field in fields) {
try {
console.log('Looking for ' + field);
var nodes = xpath.select(fields[field], document);
for (var i = 0; i < nodes.length; i++) {
csvRow.push(getValue(nodes[i]));
}
} catch (err) {
console.log(err);
}
}
saveToCsv(csvRow, ParserRules.saveFile);
}
function getValue(node) {
if(node.nodeValue != null) {
toReturn = node.nodeValue;
} else {
newNode = $(node);
toReturn = newNode.html();
}
return toReturn;
}
function saveToCsv(object, filePath) {
console.log('Saving...');
if(object.length > 0) {
console.log('Row Exists, Saving...');
toString = S(object).toCSV().s + os.EOL;
fs.appendFile(filePath, toString, {encoding: 'utf8', flag: 'a'}, function(err){
if (err) {
console.log('Write Error: ' + err);
} else {
console.log('Saved ' + object);
}
});
}
}
Upvotes: 1
Views: 2398
Reputation: 151401
Node.js works asynchronously.
So the way your code is structured, this happens:
The function iterateFiles
issues 120k fs.readFile
calls in a row, which causes Node.js to queue 120k filesystem read operations.
When the read operations are complete Node.js will call the 120k callbacks for fs.readFile
and each of these will issue a fs.appendFile
operation, which will cause Node.js to queue 120k filesystem write operation.
Eventually Node.js will call the 120k callbacks that were passed to fs.appendFile
. Until these write operations are completed Node.js must hang onto the data that is to be written.
For a task like this I would suggest using the synchronous version of the fs calls: fs.readFileSync
and fs.appendFileSync
.
When writing code for a web server or that is somehow event-driven, you don't want to use the synchronous version of these calls because they will cause your application to block. But if you are writing code which is doing batch processing of data (for instance, code that operates like a shell script would), it is simpler to use the synchronous version of these calls.
The following code is a simplified model of your code and illustrates the problem. It is set to read from /tmp
because that's as good a source of files as any. I've also set it to avoid doing any further work than parseFile
if a file is empty.
var fs = require('fs');
var ParserRules = {
saveFile: 'output.csv',
parseFolder: '/tmp'
};
start();
function start() {
console.log('Starting...');
fs.readdir(ParserRules.parseFolder, iterateFiles);
}
function iterateFiles(err, filesToParse) {
for (var i = 0; i < filesToParse.length; i++) {
var file = ParserRules.parseFolder + '/' + filesToParse[i];
console.log('Beginning read of file number ' + i);
fs.readFile(file, {encoding: 'utf8'}, parseFile);
}
}
var parse_count = 0;
function parseFile(err, data) {
if (err)
return;
if (data.length) {
console.log("Parse: " + parse_count++);
getContent(data);
}
}
function getContent(data) {
saveToCsv(data, ParserRules.saveFile);
}
var save_count = 0;
function saveToCsv(data, filePath) {
fs.appendFile(filePath, data, {encoding: 'utf8', flag: 'a'},
function(err){
if (err) {
console.log('Write Error: ' + err);
} else {
console.log('Saved: ' + save_count++);
}
});
}
If you run this code you'll see that all the Parse:
messages appear contiguously. Then only after all the Parse:
messages are output, you get the Saved:
messages. So you'd see something like:
Beginning read of file number N
Beginning read of file number N+1
Parse: 0
Parse: 1
... more parse messages ...
Parse: 18
Parse: 19
Saved: 0
Saved: 1
... more saved messages...
Saved: 18
Saved: 19
What this tells is you is that Node does not start to save until all files are parsed. Since Node cannot release the data associated with a file until it it knows it won't be used again --- in this case, it means until the file has been saved --- then at some point Node will take a minimum of 120,000 * 70kb of memory to hold all the data from all the files.
Upvotes: 4