Reputation: 24850
I have already learned that readline can be used to read the file line by line, e.g.
readline
.createInterface({input: fs.createReadStream('xxx')})
.on('line', (line) => { apply_regexp_on_line })
.on('close', () => { report_all_regexps });
However, this is pretty slow, since I compared the performance of grep
and JavaScript regexp, and the latter has better performance on the regexps I tested. (see benchmark) So I think I have to blame the node async readline.
In my situation, I do not care async at all, I just need to exploit the fast regexp from JavaScript to process very large log files (typically 1-2GB, sometimes up to 10GB). What is the best way of doing this? My only concern is speed.
Bonus points: some of the log files are gzipped, so I need to uncompress them. If someone can recommend me a fast line-by-line reader for both plain text and gzipped text exists, I would be really appreciated.
Upvotes: 13
Views: 3166
Reputation: 338208
How does this hold up against your data?
// module linegrep.js
'use strict';
var through2 = require('through2');
var StringDecoder = require('string_decoder').StringDecoder
function grep(regex) {
var decoder = new StringDecoder('utf8'),
last = "",
lineEnd = /\r?\n/;
var stream = through2({}, function transform(chunk, enc, cb) {
var lines = decoder.write(last + chunk).split(lineEnd), i;
last = lines.pop();
for (i = 0; i < lines.length; i++) {
if (regex.test(lines[i])) this.push(lines[i]);
}
cb();
}, function flush(cb) {
if (regex.test(last)) this.push(last);
cb();
});
stream._readableState.objectMode = true;
return stream;
}
module.exports = grep;
and
// index.js
'use strict';
var fs = require('fs');
var zlib = require('zlib');
var grep = require('./linegrep');
function grepFile(filename, regex) {
var rstream = fs.createReadStream(filename, {highWaterMark: 172 * 1024});
if (/\.gz$/.test(filename)) rstream = rstream.pipe(zlib.createGunzip());
return rstream
.pipe(grep(regex));
}
// -------------------------------------------------------------------------
var t = Date.now(), mc = 0;
grepFile('input.txt', /boot\.([a-z]+)_head\./).on('data', function (line) {
mc++;
console.log(line);
}).on('end', function () {
console.log( mc + " matches, " + (Date.now() - t) + " ms" );
});
This turns a file stream into an object stream of lines, maps them through your regex and returns only the matching lines.
Upvotes: 1