Reputation: 3503
I have a very large dataset that I want to save in couchdb for searchability.
I want the records to look like this:
{
"type": "first",
"name": "ryan",
"count": 447980
}
Since the text-files are larger than I should hold in memory, I am setting up a streaming readline reader, like so:
var db = require('./db'),
readline = require('readline'),
path = require('path'),
fs = require('fs');
// simple callback after cradle save
function saveHandler(er, doc){
if (er) return console.log('Error: ', er);
console.log(doc);
}
// save record of type, based on line with count & name
function handleCountedLine(type, line){
return function(line){
var record = {type:type};
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
db.save(record, saveHandler);
}
}
var handleFirst = handleCountedLine('first');
readline.createInterface({
input: fs.createReadStream('data/facebook-firstnames-withcount.txt'),
terminal: false
})
.on('line', handleFirst);
db is a cradle db.
After 40 records or so, it slows to a total crawl, then eventually runs out of memory. I tried poolr and node-rate-limiter, using "only run this many at a time" & "only allow this many to run in a minute" strategies. Both work a little better, but it still runs out of memory. Is there a good way to accomplish this goal, or am I stuck writing it in python?
Upvotes: 2
Views: 780
Reputation: 475
I guess couchdb is the bottleneck here. Have a look at couchdb's bulk doc api that allows you to insert documents en masse. (You should probably not try to commit all your data at once, but accumulate a bunch of docs in an array and push that to the database -- use stream.pause() and stream.resume() to throttle the text stream). You will be rewarded with efficiency gains by couchdb if you use the bulk api.
Upvotes: -1
Reputation: 3503
With awesome help from Paulo Machado in google hangouts, I made an answer using line-by-line, a simple wrapper that uses stream.pause() & stream.resume() to only allow a single line to be processed at a time. I'd like to give him the credit, but he hasn't come over here to make an answer, so I will just put this here. It has parsed 34039 records, so far. I will update the answer if it crashes.
var LineByLineReader = require('line-by-line'),
path = require('path'),
db = require('./db')
// line-by-line read file, turn into a couch record
function processFile(type){
var fname = path.join('data', types[type] + '.txt');
var lr = new LineByLineReader(fname, {skipEmptyLines: true});
lr.on('error', function (err) {
console.log('Error:');
console.log(err);
});
lr.on('record', function (record) {
console.log('Saved:');
console.log(record);
});
lr.on('line', function (line) {
lr.pause();
var record = { type: type };
if (type == 'full'){
record.name = line.trim().split(' ');
}else{
var i = line.trim().split(' ');
record.name = i[1].trim();
record.count = Number(i[0]);
}
db.save(record, function(er, res){
if (er) lr.emit('error', er, record);
if (res) lr.emit('record', record);
lr.resume();
})
});
}
var types = {
'first':'facebook-firstnames-withcount',
'last':'facebook-lastnames-withcount',
'full':'facebook-names-unique'
};
for (type in types){
processFile(type);
}
// views for looking things up
db.save('_design/views', require('./views'));
Upvotes: 2