konsumer
konsumer

Reputation: 3503

save many records to couchdb in nodejs

I have a very large dataset that I want to save in couchdb for searchability.

I want the records to look like this:

{
  "type": "first",
  "name": "ryan",
  "count": 447980
}

Since the text-files are larger than I should hold in memory, I am setting up a streaming readline reader, like so:

var db = require('./db'),
    readline = require('readline'),
    path = require('path'),
    fs = require('fs');

// simple callback after cradle save
function saveHandler(er, doc){
    if (er) return console.log('Error: ', er);
    console.log(doc);
}

// save record of type, based on line with count & name
function handleCountedLine(type, line){
    return function(line){
        var record = {type:type};
        var i = line.trim().split(' ');
        record.name = i[1].trim();
        record.count = Number(i[0]);
        db.save(record, saveHandler);
    }
}

var handleFirst = handleCountedLine('first');
readline.createInterface({
    input: fs.createReadStream('data/facebook-firstnames-withcount.txt'),
    terminal: false
})
.on('line', handleFirst);

db is a cradle db.

After 40 records or so, it slows to a total crawl, then eventually runs out of memory. I tried poolr and node-rate-limiter, using "only run this many at a time" & "only allow this many to run in a minute" strategies. Both work a little better, but it still runs out of memory. Is there a good way to accomplish this goal, or am I stuck writing it in python?

Upvotes: 2

Views: 780

Answers (2)

skiqh
skiqh

Reputation: 475

I guess couchdb is the bottleneck here. Have a look at couchdb's bulk doc api that allows you to insert documents en masse. (You should probably not try to commit all your data at once, but accumulate a bunch of docs in an array and push that to the database -- use stream.pause() and stream.resume() to throttle the text stream). You will be rewarded with efficiency gains by couchdb if you use the bulk api.

Upvotes: -1

konsumer
konsumer

Reputation: 3503

With awesome help from Paulo Machado in google hangouts, I made an answer using line-by-line, a simple wrapper that uses stream.pause() & stream.resume() to only allow a single line to be processed at a time. I'd like to give him the credit, but he hasn't come over here to make an answer, so I will just put this here. It has parsed 34039 records, so far. I will update the answer if it crashes.

var LineByLineReader = require('line-by-line'),
  path = require('path'),
  db = require('./db')

// line-by-line read file, turn into a couch record
function processFile(type){
  var fname = path.join('data', types[type] + '.txt');
  var lr = new LineByLineReader(fname, {skipEmptyLines: true});

  lr.on('error', function (err) {
    console.log('Error:');
    console.log(err);
  });

  lr.on('record', function (record) {
    console.log('Saved:');
    console.log(record);
  });

  lr.on('line', function (line) {
    lr.pause();
    var record = { type: type };

    if (type == 'full'){
      record.name = line.trim().split(' ');
    }else{
      var i = line.trim().split(' ');
      record.name = i[1].trim();
      record.count = Number(i[0]);
    }

    db.save(record, function(er, res){
      if (er) lr.emit('error', er, record);
      if (res) lr.emit('record', record);
      lr.resume();
    })
  });
}

var types = {
  'first':'facebook-firstnames-withcount',
  'last':'facebook-lastnames-withcount',
  'full':'facebook-names-unique'
};

for (type in types){
  processFile(type);
}

// views for looking things up
db.save('_design/views', require('./views'));

Upvotes: 2

Related Questions