Reputation: 549
There are a number of questions and answers about randomly ordering results or randomly getting a single record. The answers recommend adding a random field, creating an index on that field, and then doing a random draw. It looks like:
db.myindex.find().forEach(function(doc) {
db.myindex.update({_id: doc._id}, {$set: {rand: Math.random()}})
})
This works great, but it takes several hours (lots and lots of data). It looks like is limited by write locking which makes sense since the update is happening for each record. How do I do this in bulk? I tried:
var bulk = db.myindex.initializeUnorderedBulkOp();
bulk.find({}).update( { $set: { rand: Math.random() } } );
bulk.execute();
But it sets the rand field to the same value for every record! How do I fix this?
Edit: By the way, the reason that I need to do this is because I get a huge bson file from someone else and I need to import it frequently, so can't wait multiple hours to get it updated.
Upvotes: 1
Views: 505
Reputation: 11
If the collection is just static data, and you're getting a BSON file from someone else, it might be quicker to stream the BSON file through a filter to generate a new BSON file that you can then import using mongoimport.
Here is one that I wrote using nodeJS that can process a BSON file at around 1GB/min.
var bson = require('bson');
var BSON = new bson.BSONPure.BSON();
var BSONStream = require('bson-stream');
var fs = require('fs');
var sb = require('stream-buffers');
var rs = fs.createReadStream('tweets.bson');
var ws = fs.createWriteStream('tweets_random.bson',{flags:'a'});
var writeBuffer = new sb.WritableStreamBuffer({
initialSize: (1024*1024),
incrementAmount: (10*1024)
});
rs.pipe(new BSONStream()).on('data',function(obj) {
obj.rand = Math.random();
writeBuffer.write(BSON.serialize(obj));
if(writeBuffer.size()>(1024*1024)) {
var size = writeBuffer.size();
ws.write(writeBuffer.getContents(),function() {
console.log("Wrote",size,"bytes");
console.log("Buffer has:",writeBuffer.size(),"bytes left");
});
}
});
It might go faster if you modify the buffer size/increment parameters.
This is of course assuming that you have the luxury of reimporting your data.
Upvotes: 0
Reputation: 103375
Introduce a loop with the bulk operations sent to the server once per 1000 documents, or as many modifications as you can fit under the 64MB BSON limit:
var bulk = db.myindex.initializeOrderedBulkOp();
var counter = 0;
db.myindex.find().forEach(function(doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "rand": Math.random() }
});
counter++;
if (counter % 1000 == 0) {
bulk.execute();
bulk = db.myindex.initializeOrderedBulkOp();
}
});
if (counter % 1000 != 0){
bulk.execute();
}
Upvotes: 1