Reputation: 9121
I want to update large numbers (> 100,000) of documents most efficiently.
My first naive approach was doing it on the JS level, writing scripts that fetch _ids first, then loop through _ids and invoke updates by _id (full docs or $set patches).
I ran into memory issues, also sharding the data into chunks of max. 500 documents (with opening and closing the connection) doesn't seem to work well.
So how can i solve this on the MongoDB level?
Best practice?
I have 3 common use cases, typically maintenance work flows:
1. Change type of value of property, without changing the value.
// before
{
timestamp : '1446987395'
}
// after
{
timestamp : 1446987395
}
2. Add new property based on value of existing property.
// before
{
firstname : 'John',
lastname : 'Doe'
}
// after
{
firstname : 'John',
lastname : 'Doe',
name : 'John Doe'
}
3. Simply adding removing properties from documents.
// before
{
street : 'Whatever Ave',
street_no : '1025'
}
// after
{
street : 'Whatever Ave',
no : '1025'
}
Thanks for helping out.
Upvotes: 8
Views: 13039
Reputation: 103365
If your MongoDB server is 2.6 or newer, it would be better to take advantage of using a write commands Bulk API that allow for the execution of bulk update
operations which are simply abstractions on top of the server to make it easy to build bulk operations. These bulk operations come mainly in two flavours:
Note, for older servers than 2.6 the API will downconvert the operations. However it's not possible to downconvert 100% so there might be some edge cases where it cannot correctly report the right numbers.
For your three common use cases, you could implement the Bulk API like this:
Case 1. Change type of value of property, without changing the value:
var MongoClient = require('mongodb').MongoClient;
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 1. Change type of value of property, without changing the value.
col.find({"timestamp": {"$exists": true, "$type": 2} }).each(function (err, doc) {
var newTimestamp = parseInt(doc.timestamp);
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "timestamp": newTimestamp }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Case 2. Add new property based on value of existing property:
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 2. Add new property based on value of existing property.
col.find({"name": {"$exists": false } }).each(function (err, doc) {
var fullName = doc.firstname + " " doc.lastname;
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "name": fullName }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Case 3. Simply adding removing properties from documents.
MongoClient.connect("mongodb://localhost:27017/test", function(err, db) {
// Handle error
if(err) throw err;
// Get the collection and bulk api artefacts
var col = db.collection('users'),
bulk = col.initializeOrderedBulkOp(), // Initialize the Ordered Batch
counter = 0;
// Case 3. Simply adding removing properties from documents.
col.find({"street_no": {"$exists": true } }).each(function (err, doc) {
bulk.find({ "_id": doc._id }).updateOne({
"$set": { "no": doc.street_no },
"$unset": { "street_no": "" }
});
counter++;
if (counter % 1000 == 0 ) {
bulk.execute(function(err, result) {
// re-initialise batch operation
bulk = col.initializeOrderedBulkOp();
});
}
});
if (counter % 1000 != 0 ){
bulk.execute(function(err, result) {
// do something with result
db.close();
});
}
});
Upvotes: 12