Reputation: 223
For example, I have the following documents in my collection:
{
"_id" : "GuqXmAkkARqhBDqhy",
"beatmapset_id" : "342537",
"version" : "MX",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "3.5552737712860107"
}
{
"_id" : "oHLT7KqsB7bztBGvu",
"beatmapset_id" : "342537",
"version" : "HD",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "2.7515676021575928"
}
{
"_id" : "GbotZfrPEwW69FkGD",
"beatmapset_id" : "342537",
"version" : "NM",
"diff_approach" : "5",
"artist" : "Yousei Teikoku",
"title" : "Kokou no Sousei",
"difficultyrating" : "0"
}
These documents have the same key beatmapset_id
I want to delete all duplicates but leave the document with the most difficultyrating
.
I tried db.collection.ensureIndex({beatmapset_id: 1}, {unique: true, dropDups: true})
but it leaves a random document and I want the condition above.
How can I do that?
Upvotes: 2
Views: 2253
Reputation: 61225
First you need to update your documents and change difficultyrating
and beatmapset_id
to float point number. To do that you need to loop over each document using the .forEach
method and update each document with "Bulk" operations for maximum efficiency..
var bulk = db.collection.initializeOrderedBulkOp();
var count = 0;
db.collection.find().forEach(function(doc) {
bulk.find({ '_id': doc._id }).update({
'$set': {
'beatmapset_id': parseFloat(doc.beatmapset_id),
'difficultyrating': parseFloat(doc.difficultyrating)
}
});
count++;
if(count % 100 == 0) {
bulk.execute();
bulk = db.collection.initializeOrderedBulkOp();
}
})
if(count > 0) {
bulk.execute();
}
Now and since The "dropDups" syntax for index creation has been "deprecated" as of MongoDB 2.6 and removed in MongoDB 3.0. This is how you can remove the dups.
The main idea here is first sort your document by difficultyrating
in descending order.
bulk = db.collection.initializeUnorderedBulkOp();
count = 0;
db.collection.aggregate([
{ '$sort': { 'difficultyrating': -1 }},
{ '$group': { '_id': '$beatmapset_id', 'ids': { '$push': '$_id' }, 'count': { '$sum': 1 }}},
{ '$match': { 'count': { '$gt': 1 }}}
]).forEach(function(doc) {
doc.ids.shift();
bulk.find({'_id': { '$in': doc.ids }}).remove();
count++;
if(count === 100) {
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
}
})
if(count !== 0) {
bulk.execute();
}
This answer cover the topic for more detail.
Upvotes: 3
Reputation: 103305
One approach you can take is to get a list of the unique ids of the documents with the duplicate beatmapset_id
via the aggregation framework:
db.collection.aggregate([
{
"$group": {
"_id": "$beatmapset_id",
"count": { "$sum": 1 },
"uniqueIds": { "$addToSet": "$_id" },
"maxRating": { "$max": "$difficultyrating" }
}
},
{
"$match": {
"count": { "$gte": 2 }
}
},
{
"$sort" : { "count" : -1 }
}
]);
In the first stage of this example pipeline, we use the $group
operator to aggregate documents by the desired index key values and record (in the uniqueIds field) each _id
value of the grouped documents. We also count the number of grouped documents by using the $sum
operator which adds up the values of the fields passed to it, in this case the constant 1 - thereby counting the number of grouped records into the count field. We also get the maximum difficultyrating
value of the group by using the $max
operator.
In the second stage of this example pipeline, we use the $match
operator to filter out all documents with a count of 1. The filtered-out documents represent unique index keys.
The remaining documents identify documents in the collection that contain duplicate keys.
Sample Output:
/* 0 */
{
"result" : [
{
"_id" : "342537",
"count" : 3,
"uniqueIds" : [
"GbotZfrPEwW69FkGD",
"oHLT7KqsB7bztBGvu",
"GuqXmAkkARqhBDqhy"
],
"maxRating" : "3.5552737712860107"
}
],
"ok" : 1
}
Since the db.collection.aggregate()
method returns a cursor and can return result sets of any size, use the cursor method forEach()
to iterate the cursor and access the result documents that you can then streamline with Bulk API remove
operations:
var pipeline = [
{
"$group": {
"_id": "$beatmapset_id",
"count": { "$sum": 1 },
"uniqueIds": { "$addToSet": "$_id" },
"maxRating": { "$max": "$difficultyrating" }
}
},
{
"$match": {
"count": { "$gte": 2 }
}
},
{
"$sort" : { "count" : -1 }
}
],
counter = 0,
bulk = db.collection.initializeOrderedBulkOp();
db.collection.aggregate(pipeline).forEach(function(doc) {
bulk.find({
"_id": { "$in": doc.uniqueIds },
"difficultyrating": { "$lt": doc.maxRating }
}).remove();
counter++;
if ( counter % 500 == 0 ) {
// Execute per 500 operations and re-init.
bulk.execute();
bulk = db.mycollection.initializeOrderedBulkOp();
}
});
// Catch any under or over the 500's and clean up queues
if (counter % 500 != 0)
bulk.execute();
Upvotes: 1