Reputation: 6263
I've seen a number of solutions on this, however they are all for Mongo v2 and are not suitable for V3.
My document looks like this:
{
"_id" : ObjectId("582c98667d81e1d0270cb3e9"),
"asin" : "B01MTKPJT1",
"url" : "https://www.amazon.com/Trump-President-Presidential-Victory-T-Shirt/dp/B01MTKPJT1%3FSubscriptionId%3DAKIAIVCW62S7NTZ2U2AQ%26tag%3Dselfbalancingscooters-21%26linkCode%3Dxm2%26camp%3D2025%26creative%3D165953%26creativeASIN%3DB01MTKPJT1",
"image" : "http://ecx.images-amazon.com/images/I/41RvN8ud6UL.jpg",
"salesRank" : NumberInt(442137),
"title" : "Trump Wins 45th President Presidential Victory T-Shirt",
"brand" : "\"Getting Political On Me\"",
"favourite" : false,
"createdAt" : ISODate("2016-11-16T17:33:26.763+0000"),
"updatedAt" : ISODate("2016-11-16T17:33:26.763+0000")
}
and my collection contains around 500k documents. I want to remove all duplicate documents (except for 1) where the ASIN is the same
How can I achieve this?
Upvotes: 10
Views: 15062
Reputation: 7665
Use a for loop, it will take time but will do the work
db.amazon_sales.find({}, {asin:1}).sort({_id:1}).forEach(function(doc){
db.amazon_sales.remove({_id:{$gt:doc._id}, asin:doc.asin});
})
Then and this index
db.amazon_sales.createIndex( { "asin": 1 }, { unique: true } )
Upvotes: 3
Reputation: 61225
This is something we can actually do using the aggregation framework and without client side processing.
db.collection.aggregate(
[
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$asin",
"doc": { "$first": "$$ROOT" }
}},
{ "$replaceRoot": { "newRoot": "$doc" } },
{ "$out": "collection" }
]
)
db.collection.aggregate(
[
{ "$sort": { "_id": 1 } },
{ "$group": {
"_id": "$asin",
"doc": { "$first": "$$ROOT" }
}},
{ "$project": {
"asin": "$doc.asin",
"url": "$doc.url",
"image": "$doc.image",
"salesRank": "$doc.salesRank",
"title": "$doc.salesRank",
"brand": "$doc.brand",
"favourite": "$doc.favourite",
"createdAt": "$doc.createdAt",
"updatedAt": "$doc.updatedAt"
}},
{ "$out": "collection" }
]
)
Upvotes: 28