Reputation: 36227
I'm working with mongo and node. I have a collection with a large number of records an unknown number of which are duplicates. I'm trying to remove dups following Remove duplicate records from mongodb 4.0 and https://docs.mongodb.com/manual/aggregation/ .
I am using the mongodb compass tool. I am able to run the code in the mongodb Shell at the bottom of this tool.
So far I have:
db.hayes.aggregate([
... {"$group" : {_id:"$PropertyId", count:{$sum:1}}}
... ]
... );
{ "_id" : "R135418", "count" : 10 }
{ "_id" : "R47410", "count" : 17 }
{ "_id" : "R130794", "count" : 10 }
{ "_id" : "R92923", "count" : 18 }
{ "_id" : "R107811", "count" : 11 }
{ "_id" : "R91389", "count" : 15 }
{ "_id" : "R22047", "count" : 12 }
{ "_id" : "R103664", "count" : 10 }
{ "_id" : "R121349", "count" : 12 }
{ "_id" : "R143168", "count" : 8 }
{ "_id" : "R85918", "count" : 13 }
{ "_id" : "R41641", "count" : 13 }
{ "_id" : "R160910", "count" : 11 }
{ "_id" : "R48919", "count" : 11 }
{ "_id" : "M119387", "count" : 10 }
{ "_id" : "R161734", "count" : 12 }
{ "_id" : "R41259", "count" : 13 }
{ "_id" : "R156538", "count" : 7 }
{ "_id" : "R60868", "count" : 10 }
How do I now select 1 of each of the groups to avoid duplicates . (I can see that loading it into a new collection will likely involve: {$out: "theCollectionWithoutDuplicates"} )
edit:
The output from db.hayes.aggregate([ { $group: { _id: "$PropertyId", count: { $sum: 1 }, ids: { $addToSet: "$_id" } } }, { $match: { count: { $gt: 1 } } } ]) gives output looking like:
{ "_id" : "M118975", "count" : 8, "ids" : [ ObjectId("60147f84e9fdd41da73272d6"), ObjectId("601427ac432deb152a70b8fd"), ObjectId("6014639be210571a70d1118f"), ObjectId("60145e9ae210571a70d0062f"), ObjectId("60145545b6f7a917817e9519"), ObjectId("6014619be210571a70d0a091"), ObjectId("60145dc3d5a2811a459b4e07"), ObjectId("60146641e210571a70d1a3cd") ] }
{ "_id" : "R88986", "count" : 10, "ids" : [ ObjectId("60131752de3d3a09bc1eb04b"), ObjectId("6013348385dcda0eb5b8d40c"), ObjectId("60145297b6f7a917817e1928"), ObjectId("601458eeb08c4919df85f63d"), ObjectId("601462f4e210571a70d0e961"), ObjectId("60142ad9c0db1716068a612e"), ObjectId("601425263df18a145b2fd0a8"), ObjectId("60145be5d5a2811a459aea7e"), ObjectId("6014634ce210571a70d0fe5c"), ObjectId("60131a1ab7335806a1816b95") ] }
{ "_id" : "P119977", "count" : 11, "ids" : [ ObjectId("601468b9597abd1bfd0798a4"), ObjectId("60144b7dbfa28016887b0e8f"), ObjectId("60147094c4bca31cfdb12d1d"), ObjectId("60144de7bfa28016887b698b"), ObjectId("60135aa63674d90dffec3759"), ObjectId("60135f552441920e97e858a3"), ObjectId("601428b3432deb152a70f32e"), ObjectId("60141b222ac11f13055725a5"), ObjectId("60145326b6f7a917817e38b6"), ObjectId("6014882c5322582035e83f63"), ObjectId("6014741ae9fdd41da7313a44") ] }
However when I run the foreach loop it runs for minutes and crashes
Originally the database mydb had 0.173 GB but now 0.368 GB
any idea what is wrong?
edit 2:
I rebooted, then reran your entire script. this time it completed in a 3-4 minutes. No errors.
> show dbs
admin 0.000GB
config 0.000GB
local 0.000GB
myNewDatabase 0.000GB
mydb 0.396GB
> db.hayes.aggregate([ {"$group" : {_id:"$PropertyId", count:{$sum:1}}},{$count:"total"} ]);
{ "total" : 103296 }
> db.hayes.aggregate([ {"$group" : {_id:"$PropertyId", count:{$sum:1}}} ]);
{ "_id" : "R96274", "count" : 1 }
{ "_id" : "R106186", "count" : 1 }
{ "_id" : "R169417", "count" : 1 }
{ "_id" : "R140542", "count" : 1 }
So it looks like it worked this time, but why is 'mydb' getting larger?
Upvotes: 1
Views: 563
Reputation: 10707
Here is how to keep single document from every duplicated list and remove the rest:
db.test.aggregate([
{
$group: {
_id: "$PropertyId",
count: {
$sum: 1
},
ids: {
$addToSet: "$_id"
}
}
},
{
$match: {
count: {
$gt: 1
}
}
}
]).forEach(function(d){
d.ids.shift();
printjson(d.ids);
db.test.remove({
_id: {
$in: d.ids
}
})
})
Explained:
Upvotes: 1