Reputation: 493
My data looks like this:
{
"_id":ObjectId("516fbf68067323ce2ea5b4b8"),
"title":"GVPKFlFIXdLUaLM",
"release_year":1913,
"country_of_origin":"sWdXLXUfun",
"length_in_minutes":147,
"plot_summary":"bmwYkyyiSymHJYoXEPauPNjdKoFANDgcDImVelDGPuPJmLhyWOuNXjurNyGp",
"director":"rNDFhhxGIo",
"language":"oYeWskT",
"popularity":5.2,
"genre":"jDwdaMhuT",
"actors":[
{
"id":2740,
"name":"actor2740",
"dob":1989,
"alt_name":"PBpXPqJwmftpfcR",
"pob":"DFoxETDuhAdDGNE"
},
{
"id":3143,
"name":"actor3143",
"dob":1953,
"alt_name":"AHnVvTviSKuvNZO",
"pob":"KBUdvbnvNkXmddk"
}
]
}
At first I thought there was a bug in Mongo. I attempted to solve a hypothetical business problem using the aggregate function. (edit: I didn't mean to say that I resolved a mongo issue or that I want people to help me create an algorithm, only to confirm a potential bug with MongoDB)
db.movies.aggregate([{$match:{popularity:{$gte:7.3}}},
{$project:{actors:1,popularity:1}},
{$unwind:"$actors"},
{$group:{_id:"$actors.id",avgPop:{$avg:"$popularity"},
docsByTag : { $sum : 1 }, popSum:{$sum:"$popularity"}}},
{$match:{avgPop:{$gte:7.5}}}]);
result I focused on (edit $sum: 1 not 0)
{
"_id" : 1383,
"avgPop" : 8.772857142857141,
"docsByTag" : 28,
"popSum" : 245.63999999999996
},
But when I verified the result by hand.
db.movies.find({"actors.name":"actor1383"},{title:1,popularity:1,_id: 0})
{ "title" : "kZFfBwtAfVNobEq", "popularity" : 8.54 }
{ "title" : "kyOeSorYUWyJmjK", "popularity" : 8.11 }
{ "title" : "rvSdJCgEkkpYgFB", "popularity" : 8.36 }
{ "title" : "SwcgHTgZqqcYJja", "popularity" : 8.68 }
{ "title" : "XmcidmdwtDlNoKw", "popularity" : 7.33 }
{ "title" : "gwThvrWifoKCvyG", "popularity" : 7.94 }
{ "title" : "RdUsAFIxTnntTZR", "popularity" : 6.91 }
{ "title" : "RwhJlORFdvtDtpO", "popularity" : 5.13 }
{ "title" : "TuDfcWhNkQFeycl", "popularity" : 9.93 }
{ "title" : "xTVkwnyvftKQraC", "popularity" : 7.27 }
{ "title" : "HYMjUFlSXgnWVTx", "popularity" : 6.94 }
{ "title" : "ZPPyAUdGMeVQhbK", "popularity" : 8.48 }
{ "title" : "kEITAiMMrWTECGM", "popularity" : 9.42 }
{ "title" : "asNsLYKjvHlihXZ", "popularity" : 9.86 }
{ "title" : "ctEmciXPhbMtspt", "popularity" : 8.85 }
{ "title" : "DHjFtctccwDHtlf", "popularity" : 5.5 }
{ "title" : "ElUqbLqkoKrJPVl", "popularity" : 8.26 }
{ "title" : "XdTCieKsWtTbfZa", "popularity" : 5.72 }
{ "title" : "EeNqOPSuKiHuWRs", "popularity" : 5.91 }
{ "title" : "YgysqxcesvPryMY", "popularity" : 6.05 }
{ "title" : "eARvpGydsWilquc", "popularity" : 7.34 }
{ "title" : "NDpdkhSUfePDYjH", "popularity" : 7.28 }
{ "title" : "wUGKLBwijftQKgU", "popularity" : 8.97 }
{ "title" : "UHVGUmAcjBgAPBp", "popularity" : 7.44 }
{ "title" : "NKTKEKfbxFrudVi", "popularity" : 9.4 }
{ "title" : "AeByTKwsEQuQBYG", "popularity" : 8.97 }
{ "title" : "nZskARfGbhYRxdY", "popularity" : 9.16 }
{ "title" : "nBenZrikXFFrrnq", "popularity" : 7.58 }
{ "title" : "GdEFwoKgqjhHvjM", "popularity" : 6.3 }
{ "title" : "grpKTHgnYcDNyXH", "popularity" : 7.16 }
{ "title" : "hXhOqknvjIYJIaT", "popularity" : 5.24 }
{ "title" : "rggTJENnVeuqQVI", "popularity" : 9.95 }
{ "title" : "ABvGVFHkgOumMPO", "popularity" : 9.56 }
{ "title" : "baVkepHniIURUFH", "popularity" : 9.28 }
{ "title" : "PUYXlhPwbanMDmT", "popularity" : 9.6 }
{ "title" : "IJbqonvsVeorDMv", "popularity" : 7.82 }
{ "title" : "iAhyATKYpCVjtMw", "popularity" : 5.88 }
{ "title" : "uDECLFQGTOVnyvC", "popularity" : 6.25 }
{ "title" : "rTwfCYLfLwgPcbH", "popularity" : 8.38 }
{ "title" : "GRyKjecBHQhvYJk", "popularity" : 9.11 }
{ "title" : "GyEaSHoprUvGmZM", "popularity" : 9.92 }
which gives a subset of 27 elements greater or equal to 7.3
{ "title" : "kZFfBwtAfVNobEq", "popularity" : 8.54 }
{ "title" : "kyOeSorYUWyJmjK", "popularity" : 8.11 }
{ "title" : "rvSdJCgEkkpYgFB", "popularity" : 8.36 }
{ "title" : "SwcgHTgZqqcYJja", "popularity" : 8.68 }
{ "title" : "XmcidmdwtDlNoKw", "popularity" : 7.33 }
{ "title" : "gwThvrWifoKCvyG", "popularity" : 7.94 }
{ "title" : "TuDfcWhNkQFeycl", "popularity" : 9.93 }
{ "title" : "ZPPyAUdGMeVQhbK", "popularity" : 8.48 }
{ "title" : "kEITAiMMrWTECGM", "popularity" : 9.42 }
{ "title" : "asNsLYKjvHlihXZ", "popularity" : 9.86 }
{ "title" : "ctEmciXPhbMtspt", "popularity" : 8.85 }
{ "title" : "ElUqbLqkoKrJPVl", "popularity" : 8.26 }
{ "title" : "eARvpGydsWilquc", "popularity" : 7.34 }
{ "title" : "wUGKLBwijftQKgU", "popularity" : 8.97 }
{ "title" : "UHVGUmAcjBgAPBp", "popularity" : 7.44 }
{ "title" : "NKTKEKfbxFrudVi", "popularity" : 9.4 }
{ "title" : "AeByTKwsEQuQBYG", "popularity" : 8.97 }
{ "title" : "nZskARfGbhYRxdY", "popularity" : 9.16 }
{ "title" : "nBenZrikXFFrrnq", "popularity" : 7.58 }
{ "title" : "rggTJENnVeuqQVI", "popularity" : 9.95 }
{ "title" : "ABvGVFHkgOumMPO", "popularity" : 9.56 }
{ "title" : "baVkepHniIURUFH", "popularity" : 9.28 }
{ "title" : "PUYXlhPwbanMDmT", "popularity" : 9.6 }
{ "title" : "IJbqonvsVeorDMv", "popularity" : 7.82 }
{ "title" : "rTwfCYLfLwgPcbH", "popularity" : 8.38 }
{ "title" : "GRyKjecBHQhvYJk", "popularity" : 9.11 }
{ "title" : "GyEaSHoprUvGmZM", "popularity" : 9.92 }
one less than the aggregate function.
So I thought may be aggregate is broken and rewrote this as mapReduce
// make sure we're using the right db; this is the same as "use aggdb;" in shell
db = db.getSiblingDB("recommendations"); //Put your MongoLab database name here.
var mapFunc2 = function() {
for (var idx = 0; idx < this.actors.length; idx++) {
var key = this.actors[idx].id;
var value = {
count: 1,
pop: this.popularity
};
emit(key, value);
}
};
var reduceFunc2 = function(keyActor, countObjVals) {
reducedVal = { actor: keyActor, count: 0, pop: 0, pop_list : [] };
for (var idx = 0; idx < countObjVals.length; idx++) {
reducedVal.count += countObjVals[idx].count;
reducedVal.pop += countObjVals[idx].pop;
reducedVal.pop_list = reducedVal.pop_list.concat(countObjVals[idx].pop);
}
return reducedVal;
};
var finalizeFunc2 = function (key, reducedVal) {
reducedVal.avg = reducedVal.pop/reducedVal.count;
return reducedVal;
};
result = db.movies.mapReduce( mapFunc2,
reduceFunc2,
{
out: { merge: "mre" },
query: { popularity:
{ $gte: 7.3 }
},
finalize: finalizeFunc2
}
)
cursor = db.map_reduce_example.find()
while(cursor.hasNext()){
printjson(cursor.next());
}
The result is again off by one
{
"_id" : 1383,
"value" : {
"actor" : 1383,
"count" : 28,
"pop" : 245.63999999999996,
"avg" : 8.772857142857141
}
}
So I started debugging and I see some strange things when it comes to saving the popularity of each individual movie in the array.
{ "_id" : 1, "value" : { "actor" : 1, "count" : 13, "pop" : 114.97, "pop_list" : [ 7.47, 8.52, 9.95, 17.4, 7.4, 19.43, 8.46, 17.21, 9.24, 9.89 ], "avg" : 8.843846153846155 } }
Here, it's strange that the count is 13 but the number of elements is 10. This is because of
7.4 7.4
7.47 7.47
8.07 1
8.14 2
8.46 8.46
8.52 8.52
9.14 1
9.24 9.24
9.26 2
9.57 3
9.86 3
9.89 9.89
9.95 9.95
where 1,2,3 correspond to
1 17.21=9.14+8.07
2 17.4=8.14+9.26
3 19.43=9.57+9.86
{ "_id" : 2, "value" : { "actor" : 2, "count" : 14, "pop" : 120.91999999999999, "pop_list" : [ 35.239999999999995, 7.58, 35.56, 9.35, 25.839999999999996, 7.35 ], "avg" : 8.637142857142857 } } However, the above is totally cryptic since all my averages have only 2 decimals place of precision.
Really confused at this point. I'm sure this post could be helpful to others who are stuck figuring the same type of counting problems.
Upvotes: 1
Views: 2336
Reputation: 42362
It's highly unlikely that aggregation framework and mapreduce are both making a "mistake" so I would ask you to verify how you are comparing their results with your expectation.
In your aggregations, you group on "actors.id"
field. But your query to verify things manually is:
db.movies.find({"actors.name":"actor1383"},{title:1,popularity:1,_id: 0})
Is there evidence that your "actors.name" and "actors.id" fields match up 100%?
The precision of higher than 2 digits is normal when you do floating point arithmetic and nothing to worry about. It's no different than asking for average of 5 and 10 and getting 7.5 even though 5 and 10 both had no digits after the decimal point.
There is another place where the "difference" may come from. If you have a document like this:
{ popularity: 7.6, actors: [ { id: 1383, ... ... }, { id: 1383, ... ... } }
You will now have only one top level document that this contributes to, but when you unwind the actors array you now have TWO document that result from this where both have actor.id 1383. Can you verify that each actor only appears once per top level document? If not that would lead to the discrepancy you see.
Upvotes: 1