Reputation: 689
i am trying to use aggregate framework in mongo for some data stats. the query i am using, when run on local is hardly taking a a minute but when i run the same query on server it does not give response and after keep on waiting for too long , i had to cancel it, can anyone please suggest why is this happening.
var orderIds = db.delivery.find({"status":"DELIVERED"}).map(function(o) {
return o.order
});
var userIds = db.order.aggregate([{
$match : { _id : { $in : orderIds } }
}, {
$group: { _id : "$customer" }
}]).map(function(u) { return u._id });
var userstats = db.order.aggregate([{
$sort : { customer : 1, dateCreated : 1 }
}, {
$match : { status : "DELIVERED", customer : { $in : userIds } }
}, {
$group: {
_id : "$customer", orders : { $sum : 1 },
firstOrderDate : { $first : "$dateCreated" },
lastOrderDate : { $last : "$dateCreated" }
}
}]);
userstats.forEach(function(x) {
db.user.update({ _id : x._id }, {
$set : {
totalOrders : x.orders,
firstOrderDate : x.firstOrderDate,
lastOrderDate : x.lastOrderDate
}
})
})
I am not sure , but shouldn't it be more fast on server ? , but instead its not able to give output.
Upvotes: 3
Views: 1493
Reputation: 103375
To speed up the process you could refactor your operations in a couple of ways.
The first would be to eliminate unnecessary pipeline operations like the $sort
operator which could be replaced with the $max
and $min
operators within the $group
pipeline.
Secondly, use the bulk()
API which will increase perfromance on update operations especially when dealing with large collections since they will be sending the operations to the server in batches (for example, say a batch size of 500) unlike sending every request to the server (as you are currently doing with the update statement within the forEach()
loop).
Consider the following refactored operations:
var orderIds = db.delivery.find({"status": "DELIVERED"}).map(function(d){return d.order;}),
counter = 0,
bulk = db.user.initializeUnorderedBulkOp();
var userstatsCursor = db.orders.aggregate([
{ "$match": { "_id": { "$in": orderIds } } },
{
"$group": {
"_id": "$customer",
"orders": { "$sum": 1 },
"firstOrderDate": { "$min": "$dateCreated" },
"lastOrderDate":{ "$max": "$dateCreated" } }
}
}
]);
userstatsCursor.forEach(function (x){
bulk.find({ "_id": x._id }).updateOne({
"$set": {
"totalOrders": x.orders,
"firstOrderDate": x.firstOrderDate,
"lastOrderDate": x.lastOrderDate
}
});
counter++;
if (counter % 500 == 0) {
bulk.execute(); // Execute per 500 operations and
// re-initialize every 500 update statements
bulk = db.user.initializeUnorderedBulkOp();
}
});
// Clean up remaining operations in queue
if (counter % 500 != 0) { bulk.execute(); }
Upvotes: 3
Reputation: 21766
I recommend you make $match
the first operation in your pipeline as the $match
operator can only use an index if it is first in the aggregation pipeline:
var userstats = db.order.aggregate([{
$match : {
status :"DELIVERED",
customer : { $in : userIds }
}
}, {
$sort : {
customer : 1,
dateCreated : 1
}
}, {
$group : {
_id : "$customer",
orders : { $sum : 1 },
firstOrderDate: { $first : "$dateCreated" },
lastOrderDate : { $last:"$dateCreated" }
}
}]);
You should also add an index on status
and customer
if you have not already defined one:
db.delivery.createIndex({status:1,customer:1})
Upvotes: 3