Reputation: 1513
I'm new to MongoDB, so forgive me if I missed something in docs. I have a collection like this
[date: "2015-12-01", status: "resolved", parentId: 1]
[date: "2015-12-01", status: "resolved", parentId: 2]
[date: "2015-12-01", status: "resolved", parentId: 2]
[date: "2015-12-01", status: "waiting", parentId: 2]
[date: "2015-12-02", status: "resolved", parentId: 1]
[date: "2015-12-02", status: "waiting", parentId: 2]
[date: "2015-12-02", status: "waiting", parentId: 2]
[date: "2015-12-03", status: "resolved", parentId: 1]
and I'm expecting to sum the output grouped by
date -> parentId -> status
so that would be
{
"2015-12-01": {
"1": {
"resolved": 1
},
"2": {
"resolved": 2,
"waiting": 1
}
}
"2015-12-02": {
"1": {
"resolved": 1
},
"2": {
"waiting": 2
},
}
"2015-12-03": {
"1": {
"resolved": 1
}
}
}
any advices how I can achieve that? I already got this using aggregation framework:
{
'$group': {
'_id': {
'date': '$date',
'status': '$status',
'parentId': '$parentId'
},
'total': {
'$sum': 1
}
}
}
Upvotes: 0
Views: 2272
Reputation: 50406
Not a fan of using "data" as "keys" in output, as it is generally better to keep "data" as "data" and is more consistent with Object oriented design patterns where the keys are consistent between objects and not varying in every result. After-all, someone had the good sense to design the initial data that way in the first place.
So all you really need here is a multiple level grouping which is quite simple to do by taking the output of one $group
stage and feeding it to another:
db.collection.aggregate([
{ "$group": {
"_id": {
"date": "$date",
"parentId": "$parentId",
"status": "$status"
},
"total": { "$sum": 1 }
}},
{ "$group": {
"_id": {
"date": "$_id.date",
"parentId": "$_id.parentId"
},
"data": { "$push": {
"status": "$_id.status",
"total": "$total"
}}
}},
{ "$group": {
"_id": "$_id.date",
"parents": { "$push": {
"parentId": "$_id.parentId",
"data": "$data"
}}
}}
])
This would progressively nest the data into arrays under each "date" key after following the initial group to accumulate per the finest level of detail. The result is basically "rolling-up" the structure into a single document per key by compacting into arrays via $push
:
[
{
"_id": "2015-12-01",
"parents": [
{
"parentId": 1,
"data": [
{ "status": "resolved", "total": 1 }
]
},
{
"parentId": 2,
"data": [
{ "status": "resolved", "total": 2 },
{ "status": "waiting", "total": 1 }
]
}
]
},
{
"_id": "2015-12-02",
"parents": [
{
"parentId": 1,
"data": [
{ "status": "resolved", "total": 1 }
]
},
{
"parentId": 2,
"data": [
{ "status": "waiting", "total": 2 }
]
}
]
},
{
"_id": "2015-12-03",
"parents": [
{
"parentId": 1,
"data": [
{ "status": "resolved", "total": 1 }
]
}
]
}
]
Or if you can live with it, then you can go even flatter with all related sub data in a single array rather than a nested one:
db.collection.aggregate([
{ "$group": {
"_id": {
"date": "$date",
"parentId": "$parentId",
"status": "$status"
},
"total": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.date",
"data": { "$push": {
"parentId": "$_id.parentId",
"status": "$_id.status",
"total": "$total"
}}
}}
])
Which has a single array child, just keeping all data keyed:
[
{
"_id": "2015-12-01",
"data": [
{
"parentId": 1,
"status": "resolved",
"total": 1
},
{
"parentId": 2,
"status": "resolved",
"total": 2
},
{
"parentId": 2,
"status": "waiting",
"total": 1
}
]
},
{
"_id": "2015-12-02",
"data": [
{
"parentId": 1,
"status": "resolved",
"total": 1
},
{
"parentId": 2,
"status": "waiting",
"total": 2
}
]
},
{
"_id": "2015-12-03",
"data": [
{
"parentId": 1,
"status": "resolved",
"total": 1
}
]
}
]
The main essence here is that "lists of things" are kept as arrays as a child of the thing they are related to in either form, just with varying degrees. This is quite arguably easier to process and more logical than determining the "keys" from an object and iterating them, when you can just basically iterate a natural list anyway.
The aggregation framework does not support ( quite deliberately ) trying to massage keys from data in any way, and most MongoDB query operations also agree with this philosophy as it makes a good deal of sense for what is essentially a "database".
If you really must massage as keys, it would be recommended to do this in client side processing after retrieving the aggregated result. You can even do this in stream processing while passing through to a remote client, but as a basic transformation example:
var out = db.collection.aggregate([
{ "$group": {
"_id": {
"date": "$date",
"parentId": "$parentId",
"status": "$status"
},
"total": { "$sum": 1 }
}},
{ "$group": {
"_id": {
"date": "$_id.date",
"parentId": "$_id.parentId"
},
"data": { "$push": {
"status": "$_id.status",
"total": "$total"
}}
}},
{ "$group": {
"_id": "$_id.date",
"parents": { "$push": {
"parentId": "$_id.parentId",
"data": "$data"
}}
}}
]).toArray();
out.forEach(function(doc) {
var obj = {};
obj[doc._id] = {};
doc.parents.forEach(function(parent) {
obj[doc._id][parent.parentId] = {};
parent.data.forEach(function(data) {
obj[doc._id][parent.parentId][data.status] = data.total;
});
});
printjson(obj);
});
Which basically produces the output as you have it structured, but of course as individual documents as explained later:
{
"2015-12-01": {
"1": {
"resolved": 1
},
"2": {
"resolved": 2,
"waiting": 1
}
}
},
{
"2015-12-02": {
"1": {
"resolved": 1
},
"2": {
"waiting": 2
},
}
},
{
"2015-12-03": {
"1": {
"resolved": 1
}
}
}
Or you can force this on the server using mapReduce and JavaScript based processing, but is again ill-advised due to overall efficiency not being as effective as the aggregation processing:
db.collection.mapReduce(
function() {
var obj = {};
obj[this.parentId] = {};
obj[this.parentId][this.status] = 1;
emit(this.date,obj);
},
function(key,values) {
var result = {};
values.forEach(function(value) {
Object.keys(value).forEach(function(parent) {
if (!result.hasOwnProperty(parent))
result[parent] = {};
Object.keys(parent).forEach(function(status) {
if (!result[parent].hasOwnProperty(status))
result[parent][status] = 0;
result[parent][status] += value[parent][status];
});
});
});
return result;
},
{ "out": { "inline": 1 } }
);
Much the same result, but with the specific output format mapReduce always produces:
{
"_id": "2015-12-01",
"value": {
"1": {
"resolved": 1
},
"2": {
"resolved": 2,
"waiting": 1
}
}
},
{
"_id": "2015-12-02",
"value": {
"1": {
"resolved": 1
},
"2": {
"waiting": 2
},
}
},
{
"_id": "2015-12-03",
"value": {
"1": {
"resolved": 1
}
}
}
Taking care there that especially if you are not familiar with how mapReduce works, there is a very important reason why the structures are emitted and traversed consistently between mapper and reducer as well as summing the emitted values for status rather than simply incrementing. This is a property of mapReduce where output from the reducer can end up going back through the reducer again until a single result is reached.
Also as mentioned earlier and a great big caveat of "new to this" as you state yourself, is that you really never want to compact results into a single object for response as shown in your question.
Not only is it another property of bad design ( covered earlier ), but also there are realistic "hard limits" on the size of output from MongoDB and many sensible systems. Single documents have a BSON size limit of 16MB, which will almost certainly be exceeded in any real world case when trying to do so.
Furthermore, "lists as lists" just make sense, and trying to artificially represent that using unique keys within a single document object makes little sense. Things process and stream far more easily when you use the correct data structure types for the intended purpose.
So those are the approaches to handling your output. It really is just basic data manipulation on aggregation, whatever the approach taken. But hopefully you can see the common sense in keeping it as efficient and simple as possible, as can be directly handled by aggregation and makes a lot more sense for the eventual code that processes the received results.
Upvotes: 5