Irfan
Irfan

Reputation: 156

mongo $sum compounded when doing $unwind and then $group on multiple fields

I have the following document structure

{
    "app_id": "DHJFK67JDSJjdasj909",
    "date": ISODate("2014-08-07T00:00:00.000Z"),
    "event_count": 100,
    "events": [
        { "type": 0,  "value": 12  },
        { "type": 10, "value": 24 },
        { "type": 20, "value": 36  },
        { "type": 30, "value": 43 }
    ],
    "unique_events": [
        { "type": 0,  "value": 5  },
        { "type": 10, "value": 8 },
        { "type": 20, "value": 12  },
        { "type": 30, "value": 56 }
    ]
}

I am trying to get a sum of event_counts and also the values for unique_events and events per type. This is the kind of output I am expecting where the event_count and each events and unique_events value has been summed per type.

{
    "app_id": "DHJFK67JDSJjdasj909",
    "date": ISODate("2014-08-07T00:00:00.000Z"),
    "event_count": 4345,
    "events": [
        { "type": 0,  "value": 624  },
        { "type": 10, "value": 234 },
        { "type": 20, "value": 353 },
        { "type": 30, "value": 472 }
    ],
    "unique_events": [
        { "type": 0,  "value": 433  },
        { "type": 10, "value": 554 },
        { "type": 20, "value": 645  },
        { "type": 30, "value": 732 }
    ]
}

This is my query

db.events.aggregate([
    { "$unwind": "$events" },
    { "$group": {
        "_id": { 
            "app_id": "$app_id",
            "type": "$events.type"
            "unique_type": "$unique_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "event_value": { "$sum": "$events.value" },
        "unique_event_value": { "$sum": "$unique_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": { "$push": { "type": "$_id.type", "value": "$event_value" } }
        "unique_events": { "$push": { "type": "$_id.unique_type", "value": "$unique_event_value" } }
    }}
]) 

The problem is that using two $unwinds and then grouping by both events and unique_events is causing the $sum to be compounded and much too large. Is there some way I can fix this using mongo or do I have to run two queries and then merge the two result sets in code.

Thanks

Irfan

Upvotes: 5

Views: 4139

Answers (2)

Harsh Patel
Harsh Patel

Reputation: 573

You can perform Following Map reduce:
Its not a dynamic solution, i have created variable for each events and unique_events.
I have saved output in different collection using out: "session_stat" in mapReduce Function.

var mapFunction = function() {
                      var key = this.app_id;
                      var value = {                                 
                                    event_count: this.event_count,
                                    events: this.events,
                                    unique_events: this.unique_events
                                   };

                      emit( key, value );
                  };

var reduceFunction = function(key, values) {

                        var reducedObject = {
                                              app_id: key,
                                              events_wise_total: 0,
                                              unique_events_wise_total:0,
                                              total_event_count:0
                                            };

                        var events_wise_total = [];
                        var events_0_total = { type:0, value :0};
                        var events_10_total = { type:10, value :0};
                        var events_20_total = { type:20, value :0};
                        var events_30_total = { type:30, value :0};

                        var unique_events_wise_total = [];
                        var unique_events_0_total = { type:0, value :0};
                        var unique_events_10_total = { type:10, value :0};
                        var unique_events_20_total = { type:20, value :0};
                        var unique_events_30_total = { type:30, value :0};

                        var total_event_count = 0;
                        values.forEach( function(value) {
                                total_event_count += value.event_count;
                                var events = value.events;

                                events.forEach(function(event){
                                                if(event.type == 0){events_0_total.value += event.value;}
                                                if(event.type == 10){events_10_total.value += event.value;}
                                                if(event.type == 20){events_20_total.value += event.value;}
                                                if(event.type == 30){events_30_total.value += event.value;}
                                        });

                                var unique_events = value.unique_events;

                                unique_events.forEach(function(unique_event){
                                                if(unique_event.type == 0){unique_events_0_total.value += unique_event.value;}
                                                if(unique_event.type == 10){unique_events_10_total.value += unique_event.value;}
                                                if(unique_event.type == 20){unique_events_20_total.value += unique_event.value;}
                                                if(unique_event.type == 30){unique_events_30_total.value += unique_event.value;}
                                        }); 
                            }
                          );
                        events_wise_total.push(events_0_total);
                        events_wise_total.push(events_10_total);
                        events_wise_total.push(events_20_total);
                        events_wise_total.push(events_30_total);

                        unique_events_wise_total.push(unique_events_0_total);
                        unique_events_wise_total.push(unique_events_10_total);
                        unique_events_wise_total.push(unique_events_20_total);
                        unique_events_wise_total.push(unique_events_30_total);

                        reducedObject.events_wise_total = events_wise_total;
                        reducedObject.unique_events_wise_total = unique_events_wise_total;
                        reducedObject.total_event_count = total_event_count;

                        return reducedObject;
                     };

var finalizeFunction = function (key, reducedValue) {
                          return reducedValue;
                       };                    

db.GroupBy.mapReduce(
                       mapFunction,
                       reduceFunction,
                       {
                         out: "session_stat",
                         finalize: finalizeFunction
                       });

Hope it may be useful

Upvotes: 2

Neil Lunn
Neil Lunn

Reputation: 151132

This is simple enough really, to sum the results for each array it's just a matter of discerning between which is which and "combining the elements". In short you should probably be doing this in you documents anyway as should be evident from the first pipeline stage(s).

So for MongoDB 2.6 and greater there are some helper methods:

db.events.aggregate([
    { "$project": {
        "app_id": 1,
        "event_count": 1,
        "all_events": {
            "$setUnion": [
                { "$map": {
                    "input": "$events",
                    "as": "el",
                    "in": {
                        "type": "$$el.type",
                        "value": "$$el.value",
                        "class": { "$literal": "A" }
                    }
                }},
                { "$map": {
                    "input": "$unique_events",
                    "as": "el",
                    "in": {
                        "type": "$$el.type",
                        "value": "$$el.value",
                        "class": { "$literal": "B" }
                    }
                }}
            ]
        }
    }},
    { "$unwind": "$all_events" },
    { "$group": {
        "_id": {
            "app_id": "$app_id",
            "class": "$all_events.class",
            "type": "$all_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "value": { "$sum": "$all_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "A" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        },
        "unique_events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "B" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        }
    }},
    { "$project": {
        "event_count": 1,
        "events": { "$setDifference": [ "$events", [false] ] },
        "unique_events": {
            "$setDifference": [ "$unique_events", [false] ]
        }
    }}
])

Mostly in the $setUnion and $setDifference operators. The other ccase is $map, which processes arrays in place. The whole thing there is doing operations on arrays without the use of $unwind. But those can of course be done in prior versions, it just takes a bit more work:

db.events.aggregate([
    { "$unwind": "$events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "events": {
            "$push": {
                "type": "$events.type",
                "value": "$events.value",
                "class": { "$const": "A" }
            }
        },
        "unique_events": { "$first": "$unique_events" }            
    }},
    { "$unwind": "$unique_events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "events": { "$first": "$events" },
        "unique_events": {
            "$push": {
                "type": "$unique_events.type",
                "value": "$unique_events.value",
                "class": { "$const": "B" }
            }
        }
    }},
    { "$project": {
        "app_id": 1,
        "event_count": 1,
        "events": 1,
        "unique_events": 1,
        "type": { "$const": [ "A","B" ] }
    }},
    { "$unwind": "$type" },
    { "$unwind": "$events" },
    { "$unwind": "$unique_events" },
    { "$group": {
        "_id": "$_id",
        "app_id": { "$first": "$app_id" },
        "event_count": { "$first": "$event_count" },
        "all_events": {
            "$addToSet": {
                "$cond": [
                     { "$eq": [ "$events.class", "$type" ] },
                     {
                         "type": "$events.type",
                         "value": "$events.value",
                         "class": "$events.class"
                     },
                     {
                         "type": "$unique_events.type",
                         "value": "$unique_events.value",
                         "class": "$unique_events.class"
                     }
                ]
            }
        }
    }},
    { "$unwind": "$all_events" },
   { "$group": {
        "_id": {
            "app_id": "$app_id",
            "class": "$all_events.class",
            "type": "$all_events.type"
        },
        "event_count": { "$sum": "$event_count" },
        "value": { "$sum": "$all_events.value" }
    }},
    { "$group": {
        "_id": "$_id.app_id",
        "event_count": { "$sum": "$event_count" },
        "events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "A" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        },
        "unique_events": {
            "$push": {
                "$cond": [
                    { "$eq": [ "$_id.class", "B" ] },
                    { "type": "$_id.type", "value": "$value" },
                    false
                ]
            }
        }
    }},
    { "$unwind": "$events" },
    { "$match": { "events": { "$ne": false } } },
    { "$group": {
        "_id": "$_id",
        "event_count": { "$first": "$event_count" },
        "events": { "$push": "$events" },
        "unique_events": { "$first": "$unique_events" }
    }},
    { "$unwind": "$unique_events" },
    { "$match": { "unique_events": { "$ne": false } } },
    { "$group": {
       "_id": "$_id",
        "event_count": { "$first": "$event_count" },
        "events": { "$first": "$events" },
        "unique_events": { "$push": "$unique_events" }
    }}
])

That gets you the results you want with each array being "summed" together as well as the master "event_count" with the correct result.

You probably should consider combining both of those arrays with a similar identifier to what has been used in the pipelines as demonstrated. This part is half of the work. The other half is considering that you probably should store pre-aggregated results in a collection somewhere for the best application performance.

Upvotes: 9

Related Questions