Redsandro
Redsandro

Reputation: 11356

Counting Occurrences of Values for Keys

I have a lot of documents with many attributes. After a specific $match pass, I end up with a subsection. Here it is simplified:

[
    {"name": "foo", "code": "bbb"},
    {"name": "foo", "code": "aaa"},
    {"name": "foo", "code": "aaa"},
    {"name": "foo", "code": "aaa"},
    {"name": "bar", "code": "aaa"},
    {"name": "bar", "code": "aaa"},
    {"name": "bar", "code": "aaa"},
    {"name": "baz", "code": "aaa"},
    {"name": "baz", "code": "aaa"}
]

I would like to count the occurances of certain attributes so I end up with this (simplified):

{
    "name": {
        "foo": 4, 
        "bar": 3,
        "baz": 2
    },
    "code": {
        "bbb": 1,
        "aaa": 8
    }
}

(Or something close that I can 'translate' afterwards with Node.js)

I already do a $group stage to count other attributes (differently). Ideally I would $addToSet and also count how many times a similar value was added to the set. But I cannot figure out how.

Alternatively I was thinking to $push to end up with this (simplified):

{
    "name": ["foo", "foo", "foo", "foo", "bar", "bar", "bar", "baz", "baz"],
    "code": ["bbb", "aaa", "aaa", "aaa", "aaa", "aaa", "aaa", "aaa", "aaa", ]
}

But I can't figure out how to turn it into (something close to) the above hypothetical result either.

For single fields alone, the closest I can come is by using the above $push and then I can use $group:

"$group": {
    "_id": {"_id": "$_id", "name": "$name"},
    "nameCount": {"$sum": 1}
}

Now I have _id.name and nameCount. But I have lost all the previously counted attributes, 20 or so.

Is there a way to do (something close to) what I want?

Note: Using MongoDB 3.2

Upvotes: 1

Views: 86

Answers (1)

Neil Lunn
Neil Lunn

Reputation: 151132

For MongoDB 3.2 you are pretty much limited to mapReduce if you want to return the "data" values as "keys" in a returned document. There is however the case to consider that you actually "do not need" MongoDB to do that part for you. But to consider the approaches:

Map Reduce

db.stuff.mapReduce(
  function() { 
    emit(null, {
     name: { [this.name]: 1 },
     code: { [this.code]: 1 }
    })
   },
  function(key,values) {
     let obj = { name: {}, code: {} };
     values.forEach(value => {
       ['name','code'].forEach(key => {
         Object.keys(value[key]).forEach(k => {
           if (!obj[key].hasOwnProperty(k))
             obj[key][k] = 0;
           obj[key][k] += value[key][k];
         })    
       })
     });
     return obj;    
  },
  { "out": { "inline": 1 } }
)

Returns:

    {
        "_id" : null,
        "value" : {
            "name" : {
                "foo" : 4.0,
                "bar" : 3.0,
                "baz" : 2.0
            },
            "code" : {
                "bbb" : 1.0,
                "aaa" : 8.0
            }
        }
    }

Aggregate

For MongoDB 3.4 and upwards, you can use $arrayToObject to reshape as "key/value" objects. And a bit more efficiently than simply using $push to make two large arrays which would almost certainly break the BSON limit in real world cases.

This "more or less" mirrors the mapReduce() operations:

db.stuff.aggregate([
  { "$project": {
    "_id": 0,
    "data": [
      { "k": "name", "v": { "k": "$name", "count": 1 } },
      { "k": "code", "v": { "k": "$code", "count": 1 } }
    ]
  }},
  { "$unwind": "$data" },
  { "$group": {
    "_id": { "k": "$data.k",  "v": "$data.v.k" },
    "count": { "$sum": "$data.v.count" }
  }},
  { "$group": {
    "_id": "$_id.k",
    "v": { "$push": { "k": "$_id.v", "v": "$count" } }
  }},
  { "$group": {
    "_id": null,
    "data": { "$push": { "k": "$_id", "v": "$v" } }  
  }},
  { "$replaceRoot": {
    "newRoot": {
      "$arrayToObject": {
        "$map": {
          "input": "$data",
          "in": { 
            "k": "$$this.k",
            "v": { "$arrayToObject": "$$this.v" }
          }
        }    
      }
    }  
  }}
])

Which has similar output ( without forcing ordering of keys by applying $sort ):

{
    "code" : {
        "bbb" : 1.0,
        "aaa" : 8.0
    },
    "name" : {
        "baz" : 2.0,
        "foo" : 4.0,
        "bar" : 3.0
    }
}

So it's only really in the final stage where we actually use the new features, and the output up to that point is pretty similar, and would be easy to reshape in code:

{
    "_id" : null,
    "data" : [ 
        {
            "k" : "code",
            "v" : [ 
                {
                    "k" : "bbb",
                    "v" : 1.0
                }, 
                {
                    "k" : "aaa",
                    "v" : 8.0
                }
            ]
        }, 
        {
            "k" : "name",
            "v" : [ 
                {
                    "k" : "baz",
                    "v" : 2.0
                }, 
                {
                    "k" : "foo",
                    "v" : 4.0
                }, 
                {
                    "k" : "bar",
                    "v" : 3.0
                }
            ]
        }
    ]
}

So in fact we can do just that:

db.stuff.aggregate([
  { "$project": {
    "_id": 0,
    "data": [
      { "k": "name", "v": { "k": "$name", "count": 1 } },
      { "k": "code", "v": { "k": "$code", "count": 1 } }
    ]
  }},
  { "$unwind": "$data" },
  { "$group": {
    "_id": { "k": "$data.k",  "v": "$data.v.k" },
    "count": { "$sum": "$data.v.count" }
  }},
  { "$group": {
    "_id": "$_id.k",
    "v": { "$push": { "k": "$_id.v", "v": "$count" } }
  }},
  { "$group": {
    "_id": null,
    "data": { "$push": { "k": "$_id", "v": "$v" } }  
  }},
  /*
  { "$replaceRoot": {
    "newRoot": {
      "$arrayToObject": {
        "$map": {
          "input": "$data",
          "in": { 
            "k": "$$this.k",
            "v": { "$arrayToObject": "$$this.v" }
          }
        }    
      }
    }  
  }}
  */
]).map( doc =>
  doc.data.map( d => ({
     k: d.k,
     v: d.v.reduce((acc,curr) => 
      Object.assign(acc,{ [curr.k]: curr.v })
      ,{}
     )
  })).reduce((acc,curr) => 
    Object.assign(acc,{ [curr.k]: curr.v })
    ,{}
  )
)

Which just goes to show that simply because the aggregation framework does not have the features to use "named keys" in output for earlier versions, you generally do not need them. Since the only place we actually used the new features was in the "final" stage, but we can easily do the same by simply reshaping the final output in client code.

And of course, it's the same result:

[
    {
        "code" : {
            "bbb" : 1.0,
            "aaa" : 8.0
        },
        "name" : {
            "baz" : 2.0,
            "foo" : 4.0,
            "bar" : 3.0
        }
    }
]

So it helps to learn the lesson of exactly "where" you actually need to apply such transformations. Here it's at the "end" since we do not need that during any "aggregation" stage, and thus you simply reshape the results that can be optimally provided from the aggregation framework itself.


The Bad Ways

As noted, your attempt so far may be fine for small data, but in most real world cases "pushing" all the items in a collection into a single document without reduction is going to break the 16MB BSON Limit.

Where it would actually stay under, then you can use something like this monster with $reduce:

db.stuff.aggregate([
  { "$group": {
    "_id": null,
    "name": { "$push": "$name" },
    "code": { "$push": "$code" }
  }},
  { "$replaceRoot": {
    "newRoot": { 
      "$arrayToObject": {
        "$map": {
          "input": [
            { "k": "name", "v": "$name" },
            { "k": "code", "v": "$code" }
          ],
          "as": "m",
          "in": {
            "k": "$$m.k",
            "v": {
              "$arrayToObject": {
                "$reduce": {
                  "input": "$$m.v",
                  "initialValue": [],
                  "in": {
                    "$cond": {
                      "if": { 
                        "$in": [
                          "$$this",
                          { "$map": {
                            "input": "$$value",
                            "as": "v",
                            "in": "$$v.k"
                          }}
                        ]
                      },
                      "then": {
                        "$concatArrays": [
                          { "$filter": {
                            "input": "$$value",
                            "as": "v",
                            "cond": { "$ne": [ "$$v.k", "$$this" ] }
                          }},
                          [{
                            "k": "$$this",
                            "v": {
                              "$sum": [
                                { "$arrayElemAt": [
                                  "$$value.v",
                                  { "$indexOfArray": [ "$$value.k", "$$this" ] }
                                ]},
                                1
                              ]
                            }    
                          }]
                        ]    
                      },
                      "else": {
                        "$concatArrays": [
                          "$$value",
                          [{ "k": "$$this", "v": 1 }]
                        ]    
                      }
                    } 
                  }
                }
              }
            }
          }
        }
      }
    }
  }}
])

Which produces:

{
    "name" : {
        "foo" : 4.0,
        "bar" : 3.0,
        "baz" : 2.0
    },
    "code" : {
        "bbb" : 1.0,
        "aaa" : 8.0
    }
}

Or indeed the same reduction process in client code:

db.stuff.aggregate([
  { "$group": {
    "_id": null,
    "name": { "$push": "$name" },
    "code": { "$push": "$code" }
  }},
]).map( doc => 
  ["name","code"].reduce((acc,curr) =>
    Object.assign(
      acc,
      { [curr]: doc[curr].reduce((acc,curr) =>
          Object.assign(acc,
            (acc.hasOwnProperty(curr))
              ? { [curr]: acc[curr] += 1 }
              : { [curr]: 1 }
          ),{}
        )
      }
    ),
    {}
  )
)

Which again has the same result:

{
    "name" : {
        "foo" : 4.0,
        "bar" : 3.0,
        "baz" : 2.0
    },
    "code" : {
        "bbb" : 1.0,
        "aaa" : 8.0
    }
}

Upvotes: 1

Related Questions