Partition of Data with MongoDB

Question

I have following collection

[
  {
    "setting": "Volume",
    "_id": ObjectId("5a934e000102030405000000"),
    "counting": 1
  },
  {
    "setting": "Brightness",
    "_id": ObjectId("5a934e000102030405000001"),
    "counting": 1
  },
  {
    "setting": "Contrast",
    "_id": ObjectId("5a934e000102030405000002"),
    "counting": 1
  },
  {
    "setting": "Contrast",
    "_id": ObjectId("5a934e000102030405000003"),
    "counting": 1
  },
  {
    "setting": "Contrast",
    "_id": ObjectId("5a934e000102030405000004"),
    "counting": 0
  },
  {
    "setting": "Sharpness",
    "_id": ObjectId("5a934e000102030405000005"),
    "counting": 1
  },
  {
    "setting": "Sharpness",
    "_id": ObjectId("5a934e000102030405000006"),
    "counting": 1
  },
  {
    "setting": "Language",
    "_id": ObjectId("5a934e000102030405000007"),
    "counting": 1
  },
  {
    "setting": "Language",
    "_id": ObjectId("5a934e000102030405000008"),
    "counting": 0
  }
]

Now I want to group by setting and want only top most two data in result rest in useless

So my output should be after sort by counting

[
  {
    "setting": "Contrast",
    "counting": 2
  },
  {
    "setting": "Sharpness",
    "counting": 2
  },
  {
    "setting": "Useless",
    "counting": 3
  }
]

Neil Lunn · Accepted Answer

If you can get away with it, then it's probably best to "stuff" the reduced results into a single document and then $slice the top two and $sum the rest:

Model.aggregate([
  { "$group": {
    "_id": "$setting",
    "counting": { "$sum": "$counting" }
  }},
  { "$sort": { "counting": -1 } },
  { "$group": {
    "_id": null,
    "data": { "$push": "$$ROOT" }
  }},
  { "$addFields": {
     "data": {
       "$let": {
         "vars": { "top": { "$slice": ["$data", 0, 2 ] } },
         "in": {
           "$concatArrays": [
             "$$top",
             { "$cond": {
               "if": { "$gt": [{ "$size": "$data" }, 2] },
               "then": 
                 [{ 
                   "_id": "Useless",
                   "counting": {
                     "$sum": {
                       "$map": {
                         "input": {
                           "$filter": {
                             "input": "$data",
                             "cond": { "$not": { "$in": [ "$$this._id", "$$top._id" ] } }
                           }
                         },
                         "in": "$$this.counting"
                       }
                     }
                   }
                 }],
               "else": []
             }}
           ]
         }
       }
     }
  }},
  { "$unwind": "$data" },
  { "$replaceRoot": { "newRoot": "$data" } }
])

If it's potentially a very "large" result even reduced, then $limit use a $facet for the "rest":

Model.aggregate([
  { "$facet": {
    "top": [
      { "$group": {
        "_id": "$setting",
        "counting": { "$sum": "$counting" }
      }},
      { "$sort": { "counting": -1 } },
      { "$limit": 2 }
    ],
    "rest": [
      { "$group": {
        "_id": "$setting",
        "counting": { "$sum": "$counting" }
      }},
      { "$sort": { "counting": -1 } },
      { "$skip": 2 },
      { "$group": {
        "_id": "Useless",
        "counting": { "$sum": "$counting" }
      }}
    ]
  }},
  { "$project": {
    "data": {
      "$concatArrays": [
        "$top","$rest"
      ]
    }
  }},
  { "$unwind": "$data" },
  { "$replaceRoot": { "newRoot": "$data" } }
])

Or even $lookup with MongoDB 3.6:

Model.aggregate([
  { "$group": {
    "_id": "$setting",
    "counting": { "$sum": "$counting" }
  }},
  { "$sort": { "counting": -1 } },
  { "$limit": 2 },
  { "$group": {
    "_id": null,
    "top": { "$push": "$$ROOT" }   
  }},
  { "$lookup": {
    "from": "colllection",
    "let": { "settings": "$top._id" },
    "pipeline": [
      { "$match": {
        "$expr": {
          "$not": { "$in": [ "$setting", "$$settings" ] }
        }
      }},
      { "$group": {
        "_id": "Useless",
        "counting": { "$sum": "$counting" }
      }}
    ],
    "as": "rest"
  }},
  { "$project": {
    "data": {
      "$concatArrays": [ "$top", "$rest" ]
    }
  }},
  { "$unwind": "$data" },
  { "$replaceRoot": { "newRoot": "$data" } }
])

All pretty much the same really, and all return the same result:

{ "_id" : "Contrast", "counting" : 2 }
{ "_id" : "Sharpness", "counting" : 2 }
{ "_id" : "Useless", "counting" : 3 }

Optionally $project right at the end of each instead of the $replaceRoot if control over the field names is really important to you. Generally I just stick with the $group defaults

In the event that your MongoDB predates 3.4 and the resulting "Useless" remainder is actually too large to use any variant of the first approach, then simple Promise resolution is basically the answer, being one for the aggregate and the other for a basic count and simply do the math:

let [docs, count] = await Promise.all([
  Model.aggregate([
    { "$group": {
      "_id": "$setting",
      "counting": { "$sum": "$counting" }
    }},
    { "$sort": { "counting": -1 } },
    { "$limit": 2 },
  ]),
  Model.count().exec()
]);

docs = [ 
  ...docs,
  { 
    "_id": "Useless",
    "counting": count - docs.reduce((o,e) => o + e.counting, 0)
  }
];

Or without the async/await:

Promise.all([
  Model.aggregate([
    { "$group": {
      "_id": "$setting",
      "counting": { "$sum": "$counting" }
    }},
    { "$sort": { "counting": -1 } },
    { "$limit": 2 },
  ]),
  Model.count().exec()
]).then(([docs, count]) => ([ 
  ...docs,
  { 
    "_id": "Useless",
    "counting": count - docs.reduce((o,e) => o + e.counting, 0)
  }
]).then( result => /* do something */ )

Which is basically a variation on the age old "total pages" approach by simply running the separate query to count the collection items.

Running separate requests is generally the age old way of doing this and it often performs best. The rest of the solutions are essentially aimed at "aggregation tricks" since that was what you were asking for, and that's the answer you got by showing different variations on the same thing.

One variant put's all results into a single document ( where possible, due to the BSON limit of course ) and the others basically vary on the "age old" approach by running the query again in a different form. $facet in parallel and $lookup in series.

Partition of Data with MongoDB

Answers (1)

Related Questions