How do I return multiple counts for the same field in mongoose?

Question

Basically I want to combine two separate calls to mongo into 1 single call, but I'm not entirely sure how to do it. If anyone could give me some guidance I would appreciate it! Thank you!

RatingSchema.statics.getPostRating = function(post, callback)
{
    this.count({ post: post, positiveReview: true }, function(err, posCount){
    if(err)
    {
            callback(err);
            return;
    }
    this.count({ post: , positiveReview: false }, function(err, negCount){
        if(err)
        {
            callback(err);
            return
        }
        callback(err, posCount, negCount)
    }

}

Neil Lunn · Accepted Answer

As noted already, you can possibly do this in one actual query with the aggregation framework, and there are even a couple of ways to achieve that result, depending on what you want. But actually there is a little performance issue concerning .count() as a general method that is best demonstrated with an example.

First I'll just set up some data in the shell for convenience:

var bulk = db.testcol.initializeOrderedBulkOp();

for ( var x=1; x <= 100000; x++ ) {
    bulk.insert({ value: Math.floor(Math.random(2)*2) });
    if ( x % 1000 == 0 ) {
        bulk.execute();
        bulk = db.testcol.initializeOrderedBulkOp();
    }
}

So simply just a 100,000 document collection, with little data and no index since that won't really make a difference in this case. The distribution should be fairly even and random enough to make the point.

Then some basic code to sample the different approaches:

var async = require('async'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema;


var testSchema = new Schema({
  value: Number
});

mongoose.connect('mongodb://localhost/test');

var Test = mongoose.model( 'Test', testSchema, 'testcol' );


async.series(
  [
    // Time aggregation two results
    function(callback) {
      var start = new Date();
      Test.aggregate(
        [{ "$group": { "_id": "$value", "count": { "$sum": 1 } } }],
        function(err,result) {
          var obj = {
            "start": start,
            "end": new Date()
          };
          obj.time = obj.end.valueOf() - obj.start.valueOf();
          obj.result = result;
          callback(err,obj);
        }
      );
    },

    // Time aggregation conditional
    function(callback) {
      var start = new Date();
      Test.aggregate(
        [
          { "$group": {
            "_id": null,
            "positive": {
              "$sum": {
                "$cond": [
                  { "$eq": [ "$value", 1 ] },
                  1,
                  0
                ]
              }
            },
            "negative": {
              "$sum": {
                "$cond": [
                  { "$eq": [ "$value", 0 ] },
                  1,
                  0
                ]
              }
            }
          }}
        ],
        function(err,result) {
          var obj = {
            "start": start,
            "end": new Date()
          };
          obj.time = obj.end.valueOf() - obj.start.valueOf();
          obj.result = result;
          callback(err,obj);
        }
      );
    },

    // Time query parallel
    function(callback) {
      var start = new Date();
      async.parallel(
        [
          function(callback) {
            Test.count({ value: 1 },callback);
          },
          function(callback) {
            Test.count({ value: 0 },callback);
          }
        ],
        function(err,results) {
          var obj = {
            "start": start,
            "end": new Date()
          };
          obj.time = obj.end.valueOf() - obj.start.valueOf();
          obj.result = results;
          callback(err,obj);
        }
      );
    }
  ],
  function(err,results) {
    if (err) throw err;
    console.log( JSON.stringify( results, undefined, 2 ) );
  }
);

And of course the results, which are the most important point:

[
  {
    "start": "2014-10-01T08:18:28.059Z",
    "end": "2014-10-01T08:18:28.263Z",
    "time": 204,
    "result": [
      {
        "_id": 1,
        "count": 49965
      },
      {
        "_id": 0,
        "count": 50035
      }
    ]
  },
  {
    "start": "2014-10-01T08:18:28.264Z",
    "end": "2014-10-01T08:18:28.404Z",
    "time": 140,
    "result": [
      {
        "_id": null,
        "positive": 49965,
        "negative": 50035
      }
    ]
  },
  {
    "start": "2014-10-01T08:18:28.405Z",
    "end": "2014-10-01T08:18:28.491Z",
    "time": 86,
    "result": [
      49965,
      50035
    ]
  }
]

So, without any further manipulation, the results show (and being fair, this is after a few iterations to make use that sure that the data is "warmed" and loaded to memory ) that there is a significant difference in each form.

The "first" result is a basic aggregation statement, which returns two rows containing the counts for each "value" that is present. These can only be 1 or 0 by the insertion conditions, but you can see the time for this is 204ms.

The "second" result is a single document result with aggregation. This uses the $cond operator in order to "split" each result into it's own property in the one document. The time taken here is signifcantly less at 140ms.

And finally, for the "third" result, the response is combined from two queries executed at the same time using "async.parallel" to manage the parallel running and sequencing of results. The time taken is 86ms which is less than half of the original aggregation statement and still significantly less than the other aggregation option that was faster.

Why is this? Well, MongoDB itself holds some specific information in the returned "cursor" that comes from the query engine when executing a regular query. Part of that information is the "count" of results returned. Since the query engine already did it's work in scanning and accumulating this "matched" total this number is present and there is no more work required in order to obtain the "count".

By contrast, though useful for many things, the aggregation framework and during the $group does this in a very different way. This is partly apparent in the performance difference between each of the two aggregation approaches, but the main thing is that the basic "query engine" "counts matches" things in a far more efficient way.

Depending on actual data, and especially for this kind of true/false matching, then indexing on that property should even produce "faster" results as well.

But the main point here, is that for simply "counting" the matched values for a property, where it is viable to do so ( and true/false is a good case ) then the most performant option is to run "parallel queries" as has been shown in this example. The performance improvement will typically be a factor of the number of different property values you are "counting".

So aggregation is great, but in this case it is not the winner. The node native driver ( as with many good driver implementations ) which is used by mongoose utilizes a "connection pool" by default. While this is generally a good idea for event driven applications as there are connections available for other concurrent operations, it is also a valid use to actually run multiple concurrent operations to obtain your results.

The optimizations in the general query engine combined with issuing both .count() statements at effectively "the same time" and then making sure that you wait on the combined results, gives you the best performance result for this type of operation. Won't be true for anything much outside of basic counts in general, but it all depends on what you are actually trying to do.

Part of test driven development should generally be to "test the alternate cases". This will steer you in the right direction based on the results obtained.

How do I return multiple counts for the same field in mongoose?

Answers (2)

Related Questions