Sarodh Uggalla
Sarodh Uggalla

Reputation: 303

How to match and sort documents based on array elements in common

var UserSchema = Schema (
{
         android_id: String,
         created: {type: Date, default:Date.now},
         interests: [{ type: Schema.Types.ObjectId, ref: 'Interests' }],

});

 Users.aggregate([
        { $match: {android_id: {$ne: userID}, interests: {$elemMatch: {$in: ids}} }},
        { $group: { _id: { android_id: '$android_id'},count: {$sum: 1}}},
        { $sort: {count: -1}},
        { $limit: 5 }], 

I need the to find the top 5 android_ids of the users with the most interests in common with me (ids array). I can work with the array of only matched elements from the interests array too.

Upvotes: 1

Views: 593

Answers (1)

Blakes Seven
Blakes Seven

Reputation: 50416

You seemed to be going along the right lines here but you do need to consider that arrays have special considerations for comparisons.

Your basic start here is to find all users that are not the current user, and that you also need at least the "interests" array of the current user as well. You seem to be doing that already, but for here let us consider that you have the whole user object for the current user which will be used in the listing.

This makes your "top 5" basically a product of "Not me, and the most interests in common", which means you basically need to count the "overlap" of interests on each user compared to the current user.

This is basically the $setIntersection of the two arrays or "sets" where the elements in common are returned. In order to count how many are in common, there is also the $size operator. So you apply like this:

Users.aggregate(
    [
        { "$match": {
            "android_id": { "$ne": user.android_id },
            "interests": { "$in": user.interests }
        }},
        { "$project": {
            "android_id": 1,
            "interests": 1,
            "common": {
                "$size": {
                    "$setIntersection": [ "$interests", user.interests ]
                }
            }
        }},
        { "$sort": { "common": -1 } },
        { "$limit": 5 }
    ],
    function(err,result) {

    }
);

The result returned in "common" is the count of common interests between the current user and the user being examined in the data. This data is then processed by $sort in order to put the largest number of common interests on top, and then $limit returns only the top 5.

If for some reason your MongoDB version is presently lower than MongoDB 2.6 where both the $setIntersection and $size operators are introduced, then you can still do this, but it just takes a longer form of processing the arrays.

Mainly you need to $unwind the arrays and process each match individually:

        { "$match": {
            "android_id": { "$ne": user.android_id },
            "interests": { "$in": user.interests }
        }},
        { "$unwind": "$interests" },
        { "$group": {
            "_id": "$_id",
            "android_id": { "$first": "$android_id" },
            "interests": { "$push": "$interests" },
            "common": {
              "$sum": {
                "$add": [
                  { "$cond": [{ "$eq": [ "$interests", user.interests[0] ] },1,0 ] },
                  { "$cond": [{ "$eq": [ "$interests", user.interests[1] ] },1,0 ] },
                  { "$cond": [{ "$eq": [ "$interests", user.interests[2] ] },1,0 ] }
                ]
              }
            }
        }},
        { "$sort": { "common": -1 }},
        { "$limit": 5 }

Which is more practically coded to generate the condtional matches in the pipeline:

    var pipeline = [
        { "$match": {
            "android_id": { "$ne": user.android_id },
            "interests": { "$in": user.interests }
        }},
        { "$unwind": "$interests" }
    ];

    var group = 
        { "$group": {
            "_id": "$_id",
            "android_id": { "$first": "$android_id" },
            "interests": { "$push": "$interests" },
            "common": {
              "$sum": {
                "$add": []
              }
            }
        }};

    user.interests.forEach(function(interest) {
      group.$group.common.$sum.$add.push(
        { "$cond": [{ "$eq": [ "$interests", interest ] }, 1, 0 ] }
      );
    });

    pipeline.push(group);

    pipeline = pipeline.concat([
        { "$sort": { "common": -1 }},
        { "$limit": 5 }
    ])

    User.aggregate(pipeline,function(err,result) {

    });

The key elements there being that "both" the current user and the user being inspected have their "interests" separated out for comparison to see if they are "equal". The result from $cond attributes a 1 where this is true or 0 where false.

Any returns ( and only ever expected to be 1 at best, per pair ) are passed to the $sum accumulator which counts the matches in common. You can alternately $match with an $in condition again:

        { "$unwind": "$interests" },
        { "$match": { "interests": { "$in": user.interests } },
        { "$group": {
            "_id": "$_id",
            "android_id": { "$first": "$android_id" },
            "common": { "$sum": 1 }
        }}

But this is naturally destructive of the array content as non matches are filtered out. So it depends on what you would rather have in the response.

That is the basic process for getting the "common" counts for use in further processing like $sort and $limit in order to get your "top 5".

Just for fun, here is a basic node.js listing to show the effects of common matches: var async = require('async'), mongoose = require('mongoose'), Schema = mongoose.Schema;

mongoose.connect('mongodb://localhost/sample');

var interestSchema = new Schema({
  name: String
});

var userSchema = new Schema({
  name: String,
  interests: [{ type: Schema.Types.ObjectId, ref: 'Interest' }]
});

var Interest = mongoose.model( 'Interest', interestSchema );
var User = mongoose.model( 'User', userSchema );

var interestHash = {};

async.series(
  [
    function(callback) {
      async.each([Interest,User],function(model,callback) {
        model.remove({},callback);
      },callback);
    },

    function(callback) {
      async.each(
        [
          "Tennis",
          "Football",
          "Gaming",
          "Cooking",
          "Yoga"
        ],
        function(interest,callback) {
          Interest.create({ name: interest},function(err,obj) {
            if (err) callback(err);
            interestHash[obj.name] = obj._id;
            callback();
          });
        },
        callback
      );
    },

    function(callback) {
      async.each(
        [
          { name: "Bob", interests: ["Tennis","Football","Gaming"] },
          { name: "Tom", interests: ["Football","Cooking","Yoga"] },
          { name: "Sue", interests: ["Tennis","Gaming","Yoga","Cooking"] }
        ],
        function(data,callback) {
          data.interests = data.interests.map(function(interest) {
            return interestHash[interest];
          });
          User.create(data,function(err,user) {
            //console.log(user);
            callback(err);
          })
        },
        callback
      );
    },

    function(callback) {
      async.waterfall(
        [
          function(callback) {
            User.findOne({ name: "Bob" },callback);
          },
          function(user,callback) {
            console.log(user);
            User.aggregate(
              [
                { "$match": {
                  "_id": { "$ne": user._id },
                  "interests": { "$in": user.interests }
                }},
                { "$project": {
                  "name": 1,
                  "interests": 1,
                  "common": {
                    "$size": {
                      "$setIntersection": [ "$interests", user.interests ]
                    }
                  }
                }},
                { "$sort": { "common": -1 } }
              ],
              function(err,result) {
                if (err) callback(err);
                Interest.populate(result,'interests',function(err,result) {
                  console.log(result);
                  callback(err);
                });
              }
            );
          }
        ],
        callback
      );
    }

  ],
  function(err) {
    if (err) throw err;
    //console.dir(interestHash);
    mongoose.disconnect();
  }
);

Which will output:

{ _id: 55dbd7be0e5516ac16ea62d1,
  name: 'Bob',
  __v: 0,
  interests:
   [ 55dbd7be0e5516ac16ea62cc,
     55dbd7be0e5516ac16ea62cd,
     55dbd7be0e5516ac16ea62ce ] }
[ { _id: 55dbd7be0e5516ac16ea62d3,
    name: 'Sue',
    interests:
     [ { _id: 55dbd7be0e5516ac16ea62cc, name: 'Tennis', __v: 0 },
       { _id: 55dbd7be0e5516ac16ea62ce, name: 'Gaming', __v: 0 },
       { _id: 55dbd7be0e5516ac16ea62d0, name: 'Yoga', __v: 0 },
       { _id: 55dbd7be0e5516ac16ea62cf, name: 'Cooking', __v: 0 } ],
    common: 2 },
  { _id: 55dbd7be0e5516ac16ea62d2,
    name: 'Tom',
    interests:
     [ { _id: 55dbd7be0e5516ac16ea62cd, name: 'Football', __v: 0 },
       { _id: 55dbd7be0e5516ac16ea62cf, name: 'Cooking', __v: 0 },
       { _id: 55dbd7be0e5516ac16ea62d0, name: 'Yoga', __v: 0 } ],
    common: 1 } ]

Upvotes: 3

Related Questions