Find documents with limits from multiple MongoDB collections and as return sorted list using Mongoose

Question

If I have different types of documents, each in their own collections, is there a way to search for posts from all collections and return them as a single list ordered by something like a datestamp?

Further, I need:

To be able to decide how many posts I need in total from all collections
The posts should be ordered by the same criteria - which means the number of posts will be different from each collection
To be able to start collecting with an offset (say, give me 100 posts starting at post no. 201).

If I saved all documents in the same collection this task would be rather easy but would also require a dynamic, largely undocumented schema since each document will be very different except for a few parameters such as the date.

So, is there a way to keep my documents in well defined schemas, each in separate collections but still being able to accomplish the above?

For argument's sake, here's how the schemas could look divided up:

var InstagramPostSchema = new Schema({
   date: Date,
   imageUrl: String,
   ...
})

var TwitterPostSchema = new Schema({
   date: Date,
   message: String,
   ...
})

And if I made one universal schema it could look like this:

var SocialPostSchema = new Schema({
   date: Date,
   type: String,
   postData: {}
})

What's the preferred way to do this?

The ideal way would be if I could write separate schemas that inherits from a common base schema, but I'm not familiar enough with Mongoose and MongoDB to know if there's a native way to do this.

Blakes Seven · Accepted Answer

There is a good way to do this which is also a bit nicer and with some benifts over your final suggestion, and it is to use discriminators.

The basic idea is that there is a base schema with common properties or even no properties at all for which you are going to define your main collection from. Each other schema then inherrits from that and also shares the same collection.

As a basic demonstration:

var async = require('async'),
    util = require('util'),
    mongoose = require('mongoose'),
    Schema = mongoose.Schema;

mongoose.connect('mongodb://localhost/test');

function BaseSchema() {

  Schema.apply(this,arguments);

  this.add({
    date: { type: Date, default: Date.now },
    name: { type: String, required: true }
  });
}

util.inherits(BaseSchema,Schema);

var socialPostSchema = new BaseSchema();

var instagramPostSchema = new BaseSchema({
  imageUrl: { type: String, required: true }
});

var twitterPostSchema = new BaseSchema({
  message: { type: String, required: true }
});

var SocialPost = mongoose.model('SocialPost', socialPostSchema ),
    InstagramPost = SocialPost.discriminator(
      'InstagramPost', instagramPostSchema ),
    TwitterPost = SocialPost.discriminator(
      'TwitterPost', twitterPostSchema );

async.series(
  [
    function(callback) {
      SocialPost.remove({},callback);
    },
    function(callback) {
      InstagramPost.create({
        name: 'My instagram pic',
        imageUrl: '/myphoto.png'
      },callback);
    },
    function(callback) {
      setTimeout(
        function() {
          TwitterPost.create({
            name: "My tweet",
            message: "ham and cheese panini #livingthedream"
          },callback);
        },
        1000
      );
    },
    function(callback) {
      SocialPost.find({}).sort({ "date": -1 }).exec(callback);
    }
  ],
  function(err,results) {
    if (err) throw err;
    results.shift();
    console.dir(results);
    mongoose.disconnect();
  }
);

With output:

[ { __v: 0,
    name: 'My instagram pic',
    imageUrl: '/myphoto.png',
    __t: 'InstagramPost',
    date: Wed Aug 19 2015 22:53:23 GMT+1000 (AEST),
    _id: 55d47c43122e5fe5063e01bc },
  { __v: 0,
    name: 'My tweet',
    message: 'ham and cheese panini #livingthedream',
    __t: 'TwitterPost',
    date: Wed Aug 19 2015 22:53:24 GMT+1000 (AEST),
    _id: 55d47c44122e5fe5063e01bd },
  [ { _id: 55d47c44122e5fe5063e01bd,
      name: 'My tweet',
      message: 'ham and cheese panini #livingthedream',
      __v: 0,
      __t: 'TwitterPost',
      date: Wed Aug 19 2015 22:53:24 GMT+1000 (AEST) },
    { _id: 55d47c43122e5fe5063e01bc,
      name: 'My instagram pic',
      imageUrl: '/myphoto.png',
      __v: 0,
      __t: 'InstagramPost',
      date: Wed Aug 19 2015 22:53:23 GMT+1000 (AEST) } ] ]

So the things to notice there are that even though we defined separate models and even seperate schemas, all items are in fact in the same collection. As part of the discriminator, each document stored has a __t field depicting it's type.

So the really nice things here are:

You can store everything in one collection and query all objects together
You can seperate validation rules per schema and/or define things in a "base" so you don't need to write it out multiple times.
The objects "explode" into their own class defintions by the attached schema to the model for each type. This includes any attached methods. So these are first class objects when you create or retrieve the data.
If you wanted to work with just a specific type such as "TwitterPost", then using that model "automatically" filters out anything else but the "twitter" posts from any query operations performed, just by using that model.

Keeping things in the one collection makes a lot of sense, especially if you want to try and aggregate data accross the information for different types.

A word of caution is that though you can have completely different objects using this pattern, it is generally wise to have as much in common as makes sense to your operations. This is particularly useful in querying or aggregating across different types.

So where possible, try to convert "legacy imported" data to a more "common" format of fields, and just keep the unique properties that are really required for each object type.

As to the first part of your question where you wanted to query "each collection" with something like different limits and then sort the overall results from each, well you can do that too.

There are various techniques, but keeping in the MongoDB form, there is nedb which you an use to both store the combined results and "sort" them as well. And all is done in a manner you are used to:

var async = require('async'),
    util = require('util'),
    mongoose = require('mongoose'),
    DataStore = require('nedb'),
    Schema = mongoose.Schema;

mongoose.connect('mongodb://localhost/test');

function BaseSchema() {

  Schema.apply(this,arguments);

  this.add({
    date: { type: Date, default: Date.now },
    name: { type: String, required: true }
  });
}

util.inherits(BaseSchema,Schema);

var socialPostSchema = new BaseSchema();

var instagramPostSchema = new BaseSchema({
  imageUrl: { type: String, required: true }
});

var twitterPostSchema = new BaseSchema({
  message: { type: String, required: true }
});

var SocialPost = mongoose.model('SocialPost', socialPostSchema ),
    InstagramPost = SocialPost.discriminator(
      'InstagramPost', instagramPostSchema ),
    TwitterPost = SocialPost.discriminator(
      'TwitterPost', twitterPostSchema );

async.series(
  [
    function(callback) {
      SocialPost.remove({},callback);
    },
    function(callback) {
      InstagramPost.create({
        name: 'My instagram pic',
        imageUrl: '/myphoto.png'
      },callback);
    },
    function(callback) {
      setTimeout(
        function() {
          TwitterPost.create({
            name: "My tweet",
            message: "ham and cheese panini #livingthedream"
          },callback);
        },
        1000
      );
    },
    function(callback) {
      var ds = new DataStore();
      async.parallel(
        [
          function(callback) {
            InstagramPost.find({}).limit(1).exec(function(err,posts) {
              async.each(posts,function(post,callback) {
                post = post.toObject();
                post.id = post._id.toString();
                delete post._id;
                ds.insert(post,callback);
              },callback);
            });
          },
          function(callback) {
            TwitterPost.find({}).limit(1).exec(function(err,posts) {
              async.each(posts,function(post,callback) {
                post = post.toObject();
                post.id = post._id.toString();
                delete post._id;
                ds.insert(post,callback);
              },callback);
            });
          }
        ],
        function(err) {
          if (err) callback(err);
          ds.find({}).sort({ "date": -1 }).exec(callback);
        }
      );
    }
  ],
  function(err,results) {
    if (err) throw err;
    results.shift();
    console.dir(results);
    mongoose.disconnect();
  }
);

Same output as before with the latest post sorted first, except that this time a query was sent to each model and we just got results from each and combined them.

If you change the query output and writes to the combined model to use "stream" processing, then you even have basically the same memory consumption and likely faster processing of results from parallel queries.

Find documents with limits from multiple MongoDB collections and as return sorted list using Mongoose

Answers (1)

Related Questions