caiuspb
caiuspb

Reputation: 974

MongoDB query - Return sliced arrays

I am trying to specify a specific query on the following Schema:

executionSchema = new Schema(
  timestamp: {type: Number},
  components: [{
    uid: { type: String },
    type: { type: String },
    control_ports: [ { name: String, values: [type: Number] } ]
    input_samples: [ { name: String, values: [type: Number] } ]
    output_samples: [ { name: String, values: [type: Number] } ]
    execution_times: [type: Number]
  }]
)

I'd like to return a component with sliced arrays (input_samples and output_samples), specified by a timestamp. This is what I have done so far:

exports.getSamples = (req, res) ->
    timestamp = req.param("timestamp")
    uid = req.param("uid")
    skip = req.param("skip")
    amount = req.param("amount")
    #query = Execution.findOne({'timestamp':timestamp}, 'components.input_samples components.output_samples')
    query = Execution.findOne({'timestamp':timestamp})
    query.where('components.uid').equals(uid)
    query.slice('components.input_samples.values', 5)
    query.slice('components.output_samples.values', 5)
    query.select('components.$.')
    #query.slice('values', 5)
    query.exec ( err, samples )->
        if err
            console.log "Error: "
            console.log err
            res.json err
        else
            console.dir samples
            res.json samples
        return
    return

It actually returns the correct component bit it includes every single element inside of the arrays. Somehow I managed to slice the arrays with another query but the result contained every available component. I guess I still have to get used to MongoDb ..

Thank you.

Edit This is what I get:

{
    "_id": "5326ca6558f41c510a2659ad",
    "components": [
      {
        "uid": "sine#0",
        "type": "SW",
        "_id": "5326ca6558f41c510a2659b5",
        "execution_times": [
          500,
          450,
          700
        ],
        "output_samples": [
          {
            "name": "Output_Port",
            "_id": "5326ca6558f41c510a2659b6",
            "values": [
              0,
              0.8414709848078965,
              0.9092974268256817,
              0.1411200080598672,
              -0.7568024953079282,
              -0.9589242746631385,
              -0.27941549819892586,
              0.6569865987187891,
              0.9893582466233818,
              0.4121184852417566,
              ...,
              -0.5440211108893698,
              -0.9999902065507035,
              0.5878193939808536,
              0.9983436270438855,
              0.4909953335002932
            ]
          }
        ],
        "input_samples": [],
        "control_ports": [
          {
            "name": "Control 1",
            "_id": "5326ca6558f41c510a2659b7",
                      "values": [
              0,
              0.8414709848078965,
              0.9092974268256817,
              0.1411200080598672,
              -0.7568024953079282,
              -0.9589242746631385,
              -0.27941549819892586,
              0.6569865987187891,
              0.9893582466233818,
              0.4121184852417566,
              ...,
              -0.5440211108893698,
              -0.9999902065507035,
              0.5878193939808536,
              0.9983436270438855,
              0.4909953335002932
            ]
          }
        ]
      }
    ]

and what I want (return a subset of those arrays):

  {
    "_id": "5326ca6558f41c510a2659ad",
    "components": [
      {
        "uid": "sine#0",
        "type": "SW",
        "_id": "5326ca6558f41c510a2659b5",
        "execution_times": [
          500,
          450,
          700
        ],
        "output_samples": [
          {
            "name": "Output_Port",
            "_id": "5326ca6558f41c510a2659b6",
            "values": [
              0,
              0.8414709848078965,
              0.9092974268256817,
              0.1411200080598672,
              -0.7568024953079282,
              -0.9589242746631385
            ]
          }
        ],
        "input_samples": [],
        "control_ports": [
          {
            "name": "Control 1",
            "_id": "5326ca6558f41c510a2659b7",
                      "values": [
              0,
              0.8414709848078965,
              0.9092974268256817,
              0.1411200080598672,
              -0.7568024953079282
            ]
          }
        ]
      }
    ]

Edit 2:

So this does bring to question, "do you actually mean to have arrays"? Only saying this because as the information is presented, then it seems the only parts that are actually arrays are the "values" fields.

I guess the data is a little bit more complex than you might think. The component array can hold an arbitrary amount of components with unique "uids" which can be different for every "execution". Therefore I guess I definitely have to use an array for components.

The problem with $slice in this context is that you actually don't know "which" array element out of the nesting to operate on. So the style you have presented will not work because the elements could be and any possible index. The correct form, if it were supported, and which it is not would be something like:

The position of a specific component inside of an array is the only index I do not know, since the data creating application initializes an execution before the values are updated. Thus I only need one single 'positional $ operator'. It should look something like { "components.$.output_samples.0.values": { "$slice": 5 } }.

Nested lists are going to be a problem for various things and are notoriously bad for updating. Where possible you should consider alternatives. In this case if you want to limit your output, then the only practical way is to retrieve the whole document, then process the arrays in code to limit the returned results.

I guess it was a small mistake to simplify my real intention. I would like to use the slice operator to skip the first n elements and retrieve the following m elements. These two variables should be specified by the user. However, the output_samples.0.values field could contain multiple million values. Or even more. This is why I would like to gather just as many values as the user needs..

I really appreciate your detailed answer.

Upvotes: 3

Views: 1114

Answers (1)

Neil Lunn
Neil Lunn

Reputation: 151112

The problem you are going to have with $slice here or any kind of projection with arrays is the fact that you are nesting arrays in the schema you have defined. I also notice the use of the positional $ operator sneaking in there for a peek, so it's probably best to explain in the context of the documentation from there.

So on that page there is this:

  • The $ projection operator limits the content of the field to the first element that matches the query document.
  • The field must appear in the query document

So for starters, you are not asking in the query to match a particular field in any element of any array. By that alone there will be no projection that can be done for a matched element. In any case only the first matching array element could possibly be matched even if you did so.

In your structure, the only thing you could match is the top position in the array "components". And right at the top.

The problem with $slice in this context is that you actually don't know "which" array element out of the nesting to operate on. So the style you have presented will not work because the elements could be and any possible index. The correct form, if it were supported, and which it is not would be something like:

{ "components.0.output_samples.0.values": { "$slice": 5 } }

As you would need to specify the index path down to the elements you are actually talking about.

So this does bring to question, "do you actually mean to have arrays"? Only saying this because as the information is presented, then it seems the only parts that are actually arrays are the "values" fields.

Nested lists are going to be a problem for various things and are notoriously bad for updating. Where possible you should consider alternatives. In this case if you want to limit your output, then the only practical way is to retrieve the whole document, then process the arrays in code to limit the returned results.

Even the following "very involved" incantation will only work ("somewhat") in it's present form considering that all the "other" array nesting only has one one element. So it makes this possible but impractical:

db.components.aggregate([
   { "$unwind": "$components" },
   { "$unwind": "$components.output_samples"},
   { "$unwind": "$components.control_ports"},
   { "$unwind": "$components.output_samples.values"},
   { "$limit": 5 },
   { "$group": { 
       "_id": { 
           "_id": "$_id", 
           "components": {
               "uid": "$components.uid",
               "type": "$components.type",
               "_id": "$components._id",
               "execution_times": "$components.execution_times",
               "output_samples": {
                   "name": "$components.output_samples.name",
                   "_id": "$components.output_samples._id"
               },
               "input_samples": "$components.input_samples",
               "control_ports": "$components.control_ports"
           }
       }, 
       "output_samples_values": {"$push": "$components.output_samples.values" }
   }},
   { "$project": { 
       "_id": { 
           "_id": "$_id._id", 
           "components": {
               "uid": "$_id.components.uid",
               "type": "$_id.components.type",
               "_id": "$_id.components._id",
               "execution_times": "$_id.components.execution_times",
               "output_samples": {
                   "name": "$_id.components.output_samples.name",
                   "_id": "$_id.components.output_samples._id",
                   "values": "$output_samples_values"
               },
               "input_samples": "$_id.components.input_samples",
               "control_ports": {
                   "name": "$_id.components.control_ports.name",
                   "_id": "$_id.components.control_ports._id"
               }
           }
       }, 
       "control_ports_values": "$_id.components.control_ports.values"
   }},
   { "$unwind": "$control_ports_values" },
   { "$limit": 5 },
   { "$group": { 
       "_id": { 
           "_id": "$_id._id", 
           "components": {
               "uid": "$_id.components.uid",
               "type": "$_id.components.type",
               "_id": "$_id.components._id",
               "execution_times": "$_id.components.execution_times",
               "output_samples": {
                   "name": "$_id.components.output_samples.name",
                   "_id": "$_id.components.output_samples._id",
                   "values": "$_id.components.output_samples.values"
               },
               "input_samples": "$_id.components.input_samples",
               "control_ports": {
                   "name": "$_id.components.control_ports.name",
                   "_id": "$_id.components.control_ports._id"
               }
           }
       }, 
       "control_ports_values": {"$push": "$control_ports_values" }
   }}
])

And all of that just to slice two arrays by the first 5 values.

So if you need the nested arrays, then do the "slicing" in code when you retrieve the result. Otherwise change the schema to something that is going be more practically suit your purpose.

Upvotes: 4

Related Questions