A T
A T

Reputation: 13826

Hierachically flatten MongoDB collection of documents with arrays into documents

Block model (which goes on block 0 -> block 1 -> block 2 -> block 3 -> […]):

https://yuml.me/cb86229d.png

Example input document [700+ of these in the modulestore.structures collection]:

{
  _id: ObjectId('5932d50ff8f46c0a8098ab79'),
  blocks: [
    {
      definition: ObjectId('5923556ef8f46c0a787e9c0f'),
      block_type: 'chapter',
      block_id: '5b053a7f10ba41df85a3221c3ef3956e',
      fields: {
        format: 'Foo exam',
        children: [ 
          [ 
            'sequential', 
            '9f1e58553ad448818ec8e7915d3d94d3'
          ], 
          [ 
            'sequential', 
            'f052c7aa44274769a4631e95405834e0'
          ]
        ]
      }
    },
    {
      definition: ObjectId('59235569f8f46c0a7be1debc'),
      block_type: 'sequential',
      block_id: '9f1e58553ad448818ec8e7915d3d94d3',
      fields: {
        display_name: 'FooBar'
      }
    },
    {
      definition: ObjectId('59317406f8f46c0a8098aaf5'),
      block_type: 'sequential',
      block_id: 'f052c7aa44274769a4631e95405834e0',
      fields: {
        display_name: 'CanHaz'
      }
    }
  ]
}

My goal here is to:

  1. flatten out the blocks so all blocks are at the collection level;
  2. cursor the children array for traversal;
  3. walk and amend the 'tree' such that each child/grandchild/great-grandchild/*-child gets a new property top_ancestor_fields containing the fields property from their topmost ancestor.

Example output:

[
  {
    _id: ObjectId('5a00f611f995363c2b63c9a6'),
    block_type: 'chapter',
    block_id: '5b053a7f10ba41df85a3221c3ef3956e',
    fields: {
      format: 'Foo exam'
      children: [ 
        [ 
          'sequential',
          '9f1e58553ad448818ec8e7915d3d94d3'
        ], 
        [
          'sequential',
          'f052c7aa44274769a4631e95405834e0'
        ]
      ]
    },
    top_ancestor_fields: {
      format: 'Foo exam'
    }
  },
  {
     _id: ObjectId('5a00f611f995363c2b63c9a7'),
     block_id: '9f1e58553ad448818ec8e7915d3d94d3',
     block_type: 'sequential',
     fields: {
       display_name: 'FooBar'
     },
     top_ancestor_fields: {
       format: 'Foo exam'
     }
  },
  {
     _id: ObjectId('5a00f611f995363c2b63c9a8'),
     block_id: 'f052c7aa44274769a4631e95405834e0',
     block_type: 'sequential',
     fields: {
       display_name: 'CanHaz'
     },
     top_ancestor_fields: {
       format: 'Foo exam'
     }
  },
]

Almost have it working based off @neil-lunn's suggestion:

db.modulestore.structures.aggregate([
  { $unwind: '$blocks' },
  { $project: { _id: 0,
                block_id: '$blocks.block_id',
                children: '$blocks.fields.children',
                display_name: '$blocks.fields.display_name',
                block_type: '$blocks.block_type',
                exam: '$blocks.fields.format',
                fields: '$blocks.fields'
               }},
  { $out: 'modulestore.mapped0' }
])

db.modulestore.mapped0.aggregate([
    { $graphLookup: {
        from: 'modulestore.mapped0',
        startWith: '$block_id',
        connectToField: 'children',
        connectFromField: 'block_id',
        as: 'block_ids',
        maxDepth: 0
    } },
    { $unwind: '$block_ids' },
    { $project: {
        name: 1,
        _id: 0,
        ancestor: '$block_ids.block_id'
    } },
    { $out: 'modulestore.mapped1' }
]);

But this just hangs. I've tried configuring maxDepth $graphLookup option. FYI: db.modulestore.mapped0.count() is 80772 for me.

Each document potentially contains a children array with up to 180 elements.

Not sure how to approach this larger pipeline to map children hierarchies…

Upvotes: 2

Views: 1048

Answers (2)

dnickless
dnickless

Reputation: 10918

The following should get you started:

db.modulestore.structures.aggregate([{
    $unwind: '$blocks' // flatten "blocks" array
}, {
    $replaceRoot: { // move "blocks" field to top level
        newRoot: "$blocks"
    }
}, {
    $unwind: { // flatten "fields.children" array
        path: "$fields.children",
        preserveNullAndEmptyArrays: true
    }
}, {
    // this step is technically not needed but it might speed up things - try running with and without that
    $addFields: { // we only keep the second (last, really) entry of all your arrays since this is the only valid join key for the graphLookup
        "fields.children": {
            $slice: [ "$fields.children", -1 ]
        }
    }
}, {
    $unwind: { // flatten "fields.children" array one more time because it was nested before
        path: "$fields.children",
        preserveNullAndEmptyArrays: true
    }
}, {
    $group: { // reduce the number of lookups required later by eliminating duplicate parent-child paths
        "_id": "$block_id",
        "block_type": { $first: "$block_type" },
        "definition": { $first: "$definition" },
        "fieldsFormat": { $first: "$fields.format" },
        "fieldsChildren": { $addToSet: "$fields.children" }
    }
}, {
    $project: { // restore original structure
        "block_id": "$_id",
        "block_type": "$block_type",
        "definition": "$definition",
        "fields": {
            "format": "$fieldsFormat",
            "children": "$fieldsChildren"
        }
    }
}, { // spit out the result into "modulestore.mapped0" collection, overwriting all existing content
    $out: 'modulestore.mapped0'
}])

and then

db.modulestore.mapped0.aggregate([{
    $graphLookup: {
        from: 'modulestore.mapped0',
        startWith: '$block_id',
        connectToField: 'fields.children',
        connectFromField: 'block_id',
        as: 'block_ids',
        maxDepth: 0
    }
}, { 
    $lookup: { 
        from: 'modulestore.mapped0', 
        localField: 'block_ids.fields.children', 
        foreignField: '_id', 
        as: 'block_ids.fields.children' 
    } 
}])

Upvotes: 1

A T
A T

Reputation: 13826

Partial solution [gist]:

def update_descendants(modulestore, blocks, ancestor_fields):
    """    
    :keyword modulestore: modulestore containing the blocks
    :type modulestore: ``Collection``   

    :keyword blocks: iterator over the blocks (collections within modulestore)
    :type blocks: ``Cursor`` | `tuple`

    :keyword ancestor_fields: fields of the top most ancestor
    :type ancestor_fields: ``dict``
    """
for block in blocks:
    modulestore.replace_one({'block_id': block['block_id'],
                             'block_type': block['block_type']},
                            update_d(block, add={'ancestor_fields': ancestor_fields},
                                     rm=('_id',)))
    update_descendants.counter += 1
    print 'Updated:', update_descendants.counter

    if 'children' in block and block['children']:
        for block_type, block_id in block['children']:
            update_descendants(modulestore,
                               modulestore.find({'block_id': block_id,
                                                 'block_type': block_type,
                                                 'ancestor_fields': {
                                                     '$exists': False
                                                 }}),
                               ancestor_fields)

Would prefer a solution that's wholly in the database though, and without all these inefficient queries.

Upvotes: 0

Related Questions