user3783854
user3783854

Reputation:

Mongo db aggregate and convert multiple documents into single document with an array

I have 970,000,000 documents of the same structure, in collection pdb_atoms example:

{
    "_id" : ObjectId("53b070793a39c4ee55fb3a90"),
    "pdb_id" : "pdb263d",
    "serial" : 20,
    "name" : "O5*",
    "res" : "DG",
    "res_seq" : 2,
    "chain_id" : "A",
    "x" : 22.819,
    "y" : 30.64,
    "z" : 85.707
}

I would like to create a new collection pdb which is populated with pdb documents which consist of a pdb_id property and an Array (atoms) which holds many records of the above type for the same pdb_id.

I have written the script below but it doesn't seem to work: -

// Aggregate all pdb_atom documents for a given pdb_id into one single document
// containing an array of atoms for that given pdb_id
db.pdb_atoms.distinct("pdb_id").forEach( function(pdb_id)
{       
  printjson(pdb_id);
  var arr_pdb_atoms = db.pdb_atoms.find( pdb_id ).toArray();
  // Aggregate the pdb_atoms into one pdb document with an array of pdb_atoms
  var pdb_json = {
                "pdb_id" : pdb_id,
                "atoms" : arr_pdb_atoms
                }; 
  // Save new pdb document to pdb collection
  db.pdb.save(pdb_json);
}
);

Error message: "pdb263d" Wed Jul 2 16:03:02.252 JavaScript execution failed: error: { "$err" : "JavaScript execution failed: ReferenceError: pdb263d is not defined", "code" : 16722 } at src/mongo/shell/query.js:L128

It fails at the first item "pdb263d" from the query result.

Maybe using a cursor to iterate over the entire pdb_atom collection and inserting only when a different pdb_id is encountered would be more efficient.

Any suggestions would be helpful.

Upvotes: 2

Views: 2802

Answers (1)

Anand Jayabalan
Anand Jayabalan

Reputation: 12904

This can be done using a single aggregate query like below. If you are using MongoDB 2.6, you can leverage the $out operator to direct the output to a collection.

db.pdb_atoms.aggregate([
    {
        $group : {
            _id : "$pdb_id", 
            atoms : {
                $push : {
                    _id:"$_id", 
                    pdb_id:"$pdb_id", 
                    serial:"$serial", 
                    name:"$name", 
                    res:"$res", 
                    res_seq:"$res_seq", 
                    chain_id:"$chain_id", 
                    x:"$x", 
                    y:"$y", 
                    z:"$z"
                }
            }
        }
    },
    {
        $out : "pdb"
    }
])

Upvotes: 2

Related Questions