Reputation:
I have 970,000,000 documents of the same structure, in collection pdb_atoms example:
{
"_id" : ObjectId("53b070793a39c4ee55fb3a90"),
"pdb_id" : "pdb263d",
"serial" : 20,
"name" : "O5*",
"res" : "DG",
"res_seq" : 2,
"chain_id" : "A",
"x" : 22.819,
"y" : 30.64,
"z" : 85.707
}
I would like to create a new collection pdb which is populated with pdb documents which consist of a pdb_id property and an Array (atoms) which holds many records of the above type for the same pdb_id.
I have written the script below but it doesn't seem to work: -
// Aggregate all pdb_atom documents for a given pdb_id into one single document
// containing an array of atoms for that given pdb_id
db.pdb_atoms.distinct("pdb_id").forEach( function(pdb_id)
{
printjson(pdb_id);
var arr_pdb_atoms = db.pdb_atoms.find( pdb_id ).toArray();
// Aggregate the pdb_atoms into one pdb document with an array of pdb_atoms
var pdb_json = {
"pdb_id" : pdb_id,
"atoms" : arr_pdb_atoms
};
// Save new pdb document to pdb collection
db.pdb.save(pdb_json);
}
);
Error message: "pdb263d" Wed Jul 2 16:03:02.252 JavaScript execution failed: error: { "$err" : "JavaScript execution failed: ReferenceError: pdb263d is not defined", "code" : 16722 } at src/mongo/shell/query.js:L128
It fails at the first item "pdb263d" from the query result.
Maybe using a cursor to iterate over the entire pdb_atom collection and inserting only when a different pdb_id is encountered would be more efficient.
Any suggestions would be helpful.
Upvotes: 2
Views: 2802
Reputation: 12904
This can be done using a single aggregate query like below. If you are using MongoDB 2.6, you can leverage the $out operator to direct the output to a collection.
db.pdb_atoms.aggregate([
{
$group : {
_id : "$pdb_id",
atoms : {
$push : {
_id:"$_id",
pdb_id:"$pdb_id",
serial:"$serial",
name:"$name",
res:"$res",
res_seq:"$res_seq",
chain_id:"$chain_id",
x:"$x",
y:"$y",
z:"$z"
}
}
}
},
{
$out : "pdb"
}
])
Upvotes: 2