Reputation: 433
I have a sample mongodb data for Mongodb 3.2 below and want to optimize pipeline:
{"_id": {"$oid":"5808578b33fa6f161c9747f8"},"_class":"exceltest.TestBean","bookName":"Test6","revenue":10.0,"unitsSold":1,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747f9"},"_class":"exceltest.TestBean","bookName":"Test1","revenue":11.0,"unitsSold":2,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747fa"},"_class":"exceltest.TestBean","bookName":"Test2","revenue":12.0,"unitsSold":3,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747fb"},"_class":"exceltest.TestBean","bookName":"Test3","revenue":13.0,"unitsSold":4,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747fc"},"_class":"exceltest.TestBean","bookName":"Test4","revenue":14.0,"unitsSold":5,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747fd"},"_class":"exceltest.TestBean","bookName":"Test5","revenue":15.0,"unitsSold":6,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747fe"},"_class":"exceltest.TestBean","bookName":"Test10","revenue":16.0,"unitsSold":7,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"5808578b33fa6f161c9747ff"},"_class":"exceltest.TestBean","bookName":"Test11","revenue":100.0,"unitsSold":100,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"},{"categoryCode":"Cooking/Beverages/Bartending"},{"categoryCode":"Food Receipe/Taste"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e462"},"_class":"exceltest.TestBean","bookName":"Test1","revenue":20.0,"unitsSold":10,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e463"},"_class":"exceltest.TestBean","bookName":"Test2","revenue":19.0,"unitsSold":9,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e464"},"_class":"exceltest.TestBean","bookName":"Test3","revenue":18.0,"unitsSold":8,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e465"},"_class":"exceltest.TestBean","bookName":"Test4","revenue":17.0,"unitsSold":7,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e466"},"_class":"exceltest.TestBean","bookName":"Test5","revenue":16.0,"unitsSold":6,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e467"},"_class":"exceltest.TestBean","bookName":"Test1","revenue":15.0,"unitsSold":5,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e468"},"_class":"exceltest.TestBean","bookName":"Test2","revenue":14.0,"unitsSold":4,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e469"},"_class":"exceltest.TestBean","bookName":"Test3","revenue":13.0,"unitsSold":3,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e46a"},"_class":"exceltest.TestBean","bookName":"Test4","revenue":12.0,"unitsSold":2,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
{"_id":{"$oid":"580857b833fa6f0c3499e46b"},"_class":"exceltest.TestBean","bookName":"Test5","revenue":11.0,"unitsSold":1,"bookCategory":[{"categoryCode":"Cooking/"},{"categoryCode":"Cooking/Beverages"},{"categoryCode":"Food Receipe/"},{"categoryCode":"Food Receipe/Bartending"}]}
I have the following query :
db.books.aggregate([
{$match:{'bookCategory.categoryCode' : 'Cooking/'}},
{$unwind:'$bookCategory'},
/* This unwind creating performance problems, with 1M records, with 100 (Book Categories / book) elements in subdocuments causes 100M documents to be retrieved, may cause memory problems for our memory size : 4GB */
/*Pipeline 1 */
{ "$group": {
"_id": {
"categoryCode": "$bookCategory.categoryCode",
"book": "$bookName"
},
"revenue": { $sum:"$revenue" },
"unitsSold": { $sum:"$unitsSold" }
}
}
,
{$match:{'_id.categoryCode' : {$regex : 'Cooking/'}}},
/*Pipeline 2 */
{ "$group": {
"_id": "$_id.categoryCode",
"books": {
"$push": {
// "category": "$_id.categoryCode",
"book":"$_id.book",
"revenue": { $sum:"$revenue" },
"unitsSold": { $sum:"$unitsSold" }
},
},
"topRevenue": { $sum: "$revenue" },
"topUnitsSold": { $sum:"$unitsSold" }
}},
{ "$sort": { "topRevenue": -1 } },
{ "$limit": 3},
{ "$project": {
"books": { "$slice": [ "$books", 3 ] },
"topRevenue": 1,
"topUnitsSold": 1
}}
])
When executed Pipeline 1 , it generates 48 documents, the following output :
/* 1 */
{
"_id" : {
"categoryCode" : "Food Receipe/Taste",
"book" : "Test1"
},
"revenue" : 11,
"unitsSold" : 2
}
/* 2 */
{
"_id" : {
"categoryCode" : "Cooking",
"book" : "Test6"
},
"revenue" : 10,
"unitsSold" : 1
}
/* 3 */
{
"_id" : {
"categoryCode" : "Food Receipe/Taste",
"book" : "Test2"
},
"revenue" : 12,
"unitsSold" : 3
}
/* 4 */
{
"_id" : {
"categoryCode" : "Food Receipe/Taste",
"book" : "Test6"
},
"revenue" : 10,
"unitsSold" : 1
} ...........................and so on
I want to solve this in pipeline 1 stage only. Please lemme know whether is possible with mongodb 3.2. Please lemme know whether is possible with spring-data-mongodb.
Please help
Regards
Kris
Upvotes: 1
Views: 154
Reputation: 1200
You are doing the match in the first stage keep it there, then i would suggest you put as little data as possible through the pipeline, so after the match add a project.
For the second you can use array $filter operator. Which is something like :
$project: {
bookCategory: {
$filter: {
input: "$bookCategory",
as: "bookCat",
cond: { $eq: [ "$$bookCat.categoryCode", "Cooking/"] }
}
}
}
This should reduce the number of documents and make the aggregation more performant.
You could even eliminate the first match, because the projection eliminates the array items not maching the $eq statement.
Just try some combinations and see what works (and is the most performant) in your scenario
UPDATE: i created a little aggregation, with an Array filter to limit the number of items in your array (and so the unwind). This groups the books on Category. I put the book document in the books array (for reference) but you can limit this to only the title (lesser data, faster the aggregation).
db.collection.aggregate(
// Pipeline
[
// Stage 1
{
$project: {
bookName : 1,
revenue : 1,
unitsSold : 1,
bookCategory: {
$filter: {
input: "$bookCategory",
as: "bookCat",
cond: { $eq:[ 'Cooking', {$substr:["$$bookCat.categoryCode",0,7]}] }
}
}
}
},
// Stage 2
{
$unwind: "$bookCategory"
},
// Stage 3
{
$group: {
_id: {
categoryCode: "$bookCategory.categoryCode",
},
books : { $push: "$$ROOT" },
revenue: { $sum:"$revenue" },
unitsSold: { $sum:"$unitsSold" }
}
}
]
);
I asume you create this aggregation in code so you can build the arrayfilter depending on the value you want (Food / Cooking)
Upvotes: 1