Reputation: 897
We do have a collection with around 20million documents of this structure:
{
_id: ...
learnerId: <string>
type: <string>
organizationId: <string>
timestamp: ISODate
payload: {
learningPackageId: <string>
provenLearningProgress: <number>,
stabilityOfMemory: <number>,
numberOfActiveQuestions: <number>,
numberOfQuestionsInLearningPackage: <number>,
numberOfQuestionsInLearningPackage: <number>
}
}
I want to have a query that returns me the latest payload grouped by laernerId, organizationId and then inside by learningPackageId.
The idea of the query is to:
So I end up with a structure like this:
{
_id: {
learnerId,
oganizationId
},
learningPackages: [
{ learningPackageId, payload},
{ learningPackageId, payload},
]
}
Writing this query is quite easy. But I'm strugling in writing it in a way that it performs.
My current approach is this:
I've created a compound index covering this fields:
I've changed the $first of the complete payload to multiple $first for each value on payload.
So my final query is this:
[
{
$match: {
organizationId: "OrgId",
"learnerId": {
$in: [
"LearnerId1",
"LearnerId2"
]
},
type: "LearningProgressUpdate",
learningPackageId: "learningPackageId",
timestamp: {
$lt: new ISODate(
"2025-01-24T22:59:59.999Z"
)
}
}
},
{
$sort: {
timestamp: -1
}
},
{
$group: {
_id: {
learnerId: "$learnerId",
organizationId: "$organizationId",
learningPackageId: "$payload.learningPackageId"
},
lpId: {
$first: "$payload.learningPackageId"
},
lpPLP: {
$first: "$payload.provenLearningProgress"
},
lpSOM: {
$first: "$payload.stabilityOfMemory"
},
lpNOAQ: {
$first: "$payload.numberOfActiveQuestions"
},
lpNOQILP: {
$first: "$payload.numberOfQuestionsInLearningPackage"
},
lpTP: {
$first:
"$payload.timeOfProof"
},
timestamp: {
$first: "$timestamp"
}
}
},
{
$group: {
_id: {
learnerId: "$_id.learnerId",
organizationId: "$_id.organizationId"
},
learningPackages: {
$push: {
timestamp: "$timestamp",
lpId: "$lpId",
lpPLP: "$lpPLP",
lpNOAQ: "$lpNOAQ",
lpNOQILP: "$lpNOQILP",
lpTP: "$lpTP",
lpSOM: "$lpSOM"
}
}
}
},
{
$project: {
_id: 0,
learnerId: "$_id.learnerId",
organizationId: "$_id.organizationId",
learningPackages: 1
}
}
]
With this approach, I do have a full covered query, so no document fetching. Still it takes around 10-15 seconds depending on what is matched in $match.
My questions on this:
Here some additional information:
WinningPlan on the query using multiple $first
{
"isCached": false,
"queryPlan": {
"stage": "GROUP",
"planNodeId": 4,
"inputStage": {
"stage": "GROUP",
"planNodeId": 3,
"inputStage": {
"stage": "IXSCAN",
"planNodeId": 1,
"keyPattern": {
"type": 1,
"organizationId": 1,
"timestamp": 1,
"payload.learningPackageId": 1,
"learnerId": 1,
"payload.provenLearningProgress": 1,
"payload.timeOfProof": 1,
"payload.numberOfActiveQuestions": 1,
"payload.numberOfQuestionsInLearningPackage": 1,
"payload.stabilityOfMemory": 1
},
"indexName": "speed_test1",
"isMultiKey": false,
"multiKeyPaths": {
"type": [],
"organizationId": [],
"timestamp": [],
"payload.learningPackageId": [],
"learnerId": [],
"payload.provenLearningProgress": [],
"payload.timeOfProof": [],
"payload.numberOfActiveQuestions": [],
"payload.numberOfQuestionsInLearningPackage": [],
"payload.stabilityOfMemory": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "backward",
"indexBounds": {
"type": [
"HIDDEN"
],
"organizationId": [
"HIDDEN"
],
"timestamp": [
"HIDDEN"
],
"payload.learningPackageId": [
"HIDDEN"
],
"learnerId": [ "HIDDEN"],
"payload.provenLearningProgress": [
"[MaxKey, MinKey]"
],
"payload.timeOfProof": [
"[MaxKey, MinKey]"
],
"payload.numberOfActiveQuestions": [
"[MaxKey, MinKey]"
],
"payload.numberOfQuestionsInLearningPackage": [
"[MaxKey, MinKey]"
],
"payload.stabilityOfMemory": [
"[MaxKey, MinKey]"
]
}
}
}
},
"slotBasedPlan": {
"slots": "$$RESULT=s70 env: { s32 = IndexBounds(\"field #0['type']: [CollationKey(0x4c6561726e696e6750726f6772657373557064617465), CollationKey(0x4c6561726e696e6750726f6772657373557064617465)], field #1['organi\"...), s36 = Nothing, s40 = true }",
"stages": "[4] project [s70 = newObj(\"_id\", s68, \"learningPackages\", s69)] \n[4] project [s68 = newObj(\"learnerId\", s64, \"organizationId\", s65), s69 = getElement(s67, 0)] \n[4] group [s64, s65] [s67 = addToArrayCapped(newObj(\"timestamp\", s62, \"lpId\", s56, \"lpPLP\", s57, \"lpNOAQ\", s59, \"lpNOQILP\", s60, \"lpTP\", s61, \"lpSOM\", s58), 104857600)] spillSlots[s66] mergingExprs[aggConcatArraysCapped(s66, 104857600)] \n[4] project [s64 = traverseP(s63, lambda(l15.0) { getField(move(l15.0), \"learnerId\") }, 1), s65 = traverseP(s63, lambda(l16.0) { getField(move(l16.0), \"organizationId\") }, 1)] \n[3] project [s63 = newObj(\"learnerId\", s5, \"organizationId\", s2, \"learningPackageId\", s45)] \n[3] group [s5, s2, s45] [s56 = first((s45 ?: null)), s57 = first((s48 ?: null)), s58 = first((s43 ?: null)), s59 = first((s47 ?: null)), s60 = first((s46 ?: null)), s61 = first((s44 ?: null)), s62 = first((s3 ?: null))] spillSlots[s49, s50, s51, s52, s53, s54, s55] mergingExprs[first((s49 ?: null)), first((s50 ?: null)), first((s51 ?: null)), first((s52 ?: null)), first((s53 ?: null)), first((s54 ?: null)), first((s55 ?: null))] \n[3] project [s43 = traverseP(s42, lambda(l7.0) { getField(move(l7.0), \"stabilityOfMemory\") }, 1), s44 = traverseP(s42, lambda(l8.0) { getField(move(l8.0), \"timeOfProof\") }, 1), s45 = traverseP(s42, lambda(l9.0) { getField(move(l9.0), \"learningPackageId\") }, 1), s46 = traverseP(s42, lambda(l10.0) { getField(move(l10.0), \"numberOfQuestionsInLearningPackage\") }, 1), s47 = traverseP(s42, lambda(l11.0) { getField(move(l11.0), \"numberOfActiveQuestions\") }, 1), s48 = traverseP(s42, lambda(l12.0) { getField(move(l12.0), \"provenLearningProgress\") }, 1)] \n[1] project [s42 = getField(s41, \"payload\")] \n[1] project [s41 = newObj(\"type\", s1, \"organizationId\", s2, \"timestamp\", s3, \"payload\", newObj(\"learningPackageId\", s4, \"provenLearningProgress\", s6, \"timeOfProof\", s7, \"numberOfActiveQuestions\", s8, \"numberOfQuestionsInLearningPackage\", s9, \"stabilityOfMemory\", s10), \"learnerId\", s5)] \n[1] branch {s40} [s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s39] \n[s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s31] [1] ixscan_generic s32 none s31 none none lowPriority [s11 = 0, s12 = 1, s13 = 2, s14 = 3, s15 = 4, s16 = 5, s17 = 6, s18 = 7, s19 = 8, s20 = 9] @\"7299d749-3274-4950-8600-ccdb65d12b9a\" @\"speed_test1\" false \n[s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s33] [1] nlj inner [] [s34, s35] \n left \n [1] project [s34 = getField(s37, \"l\"), s35 = getField(s37, \"h\")] \n [1] unwind s37 s38 s36 false \n [1] limit 1ll \n [1] coscan \n right \n [1] ixseek s34 s35 none s33 none none [s21 = 0, s22 = 1, s23 = 2, s24 = 3, s25 = 4, s26 = 5, s27 = 6, s28 = 7, s29 = 8, s30 = 9] @\"7299d749-3274-4950-8600-ccdb65d12b9a\" @\"speed_test1\" false \n"
}
}
Execution stats of the same query:
"executionStats": {
"executionSuccess": true,
"nReturned": 1529,
"executionTimeMillis": 12418,
"totalKeysExamined": 2071287,
"totalDocsExamined": 0,
"executionStages": {
"stage": "project",
"planNodeId": 4,
"nReturned": 1529,
"executionTimeMillisEstimate": 12361,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"70": "newObj(\"_id\", s68, \"learningPackages\", s69) "
},
"inputStage": {
"stage": "project",
"planNodeId": 4,
"nReturned": 1529,
"executionTimeMillisEstimate": 12361,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"68": "newObj(\"learnerId\", s64, \"organizationId\", s65) ",
"69": "getElement(s67, 0) "
},
"inputStage": {
"stage": "group",
"planNodeId": 4,
"nReturned": 1529,
"executionTimeMillisEstimate": 12361,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"groupBySlots": [64, 65],
"expressions": {
"67": "addToArrayCapped(newObj(\"timestamp\", s62, \"lpId\", s56, \"lpPLP\", s57, \"lpNOAQ\", s59, \"lpNOQILP\", s60, \"lpTP\", s61, \"lpSOM\", s58), 104857600) ",
"initExprs": { "67": null }
},
"mergingExprs": {
"66": "aggConcatArraysCapped(s66, 104857600) "
},
"usedDisk": false,
"spills": 0,
"spilledBytes": 0,
"spilledRecords": 0,
"spilledDataStorageSize": 0,
"inputStage": {
"stage": "project",
"planNodeId": 4,
"nReturned": 1529,
"executionTimeMillisEstimate": 12361,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"64": "traverseP(s63, lambda(l15.0) { getField(move(l15.0), \"learnerId\") }, 1) ",
"65": "traverseP(s63, lambda(l16.0) { getField(move(l16.0), \"organizationId\") }, 1) "
},
"inputStage": {
"stage": "project",
"planNodeId": 3,
"nReturned": 1529,
"executionTimeMillisEstimate": 12361,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"63": "newObj(\"learnerId\", s5, \"organizationId\", s2, \"learningPackageId\", s45) "
},
"inputStage": {
"stage": "group",
"planNodeId": 3,
"nReturned": 1529,
"executionTimeMillisEstimate": 12356,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"groupBySlots": [5, 2, 45],
"expressions": {
"56": "first((s45 ?: null)) ",
"57": "first((s48 ?: null)) ",
"58": "first((s43 ?: null)) ",
"59": "first((s47 ?: null)) ",
"60": "first((s46 ?: null)) ",
"61": "first((s44 ?: null)) ",
"62": "first((s3 ?: null)) ",
"initExprs": {
"56": null,
"57": null,
"58": null,
"59": null,
"60": null,
"61": null,
"62": null
}
},
"mergingExprs": {
"49": "first((s49 ?: null)) ",
"50": "first((s50 ?: null)) ",
"51": "first((s51 ?: null)) ",
"52": "first((s52 ?: null)) ",
"53": "first((s53 ?: null)) ",
"54": "first((s54 ?: null)) ",
"55": "first((s55 ?: null)) "
},
"usedDisk": false,
"spills": 0,
"spilledBytes": 0,
"spilledRecords": 0,
"spilledDataStorageSize": 0,
"inputStage": {
"stage": "project",
"planNodeId": 3,
"nReturned": 1686758,
"executionTimeMillisEstimate": 10905,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"43": "traverseP(s42, lambda(l7.0) { getField(move(l7.0), \"stabilityOfMemory\") }, 1) ",
"44": "traverseP(s42, lambda(l8.0) { getField(move(l8.0), \"timeOfProof\") }, 1) ",
"45": "traverseP(s42, lambda(l9.0) { getField(move(l9.0), \"learningPackageId\") }, 1) ",
"46": "traverseP(s42, lambda(l10.0) { getField(move(l10.0), \"numberOfQuestionsInLearningPackage\") }, 1) ",
"47": "traverseP(s42, lambda(l11.0) { getField(move(l11.0), \"numberOfActiveQuestions\") }, 1) ",
"48": "traverseP(s42, lambda(l12.0) { getField(move(l12.0), \"provenLearningProgress\") }, 1) "
},
"inputStage": {
"stage": "project",
"planNodeId": 1,
"nReturned": 1686758,
"executionTimeMillisEstimate": 9878,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"42": "getField(s41, \"payload\") "
},
"inputStage": {
"stage": "project",
"planNodeId": 1,
"nReturned": 1686758,
"executionTimeMillisEstimate": 9732,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"projections": {
"41": "newObj(\"type\", s1, \"organizationId\", s2, \"timestamp\", s3, \"payload\", newObj(\"learningPackageId\", s4, \"provenLearningProgress\", s6, \"timeOfProof\", s7, \"numberOfActiveQuestions\", s8, \"numberOfQuestionsInLearningPackage\", s9, \"stabilityOfMemory\", s10), \"learnerId\", s5) "
},
"inputStage": {
"stage": "branch",
"planNodeId": 1,
"nReturned": 1686758,
"executionTimeMillisEstimate": 6374,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"numTested": 1,
"thenBranchOpens": 1,
"thenBranchCloses": 1,
"elseBranchOpens": 0,
"elseBranchCloses": 0,
"filter": "s40 ",
"thenSlots": [
11, 12, 13, 14,
15, 16, 17, 18,
19, 20, 31
],
"elseSlots": [
21, 22, 23, 24,
25, 26, 27, 28,
29, 30, 33
],
"outputSlots": [
1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 39
],
"thenStage": {
"stage": "ixscan_generic",
"planNodeId": 1,
"nReturned": 1686758,
"executionTimeMillisEstimate": 6360,
"opens": 1,
"closes": 1,
"saveState": 640,
"restoreState": 640,
"isEOF": 1,
"indexName": "speed_test1",
"keysExamined": 2071287,
"seeks": 384529,
"numReads": 2071287,
"recordIdSlot": 31,
"outputSlots": [
11, 12, 13, 14,
15, 16, 17, 18,
19, 20
],
"indexKeysToInclude": "00000000000000000000001111111111"
},
"elseStage": {
"stage": "nlj",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0,
"totalDocsExamined": 0,
"totalKeysExamined": 0,
"collectionScans": 0,
"collectionSeeks": 0,
"indexScans": 0,
"indexSeeks": 0,
"indexesUsed": [
"speed_test1"
],
"innerOpens": 0,
"innerCloses": 0,
"outerProjects": [],
"outerCorrelated": [
34, 35
],
"outerStage": {
"stage": "project",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0,
"projections": {
"34": "getField(s37, \"l\") ",
"35": "getField(s37, \"h\") "
},
"inputStage": {
"stage": "unwind",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0,
"inputSlot": 36,
"outSlot": 37,
"outIndexSlot": 38,
"preserveNullAndEmptyArrays": 0,
"inputStage": {
"stage": "limit",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0,
"inputStage": {
"stage": "coscan",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0
}
}
}
},
"innerStage": {
"stage": "ixseek",
"planNodeId": 1,
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"opens": 0,
"closes": 0,
"saveState": 640,
"restoreState": 640,
"isEOF": 0,
"indexName": "speed_test1",
"keysExamined": 0,
"seeks": 0,
"numReads": 0,
"recordIdSlot": 33,
"outputSlots": [
21, 22, 23,
24, 25, 26,
27, 28, 29, 30
],
"indexKeysToInclude": "00000000000000000000001111111111",
"seekKeyLow": "s34 ",
"seekKeyHigh": "s35 "
}
}
}
}
}
}
}
}
}
}
}
}
}
I added the same info for the query with only one $first in the first reply, as I otherwise hit the limit of characters.
Upvotes: 2
Views: 106
Reputation: 59563
Try this one:
[
{
$match: {
organizationId: "OrgId",
"learnerId": {
$in: [
"LearnerId1",
"LearnerId2"
]
},
type: "LearningProgressUpdate",
learningPackageId: "learningPackageId",
timestamp: {
$lt: new ISODate(
"2025-01-24T22:59:59.999Z"
)
}
}
},
{
$sort: {
learnerId: 1, organizationId: 1
}
},
{
$group: {
_id: {
learnerId: "$learnerId",
organizationId: "$organizationId",
learningPackageId: "$payload.learningPackageId"
},
payload: { $top: { sortBy: { timestamp: -1}, output: "$payload" },
timestamp: { $first: "$timestamp" }
}
},
$top
could be faster than sorting the entire collection. The first $sort
stage should support the $group
, see $group Performance Optimizations
Upvotes: 3