Reputation: 1070
I have the following query that I am running in Couchbase enterprise (6.0.2 build 2413) against around 1Billion documents. What would be the most performant index to create based on this query? (wanting to finish the report under specific time period, so getting the most speed out of index is the main goal)
select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
from `LogBucket` a
where LoggingType in [3001, 4004, 6002]
group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID
I tried created the following index:
CREATE INDEX `data_job_productivity_index1`
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`)
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])
but it is using a different index (one dedicated for different reporting query) when I check with explain.
{
"plan": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"as": "a",
"index": "analyst_log_LogJob_activity",
"index_id": "f85999b9b7cc0d3f",
"index_projection": {
"primary_key": true
},
"keyspace": "LogBucket",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "3001",
"inclusion": 3,
"low": "3001"
}
]
},
{
"exact": true,
"range": [
{
"high": "4004",
"inclusion": 3,
"low": "4004"
}
]
},
{
"exact": true,
"range": [
{
"high": "6002",
"inclusion": 3,
"low": "6002"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"as": "a",
"keyspace": "LogBucket",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
},
{
"#operator": "InitialGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
},
{
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "a"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
}
]
}
},
{
"#operator": "IntermediateGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "FinalGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`a`.`LogJobID`)"
},
{
"as": "LoggingTypeID",
"expr": "(`a`.`LoggingType`)"
},
{
"as": "AffectedLineCount",
"expr": "count(*)"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
"text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n from `LogBucket` a\n where LoggingType in [3001, 4004, 6002]\n group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
}
The index that it chooses to use is created like so:
CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))
The problem with this second index is that it has all 1 billion documents under the index vs. the one I am trying to create/dedicate for this new report will have significantly less because of the LoggingType where clause.
Upvotes: 1
Views: 373
Reputation: 7414
You can create covered index as follows. Use index WHERE clause only when all queries use same LoggingType values.
CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
(`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];
SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
FROM `LogBucket` AS a
WHERE LoggingType IN [3001, 4004, 6002]
GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;
Make sure inner subquery is covered and using index aggregation https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/
Explore index replication for High availability and performance https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html
If LoggingType, LogFileID, RowKey are immutable then explore them as partition keys instead of META().id https://blog.couchbase.com/couchbase-gsi-index-partitioning/
https://blog.couchbase.com/create-right-index-get-right-performance/
Explore Index Advisor https://index-advisor.couchbase.com
Upvotes: 3