Couchbase N1QL Index for 1 billion documents

Question

I have the following query that I am running in Couchbase enterprise (6.0.2 build 2413) against around 1Billion documents. What would be the most performant index to create based on this query? (wanting to finish the report under specific time period, so getting the most speed out of index is the main goal)

select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
    from `LogBucket` a
    where LoggingType in [3001, 4004, 6002]
    group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID

I tried created the following index:

CREATE INDEX `data_job_productivity_index1` 
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`) 
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])

but it is using a different index (one dedicated for different reporting query) when I check with explain.

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "Sequence",
        "~children": [
          {
            "#operator": "IndexScan3",
            "as": "a",
            "index": "analyst_log_LogJob_activity",
            "index_id": "f85999b9b7cc0d3f",
            "index_projection": {
              "primary_key": true
            },
            "keyspace": "LogBucket",
            "namespace": "default",
            "spans": [
              {
                "exact": true,
                "range": [
                  {
                    "high": "3001",
                    "inclusion": 3,
                    "low": "3001"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "4004",
                    "inclusion": 3,
                    "low": "4004"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "6002",
                    "inclusion": 3,
                    "low": "6002"
                  }
                ]
              }
            ],
            "using": "gsi"
          },
          {
            "#operator": "Fetch",
            "as": "a",
            "keyspace": "LogBucket",
            "namespace": "default"
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "Filter",
                  "condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
                },
                {
                  "#operator": "InitialGroup",
                  "aggregates": [
                    "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
                  ],
                  "group_keys": [
                    "(`a`.`LogFileID`)",
                    "(`a`.`RowKey`)"
                  ]
                }
              ]
            }
          },
          {
            "#operator": "IntermediateGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "FinalGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "InitialProject",
                  "result_terms": [
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
                    },
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
                    }
                  ]
                },
                {
                  "#operator": "FinalProject"
                }
              ]
            }
          }
        ]
      },
      {
        "#operator": "Alias",
        "as": "a"
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialGroup",
              "aggregates": [
                "count(*)"
              ],
              "group_keys": [
                "(`a`.`LoggingType`)",
                "(`a`.`LogJobID`)"
              ]
            }
          ]
        }
      },
      {
        "#operator": "IntermediateGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "FinalGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialProject",
              "result_terms": [
                {
                  "expr": "(`a`.`LogJobID`)"
                },
                {
                  "as": "LoggingTypeID",
                  "expr": "(`a`.`LoggingType`)"
                },
                {
                  "as": "AffectedLineCount",
                  "expr": "count(*)"
                }
              ]
            },
            {
              "#operator": "FinalProject"
            }
          ]
        }
      }
    ]
  },
  "text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
    from `LogBucket` a
    where LoggingType in [3001, 4004, 6002]
    group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID"
}

The index that it chooses to use is created like so:

CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))

The problem with this second index is that it has all 1 billion documents under the index vs. the one I am trying to create/dedicate for this new report will have significantly less because of the LoggingType where clause.

vsr · Accepted Answer

You can create covered index as follows. Use index WHERE clause only when all queries use same LoggingType values.

CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
           (`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];

SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
    SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
    FROM `LogBucket` AS a
    WHERE LoggingType IN [3001, 4004, 6002]
    GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;

Couchbase N1QL Index for 1 billion documents

Answers (1)

Related Questions