scorpion5211
scorpion5211

Reputation: 1070

Couchbase N1QL Index for 1 billion documents

I have the following query that I am running in Couchbase enterprise (6.0.2 build 2413) against around 1Billion documents. What would be the most performant index to create based on this query? (wanting to finish the report under specific time period, so getting the most speed out of index is the main goal)

select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
    from `LogBucket` a
    where LoggingType in [3001, 4004, 6002]
    group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID

I tried created the following index:

CREATE INDEX `data_job_productivity_index1` 
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`) 
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])

but it is using a different index (one dedicated for different reporting query) when I check with explain.

{
  "plan": {
    "#operator": "Sequence",
    "~children": [
      {
        "#operator": "Sequence",
        "~children": [
          {
            "#operator": "IndexScan3",
            "as": "a",
            "index": "analyst_log_LogJob_activity",
            "index_id": "f85999b9b7cc0d3f",
            "index_projection": {
              "primary_key": true
            },
            "keyspace": "LogBucket",
            "namespace": "default",
            "spans": [
              {
                "exact": true,
                "range": [
                  {
                    "high": "3001",
                    "inclusion": 3,
                    "low": "3001"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "4004",
                    "inclusion": 3,
                    "low": "4004"
                  }
                ]
              },
              {
                "exact": true,
                "range": [
                  {
                    "high": "6002",
                    "inclusion": 3,
                    "low": "6002"
                  }
                ]
              }
            ],
            "using": "gsi"
          },
          {
            "#operator": "Fetch",
            "as": "a",
            "keyspace": "LogBucket",
            "namespace": "default"
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "Filter",
                  "condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
                },
                {
                  "#operator": "InitialGroup",
                  "aggregates": [
                    "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
                  ],
                  "group_keys": [
                    "(`a`.`LogFileID`)",
                    "(`a`.`RowKey`)"
                  ]
                }
              ]
            }
          },
          {
            "#operator": "IntermediateGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "FinalGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          },
          {
            "#operator": "Parallel",
            "~child": {
              "#operator": "Sequence",
              "~children": [
                {
                  "#operator": "InitialProject",
                  "result_terms": [
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
                    },
                    {
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
                    }
                  ]
                },
                {
                  "#operator": "FinalProject"
                }
              ]
            }
          }
        ]
      },
      {
        "#operator": "Alias",
        "as": "a"
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialGroup",
              "aggregates": [
                "count(*)"
              ],
              "group_keys": [
                "(`a`.`LoggingType`)",
                "(`a`.`LogJobID`)"
              ]
            }
          ]
        }
      },
      {
        "#operator": "IntermediateGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "FinalGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      },
      {
        "#operator": "Parallel",
        "~child": {
          "#operator": "Sequence",
          "~children": [
            {
              "#operator": "InitialProject",
              "result_terms": [
                {
                  "expr": "(`a`.`LogJobID`)"
                },
                {
                  "as": "LoggingTypeID",
                  "expr": "(`a`.`LoggingType`)"
                },
                {
                  "as": "AffectedLineCount",
                  "expr": "count(*)"
                }
              ]
            },
            {
              "#operator": "FinalProject"
            }
          ]
        }
      }
    ]
  },
  "text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n    from `LogBucket` a\n    where LoggingType in [3001, 4004, 6002]\n    group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
}

The index that it chooses to use is created like so:

CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`)) 

The problem with this second index is that it has all 1 billion documents under the index vs. the one I am trying to create/dedicate for this new report will have significantly less because of the LoggingType where clause.

Upvotes: 1

Views: 373

Answers (1)

vsr
vsr

Reputation: 7414

You can create covered index as follows. Use index WHERE clause only when all queries use same LoggingType values.

CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
           (`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];

SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
    SELECT MAX([CreateDate, SequenceID, {LoggingTypeID:LoggingType,LogJobID} ])[2].*
    FROM `LogBucket` AS a
    WHERE LoggingType IN [3001, 4004, 6002]
    GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;

Upvotes: 3

Related Questions