Reputation: 45
I am experimenting with CB for a uni project. I've loaded a bucket called tweets with 5000000 docs and a bucket called users with 2000000 docs.
Each Tweet-doc has the same schema, with some attributes like "is_retweet" ("true" or "false"), "text" (text of the tweet), "timestamp" and "user_id" which is the id of the user who tweeted it.
Each User has and id and may have a list of followers (id of other users).
I would like to create a query that retrieve the 20 most seen tweets. This means that I look for those tweets which have the field "is_retweet" = "true" and a user with a lot of followers.
I have created indexes on the ARRAY_LENGTH of followers, on the attribute "is_retweet" and on "user_id" field:
CREATE INDEX `idx_followers_length` ON `users`(array_length(`followers`)) WHERE (100 < array_length(`followers`))
CREATE INDEX `idx_retweet` ON `tweets`(`is_retweet`) WHERE (`is_retweet` = "true")
CREATE INDEX `idx_users_on_tweets` ON `tweets`(`user_id`)
Thanks to those indexes, the execution of partial queries is quite fast. "Partial Queries" are:
Get the 20 most followed users, based on the length of "followers" array
SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100
ORDER BY ARRAY_LENGTH(followers) DESC
LIMIT 20
Select the text and the timestamp (month/day) of tweets that have been retweeted
SELECT DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(timestamp)), 'day') AS day,
text
FROM tweets
WHERE is_retweet = "true"
The problem is that when I try the join, the query just runs forever (more than 30 mins). Here is the query (maybe it is wrong) and the explain (the query uses all the indexes mentioned above):
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM tweets AS t
JOIN (SELECT id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100)
AS u ON t.user_id = META(u).id
WHERE t.is_retweet = "true"
ORDER BY u.followers_num DESC
LIMIT 20
{
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IntersectScan",
"scans": [
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_retweet",
"index_id": "437d590a2e220ed4",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"high": "\"true\"",
"inclusion": 3,
"low": "\"true\""
}
]
}
],
"using": "gsi"
},
{
"#operator": "IndexScan3",
"as": "t",
"index": "idx_users_on_tweets",
"index_id": "c93f6f0be887553",
"index_projection": {
"primary_key": true
},
"keyspace": "tweets",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "null"
}
]
}
],
"using": "gsi"
}
]
},
{
"#operator": "Fetch",
"as": "t",
"keyspace": "tweets",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "NestedLoopJoin",
"alias": "u",
"on_clause": "((`t`.`user_id`) = (meta(`u`).`id`))",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Sequence",
"~children": [
{
"#operator": "IndexScan3",
"index": "idx_followers_length_2",
"index_id": "b5cc45b51847b40f",
"index_projection": {
"primary_key": true
},
"keyspace": "users",
"namespace": "default",
"spans": [
{
"exact": true,
"range": [
{
"inclusion": 0,
"low": "100"
}
]
}
],
"using": "gsi"
},
{
"#operator": "Fetch",
"keyspace": "users",
"namespace": "default"
},
{
"#operator": "Parallel",
"~child": {
"#operator": "Sequence",
"~children": [
{
"#operator": "Filter",
"condition": "(100 < array_length((`users`.`followers`)))"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`users`.`id`)"
},
{
"as": "followers_num",
"expr": "array_length((`users`.`followers`))"
}
]
},
{
"#operator": "FinalProject"
}
]
}
}
]
},
{
"#operator": "Alias",
"as": "u"
}
]
}
},
{
"#operator": "Filter",
"condition": "((`t`.`is_retweet`) = \"true\")"
},
{
"#operator": "InitialProject",
"result_terms": [
{
"expr": "(`u`.`id`)"
},
{
"expr": "(`u`.`followers_num`)"
},
{
"as": "month",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"month\")"
},
{
"as": "day",
"expr": "date_part_str(millis_to_str(to_number((`t`.`timestamp`))), \"day\")"
},
{
"expr": "(`t`.`text`)"
}
]
}
]
}
}
]
},
{
"#operator": "Order",
"limit": "20",
"sort_terms": [
{
"desc": true,
"expr": "(`u`.`followers_num`)"
}
]
},
{
"#operator": "Limit",
"expr": "20"
},
{
"#operator": "FinalProject"
}
]
}
Upvotes: 2
Views: 325
Reputation: 7414
N1QL Joins are LEFT to RIGHT. And you have Pagination query with ORDER BY, which must produce all possible values even though you need 20 items.
CREATE INDEX `ix1` ON `users`(array_length(`followers`) DESC) WHERE (100 < array_length(`followers`));
CREATE INDEX `ix2` ON `tweets`(user_id, timestamp, text) WHERE (`is_retweet` = "true");
SELECT u.id,
u.followers_num,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'month') AS month,
DATE_PART_STR(MILLIS_TO_STR(TONUMBER(t.timestamp)), 'day') AS day,
t.text
FROM (SELECT META().id, ARRAY_LENGTH(followers) AS followers_num
FROM users
WHERE ARRAY_LENGTH(followers) > 100
ORDER BY ARRAY_LENGTH(followers) DESC) AS u
JOIN tweets AS t ON u.id = t.user_id
WHERE t.is_retweet = "true"
LIMIT 20;
Subquery uses covering index with index order (avoids sort) and produces the the highest followers in reverse order. Then it JOINS the tweet produces 20 tweets.
NOTE: This produces 20 retweets. JOIN can produce 1 to many. It can't be top 20 followers. If you need that, based on your needs you might need GROUP BY or use NEST (Example 17 https://blog.couchbase.com/ansi-join-support-n1ql/). In CB 7.0 you can use generic correlated subquery.
Upvotes: 1
Reputation: 26169
I tried the index advisor in my local instance of Couchbase Server 7, with the indexes you already created. It recommended another index:
CREATE INDEX adv_is_retweet_user_id ON `tweets`(`is_retweet`,`user_id`)
So, you might want to give that a try. (I don't have your dataset, so I don't know if it will help, or how much it will help).
But, if you plan to create more complex queries on this same dataset, especially adhoc queries, you may want to consider using the Analytics service. It supports N1QL and indexing as well, but it will usually give you pretty good performance without having to create indexes up front.
Upvotes: 0