Reputation: 744
We are ingesting data into Snowflake via the kafka connector. To increase the data read performance / scan less partitions we decided to add a clustering key to a a key / combination of keys stored in the RECORD_CONTENT variant field.
The data in the RECORD_CONTENT field looks like this:
{
"jsonSrc": {
"Integerfield": 1,
"SourceDateTime": "2020-06-30 05:33:08:345",
*REST_OF_THE_KEY_VALUE_PAIRS*
}
Now, the issue is that clustering on a datetime col like SourceDateTime does NOT work:
CLUSTER BY (to_date(RECORD_CONTENT:jsonSrc:loadDts::datetime))
...while clustering on a field like Integerfield DOES work:
CLUSTER BY (RECORD_CONTENT:jsonSrc:Integerfield::int )
Not working means: when using a filter on RECORD_CONTENT:jsonSrc:loadDts::datetime, it has no effect on the partitions scanned, while filtering on RECORD_CONTENT:jsonSrc:Integerfield::int does perform partition pruning.
What is wrong here? Is this a bug?
Note that:
Upvotes: 3
Views: 1282
Reputation: 11
Long story short, yes, not like number/varchar sub field in variant column that can be successfully clustered/pruned, at the moment date/timestamp sub field in variant column can 'not' be clustered pruned in snowflake (this limit is only for sub-field of variant column but not for native column data type of date/timestamp etc.). Therefore we will have to tweak it a bit(convert to different format) so we can still successfully cluster prune it.
Please find below for a 3 mins LAB to prove this point,
--create testing table to evalute prune/cluster feature for all sub-field data types in variant column
CREATE OR REPLACE TABLE GET_PROPOSALS as
select
row_number() over (order by seq8()) as pk,
mod(pk,12876381) + 1 as city_id, --NDV 12876381
chr(65 + uniform(0, 15, random())) || chr(65 + uniform(0, 15, random())) as message, --NDV 256
cast (dateadd(day, mod(pk, 1095), '2020-01-01' ) as date) as transactionDate, --NDV 1095 DAYS, i.e. 3 years
cast (dateadd(second, mod(pk, 3600), '2020-01-01 00:00:00' ) as timestamp) as message_timestamp,
cast (object_construct('city_id', city_id,
'message', message,
'transactionDate', transactionDate,
'transactionDate2', to_char(transactionDate, 'yyyy-mm-dd'),
'transactionDate3', to_char(transactionDate, 'yymmdd'),
'message_timestamp', message_timestamp,
'message_timestamp2', to_char(message_timestamp, 'yyyy-mm-dd-hh24:mi:ss'),
'message_timestamp3', DATE_PART(EPOCH_SECOND, message_timestamp)
)
as variant) as data,
RANDSTR(120 ,RANDOM()) as any_other_columns --fill in necessary space to simulate real life customer environment
from table(generator(rowcount => 12960000))
order by random(); --data is randomly distributed, i.e. 100% not clustered on any columns whatsoever
desc table GET_PROPOSALS;
/*
--kindly note: your testing data might be different since it's randmoly generated/selected but gist is the same here
{
"city_id": 586422,
"message": "NM",
"message_timestamp": "2020-01-01 00:53:41.000",
"message_timestamp2": "2020-01-01-00:53:41",
"message_timestamp3": 1577840021,
"transactionDate": "2021-08-19",
"transactionDate2": "2021-08-19",
"transactionDate3": "210819"
}
*/
select *
from GET_PROPOSALS
limit 1;
--simulate number sub-field data pruning
create or replace table GET_PROPOSALS_CITY_ID as
select *
from GET_PROPOSALS
order by data:city_id;
select system$clustering_information('GET_PROPOSALS_CITY_ID', '(data:city_id::number)');
alter session set use_cached_result=false;
--only need to hit 1 partition since number sub-field can be properly clustered
select *
from GET_PROPOSALS_CITY_ID
where data:city_id=586422;
--simulate char sub-field data pruning
create or replace table GET_PROPOSALS_MESSAGE as
select *
from GET_PROPOSALS
order by data:message;
select system$clustering_information('GET_PROPOSALS_MESSAGE', '(data:message::varchar)');
--only need to hit 2 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGE
where data:message='NM';
--simulate date sub-field data pruning
create or replace table GET_PROPOSALS_TRANSACTIONDATE as
select *
from GET_PROPOSALS
order by data:transactionDate;
--needs to hit all 256 partitions. i.e. full table scan and no pruning if we cluster on date type sub-field directly
select *
from GET_PROPOSALS_TRANSACTIONDATE
where data:transactionDate='2021-08-19';
--simulate date sub-field data pruning, try to use workaround as format it to yyyy-mm-dd char type
create or replace table GET_PROPOSALS_TRANSACTIONDATE2 as
select *
from GET_PROPOSALS
order by data:transactionDate2;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_TRANSACTIONDATE2
where data:transactionDate2='2021-08-19';
--simulate date sub-field data pruning, try to use workaround as format it to yymmdd char type
create or replace table GET_PROPOSALS_TRANSACTIONDATE3 as
select *
from GET_PROPOSALS
order by data:transactionDate3;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_TRANSACTIONDATE3
where data:transactionDate3='210819';
--simulate timestamp sub-field data pruning
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP as
select *
from GET_PROPOSALS
order by data:message_timestamp;
--needs to hit all 256 partitions. i.e. full table scan and no pruning if we cluster on timestamp type sub-field directly
select *
from GET_PROPOSALS_MESSAGETIMESTAMP
where data:message_timestamp='2020-01-01 00:53:41.000';
--simulate timestamp sub-field data pruning, try to use workaround as format it to yyyy-mm-dd char type
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP2 as
select *
from GET_PROPOSALS
order by data:message_timestamp2;
--only need to hit 1 partition since char sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGETIMESTAMP2
where data:message_timestamp2='2020-01-01-00:53:41';
--simulate timestamp sub-field data pruning, try to use workaround as format it to epoch type
create or replace table GET_PROPOSALS_MESSAGETIMESTAMP3 as
select *
from GET_PROPOSALS
order by data:message_timestamp3::date;
--only need to hit 1 partition since epoch sub-field can be properly clustered
select *
from GET_PROPOSALS_MESSAGETIMESTAMP3
where data:message_timestamp3='2020-01-01 00:53:41'::timestamp;
kindly note: order by means 100% cluster on a certain key so I just use this trick to get 100% cluster table immediately instead of waiting for auto-cluster just for testing purpose.
Upvotes: 1
Reputation: 737
For better pruning and less storage consumption, we recommend flattening your object and key data into separate relational columns if your semi-structured data includes: Dates and timestamps, especially non-ISO 8601dates and timestamps, as string values
Numbers within strings
Arrays
Non-native values such as dates and timestamps are stored as strings when loaded into a VARIANT column, so operations on these values could be slower and also consume more space than when stored in a relational column with the corresponding data type.
Upvotes: 2