Reputation: 34356
I have a JSON structure in a field that looks like this. I'm trying to extract every task in every category, there could be any number of tasks or categories.
I've got part of the way there by extracting a single category, but can't seem to do it for every task in every category.
"tasks": {
"category-business": [
{
"dateCompleted": {
"_seconds": 1653672655,
"_nanoseconds": 791000000
},
"slug": "task-alpha",
"status": "completed"
},
{
"dateCompleted": {
"_seconds": 1654516259,
"_nanoseconds": 796000000
},
"slug": "task-bravo",
"status": "completed"
}
],"category-community": [
{
"dateCompleted": {
"_seconds": 1654709063,
"_nanoseconds": 474000000
},
"slug": "task-papa",
"status": "completed"
},
{
"dateCompleted": {
"_seconds": 1654709841,
"_nanoseconds": 764000000
},
"slug": "task-zebra",
"status": "completed"
}
]}
Here's the query so far
SELECT
*
FROM
(
SELECT
ARRAY(
SELECT
STRUCT(
TIMESTAMP_SECONDS(
CAST(
JSON_EXTRACT_SCALAR(business_tasks, '$.dateCompleted._seconds') AS INT64
)
) AS dateCompleted,
json_extract_scalar(business_tasks, '$.slug') AS task_slug,
json_extract_scalar(business_tasks, '$.status') AS status
)
FROM
UNNEST(
json_extract_array(DATA, '$.tasks.category-business')
) business_tasks
) AS items
FROM
`table`
)
This extracts just the information in the category business.
What I'm trying to do is expand category-community
and any other children underneath the tasks
key. The real data has at least 10 categories and 50 tasks.
I think I need to do another round of UNNEST and json_extract_array but I can't quite work out the correct order?
Upvotes: 0
Views: 246
Reputation: 391
new version:
select
timestamp_seconds(cast(regexp_extract_all(to_json_string(json_extract(DATA,'$.tasks')), r'"_seconds":(\d*)')[offset(a)] as int64)) dateCompleted
, regexp_extract_all(to_json_string(json_extract(DATA,'$.tasks')), r'"slug":"([a-z\-]*)"')[offset(a)] task_slug
, regexp_extract_all(to_json_string(json_extract(DATA,'$.tasks')), r'"status":"([a-z\-]*)"')[offset(a)] status
from table
join unnest(generate_array(0,-1+array_length(regexp_extract_all(to_json_string(json_extract(DATA,'$.tasks')), r'"slug":"([a-z\-]*)"')))) a
https://console.cloud.google.com/bigquery?sq=1013309549723:9f43bd653ba14589b31a1f5673adcda7
Upvotes: 1
Reputation: 391
DML only:
with category_level as (
select
coalesce(
json_query_array(DATA.tasks[a], '$.category-business')
, json_query_array(DATA.tasks[a], '$.category-community')
, json_query_array(DATA.tasks[a], '$.category-3')
, json_query_array(DATA.tasks[a], '$.category-4')
, json_query_array(DATA.tasks[a], '$.category-5')
, json_query_array(DATA.tasks[a], '$.category-6')
, json_query_array(DATA.tasks[a], '$.category-7')
, json_query_array(DATA.tasks[a], '$.category-8')
, json_query_array(DATA.tasks[a], '$.category-9')
, json_query_array(DATA.tasks[a], '$.category-10')
) category_array
from table
left join unnest(generate_array(0, 100)) a
where DATA.tasks[a] is not null
)
select
timestamp_seconds(cast(json_extract_scalar(b.dateCompleted._seconds) as int64)) dateCompleted
, json_extract_scalar(b.slug) slug
, json_extract_scalar(b.status) status
from category_level
left join unnest(category_array) b
https://console.cloud.google.com/bigquery?sq=1013309549723:fe8b75122e5b4b549e8081df99584c81
Upvotes: 1
Reputation: 172944
Consider below approach
create temp function get_keys(input string) returns array<string> language js as """
return Object.keys(JSON.parse(input));
""";
create temp function get_values(input string) returns array<string> language js as """
return Object.values(JSON.parse(input));
""";
create temp function get_leaves(input string) returns string language js as '''
function flattenObj(obj, parent = '', res = {}){
for(let key in obj){
let propName = parent ? parent + '.' + key : key;
if(typeof obj[key] == 'object'){
flattenObj(obj[key], propName, res);
} else {
res[propName] = obj[key];
}
}
return JSON.stringify(res);
}
return flattenObj(JSON.parse(input));
''';
create temp table temp_table as (
select
split(key, '.')[offset(0)] as category,
split(key, '.')[offset(1)] as offset,
split(key, '.')[offset(2)] || ifnull(split(key, '.')[safe_offset(3)], '') as key,
val, format('%t', t) row_id
from your_table t, unnest([struct(get_leaves(json_extract(data, '$.tasks')) as leaves)]),
unnest(get_keys(leaves)) key with offset
join unnest(get_values(leaves)) val with offset using(offset)
);
execute immediate (
select '''
select * except(row_id) from temp_table
pivot (any_value(val) for key in ("''' || keys || '"))'
from (
select string_agg(key, '","') keys
from (select distinct key from temp_table)
)
);
if applied to sample data in your question - output is
Upvotes: 2