Reputation: 71
I have a table with one of the columns containing a raw JSON string as follows:
Sample JSON stored in order_lines:
{
"STR_BLK_002":{
"amount":167,
"type":"part spare",
"total_discount":0,
"color":"Black",
"is_out_of_stock":false,
"variable_fields":{
"Size":"XL",
"trueColor":"Black"
},
"category_id":"44356721",
"status_list":[
{
"id":1,
"time":"2021-04-01T15:01:54.746Z",
"status":"ORDER PLACED"
},
{
"id":2,
"time":"2021-04-02T10:31:00.397Z",
"status":"PACKED"
},
{
"id":3,
"time":"2021-04-04T10:31:01.719Z",
"status":"SHIPPED"
},
{
"id":3,
"time":"2021-04-04T18:12:06.896Z",
"status":"SHIPPED"
}
],
"product_id":270,
"price_per_quantity":167,
"quantity":1,
"maximum_quantity":10,
"variant_name":"Helmet strap",
"current_status":30,
"estimated_delivery":"09 Apr 2021",
"total_before_discount":167,
"delivery_statuses":[
{
"time":"2021-04-01T15:10:13.594Z",
"status":"FULFILLABLE"
},
{
"time":"2021-04-02T10:31:00.397Z",
"status":"PACKED"
},
{
"time":"2021-04-03T10:31:01.197Z",
"status":"READY_TO_SHIP"
},
{
"time":"2021-04-04T10:31:01.719Z",
"status":"DISPATCHED"
},
{
"time":"2021-04-04T18:12:06.896Z",
"status":"SHIPPED"
}
],
"sku_code":"STR_BLK_002"
}
}
I want to unnest this string so that the key value pairs can be accessed individually. Also the sku_code, ('STR_BLK_002' in the sample shared above) is not available in any other column and the string can contain more a single sku, so if there are 2 sku(s) corresponding to an order then the JSON string would be:
{
"STR_BLK_002":{
"amount":167,
"type":"part spare",
"total_discount":0,
"color":"Black",
"is_out_of_stock":false,
"variable_fields":{
"Size":"XL",
"trueColor":"Black"
},
"category_id":"44356721",
"status_list":[
{
"id":1,
"time":"2021-04-01T15:01:54.746Z",
"status":"ORDER PLACED"
},
{
"id":2,
"time":"2021-04-02T10:31:00.397Z",
"status":"PACKED"
},
{
"id":3,
"time":"2021-04-04T10:31:01.719Z",
"status":"SHIPPED"
},
{
"id":3,
"time":"2021-04-04T18:12:06.896Z",
"status":"SHIPPED"
}
],
"product_id":270,
"price_per_quantity":167,
"quantity":1,
"maximum_quantity":10,
"variant_name":"Helmet strap",
"current_status":3,
"estimated_delivery":"09 Apr 2021",
"total_before_discount":167,
"delivery_statuses":[
{
"time":"2021-04-01T15:10:13.594Z",
"status":"FULFILLABLE"
},
{
"time":"2021-04-02T10:31:00.397Z",
"status":"PACKED"
},
{
"time":"2021-04-03T10:31:01.197Z",
"status":"READY_TO_SHIP"
},
{
"time":"2021-04-04T10:31:01.719Z",
"status":"DISPATCHED"
},
{
"time":"2021-04-04T18:12:06.896Z",
"status":"SHIPPED"
}
],
"sku_code":"STR_BLK_002"
},
"STR_BLK_008":{
"amount":590,
"type":"accessory",
"total_discount":0,
"color":"blue",
"is_out_of_stock":false,
"variable_fields":{
"Size":"XL",
"trueColor":"prussian blue"
},
"category_id":"65577970",
"status_list":[
{
"id":1,
"time":"2021-04-06T15:01:54.746Z",
"status":"ORDER PLACED"
},
{
"id":2,
"time":"2021-04-07T10:31:00.397Z",
"status":"PACKED"
},
{
"id":3,
"time":"2021-04-07T10:31:01.719Z",
"status":"SHIPPED"
},
{
"id":3,
"time":"2021-04-08T18:12:06.896Z",
"status":"SHIPPED"
}
],
"product_id":276,
"price_per_quantity":590,
"quantity":1,
"maximum_quantity":5,
"variant_name":"Car Perfume",
"current_status":3,
"estimated_delivery":"09 Apr 2021",
"total_before_discount":590,
"delivery_statuses":[
{
"time":"2021-04-06T15:10:13.594Z",
"status":"FULFILLABLE"
},
{
"time":"2021-04-07T10:31:00.397Z",
"status":"PACKED"
},
{
"time":"2021-04-07T10:31:01.197Z",
"status":"READY_TO_SHIP"
},
{
"time":"2021-04-08T10:31:01.719Z",
"status":"DISPATCHED"
},
{
"time":"2021-04-10T18:12:06.896Z",
"status":"SHIPPED"
}
],
"sku_code":"STR_BLK_008"
}
}
I want to break this information into separate columns, so that I can fetch the corresponding values for each SKU.
Upvotes: 0
Views: 500
Reputation: 188
So basically what I think you want to do is first transform your column into a array of structs so that instead of having this:
{
"STR_BLK_002": {...},
"STR_BLK_003": {...}
}
You have something like this:
[
{
"amount":167,
"type":"part spare",
"total_discount":0,
...
},
{
"amount":590,
"type":"accessory",
"total_discount":0,
...
}
]
With the data in that format you can leverage UNNEST to make each entry into its own row, and then use JSON functions to pull out fields into their own columns, for example JSON_EXTRACT_SCALAR
In order to do this, I built a Javascript UDF that find the keys in the object and then iterates through each key to create an array of structs.
CREATE TEMP FUNCTION format_json(str STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS r"""
var obj = JSON.parse(str);
var keys = Object.keys(obj);
var arr = [];
for (i = 0; i < keys.length; i++) {
arr.push(JSON.stringify(obj[keys[i]]));
}
return arr;
""";
SELECT
JSON_EXTRACT_SCALAR(formatted_json,'$.amount') as amount
,JSON_EXTRACT_SCALAR(formatted_json,'$.type') as type
,JSON_EXTRACT_SCALAR(formatted_json,'$.total_discount') as total_discount
,JSON_EXTRACT_SCALAR(formatted_json,'$.color') as color
,JSON_EXTRACT_SCALAR(formatted_json,'$.is_out_of_stock') as is_out_of_stock
,JSON_EXTRACT_SCALAR(formatted_json,'$.sku_code') as sku_code
from
testing.json_test
left join unnest(format_json(order_lines)) as formatted_json
Which results in this:
Upvotes: 2
Reputation: 172944
Below should give you good start
select
json_extract_scalar(line, '$.sku_code') as sku_code,
json_extract_scalar(line, '$.amount') as amount,
json_extract_scalar(line, '$.type') as type,
json_extract_scalar(line, '$.total_discount') as total_discount,
json_extract_scalar(line, '$.color') as color,
json_extract_scalar(line, '$.variable_fields.Size') as Size,
json_extract_scalar(line, '$.variable_fields.trueColor') as trueColor,
from `project.dataset.table`,
unnest(split(regexp_replace(regexp_replace(order_lines, r'\s', ''), r'"STR_BLK_\d+":{', '"STR_BLK":{'),'"STR_BLK":')) order_line with offset,
unnest([struct('{' || trim(order_line, ',{}}') || '}' as line)])
where offset > 0
if applied to first example in your question - output is
if applied to second example in your question - output is
Hope, you can extend this example to whatever final goal you have in mind
Upvotes: 2