Reputation: 407
I need help to unnest a field that has multiple arrays and same field repeated in nested objects (quantity is repeated outside and inside nested object).
There are 2 fields in the dataset: order_id and cart, where cart is a dictionary object with multiple lists including lists within a list "items" in it, but the datatype of a cart is string. I would like the output to be individual row for each product and category.
Sample data with the partially working query.
#standardSQL
WITH t AS (
SELECT "order1234" as order_id, '{ "_id" : "cart1234" , "taxRate" : 0.0 , "items" : [{ "quantity" : 1 , "product" : { "_id" : "prod1" , "categoryIds" : [ "cat1", "cat2", "cat3"] , "name" : "Product 1" , "pricing" : { "listPrice" : { "value" : 899} , "salePrice" : { "value" : 725}}, "imagedata" : { "imageLink" : { "_id" : "img1" , "createdOn" : { "$date" : "2019-01-19T19:55:19.782Z"} , "revision" : 1} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var1" , "sku" : { "value" : "sku1" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}] , "Shipping" : true}} , { "quantity" : 2 , "product" : { "_id" : "prod2" , "categoryIds" : [ "cat2", "cat4"] , "name" : "Product 2" , "pricing" : { "listPrice" : { "value" : 199} , "salePrice" : { "value" : 150}}, "imagedata" : { "imageLink" : { "_id" : "img2" , "createdOn" : { "$date" : "2019-01-19T19:58:11.484Z"} , "revision" : 1} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var2" , "sku" : { "value" : "sku2" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}] , "Shipping" : true}} , { "quantity" : 3 , "product" : { "_id" : "prod3" , "categoryIds" : [ "cat2","cat4","cat5"] , "name" : "Product 3" , "pricing" : { "listPrice" : { "value" : 499} , "salePrice" : { "value" : 325}}, "imagedata" : { "imageLink" : { "_id" : "img3" , "createdOn" : { "$date" : "2019-01-15T05:34:17.556Z"} , "revision" : 3} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var3" , "sku" : { "value" : "sku3" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}], "Shipping" : true }}]}' as cart
)
select order_id, quantity, product, JSON_EXTRACT_SCALAR(product,'$._id') as product_id, REPLACE(category_id, '"', '') category_id,
JSON_EXTRACT_SCALAR(product,'$.pricing.listPrice.value') as product_list_price,
JSON_EXTRACT_SCALAR(product,'$.pricing.salePrice.value') as product_sale_price
from t,
UNNEST(REGEXP_EXTRACT_ALL(cart, r'"categoryIds" : \[(.+?)]')) categoryIds WITH OFFSET pos1,
UNNEST(SPLIT(categoryIds)) category_id,
UNNEST(REGEXP_EXTRACT_ALL(cart, r'"product" : (.*?)\}')) product WITH OFFSET pos2,
UNNEST(REGEXP_EXTRACT_ALL(cart, r'"quantity" : (.+?)')) quantity WITH OFFSET pos3
where pos1= pos2 and pos1 = pos3
In the above query, quantity field is incorrect and product_list_price ad product_sale_price are not showing up. Keep in mind quantity in repeated in nested elements. I am thinking my regex is wrong and somehow I need to pick the first "quantity" within each "items", and for price my regex for product doesnt give me full product dictionary thats why they are returned as null. What is the correct Regex for getting the complete value for the product key knowing there could be several { } inside the product key?
Expected result
order_id quantity product_id category_id product_list_price product_sale_price
order1234 1 prod1 cat1 899 799
order1234 1 prod1 cat2 899 799
order1234 1 prod1 cat3 899 799
order1234 2 prod2 cat2 199 150
order1234 2 prod2 cat4 199 150
order1234 3 prod3 cat2 499 399
order1234 3 prod3 cat4 499 399
order1234 3 prod3 cat5 499 399
Upvotes: 0
Views: 1026
Reputation: 172994
What is the correct Regex for getting the complete value for the product key knowing there could be several { } inside the product key?
Ideally, there should be JSON_EXTRACT used (and not REGEXP_EXTRACT - which overcomplicates things). But unfortunately BigQuery's JSON_EXTRACT has some limitation that does not allow processing JSON arrays
To overcome BigQuery "limitation" for JsonPath, you can use custom function as below example shows:
It uses jsonpath-0.8.0.js that can be downloaded from https://code.google.com/archive/p/jsonpath/downloads and uploaded to Google Cloud Storage - gs://your_bucket/jsonpath-0.8.0.js
#standardSQL
CREATE TEMPORARY FUNCTION CUSTOM_JSON_EXTRACT(json STRING, json_path STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
return jsonPath(JSON.parse(json), json_path);
"""
OPTIONS (
library="gs://your_bucket/jsonpath-0.8.0.js"
);
SELECT order_id, quantity, product_id, category_id
FROM `project.dataset.table`,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].quantity')) quantity WITH OFFSET pos1,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].product._id')) product_id WITH OFFSET pos2,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].product.categoryIds')) category_ids WITH OFFSET pos3,
UNNEST(SPLIT(category_ids)) category_id
WHERE pos1 = pos2 AND pos1 = pos3
You can test, play with above using sample data you provided:
#standardSQL
CREATE TEMPORARY FUNCTION CUSTOM_JSON_EXTRACT(json STRING, json_path STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
return jsonPath(JSON.parse(json), json_path);
"""
OPTIONS (
library="gs://your_bucket/jsonpath-0.8.0.js"
);
WITH t AS (
SELECT "order1234" AS order_id, '''{ "_id" : "cart1234" , "taxRate" : 0.0 , "items" : [
{ "quantity" : 1 , "product" : { "_id" : "prod1" , "categoryIds" : [ "cat1", "cat2", "cat3"] , "name" : "Product 1" , "imagedata" : { "imageLink" : { "_id" : "img1" , "createdOn" : { "$date" : "2019-01-19T19:55:19.782Z"} , "revision" : 1} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var1" , "sku" : { "value" : "sku1" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}] , "Shipping" : true}} ,
{ "quantity" : 2 , "product" : { "_id" : "prod2" , "categoryIds" : [ "cat2", "cat4"] , "name" : "Product 2" , "imagedata" : { "imageLink" : { "_id" : "img2" , "createdOn" : { "$date" : "2019-01-19T19:58:11.484Z"} , "revision" : 1} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var2" , "sku" : { "value" : "sku2" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}] , "Shipping" : true}} ,
{ "quantity" : 3 , "product" : { "_id" : "prod3" , "categoryIds" : [ "cat2","cat4","cat5"] , "name" : "Product 3" , "imagedata" : { "imageLink" : { "_id" : "img3" , "createdOn" : { "$date" : "2019-01-15T05:34:17.556Z"} , "revision" : 3} , "title" : "" , "description" : "" , "altText" : ""} , "variants" : [ ] , "productVariants" : [ { "_id" : "var3" , "sku" : { "value" : "sku3" , "modifier" : 0} , "variants" : [ ] , "quantity" : 0 , "imageLinkIds" : [ ] , "skuImageLinkIds" : [ ] , "fulfillmentData" : { "sourceName" : null , "sourceId" : null , "sourceSku" : null , "sourceMethod" : null , "sourceRedirectUrl" : null , "sourceRedirectAppKey" : null }}], "Shipping" : true }}
]}''' AS cart
)
SELECT order_id, quantity, product_id, category_id
FROM t,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].quantity')) quantity WITH OFFSET pos1,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].product._id')) product_id WITH OFFSET pos2,
UNNEST(CUSTOM_JSON_EXTRACT(cart, '$.items[*].product.categoryIds')) category_ids WITH OFFSET pos3,
UNNEST(SPLIT(category_ids)) category_id
WHERE pos1 = pos2 AND pos1 = pos3
with result
Row order_id quantity product_id category_id
1 order1234 1 prod1 cat1
2 order1234 1 prod1 cat2
3 order1234 1 prod1 cat3
4 order1234 2 prod2 cat2
5 order1234 2 prod2 cat4
6 order1234 3 prod3 cat2
7 order1234 3 prod3 cat4
8 order1234 3 prod3 cat5
Note: product_list_price
and product_sale_price
are not present in your sample daat, thus it is not in the above result. But now query is extremely clean and simple, so hope you will be able to easily add those
Upvotes: 1