Reputation: 149
Need to parse nested XML in Snowflake. Example:
WITH xml_table
AS (
SELECT 1 AS ID
, PARSE_XML(
'<root>
<docs>
<doc>
<Id>1</Id>
<Name>
<Kvp>
<Key>k1</Key>
<Value>v1</Value>
</Kvp>
<Kvp>
<Key>k2</Key>
<Value>v2</Value>
</Kvp>
<Kvp>
<Key>k3</Key>
<Value>v3</Value>
</Kvp>
</Name>
</doc>
<doc>
<Id>2</Id>
<Name>
<Kvp>
<Key>k4</Key>
<Value>v4</Value>
</Kvp>
<Kvp>
<Key>k5</Key>
<Value>v5</Value>
</Kvp>
</Name>
</doc>
<doc>
<Id>3</Id>
<Name>
<Kvp>
<Key>k6</Key>
<Value>v6</Value>
</Kvp>
</Name>
</doc>
</docs>
</root>'
) AS XML_COL
)
SELECT docs.ID
, docs.docId
--, docs.docName
, GET(XMLGET(kvps.VALUE, 'Key'), '$')::STRING AS docNameKey
, GET(XMLGET(kvps.VALUE, 'Value'), '$')::STRING AS docNameValue
--, kvps.*
FROM --docs
(
SELECT xml_table.ID
, xml_table.XML_COL
, GET(xml_table.XML_COL, '@')::STRING AS ROOT_NODE_NAME
, XMLGET(docs.VALUE, 'Id') : "$"::STRING AS docId
, XMLGET(docs.VALUE, 'Name') AS docName
FROM xml_table
, LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
WHERE 1 = 1
) docs
, LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
WHERE 1 = 1
So i need to parse it to table format:
doc.Id AS docId
doc.Name.Kvp.Key AS docNameKey
doc.Name.Kvp.Value AS docNameValue
The given example seesm to work as expected when there is more than one key-value-pair under KVP(like the docId IN (1,2) in the example), but then when there is one key-value-pair then the result is not as expected. Why?
The query returns:
ID DOCID DOCNAMEKEY DOCNAMEVALUE
1 1 k1 v1
1 1 k2 v2
1 1 k3 v3
1 2 k4 v4
1 2 k5 v5
1 3 null null
1 3 null null
1 3 null null
1 3 null null
As you see the docId=3 has been "blown" to 4 records and no key-value-pair is found. But it has one key value pair k6-v6.
So the expected output (what i would like to get) is:
ID DOCID DOCNAMEKEY DOCNAMEVALUE
1 1 k1 v1
1 1 k2 v2
1 1 k3 v3
1 2 k4 v4
1 2 k5 v5
1 3 k6 v6
Why does the xml parsing depend on the number of sub-elements?
Upvotes: 0
Views: 47
Reputation: 149
The way I solved it currently:
WITH xml_table
AS (
SELECT 1 AS ID
, PARSE_XML(
'<root>
<docs>
<doc>
<Id>1</Id>
<Name>
<Kvp>
<Key>k1</Key>
<Value>v1</Value>
</Kvp>
<Kvp>
<Key>k2</Key>
<Value>v2</Value>
</Kvp>
<Kvp>
<Key>k3</Key>
<Value>v3</Value>
</Kvp>
</Name>
</doc>
<doc>
<Id>2</Id>
<Name>
<Kvp>
<Key>k4</Key>
<Value>v4</Value>
</Kvp>
<Kvp>
<Key>k5</Key>
<Value>v5</Value>
</Kvp>
</Name>
</doc>
<doc>
<Id>3</Id>
<Name>
<Kvp>
<Key>k6</Key>
<Value>v6</Value>
</Kvp>
</Name>
</doc>
</docs>
</root>'
) AS XML_COL
)
SELECT docs.ID
, docs.docId
, CASE
WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
THEN CAST(GET(XMLGET(kvps.THIS, 'Key'), '$') AS STRING)
ELSE CAST(GET(XMLGET(kvps.VALUE, 'Key'), '$') AS STRING)
END AS docNameKey
, CASE
WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
THEN CAST(GET(XMLGET(kvps.THIS, 'Value'), '$') AS STRING)
ELSE CAST(GET(XMLGET(kvps.VALUE, 'Value'), '$') AS STRING)
END AS docNameValue
FROM --docs
(
SELECT xml_table.ID
, xml_table.XML_COL
, CAST(GET(xml_table.XML_COL, '@') AS STRING) AS ROOT_NODE_NAME
, CAST(GET(XMLGET(docs.VALUE, 'Id'), '$') AS STRING) AS docId
, XMLGET(docs.VALUE, 'Name') AS docName
FROM xml_table
, LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
WHERE 1 = 1
) docs
, LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
WHERE 1 = 1
AND (
kvps.KEY IS NULL
OR kvps.KEY = '$'
) --For some reason Snowflake works differently if there is 1 sub element than when multiple
So added the WHERE clause condition to exclude the "extra" lines Snowflake adds when there is only one sub set of value pairs. And then in SELECT clause extra CASE statements to pull the values either from THIS or VALUE attributes depending on is there one or many value pairs.
Upvotes: 0