user12761950
user12761950

Reputation: 149

Parse nested XML with key value pairs in Snowflake

Need to parse nested XML in Snowflake. Example:


    WITH xml_table
    AS (
        SELECT 1 AS ID
            , PARSE_XML(
                '<root>
                    <docs>
                        <doc>
                            <Id>1</Id>
                            <Name>
                                <Kvp>
                                    <Key>k1</Key>
                                    <Value>v1</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k2</Key>
                                    <Value>v2</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k3</Key>
                                    <Value>v3</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>2</Id>
                            <Name>
                                <Kvp>
                                    <Key>k4</Key>
                                    <Value>v4</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k5</Key>
                                    <Value>v5</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>3</Id>
                            <Name>
                                <Kvp>
                                    <Key>k6</Key>
                                    <Value>v6</Value>
                                </Kvp>
                            </Name>
                        </doc>
                    </docs>
                </root>'
            ) AS XML_COL
        )
    SELECT docs.ID
        , docs.docId
        --, docs.docName
        , GET(XMLGET(kvps.VALUE, 'Key'), '$')::STRING AS docNameKey
        , GET(XMLGET(kvps.VALUE, 'Value'), '$')::STRING AS docNameValue
        --, kvps.*
    FROM --docs
        (
        SELECT xml_table.ID
            , xml_table.XML_COL
            , GET(xml_table.XML_COL, '@')::STRING AS ROOT_NODE_NAME
            , XMLGET(docs.VALUE, 'Id') : "$"::STRING AS docId
            , XMLGET(docs.VALUE, 'Name') AS docName
        FROM xml_table
            , LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
        WHERE 1 = 1
        ) docs
        , LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
    WHERE 1 = 1

So i need to parse it to table format:

    doc.Id AS docId
    doc.Name.Kvp.Key AS docNameKey
    doc.Name.Kvp.Value AS docNameValue

The given example seesm to work as expected when there is more than one key-value-pair under KVP(like the docId IN (1,2) in the example), but then when there is one key-value-pair then the result is not as expected. Why?

The query returns:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   null    null
1   3   null    null
1   3   null    null
1   3   null    null

As you see the docId=3 has been "blown" to 4 records and no key-value-pair is found. But it has one key value pair k6-v6.

So the expected output (what i would like to get) is:

ID  DOCID   DOCNAMEKEY  DOCNAMEVALUE
1   1   k1  v1
1   1   k2  v2
1   1   k3  v3
1   2   k4  v4
1   2   k5  v5
1   3   k6  v6

Why does the xml parsing depend on the number of sub-elements?

Upvotes: 0

Views: 47

Answers (1)

user12761950
user12761950

Reputation: 149

The way I solved it currently:

WITH xml_table
AS (
    SELECT 1 AS ID
        , PARSE_XML(
            '<root>
                    <docs>
                        <doc>
                            <Id>1</Id>
                            <Name>
                                <Kvp>
                                    <Key>k1</Key>
                                    <Value>v1</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k2</Key>
                                    <Value>v2</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k3</Key>
                                    <Value>v3</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>2</Id>
                            <Name>
                                <Kvp>
                                    <Key>k4</Key>
                                    <Value>v4</Value>
                                </Kvp>
                                <Kvp>
                                    <Key>k5</Key>
                                    <Value>v5</Value>
                                </Kvp>
                            </Name>
                        </doc>
                        <doc>
                            <Id>3</Id>
                            <Name>
                                <Kvp>
                                    <Key>k6</Key>
                                    <Value>v6</Value>
                                </Kvp>
                            </Name>
                        </doc>
                    </docs>
                </root>'
        ) AS XML_COL
    )
SELECT docs.ID
    , docs.docId
    , CASE 
        WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
            THEN CAST(GET(XMLGET(kvps.THIS, 'Key'), '$') AS STRING)
        ELSE CAST(GET(XMLGET(kvps.VALUE, 'Key'), '$') AS STRING)
        END AS docNameKey
    , CASE 
        WHEN kvps.KEY = '$' --For some reason Snowflake works differently if there is 1 sub element than when multiple
            THEN CAST(GET(XMLGET(kvps.THIS, 'Value'), '$') AS STRING)
        ELSE CAST(GET(XMLGET(kvps.VALUE, 'Value'), '$') AS STRING)
        END AS docNameValue
FROM --docs
    (
    SELECT xml_table.ID
        , xml_table.XML_COL
        , CAST(GET(xml_table.XML_COL, '@') AS STRING) AS ROOT_NODE_NAME
        , CAST(GET(XMLGET(docs.VALUE, 'Id'), '$') AS STRING) AS docId
        , XMLGET(docs.VALUE, 'Name') AS docName
    FROM xml_table
        , LATERAL FLATTEN(GET(XMLGET(xml_table.XML_COL, 'docs'), '$')) AS docs
    WHERE 1 = 1
    ) docs
    , LATERAL FLATTEN(GET(docs.docName, '$')) AS kvps
WHERE 1 = 1
    AND (
        kvps.KEY IS NULL
        OR kvps.KEY = '$'
        ) --For some reason Snowflake works differently if there is 1 sub element than when multiple

So added the WHERE clause condition to exclude the "extra" lines Snowflake adds when there is only one sub set of value pairs. And then in SELECT clause extra CASE statements to pull the values either from THIS or VALUE attributes depending on is there one or many value pairs.

Upvotes: 0

Related Questions