Reputation: 43
We have a staging table called "portfolio" in Snowflake which has a single Variant column called "cdc_xml" that stores an XML document loaded by Snowpipe via S3.
The XML looks like:
<xyz>
<jmsTimestamp>1570068080385</jmsTimestamp>
<portfolio>
<id>1234</id>
<portfolioNumber>909</portfolioNumber>
<portfolioName>Hello World</portfolioName>
<master>
<attribute fieldName="active" value="1" oldValue="0"/>
<attribute fieldName="name" value="Hello Co" oldValue="Hello Company"/>
<attribute fieldName="startDate" value="04/02/1988" oldValue="04/01/1988"/>
</master>
<characteristics>
<characteristic fieldName="currency" value="JPY" oldValue="USD"/>
<characteristic fieldName="duplicate" value="YES" oldValue="NO"/>
<characteristic fieldName="clone" value="TRUE" oldValue="FALSE"/>
</characteristics>
</portfolio>
</xyz>
And the following is the Snowflake lateral flatten code to supposedly parse the XML to retrieve all the "@fieldName" and "@value" at <master><attribute>
level and all "@fieldName" and "@value" at <characteristics><characteristic>
level. All these data will be retrieved as name-value pairs.
-- flatten the characteristics nested structure to get all characteristic nvps
select 'XYZ' as source_name,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'id'):"$"::string as source_portfolio_id,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'portfolioNumber'):"$"::string as portfolio_number,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'portfolioName'):"$"::string as name,
get(flt1.value, '@fieldName')::string as field_name,
nvl(decode(get(flt1.value, '@value')::string, '', null, get(flt1.value, '@value')::string), '\b') as field_value -- deletion CDC if new value is null or empty
from staging.portfolio src1,
lateral flatten(xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'characteristics'):"$") flt1
union
-- flatten the master nested structure to get all attribute nvps
select 'XYZ' as source_name,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'id'):"$"::string as source_portfolio_id,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'portfolioNumber'):"$"::string as portfolio_number,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'portfolioName'):"$"::string as name,
get(flt2.value, '@fieldName')::string as field_name,
nvl(decode(get(flt2.value, '@value')::string, '', null, get(flt2.value, '@value')::string), '\b') as field_value -- deletion CDC if new value is null or empty
from staging.portfolio src2,
lateral flatten(xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'master'):"$") flt2
It works fine for the provided example above. But if the XML looks like below (with only 1 instance of the nested <master><attribute>
structure), that 1 instance of the <master><attribute>
is unable to be parsed and its "@fieldName" and "@value" are all NULL (instead of "startDate" and "11/02/1988").
Similarly, if the XML looks like the one at the bottom (with only 1 instance of the nested <characteristics><characteristic>
structure), that 1 instance of the <characteristics><characteristic>
is unable to be parsed and its "@fieldName" and "@value" are all NULL (instead of "clone" and "TRUE").
Any help is appreciated. Thanks in advance!
<xyz>
<jmsTimestamp>1570068080300</jmsTimestamp>
<portfolio>
<id>9876</id>
<portfolioNumber>808</portfolioNumber>
<portfolioName>Another Example</portfolioName>
<master>
<attribute fieldName="startDate" value="11/02/1988" oldValue="11/01/1988"/>
</master>
<characteristics>
<characteristic fieldName="currency" value="JPY" oldValue="USD"/>
<characteristic fieldName="duplicate" value="YES" oldValue="NO"/>
<characteristic fieldName="clone" value="TRUE" oldValue="FALSE"/>
</characteristics>
</portfolio>
</xyz>
<xyz>
<jmsTimestamp>1570068080300</jmsTimestamp>
<portfolio>
<id>9876</id>
<portfolioNumber>808</portfolioNumber>
<portfolioName>Another Example</portfolioName>
<master>
<attribute fieldName="active" value="0" oldValue="1"/>
<attribute fieldName="name" value="Example Inc" oldValue="Example LLC"/>
<attribute fieldName="startDate" value="11/02/1988" oldValue="11/01/1988"/>
</master>
<characteristics>
<characteristic fieldName="clone" value="TRUE" oldValue="FALSE"/>
</characteristics>
</portfolio>
</xyz>
Upvotes: 4
Views: 2479
Reputation: 1222
Very similar to the solution just provided by Simeon Pilgrim, you can unconditionally convert each list of elements to an array, to avoid having the FLATTEN try to "explode" the element into its component attributes (which is what you are experiencing). So, this would work as well:
select 'XYZ' as source_name,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'id'):"$"::string as source_portfolio_id,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'portfolioNumber'):"$"::string as portfolio_number,
xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'portfolioName'):"$"::string as name,
get(flt1.value, '@fieldName')::string as field_name,
nvl(decode(get(flt1.value, '@value')::string, '', null, get(flt1.value, '@value')::string), '\b') as field_value -- deletion CDC if new value is null or empty
from staging.portfolio src1,
lateral flatten(to_array(xmlget(xmlget(src1.cdc_xml, 'portfolio'), 'characteristics'):"$")) flt1
union
-- flatten the master nested structure to get all attribute nvps
select 'XYZ' as source_name,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'id'):"$"::string as source_portfolio_id,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'portfolioNumber'):"$"::string as portfolio_number,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'portfolioName'):"$"::string as name,
get(flt2.value, '@fieldName')::string as field_name,
nvl(decode(get(flt2.value, '@value')::string, '', null, get(flt2.value, '@value')::string), '\b') as field_value -- deletion CDC if new value is null or empty
from staging.portfolio src2,
lateral flatten(to_array(xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'master'):"$")) flt2```
Upvotes: 6
Reputation: 25968
We have the same problem is JavaScript with the library the parses XML to JSON, we had to pull the node Master
and then check if it's an array and if not cast it to array.
Lucky it seems snowflake has IS_ARRAY in it's semistructured functions
so if IS_ARRAY
and TO_ARRAY
work as hoped, then this should work:
select source_name,
source_portfolio_id,
portfolio_number,
name,
get(flt2.value, '@fieldName')::string as field_name,
nvl(decode(get(flt2.value, '@value')::string, '', null, get(flt2.value, '@value')::string), '\b') as field_value -- deletion CDC if new value is null or empty
from (
select 'XYZ' as source_name,
xmlget(portfolio, 'id'):"$"::string as source_portfolio_id,
xmlget(portfolio, 'portfolioNumber'):"$"::string as portfolio_number,
xmlget(portfolio, 'portfolioName'):"$"::string as name,
xmlget(xmlget(src2.cdc_xml, 'portfolio'), 'master'):"$" AS master_raw
IFF(IS_ARRAY(master_raw), master_raw, TO_ARRAY(master_raw)) as master
from (
select xmlget(src2.cdc_xml, 'portfolio') as portfolio
from staging.portfolio src2
)
),
lateral flatten(master) flt2
Upvotes: 2