carousallie
carousallie

Reputation: 865

How to Conditionally Modify XML?

I have an XML document that runs through an entity tagging model and produces the following XML. I am attempting to take that output and reformat it into a manageable, efficient manner using XSLT.

Essentially I would like to modify the field names based on the entity type and have all of the related info combined into one section or ENTINFO tag. That way, it's clear when I look at the data which SSN belongs to George Washington and which belongs to Thomas Jefferson. Is this even possible in XSLT, or would I be better off writing a script?

Original XML

<?xml version="1.0" encoding="UTF-8"?>
<NORMDOC>
   <DOC>
      <DOCID>112233</DOCID>
      <FI fitype="B" xref="54321">
         <FIName>FOUNDING FATHERS CREDIT UNION</FIName>
         <FITIN>1212</FITIN>
      </FI>
      <OIs>
         <OI xref="654321">
            <OIName>FOUNDING FATHERS CREDIT UNION</OIName>
         </OI>
      </OIs>
      <Subjects>
         <Subject stype="PER" xref="98765">
            <SubjectFullName type="L">Washington/George</SubjectFullName>
            <SubjectLastName type="L">Washington</SubjectLastName>
            <SubjectFirstName type="L">George</SubjectFirstName>
            <SubjectID type="SSN/ITIN">111111111</SubjectID>
            <SubjectPhone type="Residence">1112223333</SubjectPhone>
         </Subject>
         <Subject stype="PER" xref="98876">
            <SubjectFullName type="L">Jefferson/Thomas</SubjectFullName>
            <SubjectLastName type="L">Jefferson</SubjectLastName>
            <SubjectFirstName type="L">Thomas</SubjectFirstName>
            <SubjectID type="SSN/ITIN">222222222</SubjectID>
         </Subject>
      </Subjects>
      <TXT>
        <S sid="112233-SENT-001"><ENAMEX type="PERSON" id="PER-112233-001">George Washington</ENAMEX> and <ENAMEX type="PERSON" id="PER-112233-002">Thomas Jefferson</ENAMEX> were both founding fathers.</S>
        <S sid="112233-SENT-002"><ENAMEX type="PERSON" id="PER-112233-002">Thomas Jefferson</ENAMEX> has a social security number of <IDEX type="SSN" id="SSN-112233-075">222-22-2222</IDEX>.</S>
        <S sid="112233-SENT-003"><ENAMEX type="PERSON" id="PER-112233-001">George Washington</ENAMEX> has social security number <IDEX type="SSN" id="SSN-112233-074">111-11-1111</IDEX>.</S>
        <S sid="112233-SENT-004"><ENAMEX type="PERSON" id="PER-112233-001">George Washington</ENAMEX> can be reached at <IDEX type="PHONE" id="PHO-112233-100">111-222-3333</IDEX>.</S>
      </TXT>
   </DOC>
   <ENTINFO ID="PHO-112233-100"
            TYPE="PHONE"
            NORM="(111) 222-3333"
            REFID="PHO-112233-100"
            MENTION="111-222-3333"/>
   <ENTINFO ID="SSN-112233-075"
            TYPE="SSN"
            NORM="222222222"
            REFID="SSN-112233-075"
            MENTION="social security number of 222-22-2222"
            SSNTYPE="SSN"/>
   <ENTINFO ID="SSN-112233-074"
            TYPE="SSN"
            NORM="111111111"
            REFID="SSN-112233-074"
            MENTION="social security number of 111-11-1111"
            SSNTYPE="SSN"/>
   <ENTINFO ID="PER-112233-001"
            TYPE="PERSON"
            NORM="Washington, George"
            REFID="PER-112233-001"
            MENTION="George Washington"
            GIVEN="George"
            MIDDLE=""
            SURNAME="Washington"/>
   <ENTINFO ID="PER-112233-002"
            TYPE="PERSON"
            NORM="Jefferson, Thomas"
            REFID="PER-112233-002"
            MENTION="Thomas Jefferson"
            GIVEN="Thomas"
            MIDDLE=""
            SURNAME="Jefferson"/>
   <RELINFO ID="REL-112233-355"
            RELTYPE="PER-IDENT"
            PERID="PER-112233-002"
            IDENTID="SSN-112233-075"
            SENTID="112233-SENT-002"/>
   <RELINFO ID="REL-112233-356"
            RELTYPE="PER-IDENT"
            PERID="PER-112233-001"
            IDENTID="SSN-112233-074"
            SENTID="112233-SENT-003"/>
   <RELINFO ID="REL-112233-357"
            RELTYPE="PER-IDENT"
            PERID="PER-112233-001"
            IDENTID="SSN-112233-100"
            SENTID="112233-SENT-004"/>
</NORMDOC>

XSLT

<xsl:stylesheet version="2.0"
                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <!-- Transforms the output from the NLP tool into the appropriate format for storage. -->
    <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
    <xsl:strip-space elements="*"/>
    <!-- Create sub-tags from metadata in ENTINFO -->
    <xsl:template match="/NORMDOC">
        <xsl:apply-templates select="DOC"/>
        <ENTITIES>
            <xsl:apply-templates select="ENTINFO"/>
        </ENTITIES>
    </xsl:template>
    <!-- Delete and append subjects, extraneous information gleaned from party tags -->
    <xsl:template match="Subject">
        <Subject_xref><xsl:value-of select="@xref"/></Subject_xref>
    </xsl:template>
    <!-- Make a copy of narrative text and strip out label tags -->
    <xsl:template match="TXT">
        <RAW_TXT>
            <xsl:value-of select="."/>
        </RAW_TXT>
        <xsl:copy>
            <xsl:apply-templates/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="S">
        <xsl:copy>
            <xsl:apply-templates select="*" mode="extra"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="*" mode="extra">
        <xsl:element name="{name()}_{@type}">
            <xsl:apply-templates/>
        </xsl:element>
    </xsl:template>
    <!-- Append tag labels in metadata -->
    <xsl:template match="ENTINFO">
        <xsl:copy>
            <xsl:for-each select="@*">
                <xsl:element name="ENTINFO_{translate(name(), '-', '_')}">
                    <xsl:value-of select="." />
                </xsl:element>
            </xsl:for-each>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="/">
        <NORMDOC>
            <xsl:apply-templates/>
        </NORMDOC>
    </xsl:template>

    <xsl:template match="@*|node()">
        <xsl:copy>
            <xsl:apply-templates/>
        </xsl:copy>
    </xsl:template>

</xsl:stylesheet>

Actual Output

<NORMDOC>
   <DOC>
      <DOCID>112233</DOCID>
      <FI>
         <FIName>DISCOVER BANK</FIName>
         <FITIN>510020270</FITIN>
      </FI>
      <OIs>
         <OI>
            <OIName>DISCOVER BANK</OIName>
         </OI>
      </OIs>
      <Subjects>
         <Subject_xref>98765</Subject_xref>
         <Subject_xref>98876</Subject_xref>
      </Subjects>
      <RAW_TXT>George Washington and Thomas Jefferson were both founding fathers.Thomas Jefferson has a social security number of 222-22-2222.George Washington has social security number 111-11-1111.George Washington can be reached at 111-222-3333.</RAW_TXT>
      <TXT>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <ENAMEX_PERSON>Thomas Jefferson</ENAMEX_PERSON>
         </S>
         <S>
            <ENAMEX_PERSON>Thomas Jefferson</ENAMEX_PERSON>
            <IDEX_SSN>222-22-2222</IDEX_SSN>
         </S>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <IDEX_SSN>111-11-1111</IDEX_SSN>
         </S>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <IDEX_PHONE>111-222-3333</IDEX_PHONE>
         </S>
      </TXT>
   </DOC>
   <ENTITIES>
      <ENTINFO>
         <ENTINFO_ID>PHO-112233-100</ENTINFO_ID>
         <ENTINFO_TYPE>PHONE</ENTINFO_TYPE>
         <ENTINFO_NORM>(111) 222-3333</ENTINFO_NORM>
         <ENTINFO_REFID>PHO-112233-100</ENTINFO_REFID>
         <ENTINFO_MENTION>111-222-3333</ENTINFO_MENTION>
      </ENTINFO>
      <ENTINFO>
         <ENTINFO_ID>SSN-112233-075</ENTINFO_ID>
         <ENTINFO_TYPE>SSN</ENTINFO_TYPE>
         <ENTINFO_NORM>222222222</ENTINFO_NORM>
         <ENTINFO_REFID>SSN-112233-075</ENTINFO_REFID>
         <ENTINFO_MENTION>social security number of 222-22-2222</ENTINFO_MENTION>
         <ENTINFO_SSNTYPE>SSN</ENTINFO_SSNTYPE>
      </ENTINFO>
      <ENTINFO>
         <ENTINFO_ID>SSN-112233-074</ENTINFO_ID>
         <ENTINFO_TYPE>SSN</ENTINFO_TYPE>
         <ENTINFO_NORM>111111111</ENTINFO_NORM>
         <ENTINFO_REFID>SSN-112233-074</ENTINFO_REFID>
         <ENTINFO_MENTION>social security number of 111-11-1111</ENTINFO_MENTION>
         <ENTINFO_SSNTYPE>SSN</ENTINFO_SSNTYPE>
      </ENTINFO>
      <ENTINFO>
         <ENTINFO_ID>PER-112233-001</ENTINFO_ID>
         <ENTINFO_TYPE>PERSON</ENTINFO_TYPE>
         <ENTINFO_NORM>Washington, George</ENTINFO_NORM>
         <ENTINFO_REFID>PER-112233-001</ENTINFO_REFID>
         <ENTINFO_MENTION>George Washington</ENTINFO_MENTION>
         <ENTINFO_GIVEN>George</ENTINFO_GIVEN>
         <ENTINFO_MIDDLE/>
         <ENTINFO_SURNAME>Washington</ENTINFO_SURNAME>
      </ENTINFO>
      <ENTINFO>
         <ENTINFO_ID>PER-112233-002</ENTINFO_ID>
         <ENTINFO_TYPE>PERSON</ENTINFO_TYPE>
         <ENTINFO_NORM>Jefferson, Thomas</ENTINFO_NORM>
         <ENTINFO_REFID>PER-112233-002</ENTINFO_REFID>
         <ENTINFO_MENTION>Thomas Jefferson</ENTINFO_MENTION>
         <ENTINFO_GIVEN>Thomas</ENTINFO_GIVEN>
         <ENTINFO_MIDDLE/>
         <ENTINFO_SURNAME>Jefferson</ENTINFO_SURNAME>
      </ENTINFO>
   </ENTITIES>
</NORMDOC>

Intended Output

<NORMDOC>
   <DOC>
      <DOCID>112233</DOCID>
      <FI>
         <FIName>FOUNDING FATHERS CREDIT UNION</FIName>
         <FITIN>1212</FITIN>
      </FI>
      <OIs>
         <OI>
            <OIName>FOUNDING FATHERS CREDIT UNION</OIName>
         </OI>
      </OIs>
      <Subjects>
         <Subject_xref>98765</Subject_xref>
         <Subject_xref>98876</Subject_xref>
      </Subjects>
      <RAW_TXT>George Washington and Thomas Jefferson were both founding fathers.Thomas Jefferson has a social security number of 222-22-2222.George Washington has social security number 111-11-1111.George Washington can be reached at 111-222-3333.</RAW_TXT>
      <TXT>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <ENAMEX_PERSON>Thomas Jefferson</ENAMEX_PERSON>
         </S>
         <S>
            <ENAMEX_PERSON>Thomas Jefferson</ENAMEX_PERSON>
            <IDEX_SSN>222-22-2222</IDEX_SSN>
         </S>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <IDEX_SSN>111-11-1111</IDEX_SSN>
         </S>
         <S>
            <ENAMEX_PERSON>George Washington</ENAMEX_PERSON>
            <IDEX_PHONE>111-222-3333</IDEX_PHONE>
         </S>
      </TXT>
   </DOC>
   <ENTITIES>
     <ENTINFO>
       <ENTINFO_PERSON_NORM>Washington, George</ENTINFO_PERSON_NORM>
       <ENTINFO_PERSON_MENTION>George Washington</ENTINFO_PERSON_MENTION>
       <ENTINFO_PERSON_GIVEN>George</ENTINFO_PERSON_GIVEN>
       <ENTINFO_PERSON_MIDDLE/>
       <ENTINFO_PERSON_SURNAME>Washington</ENTINFO_PERSON_SURNAME>
       <ENTINFO_SSN_NORM>111111111</ENTINFO_SSN_NORM>
       <ENTINFO_SSN_MENTION>social security number of 111-11-1111</ENTINFO_SSN_MENTION>
       <ENTINFO_PHONE_NORM>(111) 222-3333</ENTINFO_PHONE_NORM>
       <ENTINFO_PHONE_MENTION>111-222-3333</ENTINFO_PHONE_MENTION>
     </ENTINFO>
     <ENTINFO>
       <ENTINFO_PERSON_NORM>Jefferson, Thomas</ENTINFO_PERSON_NORM>
       <ENTINFO_PERSON_MENTION>Thomas Jefferson</ENTINFO_PERSON_MENTION>
       <ENTINFO_PERSON_GIVEN>Thomas</ENTINFO_PERSON_GIVEN>
       <ENTINFO_PERSON_MIDDLE/>
       <ENTINFO_PERSON_SURNAME>Jefferson</ENTINFO_PERSON_SURNAME>
       <ENTINFO_SSN_NORM>222222222</ENTINFO_SSN_NORM>
       <ENTINFO_SSN_MENTION>social security number of 222-22-2222</ENTINFO_SSN_MENTION>
     </ENTINFO>
   </ENTITIES>
</NORMDOC>

Upvotes: 1

Views: 69

Answers (1)

Parfait
Parfait

Reputation: 107652

Consider XSLT 2.0's xsl:for-each-group to group the two persons of ENTINFO and REFINFO nodes. However, grouping is not straightforward. My proposal suggests running groups on REFINFO and map all information from preceding siblings, ENTINFO, with conditionals and help of variables.

Therefore, instead of applying templates on ENTINFO, run the grouping operation in RELINFO. By the way, phone number due to no matching IDs as posted cannot map to final XML.

Specifically, remove this template (or keep but is redundant) and retain all other templates.

<xsl:template match="ENTINFO">
    <xsl:copy>
        <xsl:for-each select="@*">
            <xsl:element name="ENTINFO_{translate(name(), '-', '_')}">
                <xsl:value-of select="." />
            </xsl:element>
        </xsl:for-each>
    </xsl:copy>
</xsl:template>

Then, only adjust the NORMDOC template as shown below (admittedly not elegant but appears to work with sample input).

<xsl:template match="/NORMDOC">
    <xsl:apply-templates select="DOC"/>
    <ENTITIES>
        <xsl:for-each-group select="RELINFO" group-by="@PERID">
            <ENTINFO>
                <xsl:for-each-group select="current-group()" group-by="@RELTYPE">
                    <xsl:variable name="perid" select="@PERID"/>
                    <xsl:variable name="identid" select="@IDENTID"/>

                    <ENTINFO_PERSON_NAME>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@ID=$perid]/@NORM"/>
                    </ENTINFO_PERSON_NAME>
                    <ENTINFO_PERSON_MENTION>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@ID=$perid]/@MENTION"/>
                    </ENTINFO_PERSON_MENTION>
                    <ENTINFO_PERSON_GIVEN>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@ID=$perid]/@GIVEN"/>
                    </ENTINFO_PERSON_GIVEN>
                    <ENTINFO_PERSON_MIDDLE>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@ID=$perid]/@MIDDLE"/>
                    </ENTINFO_PERSON_MIDDLE>
                    <ENTINFO_PERSON_SURNAME>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@ID=$perid]/@SURNAME"/>
                    </ENTINFO_PERSON_SURNAME>
                    <ENTINFO_SSN_NORM>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@REFID=$identid and @TYPE='SSN']/@NORM"/>
                    </ENTINFO_SSN_NORM>
                    <ENTINFO_SSN_MENTION>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@REFID=$identid and @TYPE='SSN']/@MENTION"/>
                    </ENTINFO_SSN_MENTION>
                    <ENTINFO_PHONE_ID>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@REFID=$identid and @TYPE='PHONE']/@ID"/>
                    </ENTINFO_PHONE_ID>
                    <ENTINFO_PHONE_NORM>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@REFID=$identid and @TYPE='PHONE']/@NORM"/>
                    </ENTINFO_PHONE_NORM>
                    <ENTINFO_PHONE_MENTION>
                        <xsl:value-of select="preceding-sibling::ENTINFO[@REFID=$identid and @TYPE='PHONE']/@MENTION"/>
                    </ENTINFO_PHONE_MENTION>
                </xsl:for-each-group>
            </ENTINFO>
        </xsl:for-each-group>
    </ENTITIES>
</xsl:template>

XSLT Fiddle Demo

Upvotes: 2

Related Questions