euler
euler

Reputation: 1411

Grouping values with same attributes using xslt-1.0

Given this input XML:

<?xml version="1.0" encoding="ISO-8859-1" ?>
<agrisResources xmlns:ags="http://purl.org/agmes/1.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
    <agrisResource bibliographicLevel="AM" ags:ARN="^aSF17^b00003">
        <dc:subject xml:lang="en">Penaeidae</dc:subject>
        <dc:subject xml:lang="en">Vibrio harveyi</dc:subject>
        <dc:subject xml:lang="en">Vibrio parahaemolyticus</dc:subject>
        <dc:subject>
            <ags:subjectClassification scheme="ags:ASC">ASFA-1</ags:subjectClassification>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases</ags:subjectThesaurus>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Fish diseases</ags:subjectThesaurus>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Genes</ags:subjectThesaurus>
        </dc:subject>
    </agrisResource>
</agrisResources>

I would like to group items with the same attributes, so the output would be like this:

<dc:subject xml:lang="en">Penaeidae||Vibrio harveyi||Vibrio parahaemolyticus</dc:subject>
<dc:subject>
    <ags:subjectClassification scheme="ags:ASC">ASFA-1</ags:subjectClassification>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases||Fish diseases||Genes</ags:subjectThesaurus>
</dc:subject>

Basically, my rule for the grouping is to combine the values of the nodes if that node have multiple values, eg dc:subject, and ags:subjectThesaurus. I specify in my title to group values with same attributes because I'm not really sure if it is possible to just group them by their tags without specifying their attributes to differentiate them.

In other words, differentiate

<dc:subject>Penaeidae</dc:subject>

from

<dc:subject>
    <ags:subjectThesaurus>Bacterial diseases</ags:subjectThesaurus>
</dc:subject>

UPDATE

INPUT XML

<?xml version="1.0" encoding="ISO-8859-1" ?>
<agrisResources xmlns:ags="http://purl.org/agmes/1.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
    <agrisResource bibliographicLevel="AM" ags:ARN="^aSF17^b00003">
        <dc:creator>
            <ags:creatorPersonal>Doe, John</ags:creatorPersonal>
            <ags:creatorPersonal>Smith, Jason T.</ags:creatorPersonal>
            <ags:creatorPersonal>Doe, Jane E.</ags:creatorPersonal>
        </dc:creator>
        <dc:subject xml:lang="en">Penaeidae</dc:subject>
        <dc:subject xml:lang="en">Vibrio harveyi</dc:subject>
        <dc:subject xml:lang="en">Vibrio parahaemolyticus</dc:subject>
        <dc:subject>
            <ags:subjectClassification scheme="ags:ASC">ASFA-1</ags:subjectClassification>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases</ags:subjectThesaurus>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Fish diseases</ags:subjectThesaurus>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Genes</ags:subjectThesaurus>
        </dc:subject>
    </agrisResource>
</agrisResources>

Desired Output

Rules on grouping: Combine the values using double pipe || as separator for repeating elements, eg <ags:creatorPersonal>, <dc:subject xml:lang="en"> and <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">. Leave other elements as is that does not meet that rule.

<?xml version="1.0" encoding="ISO-8859-1" ?>
<agrisResources xmlns:ags="http://purl.org/agmes/1.1/" xmlns:dc="http://purl.org/dc/elements/1.1/">
    <agrisResource bibliographicLevel="AM" ags:ARN="^aSF17^b00003">
        <dc:creator>
            <ags:creatorPersonal>Doe, John||Smith, Jason T.||Doe, Jane E.</ags:creatorPersonal>
        </dc:creator>
        <dc:subject xml:lang="en">Penaeidae||Vibrio harveyi||Vibrio parahaemolyticus</dc:subject>
        <dc:subject>
            <ags:subjectClassification scheme="ags:ASC">ASFA-1</ags:subjectClassification>
            <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases||Fish diseases||Genes</ags:subjectThesaurus>
        </dc:subject>
    </agrisResource>
</agrisResources>

Below is my code based from this answer:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/terms/"
            xmlns:ags="http://purl.org/agmes/1.1/"
            xmlns:agls="http://www.naa.gov.au/recordkeeping/gov_online/agls/1.2"
            xmlns:dcterms="http://purl.org/dc/terms/">
    <xsl:output method="xml" indent="yes" omit-xml-declaration="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@* | node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="ags:subjectThesaurus|dc:subject">
        <xsl:copy>
            <xsl:apply-templates select="@* | text()"/>
                <xsl:call-template name="NextSibling"/>
        </xsl:copy>
    </xsl:template>
    <xsl:template match="ags:subjectThesaurus[@scheme = preceding-sibling::*[1][self::ags:subjectThesaurus]/@scheme]|dc:subject[@xml:lang = preceding-sibling::*[1][self::dc:subject]/@xml:lang]"/>

    <xsl:template match="ags:subjectThesaurus|dc:subject" mode="includeSib">
        <xsl:value-of select="concat('||', .)"/>
            <xsl:call-template name="NextSibling"/>
        </xsl:template>

    <xsl:template name="NextSibling">
        <xsl:apply-templates select="following-sibling::*[1][self::ags:subjectThesaurus and @scheme = current()/@scheme]|following-sibling::*[1][self::dc:subject and @xml:lang = current()/@xml:lang]" mode="includeSib"/>
    </xsl:template>
</xsl:stylesheet>

My only problem is that it is only transforming the ags:subjectThesaurus but not the dc:subject node. My output looks like this:

<dc:subject xml:lang="en">Penaeidae</dc:subject>
<dc:subject xml:lang="en">Vibrio harveyi</dc:subject>
<dc:subject xml:lang="en">Vibrio parahaemolyticus</dc:subject>
<dc:subject>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases||Fish diseases||Genes</ags:subjectThesaurus>
</dc:subject>

How can I modify my code such that it will also group the dc:subject node with the same xml:lang attribute?

EDIT

Based on the suggestion of michael.hor257k and from this answer to use the Muenchian method, below is what I tried:

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:dc="http://purl.org/dc/terms/"
            xmlns:ags="http://purl.org/agmes/1.1/"
            xmlns:agls="http://www.naa.gov.au/recordkeeping/gov_online/agls/1.2"
            xmlns:dcterms="http://purl.org/dc/terms/">
    <xsl:output method="xml" indent="yes" omit-xml-declaration="yes"/>
    <xsl:strip-space elements="*"/>
    <xsl:key name="kNodeSubject" match="dc:subject[@xml:lang]" use="@xml:lang"/>
    <xsl:key name="subjectThesaurus" match="dc:subject/ags:subjectThesaurus" use="@scheme"/>
    <xsl:template match="node() | @*">
        <xsl:copy>
            <xsl:apply-templates select="node() | @*"/>
        </xsl:copy>
    </xsl:template>
    <xsl:template match="dc:subject[generate-id() = generate-id(key('kNodeSubject', @xml:lang)[1])]">
        <xsl:copy>
            <xsl:apply-templates select="@*"/>
            <xsl:apply-templates select="key('kNodeSubject', @xml:lang)" mode="concat"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="dc:subject/ags:subjectThesaurus[generate-id() = generate-id(key('subjectThesaurus', @scheme)[1])]">
        <xsl:copy>
            <xsl:apply-templates select="@*"/>
            <xsl:apply-templates select="key('subjectThesaurus', @scheme)" mode="concat"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="dc:subject|subjectThesaurus" mode="concat">
        <xsl:value-of select="."/>
            <xsl:if test="position() != last()">
                <xsl:text>||</xsl:text>
            </xsl:if>
    </xsl:template>

    <xsl:template match="dc:subject"/>
    <xsl:template match="ags:subjectThesaurus"/>
</xsl:stylesheet>

When I applied the code above, the nodes ags:subjectThesaurus are gone and the values of <dc:subject xml:lang="en"> are not grouped either. I don't know if I have the match right, I used the match="dc:subject[@xml:lang]" for the <xsl:key name="kNodeSubject" because the node ags:subjectThesaurus is the child of <dc:subject>.

Thanks in advance.

Upvotes: 0

Views: 608

Answers (1)

michael.hor257k
michael.hor257k

Reputation: 117140

Consider the following example:

XML

<root xmlns:dc="http://purl.org/dc/terms/" xmlns:ags="http://purl.org/agmes/1.1/">
  <dc:subject xml:lang="en">Penaeidae</dc:subject>
  <dc:subject xml:lang="en">Vibrio harveyi</dc:subject>
  <dc:subject xml:lang="fr">Franca premier</dc:subject>
  <dc:subject xml:lang="fr">Franca deux</dc:subject>
  <dc:subject xml:lang="en">Vibrio parahaemolyticus</dc:subject>
  <dc:subject>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases</ags:subjectThesaurus>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Fish diseases</ags:subjectThesaurus>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Genes</ags:subjectThesaurus>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:B">Bees</ags:subjectThesaurus>
    <ags:subjectThesaurus xml:lang="en" scheme="ags:B">Birds</ags:subjectThesaurus>
  </dc:subject>
</root>

XSLT 1.0

<xsl:stylesheet version="1.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:dc="http://purl.org/dc/terms/"
xmlns:ags="http://purl.org/agmes/1.1/">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:strip-space elements="*"/>

<xsl:key name="subj-by-lang" match="dc:subject[@xml:lang]" use="@xml:lang"/>
<xsl:key name="thes-by-scheme" match="ags:subjectThesaurus" use="@scheme"/>

<!-- identity transform -->
<xsl:template match="@*|node()">
    <xsl:copy>
        <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
</xsl:template>

<xsl:template match="root">
    <xsl:copy>
        <!-- group subjects by lang -->
        <xsl:for-each select="dc:subject[@xml:lang][count(. | key('subj-by-lang', @xml:lang)[1]) = 1]">
             <dc:subject xml:lang="{@xml:lang}">
                <xsl:for-each select="key('subj-by-lang', @xml:lang)">
                    <xsl:value-of select="."/>
                    <xsl:if test="position() != last()">
                        <xsl:text>||</xsl:text>
                    </xsl:if>
                </xsl:for-each>
             </dc:subject>  
        </xsl:for-each>
        <!-- process other nodes -->
        <xsl:apply-templates select="node()[not(self::dc:subject[@xml:lang])]"/>
    </xsl:copy>
</xsl:template>

<xsl:template match="dc:subject">
    <xsl:copy>
        <!-- group thesauri by scheme    -->
        <xsl:for-each select="ags:subjectThesaurus[count(. | key('thes-by-scheme', @scheme)[1]) = 1]">
             <dc:subjectThesaurus xml:lang="{@xml:lang}" scheme="{@scheme}">
                <xsl:for-each select="key('thes-by-scheme', @scheme)">
                    <xsl:value-of select="."/>
                    <xsl:if test="position() != last()">
                        <xsl:text>||</xsl:text>
                    </xsl:if>
                </xsl:for-each>
             </dc:subjectThesaurus> 
        </xsl:for-each>
    </xsl:copy>
</xsl:template>

</xsl:stylesheet>

Result

<?xml version="1.0" encoding="UTF-8"?>
<root xmlns:dc="http://purl.org/dc/terms/" xmlns:ags="http://purl.org/agmes/1.1/">
  <dc:subject xml:lang="en">Penaeidae||Vibrio harveyi||Vibrio parahaemolyticus</dc:subject>
  <dc:subject xml:lang="fr">Franca premier||Franca deux</dc:subject>
  <dc:subject>
    <dc:subjectThesaurus xml:lang="en" scheme="ags:ASFAT">Bacterial diseases||Fish diseases||Genes</dc:subjectThesaurus>
    <dc:subjectThesaurus xml:lang="en" scheme="ags:B">Bees||Birds</dc:subjectThesaurus>
  </dc:subject>
</root>

Added:

Based on your clarifications, I suspect you want to do something much simpler: just join together some leaf nodes (i.e. nodes with no child elements) and leave the others as is.

Here's an example joining the dc:subject leaf nodes within agrisResource:

<xsl:template match="agrisResource">
    <xsl:copy>
        <!-- join subjects with no children -->
        <dc:subject>
            <!-- copy the attributes of the first subject with no children -->
            <xsl:copy-of select="dc:subject[not(*)][1]/@*"/>
            <!-- concat the values of all subjects with any attributes -->
            <xsl:for-each select="dc:subject[not(*)]">
                <xsl:value-of select="."/>
                <xsl:if test="position() != last()">
                    <xsl:text>||</xsl:text>
                </xsl:if>
            </xsl:for-each>
         </dc:subject>  
        <!-- process other nodes -->
        <xsl:apply-templates select="node()[not(self::dc:subject[not(*)])]"/>
    </xsl:copy>
</xsl:template>

This could be generalized by using a key based on an element's name.

Upvotes: 1

Related Questions