David
David

Reputation: 23

XSLT conversion of open XML into clean HTML lists

I have created an XSLT file that converts everything in a Word XML into clean HTML however I am unable to covert nested lists properly.

I saved a word v16.12 file into XML. The Word file contains two lists

Here is the exported Open XML (relating to just the bullets).

<w:body>
<w:p w:rsidR="00875AF6" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="0"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 1 level 1</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="0"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 2 level 1</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 3 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="2"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 4 level 3</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="2"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 5 level 3</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 6 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="2"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 7 level 3</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="007A38EC">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="0"/>
            <w:numId w:val="1"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 1 Bullet 8 level 1</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241"/>
<w:p w:rsidR="00575241" w:rsidRDefault="00575241" w:rsidP="00575241">
    <w:r>
        <w:t>This is a break</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="00575241" w:rsidRDefault="00575241" w:rsidP="00575241"/>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="0"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 1 level 1</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 2 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="2"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 3 level 3</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="0"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 4 level 1</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 5 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 6 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="2"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 7 level 3</w:t>
    </w:r>
    <w:bookmarkStart w:id="0" w:name="_GoBack"/>
    <w:bookmarkEnd w:id="0"/>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241">
    <w:pPr>
        <w:pStyle w:val="ListParagraph"/>
        <w:numPr>
            <w:ilvl w:val="1"/>
            <w:numId w:val="2"/>
        </w:numPr>
    </w:pPr>
    <w:r>
        <w:t>List 2 Bullet 8 level 2</w:t>
    </w:r>
</w:p>
<w:p w:rsidR="007A38EC" w:rsidRDefault="007A38EC" w:rsidP="00575241"/>
<w:sectPr w:rsidR="007A38EC" w:rsidSect="00D678D3">
    <w:pgSz w:w="11900" w:h="16840"/>
    <w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"
        w:header="708" w:footer="708" w:gutter="0"/>
        <w:cols w:space="708"/>
        <w:docGrid w:linePitch="360"/>
    </w:sectPr>
</w:body>

Using XSLT I need to convert the XML into this HTML

<ul>
  <li>List 1 Bullet 1 level 1</li>
  <li>List 1 Bullet 2 level 1
    <ul>
      <li>List 1 Bullet 3 level 2
        <ul>
          <li>List 1 Bullet 4 level 3</li>
          <li>List 1 Bullet 5 level 3</li>
        </ul>
      </li>
      <li>List 1 Bullet 6 level 2
        <ul>
          <li>List 1 Bullet 7 level 3</li>
        </ul>
      </li>
    </ul>
  </li>
  <li>List 1 Bullet 8 level 1</li>
</ul>
<p>This is a gap</p>
<ul>
  <li>List 2 Bullet 1 level 1
    <ul>
      <li>List 2 Bullet 2 level 2
        <ul>
          <li>List 2 Bullet 3 level 3</li>
        </ul>
      </li>
    </ul>
  </li>
  <li>List 2 Bullet 4 level 1
    <ul>
      <li>List 2 Bullet 5 level 2</li>
      <li>List 2 Bullet 6 level 2
        <ul>
          <li>List 2 Bullet 7 level 3</li>
        </ul>
      </li>
      <li>List 2 Bullet 8 level 2</li>
    </ul>
  </li>
</ul>

I have researched and the closest I found to was using a function and for-each-group like the below.

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:mf="http://example.com/mf" version="2.0"
    exclude-result-prefixes="xs mf">

    <xsl:strip-space elements="*"/>
    <xsl:output indent="yes"/>

    <xsl:function name="mf:group" as="node()*">
        <xsl:param name="nodes" as="node()*"/>
        <xsl:param name="level" as="xs:integer"/>
        <xsl:if test="$nodes">
            <list type="ul">
                <xsl:for-each-group select="$nodes"
                    group-adjacent="boolean(self::*[@level = $level])">
                    <xsl:choose>
                        <xsl:when test="current-grouping-key()">
                            <xsl:apply-templates select="current-group()"/>
                        </xsl:when>
                        <xsl:otherwise>
                            <xsl:sequence select="mf:group(current-group(), $level + 1)"/>
                        </xsl:otherwise>
                    </xsl:choose>
                </xsl:for-each-group>
            </list>
        </xsl:if>
    </xsl:function>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@*, node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="item[@level]">
        <item>
            <xsl:apply-templates/>
        </item>
    </xsl:template>

    <xsl:template match="test">
        <xsl:copy>
            <xsl:for-each-group select="*" group-adjacent="boolean(self::item)">
                <xsl:choose>
                    <xsl:when test="current-grouping-key()">
                        <xsl:sequence select="mf:group(current-group(), 0)"/>
                    </xsl:when>
                    <xsl:otherwise>
                        <xsl:apply-templates select="current-group()"/>
                    </xsl:otherwise>
                </xsl:choose>
            </xsl:for-each-group>
        </xsl:copy>
    </xsl:template>

</xsl:stylesheet>

Unfortunately using functions and the for-each-group is beyond my ability. My question is how would I amend the above XSLT to work with the XML that I am getting from Word?

Upvotes: 2

Views: 457

Answers (1)

Joel M. Lamsen
Joel M. Lamsen

Reputation: 7173

First off, we'll start with an identity template:

<xsl:template match="@* | node()">
    <xsl:copy>
        <xsl:apply-templates select="@*, node()"/>
    </xsl:copy>
</xsl:template>

Second, we have to match the root node w:body and group the elements using xsl:for-each-group. Afterwards, we'll store the nodes in a variable (firstpass) to further manipulate the nodes later, such as:

<!-- If you want to specify the target node (1 in 22 as you say),
     you can adjust the xpath below to match your target node.
-->
<xsl:template match="w:body">
    <xsl:variable name="firstPass">
        <xsl:for-each-group select="*" group-adjacent="boolean(self::w:p[descendant::w:ilvl])">
            <xsl:choose>
                <xsl:when test="current-grouping-key()">
                    <!-- the zero (0) was obtained from the value of
                         w:val attribute of w:ilvl node -->
                    <xsl:sequence select="mf:group(current-group(), 0)"/>
                </xsl:when>
                <xsl:otherwise>
                    <xsl:apply-templates select="current-group()"/>
                </xsl:otherwise>
            </xsl:choose>
        </xsl:for-each-group>
    </xsl:variable>
    <xsl:apply-templates select="$firstPass/node()"/>
</xsl:template>

we can adapt the function that you mentioned. We can modify the group-adjacent target nodes to

<xsl:function name="mf:group" as="node()*">
    <xsl:param name="nodes" as="node()*"/>
    <xsl:param name="level" as="xs:integer"/>
    <xsl:if test="$nodes">
        <ul>
            <xsl:for-each-group select="$nodes"
                group-adjacent="boolean(self::*[descendant::w:ilvl/@w:val = $level])">
                <xsl:choose>
                    <xsl:when test="current-grouping-key()">
                        <xsl:apply-templates select="current-group()"/>
                    </xsl:when>
                    <xsl:otherwise>
                        <xsl:sequence select="mf:group(current-group(), $level + 1)"/>
                    </xsl:otherwise>
                </xsl:choose>
            </xsl:for-each-group>
        </ul>
    </xsl:if>
</xsl:function>

The following are the templates needed for the cleanup

<xsl:template match="w:p">
    <xsl:apply-templates select="descendant::w:t"/>
</xsl:template>

<xsl:template match="w:p[.='']|w:sectPr"/>

<xsl:template match="w:t">
    <xsl:choose>
        <xsl:when test="ancestor::w:p[descendant::w:pStyle[@w:val='ListParagraph']]">
            <li>
                <xsl:apply-templates/>
            </li>
        </xsl:when>
        <xsl:otherwise>
            <p>
                <xsl:apply-templates/>
            </p>
        </xsl:otherwise>
    </xsl:choose>
</xsl:template>

After that, we still need to insert the <ul> sublevels into parent <li>. To do that, we have to do a second pass of transformation.

We will now then match the nodes present in the firstpass variable

<xsl:template match="li[following-sibling::*[1][name()='ul']]">
    <xsl:copy>
        <xsl:apply-templates/>
        <!-- this will copy the target ul nodes, albeit in a different mode -->
        <xsl:apply-templates select="following-sibling::*[1][name()='ul']" mode="transfer"/>
    </xsl:copy>
</xsl:template>

<!-- this will delete the target node -->
<xsl:template match="ul[preceding-sibling::*[1][name()='li']]"/>

and an identity template for the other mode

<xsl:template match="@* | node()" mode="transfer">
    <xsl:copy>
        <xsl:apply-templates select="@*, node()"/>
    </xsl:copy>
</xsl:template>

The whole stylesheet is as follows:

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:mf="http://example.com/mf"
    xmlns:w="www.wnamespace.com"
    version="2.0"
    exclude-result-prefixes="xs mf w">

    <xsl:strip-space elements="*"/>
    <xsl:output indent="yes" omit-xml-declaration="yes"/>

    <xsl:function name="mf:group" as="node()*">
        <xsl:param name="nodes" as="node()*"/>
        <xsl:param name="level" as="xs:integer"/>
        <xsl:if test="$nodes">
            <ul>
                <xsl:for-each-group select="$nodes"
                    group-adjacent="boolean(self::*[descendant::w:ilvl/@w:val = $level])">
                    <xsl:choose>
                        <xsl:when test="current-grouping-key()">
                            <xsl:apply-templates select="current-group()"/>
                        </xsl:when>
                        <xsl:otherwise>
                            <xsl:sequence select="mf:group(current-group(), $level + 1)"/>
                        </xsl:otherwise>
                    </xsl:choose>
                </xsl:for-each-group>
            </ul>
        </xsl:if>
    </xsl:function>

    <xsl:template match="@* | node()">
        <xsl:copy>
            <xsl:apply-templates select="@*, node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="@* | node()" mode="transfer">
        <xsl:copy>
            <xsl:apply-templates select="@*, node()"/>
        </xsl:copy>
    </xsl:template>

    <xsl:template match="w:p">
        <xsl:apply-templates select="descendant::w:t"/>
    </xsl:template>

    <xsl:template match="w:p[.='']|w:sectPr"/>

    <xsl:template match="w:t">
        <xsl:choose>
            <xsl:when test="ancestor::w:p[descendant::w:pStyle[@w:val='ListParagraph']]">
                <li>
                    <xsl:apply-templates/>
                </li>
            </xsl:when>
            <xsl:otherwise>
                <p>
                    <xsl:apply-templates/>
                </p>
            </xsl:otherwise>
        </xsl:choose>
    </xsl:template>

<xsl:template match="w:body">
    <xsl:variable name="firstPass">
        <xsl:for-each-group select="*" group-adjacent="boolean(self::w:p[descendant::w:ilvl])">
            <xsl:choose>
                <xsl:when test="current-grouping-key()">
                    <xsl:sequence select="mf:group(current-group(), 0)"/>
                </xsl:when>
                <xsl:otherwise>
                    <xsl:apply-templates select="current-group()"/>
                </xsl:otherwise>
            </xsl:choose>
        </xsl:for-each-group>
    </xsl:variable>
    <xsl:apply-templates select="$firstPass/node()"/>
</xsl:template>

    <xsl:template match="ul[preceding-sibling::*[1][name()='li']]"/>

</xsl:stylesheet>

See it in action here.

Upvotes: 2

Related Questions