J R
J R

Reputation: 49

How to Efficiently compare 2 large volume XML files

-- EDIT -- , clarifying documents & desired output. (also why variance between 1st reponse)

I'm trying to compare 2 large XML data sets using XSLT 2.0 (I can also use 3.0) and I'm having some performance issues.

I have ~300k records in file 1 that I need to compare against another ~300k records in file 2 to see if entries from file 1 exists in file 2. If so, I need to insert a node to the result. I also need to exclude certain record types from file 1.

File 1

<?xml version="1.0" encoding="UTF-8"?>
<root>
    <row>
        <col1>100035</col1>
        <col2>3000009091</col2>
        <col3>SSL</col3>
        <col4>8.000000</col4>
        <col5>06-Jul-2020</col5>
        <col6>A</col6>
    </row>
    <row>
        <col1>100002</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>07-Jul-2020</col5>
        <col6>P</col6>
    </row>
    <row>
        <col1>100028</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>08-Jul-2020</col5>
        <col6>P</col6>
    </row>
    <row>
        <col1>100200</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>09-Jul-2020</col5>
        <col6>A</col6>
    </row>
    <row>
        <col1>100689</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>10-Jul-2020</col5>
        <col6>A</col6>
    </row>
    <row>
        <col1>100035</col1>
        <col2>3000013528</col2>
        <col3>UFH</col3>
        <col4>8.000000</col4>
        <col5>16-Jul-2020</col5>
        <col6>A</col6>
    </row>
</root>

File 2

<?xml version="1.0" encoding="UTF-8"?>
<nm:Data xmlns:nm="namespace">
    <nm:Entry>
        <nm:Record>
            <nm:ID>10084722-Jun-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>48548310-Jul-2020SSL</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10000201-Jul-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>57307407-Jul-2020SSL</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10003516-Jul-2020UFH</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10020009-Jul-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>00155501-Jun-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10533728-May-2020UUT</nm:ID>
        </nm:Record>
    </nm:Entry>
    <nm:Entry>
        <nm:Record>
            <nm:ID>99954801-Jul-2020UUT</nm:ID>
        </nm:Record>
    </nm:Entry>
    <nm:Entry>
        <nm:Record>
            <nm:ID>30254801-Jun-2020UFH</nm:ID>
        </nm:Record>
    </nm:Entry>
</nm:Data>

The Desired Output (copy 'A' records and add "type" node). "Adj" if there is matching ID from File 2 otherwise, "New" type:

<?xml version="1.0" encoding="UTF-8"?>
<root>
    <row>
        <type>New</type>
        <col1>100035</col1>
        <col2>3000009091</col2>
        <col3>SSL</col3>
        <col4>8.000000</col4>
        <col5>06-Jul-2020</col5>
        <col6>A</col6>
    </row> 
    <row>
        <type>Adj</type>
        <col1>100200</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>09-Jul-2020</col5>
        <col6>A</col6>
    </row>
    <row>
        <type>New</type>
        <col1>100689</col1>
        <col2>3000009091</col2>
        <col3>UUT</col3>
        <col4>8.000000</col4>
        <col5>10-Jul-2020</col5>
        <col6>A</col6>
    </row>
    <row>
        <type>Adj</type>
        <col1>100035</col1>
        <col2>3000013528</col2>
        <col3>UFH</col3>
        <col4>8.000000</col4>
        <col5>16-Jul-2020</col5>
        <col6>A</col6>
    </row>
</root>

Originally, I couldn't get the exact output so I compromised with the following xslt; however, performance is poor and I need a much more efficient solution.

XSLT Attempt 1 (want to replace exists() & copy-of() functions):

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nm="namespace"
    exclude-result-prefixes="xs" version="3.0">
    
    <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
    <xsl:strip-space elements="*"/>
    
    <xsl:variable name="report" select="document('File2.xml')"/>
    
    <xsl:template match="root">
        <root>
            <xsl:for-each select="row[col6 = 'A']">
                <record>
                    <!-- Create value to match against -->
                    <xsl:variable name="inputID" select="concat(col1,col5,col3)"/>
                    
                    <!-- Add Node based on existing match or not -->
                    <xsl:choose>
                        <xsl:when test="exists($report/nm:Data/nm:Entry/nm:Record/nm:ID[. = $inputID])">
                            <type>Adj</type>
                        </xsl:when>
                        <xsl:otherwise>
                            <type>New</type>
                        </xsl:otherwise>
                    </xsl:choose>
                    <!-- Copy all other nodes -->
                    <xsl:copy-of select="."/>
                </record>
            </xsl:for-each>
        </root>
    </xsl:template>
</xsl:stylesheet>

Actual Output 1 (not perfect output, but acceptable):

<?xml version="1.0" encoding="UTF-8"?>
<root xmlns:nm="namespace">
   <record>
      <type>New</type>
      <row>
         <col1>100035</col1>
         <col2>3000009091</col2>
         <col3>SSL</col3>
         <col4>8.000000</col4>
         <col5>06-Jul-2020</col5>
         <col6>A</col6>
      </row>
   </record>
   <record>
      <type>Adj</type>
      <row>
         <col1>100200</col1>
         <col2>3000009091</col2>
         <col3>UUT</col3>
         <col4>8.000000</col4>
         <col5>09-Jul-2020</col5>
         <col6>A</col6>
      </row>
   </record>
   <record>
      <type>New</type>
      <row>
         <col1>100689</col1>
         <col2>3000009091</col2>
         <col3>UUT</col3>
         <col4>8.000000</col4>
         <col5>10-Jul-2020</col5>
         <col6>A</col6>
      </row>
   </record>
   <record>
      <type>Adj</type>
      <row>
         <col1>100035</col1>
         <col2>3000013528</col2>
         <col3>UFH</col3>
         <col4>8.000000</col4>
         <col5>16-Jul-2020</col5>
         <col6>A</col6>
      </row>
   </record>
</root>

I then took the suggestions below and tried applying both streaming & the key() function in XSLT 3.0 but I've been unable to get anything functioning. The closest was this xslt here, but the output is incorrect.

XSLT 3.0 attempt:

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:nm="namespace"
    exclude-result-prefixes="#all" version="3.0">

    <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:variable name="report" select="document('File2.xml')"/>

    <xsl:key name="ref" match="nm:Data/nm:Entry/nm:Record/nm:ID" use="."/>
    
    <xsl:key name="type-ref" match="row" use="col6"/>
    
    <xsl:mode on-no-match="shallow-copy"/>
    
    <xsl:template match="key('type-ref', 'A')[key('ref', col1 || col3 || col5, $report)]">
        <xsl:copy>
            <type>Adj</type>
            <xsl:apply-templates/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="key('type-ref', 'A')[not(key('ref', col1 || col3 || col5, $report))]">
        <xsl:copy>
            <type>New</type>
            <xsl:apply-templates/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="key('type-ref', 'P')"/>

</xsl:stylesheet>

3.0 Output (note that the "Adj" type is not being applied correctly but P records are being dropped):

<?xml version="1.0" encoding="UTF-8"?>
<root>
   <row>
      <type>New</type>
      <col1>100035</col1>
      <col2>3000009091</col2>
      <col3>SSL</col3>
      <col4>8.000000</col4>
      <col5>06-Jul-2020</col5>
      <col6>A</col6>
   </row>
   <row>
      <type>New</type>
      <col1>100200</col1>
      <col2>3000009091</col2>
      <col3>UUT</col3>
      <col4>8.000000</col4>
      <col5>09-Jul-2020</col5>
      <col6>A</col6>
   </row>
   <row>
      <type>New</type>
      <col1>100689</col1>
      <col2>3000009091</col2>
      <col3>UUT</col3>
      <col4>8.000000</col4>
      <col5>10-Jul-2020</col5>
      <col6>A</col6>
   </row>
   <row>
      <type>New</type>
      <col1>100035</col1>
      <col2>3000013528</col2>
      <col3>UFH</col3>
      <col4>8.000000</col4>
      <col5>16-Jul-2020</col5>
      <col6>A</col6>
   </row>
</root>

I don't quite have a deep enough understanding of the key() function to adjust to tweak it further or how to correctly apply the copy() statements when trying to use the stream mode.

Thank you again for the input & I'll keep trying.

Upvotes: 0

Views: 354

Answers (1)

Martin Honnen
Martin Honnen

Reputation: 167696

I would use a key (https://www.w3.org/TR/xslt-30/#key) to index the second document and (perhaps additionally) a key to select only certain rows for the whole processing:

  <xsl:key name="ref" match="data/id" use="."/>
  
  <xsl:key name="type-ref" match="row" use="type"/>

  <xsl:mode on-no-match="shallow-copy"/>
  
  <xsl:template match="root">
      <xsl:copy>
          <xsl:apply-templates select="key('type-ref', 'A')"/>
      </xsl:copy>
  </xsl:template>

  <xsl:template match="row[key('ref', id || code || date, $report)]">
      <xsl:copy>
         <type>Adj</type>
         <xsl:apply-templates/>
      </xsl:copy>
  </xsl:template>
  
  <xsl:template match="row[not(key('ref', id || code || date, $report))]">
      <xsl:copy>
         <type>New</type>
         <xsl:apply-templates/>
      </xsl:copy>
  </xsl:template>

https://xsltfiddle.liberty-development.net/a9HjZH/2

The arguments to the key function are explained in https://www.w3.org/TR/xslt-30/#func-key:

fn:key( $key-name    as xs:string,
        $key-value   as xs:anyAtomicType*,
        $top     as node()) as node()*

The third argument is used to identify the selected subtree. If the argument is present, the selected subtree is the set of nodes that have $top as an ancestor-or-self node. If the argument is omitted, the selected subtree is the document containing the context node. This means that the third argument effectively defaults to /.

Applied to your altered input samples (only difficulty was to concat the colX elements in the order their values appear in the second document) that would give

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:nm="namespace"
    exclude-result-prefixes="#all"
    version="3.0">

  <xsl:param name="report">
<nm:Data xmlns:nm="namespace">
    <nm:Entry>
        <nm:Record>
            <nm:ID>10084722-Jun-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>48548310-Jul-2020SSL</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10000201-Jul-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>57307407-Jul-2020SSL</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10003516-Jul-2020UFH</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10020009-Jul-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>00155501-Jun-2020UUT</nm:ID>
        </nm:Record>
        <nm:Record>
            <nm:ID>10533728-May-2020UUT</nm:ID>
        </nm:Record>
    </nm:Entry>
    <nm:Entry>
        <nm:Record>
            <nm:ID>99954801-Jul-2020UUT</nm:ID>
        </nm:Record>
    </nm:Entry>
    <nm:Entry>
        <nm:Record>
            <nm:ID>30254801-Jun-2020UFH</nm:ID>
        </nm:Record>
    </nm:Entry>
</nm:Data>
  </xsl:param>
  
  <xsl:output indent="yes"/>
  <xsl:strip-space elements="*"/>
    
  <xsl:key name="ref" match="nm:Data/nm:Entry/nm:Record/nm:ID" use="."/>
  
  <xsl:key name="type-ref" match="row" use="col6"/>

  <xsl:mode on-no-match="shallow-copy"/>
  
  <xsl:template match="root">
      <xsl:copy>
          <xsl:apply-templates select="key('type-ref', 'A')"/>
      </xsl:copy>
  </xsl:template>

  <xsl:template match="row[key('ref', col1 || col5 || col3, $report)]">
      <xsl:copy>
         <type>Adj</type>
         <xsl:apply-templates/>
      </xsl:copy>
  </xsl:template>
  
  <xsl:template match="row[not(key('ref', col1 || col5 || col3, $report))]">
      <xsl:copy>
         <type>New</type>
         <xsl:apply-templates/>
      </xsl:copy>
  </xsl:template>
  
</xsl:stylesheet>

https://xsltfiddle.liberty-development.net/a9HjZH/3

Finally, with XSLT 3 and streaming (e.g. with Saxon 9 or 10 EE) you could use a different approach that reads the second document with streaming into a map and then streams through the first input document and performs the template matching on each row materialized in memory:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map"
    exclude-result-prefixes="#all"
    version="3.0">
    
    <xsl:param name="doc2-uri" as="xs:string">input-sample2.xml</xsl:param>
    
    <xsl:strip-space elements="*"/>
    <xsl:output indent="yes"/>
        
    <xsl:param name="key-map" as="map(xs:string, xs:boolean)">
        <xsl:map>
            <xsl:source-document href="{$doc2-uri}" streamable="yes">
                <xsl:iterate select="data/id">
                    <xsl:map-entry key="string()" select="true()"/>
                </xsl:iterate>
            </xsl:source-document>
        </xsl:map>
    </xsl:param>
    
    <xsl:mode on-no-match="shallow-copy" streamable="yes"/>
    
    <xsl:template match="root">
        <xsl:copy>
            <xsl:apply-templates select="row!copy-of()" mode="grounded"/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:mode name="grounded" on-no-match="shallow-copy"/>
    
    <xsl:template match="row[map:contains($key-map, id || code || date)]" mode="grounded">
        <xsl:copy>
            <type>Adj</type>
            <xsl:apply-templates mode="#current"/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="row[not(map:contains($key-map, id || code || date))]" mode="grounded">
        <xsl:copy>
            <type>New</type>
            <xsl:apply-templates mode="#current"/>
        </xsl:copy>
    </xsl:template>
    
</xsl:stylesheet>

or, for the adapted input samples and the clarified requirement that only certain types of rows are to be processed:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:map="http://www.w3.org/2005/xpath-functions/map"
    xmlns:nm="namespace"
    exclude-result-prefixes="#all"
    version="3.0">
    
    <xsl:param name="doc2-uri" as="xs:string">input2-sample2.xml</xsl:param>
    
    <xsl:strip-space elements="*"/>
    <xsl:output indent="yes"/>
    
    <xsl:param name="key-map" as="map(xs:string, xs:boolean)">
        <xsl:map>
            <xsl:source-document href="{$doc2-uri}" streamable="yes">
                <xsl:iterate select="nm:Data/nm:Entry/nm:Record/nm:ID">
                    <xsl:map-entry key="string()" select="true()"/>
                </xsl:iterate>
            </xsl:source-document>
        </xsl:map>
    </xsl:param>
    
    <xsl:mode on-no-match="shallow-copy" streamable="yes"/>
    
    <xsl:template match="root">
        <xsl:copy>
            <xsl:apply-templates select="row!copy-of()[col6 = 'A']" mode="grounded"/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:mode name="grounded" on-no-match="shallow-copy"/>
    
    <xsl:template match="row[map:contains($key-map, col1 || col5 || col3)]" mode="grounded">
        <xsl:copy>
            <type>Adj</type>
            <xsl:apply-templates mode="#current"/>
        </xsl:copy>
    </xsl:template>
    
    <xsl:template match="row[not(map:contains($key-map, col1 || col5 || col3))]" mode="grounded">
        <xsl:copy>
            <type>New</type>
            <xsl:apply-templates mode="#current"/>
        </xsl:copy>
    </xsl:template>
    
</xsl:stylesheet>

That should keep the memory consumption for the the first document low, even if you have millions of rows. For the second document it streams through and build a light-weight map to store the keys instead of holding the complete XML tree and its key function in memory.

Upvotes: 1

Related Questions