Venturer
Venturer

Reputation: 84

Optimised EmEditor Macro to Element Sort, Split, and Dedupe a single Column and Extract Count

I currently have a separated file in this format (3 columns tab "\t" separated), and “;” separates all the elements within the columns).

    COL1\tCOL2\tCOL3
    abc\t123;1q\tapple\t
    dfg\t234;2w\tapple;apple\t
    hij\t345;3e\tbanana;apple;cherry;\t
    klm\t456;4r\tapple;banana;cherry;banana;cherry;\t
    nop\t567;5t\t;;apple;banana;cherry;banana;;cherry;;\t

I would like any thoughts on an optimized macro (ideally javascript) to manipulate the file to OUTPUT this: Column 3 is now sorted (any extra/unrequired delimiters are removed as well)and duplicates removed. New column 4 is the deduplicated element count.

    abc\t123;1q\tapple\t1
    dfg\t234;2w\tapple\t1
    hij\t345;3e\tapple;banana;cherry\t3
    klm\t456;4r\tapple;banana;cherry\t3
    nop\t567;5t\tapple;banana;cherry\t3

I have been trying similar to the below, but this method could be quicker I think.

    for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
      str = document.GetCell(iRow, 2, eeCellIncludeQuotes);
      var count = (str.match(/;/g) || []).length;
      var numOfElements = count + 1;
      document.SetCell( iRow, 3, numOfElements, eeAutoQuote );
    }

So the user should select the column they want to run this on (Column 3 in this example) and the macro would run on that column only, and output the count to a new column to the right.

The real source files will be millions of lines, so if this could be EmEditor optimized in any way, that would be great.

Upvotes: 0

Views: 186

Answers (1)

Yutaka
Yutaka

Reputation: 1806

I optimized your macro by creating a function to count semicolons in a string rather than using a regular expression (Second version), and also used GetColumn and SetColumn methods to increase the speed (Third version). The third version will insert a column rather than overwrite the existing column.

  1. Original macro (modified for correctness and timing)

    var start = new Date().getTime();
    
    var totalLines = document.GetLines();
    for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
        str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
        var count = (str.match(/;/g) || []).length;
        var numOfElements = count + 1;
        document.SetCell( iRow, 4, numOfElements, eeAutoQuote );
    }
    
    var end = new Date().getTime();
    var time = end - start;
    alert( "Execution time: " + time + " ms" );
    
  2. Second version

    function CountSemiColon( str )
    {
        var count = 0;
        for( var index = -1; ; ) {
            index = str.indexOf( ';', index + 1 );
            if( index == -1 ) {
                break;
            }
            ++count;
        }
        return count;
    }
    
    var start = new Date().getTime();
    
    var totalLines = document.GetLines();
    for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row, start at 2nd row
        var str = document.GetCell(iRow, 3, eeCellIncludeQuotes);
        document.SetCell( iRow, 4, CountSemiColon( str ) + 1, eeAutoQuote );
    }
    
    var end = new Date().getTime();
    var time = end - start;
    alert( "Execution time: " + time + " ms" );
    
  3. Third version

    function CountSemiColon( str )
    {
        var count = 0;
        for( var index = -1; ; ) {
            index = str.indexOf( ';', index + 1 );
            if( index == -1 ) {
                break;
            }
            ++count;
        }
        return count;
    }
    
    var start = new Date().getTime();
    
    var totalLines = document.GetLines();
    s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
    sLines = s1.split( "\n" );
    s2 = "";
    nTotal = sLines.length;
    for( y = 0; y < nTotal; y++ ) {
        s2 += CountSemiColon( sLines[y] ) + 1 + "\n";
    }
    x = s2.length;
    if( x > 0 ) s2 = s2.substr( 0, x - 1 );
    document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
    
    var end = new Date().getTime();
    var time = end - start;
    alert( "Execution time: " + time + " ms" );
    
  4. Fourth version (returns 0 for empty cell)

    function CountElements( str )
    {
        if( str.length == 0 || str == '\t' ) {   // if empty string or delimiter only, return 0
            return 0;
        }
        var count = 0;
        for( var index = -1; ; ) {
            index = str.indexOf( ';', index + 1 );
            if( index == -1 ) {
                break;
            }
            ++count;
        }
        return count + 1;   // add 1 to the Count
    }
    
    var start = new Date().getTime();
    
    var totalLines = document.GetLines();
    s1 = document.GetColumn( 3, "\n", eeCellIncludeQuotesAndDelimiter, 2, totalLines - 1 );
    sLines = s1.split( "\n" );
    s2 = "";
    nTotal = sLines.length;
    for( y = 0; y < nTotal; y++ ) {
        s2 += CountElements( sLines[y] ) + "\n";
    }
    x = s2.length;
    if( x > 0 ) s2 = s2.substr( 0, x - 1 );
    document.InsertColumn( 4, s2, "\n", eeDontQuote, 2 );
    
    var end = new Date().getTime();
    var time = end - start;
    alert( "Execution time: " + time + " ms" );
    

Test results:

  1. 10429 ms
  2. 8496 ms
  3. 1803 ms
  4. 1890 ms

1 million line, 52 MB CSV file.

If this is not fast enough, or "Out of Memory" error occurs, I will think about other methods or optimize even further, so please let me know.

Upvotes: 1

Related Questions