Ian Stewart
Ian Stewart

Reputation: 465

How to optimise this programme that calculates 2^p

I have limited p to 50,000 but would like to use this code for p > 1,000,000. To simplify the code the arrays are pre-defined, not created with malloc - which has already made it faster. I understand the problem is the calculations within the two loops, division and modulus really slow the code down. Removing zeros from the beginning of the final number and printing the final array do not seem to cause any problems.

#include <stdio.h>

unsigned int p, l, zero;

int main() {

int power2[50005] = { 0 };
int product[50005] = { 0 };
int carry[50005] = { 0 };
            
                printf("Enter power: ");
                scanf("%u", &p);
                l = (p * 0.4) + 1;
                
                power2[l-1] = 2;

            for (int j=p-2; j>-1; --j) {
            for (int i=l-1; i>-1; --i) {
                
                power2[i] *= 2;
                carry[i] = power2[i] / 10;
                
                product[i] = power2[i] + carry[i+1];
                
                carry[i] = product[i] / 10;
                product[i] %= 10;
                
                power2[i] = product[i];
                    
            }
            
        }
                /* remove 0s from beginning of final array - product[] */       
                zero = 0;
                while (product[zero] == 0) {
                    ++zero;
                } 
                for (int i=zero; i<l; ++i) {
                    printf("%d", product[i]);
                }

printf("\n");
return 0;
}

Upvotes: 0

Views: 176

Answers (1)

chqrlie
chqrlie

Reputation: 144780

Your analysis is correct:

  • the code has quadratic time complexity due to the nested loops. Not much can be done about that without changing the bignum representation. Using base 2 obviously simplifies the computation, but converting the resulting number to base 10 for mere humans to read has the same complexity.
  • regarding the division and modulo: these operations are not so expensive on modern hardware with current compilers: they can be converted to multiplications and shifts because the dividend is a compile time constant.
  • removing the leading zeroes is indeed cheap compared to the computation itself, but this phase could be optimized as we shall see.

There are some issues in your code:

  • there is no reason to use global variables for p, l and zero

  • the arrays with an arbitrary length of 50005 elements might be too small for large powers of 2 and you do not check for overflow after computing l. You should allocate the arrays as you did in the previous versions.

  • you should test the return value of scanf() to avoid undefined behavior on invalid or missing input.

  • i should be defined with the same type as l to avoid comparing signed and unsigned values, whose C semantics can produce surprising results.

  • the initial value should be 1 to allow for 20

  • there is no need for a downward loop for the outer loop.

  • the downward loops should be written differently to accept unsigned index types:

      for (unsigned int i = l; i-- > 0;)
    
  • the code is not indented properly, which makes it difficult to read.

Here is a modified version (with the timing feature):

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void print_digits(int *p, unsigned int n) {
    while (n --> 0) printf("%d", *p++);
}

int main(int argc, char *argv[]) {
    int *power2, *product, *carry;
    unsigned int p, l, zero, nd;

    if (argc > 1) {
        p = strtoul(argv[1], NULL, 0);
    } else {
        printf("Enter power: ");
        if (scanf("%u", &p) != 1) {
            printf("invalid or missing input\n");
            return 1;
        }
    }
    clock_t start = clock();
    l = (unsigned int)(p * 0.4) + 1;

    if ((power2 = calloc(l, sizeof(*power2))) == NULL
    ||  (product = calloc(l, sizeof(*product))) == NULL
    ||  (carry = calloc(l, sizeof(*carry))) == NULL) {
        printf("allocation failure for %u digits\n", l);
        return 1;
    }
    power2[l - 1] = 1;

    for (unsigned int j = 0; j < p; j++) {
        for (unsigned int i = l; i-- > 0;) {
            power2[i] *= 2;
            carry[i] = power2[i] / 10;
            product[i] = power2[i] + carry[i+1];
            carry[i] = product[i] / 10;
            product[i] %= 10;
            power2[i] = product[i];
        }
    }
    /* remove 0s from beginning of final array - product[] */
    for (zero = 0; power2[zero] == 0; zero++)
        continue;
    nd = l - zero;
    printf("pow(2, %u) -> [%u digits] ", p, nd);
    if (nd < 50) {
        print_digits(power2 + zero, nd);
    } else {
        print_digits(power2 + zero, 25);
        printf("...");
        print_digits(power2 + zero + nd - 25, 25);
    }
    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("\ntime: %f sec\n", elapsed);
    free(power2);
    free(product);
    free(carry);
    return 0;
}

Here is the output for 50k and 100k:

pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 3.781174 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 15.145408 sec

Given the quadratic time complexity, computing 21000000 should take at least 100 times longer than 2100000, ignoring cache effects, about half an hour.

Note these further remarks:

  • there is no need for the power2 array to have type int, using type unsigned char suffices to store single digits.
  • there is no need for carry to be an array, you can keep the running value in a simple int variable
  • there is no need to use a separate product array, just store the updated digit value into the power2 array as you go.

Here is a modified version:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void print_digits(unsigned char *p, unsigned int n) {
    while (n --> 0) putchar('0' + *p++);
}

int main(int argc, char *argv[]) {
    unsigned char *power2;
    unsigned int p, l, zero, nd;

    if (argc > 1) {
        p = strtoul(argv[1], NULL, 0);
    } else {
        printf("Enter power: ");
        if (scanf("%u", &p) != 1) {
            printf("invalid or missing input\n");
            return 1;
        }
    }
    clock_t start = clock();
    l = (unsigned int)(p * 0.4) + 1;

    if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
        printf("allocation failure for %u digits\n", l);
        return 1;
    }
    power2[l - 1] = 1;

    for (unsigned int j = 0; j < p; j++) {
        int carry = 0;
        for (unsigned int i = l; i-- > 0;) {
            carry += power2[i] * 2;
            power2[i] = carry % 10;
            carry = carry / 10;
        }
    }
    /* remove 0s from beginning of final array - product[] */
    for (zero = 0; power2[zero] == 0; zero++)
        continue;
    nd = l - zero;
    printf("pow(2, %u) -> [%u digits] ", p, nd);
    if (nd < 50) {
        print_digits(power2 + zero, nd);
    } else {
        print_digits(power2 + zero, 25);
        printf("...");
        print_digits(power2 + zero + nd - 25, 25);
    }
    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("\ntime: %f sec\n", elapsed);
    free(power2);
    return 0;
}

This simple change makes the code 30% faster:

pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 2.389122 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 9.541588 sec

Note also that you can keep track of the first non zero digit as you go instead of multiplying the whole array at each iteration. This actually simplifies the code and shaves another 60% in the timings:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void print_digits(unsigned char *p, unsigned int n) {
    while (n --> 0) putchar('0' + *p++);
}

int main(int argc, char *argv[]) {
    unsigned char *power2;
    unsigned int p, l, zero, nd;

    if (argc > 1) {
        p = strtoul(argv[1], NULL, 0);
    } else {
        printf("Enter power: ");
        if (scanf("%u", &p) != 1) {
            printf("invalid or missing input\n");
            return 1;
        }
    }
    clock_t start = clock();
    l = (unsigned int)(p * 0.4) + 1;

    if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
        printf("allocation failure for %u digits\n", l);
        return 1;
    }
    power2[l - 1] = 1;
    zero = l - 1;

    for (unsigned int j = 0; j < p; j++) {
        int carry = 0;
        for (unsigned int i = l; i-- > zero;) {
            carry += power2[i] * 2;
            power2[i] = carry % 10;
            carry = carry / 10;
        }
        if (carry) {
            power2[--zero] = carry % 10;
        }
    }
    nd = l - zero;
    printf("pow(2, %u) -> [%u digits] ", p, nd);
    if (nd < 50) {
        print_digits(power2 + zero, nd);
    } else {
        print_digits(power2 + zero, 25);
        printf("...");
        print_digits(power2 + zero + nd - 25, 25);
    }
    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("\ntime: %f sec\n", elapsed);
    free(power2);
    return 0;
}

Output:

pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 0.893097 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 3.591081 sec

To further improve the code efficiency, you can multiply by 228 instead of just 2 except for the last iteration:

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void print_digits(unsigned char *p, unsigned int n) {
    while (n --> 0) putchar('0' + *p++);
}

int main(int argc, char *argv[]) {
    unsigned char *power2;
    unsigned int p, l, zero, nd;

    if (argc > 1) {
        p = strtoul(argv[1], NULL, 0);
    } else {
        printf("Enter power: ");
        if (scanf("%u", &p) != 1) {
            printf("invalid or missing input\n");
            return 1;
        }
    }
    clock_t start = clock();
    l = (unsigned int)(p * 0.4) + 1;

    if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
        printf("allocation failure for %u digits\n", l);
        return 1;
    }
    power2[l - 1] = 1;
    zero = l - 1;

    for (unsigned int j = 0; j < p;) {
        unsigned int carry = 0;
        unsigned int mul = 1;
        /* select the maximum multiplier */
        while (j < p && mul < UINT_MAX / 20) {
            mul *= 2;
            j++;
        }
        for (unsigned int i = l; i-- > zero;) {
            carry += power2[i] * mul;
            power2[i] = carry % 10;
            carry = carry / 10;
        }
        while (carry) {
            power2[--zero] = carry % 10;
            carry = carry / 10;
        }
    }
    nd = l - zero;
    printf("pow(2, %u) -> [%u digits] ", p, nd);
    if (nd < 50) {
        print_digits(power2 + zero, nd);
    } else {
        print_digits(power2 + zero, 25);
        printf("...");
        print_digits(power2 + zero + nd - 25, 25);
    }
    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("\ntime: %f sec\n", elapsed);
    free(power2);
    return 0;
}

Output:

pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 0.093706 sec
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...1236104888403162747109376
time: 9.163389 sec

The timings have been divided by 160 so far, and computing 21000000 takes only 9,16 seconds.

You can further improve this by using larger digits: instead of storing a single digit in base 10 in the array elements, you can use base 1000000000 and store 9 decimal digits per int32_t. The final conversion is more tricky as the first digit is handled differently from the next ones, which must be converted to 9 digits with leading zeroes:

#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

void print_digits(unsigned char *p, unsigned int n) {
    while (n --> 0) putchar('0' + *p++);
}

int main(int argc, char *argv[]) {
    uint32_t *power2;
    unsigned int p, l, zero, nd;

    if (argc > 1) {
        p = strtoul(argv[1], NULL, 0);
    } else {
        printf("Enter power: ");
        if (scanf("%u", &p) != 1) {
            printf("invalid or missing input\n");
            return 1;
        }
    }
    clock_t start = clock();

#define UNIT 1000000000
#define UNIT_LEN 9
    l = (unsigned int)(p * 0.4 / UNIT_LEN) + 1;

    if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
        printf("allocation failure for %u digits\n", l);
        return 1;
    }
    power2[l - 1] = 1;
    zero = l - 1;

    for (unsigned int j = 0; j < p;) {
        uint64_t carry = 0;
        unsigned int shift = (p - j) < 34 ? (p - j) : 34;
        j += shift;
        for (unsigned int i = l; i-- > zero;) {
            carry += (uint64_t)power2[i] << shift;
            power2[i] = carry % UNIT;
            carry = carry / UNIT;
        }
        while (carry) {
            power2[--zero] = carry % UNIT;
            carry = carry / UNIT;
        }
    }
    nd = snprintf(NULL, 0, "%"PRId32"", power2[zero]) + (l - 1 - zero) * UNIT_LEN;
    printf("pow(2, %u) -> [%u digits] ", p, nd);
    if (nd < UNIT_LEN * 6) {
        printf("%"PRId32"", power2[zero]);
        for (unsigned int i = zero + 1; i < l; i++)
            printf("%09"PRId32"", power2[i]);
    } else {
        printf("%"PRId32"", power2[zero]);
        for (unsigned int i = zero + 1; i < zero + 3; i++)
            printf("%09"PRId32"", power2[i]);
        printf("...");
        for (unsigned int i = l - 3; i < l; i++)
            printf("%09"PRId32"", power2[i]);
    }
    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("\ntime: %f sec\n", elapsed);
    free(power2);
    return 0;
}

The output shows another 9x speed up:

pow(2, 50000) -> [15052 digits] 3160699436856317896135...102289456131085235835109376
time: 0.002685 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...597025155304734389883109376
time: 0.010489 sec
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...871236104888403162747109376
time: 1.034588 sec

Muti-precision packages use base 2 for internal computations and only convert to base 10 for final output, potentially using more efficient techniques. A power of 2 is trivial to compute in this representation, all the time is spent converting to base 10 for output.

Python 3.8 running the script below produces the same output for 21000000 in 1.24 seconds, slower than the above code.

def pow2(p):
   s = "%d" % (1 << p)
   print("pow(2, %d) -> [%d digits] %s...%s" % (p, len(s), s[:25], s[-25:]))

pow2(1000000)

The QuickJS Javascript interpreter is much faster for the same task: 0.091 seconds.

function pow2(p) {
    var s = String(1n << BigInt(p));
    var len = s.length;
    std.printf("pow(2, %d) -> [%d digits] %s...%s\n",
               p, s.length, s.substr(0, 25), s.substr(len - 25, 25));
}
pow2(1000000);

Output:

chqrlie> time qjs --std 220923-pow2.js
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...1236104888403162747109376

real    0m0.099s
user    0m0.091s
sys     0m0.004s

Upvotes: 3

Related Questions