Reputation: 465
I have limited p to 50,000 but would like to use this code for p > 1,000,000. To simplify the code the arrays are pre-defined, not created with malloc - which has already made it faster. I understand the problem is the calculations within the two loops, division and modulus really slow the code down. Removing zeros from the beginning of the final number and printing the final array do not seem to cause any problems.
#include <stdio.h>
unsigned int p, l, zero;
int main() {
int power2[50005] = { 0 };
int product[50005] = { 0 };
int carry[50005] = { 0 };
printf("Enter power: ");
scanf("%u", &p);
l = (p * 0.4) + 1;
power2[l-1] = 2;
for (int j=p-2; j>-1; --j) {
for (int i=l-1; i>-1; --i) {
power2[i] *= 2;
carry[i] = power2[i] / 10;
product[i] = power2[i] + carry[i+1];
carry[i] = product[i] / 10;
product[i] %= 10;
power2[i] = product[i];
}
}
/* remove 0s from beginning of final array - product[] */
zero = 0;
while (product[zero] == 0) {
++zero;
}
for (int i=zero; i<l; ++i) {
printf("%d", product[i]);
}
printf("\n");
return 0;
}
Upvotes: 0
Views: 176
Reputation: 144780
Your analysis is correct:
There are some issues in your code:
there is no reason to use global variables for p
, l
and zero
the arrays with an arbitrary length of 50005
elements might be too small for large powers of 2 and you do not check for overflow after computing l
. You should allocate the arrays as you did in the previous versions.
you should test the return value of scanf()
to avoid undefined behavior on invalid or missing input.
i
should be defined with the same type as l
to avoid comparing signed and unsigned values, whose C semantics can produce surprising results.
the initial value should be 1
to allow for 20
there is no need for a downward loop for the outer loop.
the downward loops should be written differently to accept unsigned index types:
for (unsigned int i = l; i-- > 0;)
the code is not indented properly, which makes it difficult to read.
Here is a modified version (with the timing feature):
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void print_digits(int *p, unsigned int n) {
while (n --> 0) printf("%d", *p++);
}
int main(int argc, char *argv[]) {
int *power2, *product, *carry;
unsigned int p, l, zero, nd;
if (argc > 1) {
p = strtoul(argv[1], NULL, 0);
} else {
printf("Enter power: ");
if (scanf("%u", &p) != 1) {
printf("invalid or missing input\n");
return 1;
}
}
clock_t start = clock();
l = (unsigned int)(p * 0.4) + 1;
if ((power2 = calloc(l, sizeof(*power2))) == NULL
|| (product = calloc(l, sizeof(*product))) == NULL
|| (carry = calloc(l, sizeof(*carry))) == NULL) {
printf("allocation failure for %u digits\n", l);
return 1;
}
power2[l - 1] = 1;
for (unsigned int j = 0; j < p; j++) {
for (unsigned int i = l; i-- > 0;) {
power2[i] *= 2;
carry[i] = power2[i] / 10;
product[i] = power2[i] + carry[i+1];
carry[i] = product[i] / 10;
product[i] %= 10;
power2[i] = product[i];
}
}
/* remove 0s from beginning of final array - product[] */
for (zero = 0; power2[zero] == 0; zero++)
continue;
nd = l - zero;
printf("pow(2, %u) -> [%u digits] ", p, nd);
if (nd < 50) {
print_digits(power2 + zero, nd);
} else {
print_digits(power2 + zero, 25);
printf("...");
print_digits(power2 + zero + nd - 25, 25);
}
double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
printf("\ntime: %f sec\n", elapsed);
free(power2);
free(product);
free(carry);
return 0;
}
Here is the output for 50k and 100k:
pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 3.781174 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 15.145408 sec
Given the quadratic time complexity, computing 21000000 should take at least 100 times longer than 2100000, ignoring cache effects, about half an hour.
Note these further remarks:
power2
array to have type int
, using type unsigned char
suffices to store single digits.carry
to be an array, you can keep the running value in a simple int
variableproduct
array, just store the updated digit value into the power2
array as you go.Here is a modified version:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void print_digits(unsigned char *p, unsigned int n) {
while (n --> 0) putchar('0' + *p++);
}
int main(int argc, char *argv[]) {
unsigned char *power2;
unsigned int p, l, zero, nd;
if (argc > 1) {
p = strtoul(argv[1], NULL, 0);
} else {
printf("Enter power: ");
if (scanf("%u", &p) != 1) {
printf("invalid or missing input\n");
return 1;
}
}
clock_t start = clock();
l = (unsigned int)(p * 0.4) + 1;
if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
printf("allocation failure for %u digits\n", l);
return 1;
}
power2[l - 1] = 1;
for (unsigned int j = 0; j < p; j++) {
int carry = 0;
for (unsigned int i = l; i-- > 0;) {
carry += power2[i] * 2;
power2[i] = carry % 10;
carry = carry / 10;
}
}
/* remove 0s from beginning of final array - product[] */
for (zero = 0; power2[zero] == 0; zero++)
continue;
nd = l - zero;
printf("pow(2, %u) -> [%u digits] ", p, nd);
if (nd < 50) {
print_digits(power2 + zero, nd);
} else {
print_digits(power2 + zero, 25);
printf("...");
print_digits(power2 + zero + nd - 25, 25);
}
double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
printf("\ntime: %f sec\n", elapsed);
free(power2);
return 0;
}
This simple change makes the code 30% faster:
pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 2.389122 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 9.541588 sec
Note also that you can keep track of the first non zero digit as you go instead of multiplying the whole array at each iteration. This actually simplifies the code and shaves another 60% in the timings:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void print_digits(unsigned char *p, unsigned int n) {
while (n --> 0) putchar('0' + *p++);
}
int main(int argc, char *argv[]) {
unsigned char *power2;
unsigned int p, l, zero, nd;
if (argc > 1) {
p = strtoul(argv[1], NULL, 0);
} else {
printf("Enter power: ");
if (scanf("%u", &p) != 1) {
printf("invalid or missing input\n");
return 1;
}
}
clock_t start = clock();
l = (unsigned int)(p * 0.4) + 1;
if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
printf("allocation failure for %u digits\n", l);
return 1;
}
power2[l - 1] = 1;
zero = l - 1;
for (unsigned int j = 0; j < p; j++) {
int carry = 0;
for (unsigned int i = l; i-- > zero;) {
carry += power2[i] * 2;
power2[i] = carry % 10;
carry = carry / 10;
}
if (carry) {
power2[--zero] = carry % 10;
}
}
nd = l - zero;
printf("pow(2, %u) -> [%u digits] ", p, nd);
if (nd < 50) {
print_digits(power2 + zero, nd);
} else {
print_digits(power2 + zero, 25);
printf("...");
print_digits(power2 + zero + nd - 25, 25);
}
double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
printf("\ntime: %f sec\n", elapsed);
free(power2);
return 0;
}
Output:
pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
time: 0.893097 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 3.591081 sec
To further improve the code efficiency, you can multiply by 228 instead of just 2 except for the last iteration:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void print_digits(unsigned char *p, unsigned int n) {
while (n --> 0) putchar('0' + *p++);
}
int main(int argc, char *argv[]) {
unsigned char *power2;
unsigned int p, l, zero, nd;
if (argc > 1) {
p = strtoul(argv[1], NULL, 0);
} else {
printf("Enter power: ");
if (scanf("%u", &p) != 1) {
printf("invalid or missing input\n");
return 1;
}
}
clock_t start = clock();
l = (unsigned int)(p * 0.4) + 1;
if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
printf("allocation failure for %u digits\n", l);
return 1;
}
power2[l - 1] = 1;
zero = l - 1;
for (unsigned int j = 0; j < p;) {
unsigned int carry = 0;
unsigned int mul = 1;
/* select the maximum multiplier */
while (j < p && mul < UINT_MAX / 20) {
mul *= 2;
j++;
}
for (unsigned int i = l; i-- > zero;) {
carry += power2[i] * mul;
power2[i] = carry % 10;
carry = carry / 10;
}
while (carry) {
power2[--zero] = carry % 10;
carry = carry / 10;
}
}
nd = l - zero;
printf("pow(2, %u) -> [%u digits] ", p, nd);
if (nd < 50) {
print_digits(power2 + zero, nd);
} else {
print_digits(power2 + zero, 25);
printf("...");
print_digits(power2 + zero + nd - 25, 25);
}
double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
printf("\ntime: %f sec\n", elapsed);
free(power2);
return 0;
}
Output:
pow(2, 50000) -> [15052 digits] 3160699436856317896135924...2289456131085235835109376
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...7025155304734389883109376
time: 0.093706 sec
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...1236104888403162747109376
time: 9.163389 sec
The timings have been divided by 160 so far, and computing 21000000 takes only 9,16 seconds.
You can further improve this by using larger digits: instead of storing a single digit in base 10 in the array elements, you can use base 1000000000 and store 9 decimal digits per int32_t
. The final conversion is more tricky as the first digit is handled differently from the next ones, which must be converted to 9 digits with leading zeroes:
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void print_digits(unsigned char *p, unsigned int n) {
while (n --> 0) putchar('0' + *p++);
}
int main(int argc, char *argv[]) {
uint32_t *power2;
unsigned int p, l, zero, nd;
if (argc > 1) {
p = strtoul(argv[1], NULL, 0);
} else {
printf("Enter power: ");
if (scanf("%u", &p) != 1) {
printf("invalid or missing input\n");
return 1;
}
}
clock_t start = clock();
#define UNIT 1000000000
#define UNIT_LEN 9
l = (unsigned int)(p * 0.4 / UNIT_LEN) + 1;
if ((power2 = calloc(l, sizeof(*power2))) == NULL) {
printf("allocation failure for %u digits\n", l);
return 1;
}
power2[l - 1] = 1;
zero = l - 1;
for (unsigned int j = 0; j < p;) {
uint64_t carry = 0;
unsigned int shift = (p - j) < 34 ? (p - j) : 34;
j += shift;
for (unsigned int i = l; i-- > zero;) {
carry += (uint64_t)power2[i] << shift;
power2[i] = carry % UNIT;
carry = carry / UNIT;
}
while (carry) {
power2[--zero] = carry % UNIT;
carry = carry / UNIT;
}
}
nd = snprintf(NULL, 0, "%"PRId32"", power2[zero]) + (l - 1 - zero) * UNIT_LEN;
printf("pow(2, %u) -> [%u digits] ", p, nd);
if (nd < UNIT_LEN * 6) {
printf("%"PRId32"", power2[zero]);
for (unsigned int i = zero + 1; i < l; i++)
printf("%09"PRId32"", power2[i]);
} else {
printf("%"PRId32"", power2[zero]);
for (unsigned int i = zero + 1; i < zero + 3; i++)
printf("%09"PRId32"", power2[i]);
printf("...");
for (unsigned int i = l - 3; i < l; i++)
printf("%09"PRId32"", power2[i]);
}
double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
printf("\ntime: %f sec\n", elapsed);
free(power2);
return 0;
}
The output shows another 9x speed up:
pow(2, 50000) -> [15052 digits] 3160699436856317896135...102289456131085235835109376
time: 0.002685 sec
pow(2, 100000) -> [30103 digits] 9990020930143845079440327...597025155304734389883109376
time: 0.010489 sec
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...871236104888403162747109376
time: 1.034588 sec
Muti-precision packages use base 2 for internal computations and only convert to base 10 for final output, potentially using more efficient techniques. A power of 2 is trivial to compute in this representation, all the time is spent converting to base 10 for output.
Python 3.8 running the script below produces the same output for 21000000 in 1.24 seconds, slower than the above code.
def pow2(p):
s = "%d" % (1 << p)
print("pow(2, %d) -> [%d digits] %s...%s" % (p, len(s), s[:25], s[-25:]))
pow2(1000000)
The QuickJS Javascript interpreter is much faster for the same task: 0.091 seconds.
function pow2(p) {
var s = String(1n << BigInt(p));
var len = s.length;
std.printf("pow(2, %d) -> [%d digits] %s...%s\n",
p, s.length, s.substr(0, 25), s.substr(len - 25, 25));
}
pow2(1000000);
Output:
chqrlie> time qjs --std 220923-pow2.js
pow(2, 1000000) -> [301030 digits] 9900656229295898250697923...1236104888403162747109376
real 0m0.099s
user 0m0.091s
sys 0m0.004s
Upvotes: 3