Reputation: 6133

Fastest way to find all primes under 4 billion

I'm trying to print every prime number under 2**32. Right now I'm using a bool vector to build a sieve and then print out the primes after making the sieve. It takes 4 minutes just to print out the primes upto 1 billion. Is there a faster way to do this?? Here is my code

#include <iostream>
#include <cstdlib>
#include <vector>
#include <math.h>

using namespace std;

int main(int argc, char **argv){
  long long limit = atoll(argv[1]);
  //cin >> limit;
  long long sqrtlimit = sqrt(limit);

  vector<bool> sieve(limit+1, false);

  for(long long n = 4; n <= limit; n += 2)
    sieve[n] = true;

  for(long long n=3; n <= sqrtlimit; n = n+2){
    if(!sieve[n]){
      for(long long m = n*n; m<=limit; m=m+(2*n))
        sieve[m] = true;
    }
  }

  long long last;
  for(long long i=limit; i >= 0; i--){
    if(sieve[i] == false){
      last = i;
      break;
    }
  }
  cout << last << endl;

  for(long long i=2;i<=limit;i++)
  {
    if(!sieve[i])
      if(i != last)
        cout<<i<<",";
      else
        cout<<i;
  }
  cout<<endl;

Upvotes: 0

Answers (8)

Jack

Reputation: 1

I wrote cuda code that runs on my 3090 to generate all primes under 1 billion in around 0.4 seconds `

#include <iostream>
#include <cmath>
#include <cuda.h>
#include <chrono>
#include <omp.h>

#define LIMIT 1000000000
#define BITS_PER_WORD 32
#define BIT_ARRAY_SIZE ((LIMIT / BITS_PER_WORD) + 1)

// Device function to toggle sieve bit
__device__ void toggle_bit(unsigned int* sieve, unsigned long long n)
{
    atomicXor(&sieve[n / BITS_PER_WORD], 1U << (n % BITS_PER_WORD));
}

// CUDA kernel for the seive
__global__ void atkin_kernel(unsigned int* sieve, unsigned int limit, unsigned int sqrt_limit)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x + 1; // x starts from 1
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y + 1; // y starts from 1

    if (x > sqrt_limit || y > sqrt_limit)
        return;

    unsigned long long xx = static_cast<unsigned long long>(x) * x; // Precompute x^2
    unsigned long long yy = static_cast<unsigned long long>(y) * y; // Precompute y^2

    unsigned long long n;

    // First quadratic form: n = 4x^2 + y^2
    n = 4 * xx + yy;
    if (n <= limit)
    {
        unsigned int r = n % 60;
        if (r == 1 || r == 13 || r == 17 || r == 29 || r == 37 || r == 41 || r == 49 || r == 53)
        {
            toggle_bit(sieve, n);
        }
    }

    // Second quadratic form: n = 3x^2 + y^2
    n = 3 * xx + yy;
    if (n <= limit)
    {
        unsigned int r = n % 60;
        if (r == 7 || r == 19 || r == 31 || r == 43)
        {
            toggle_bit(sieve, n);
        }
    }

    // Third quadratic form: n = 3x^2 - y^2 (x > y)
    if (x > y)
    {
        n = 3 * xx - yy;
        if (n <= limit)
        {
            unsigned int r = n % 60;
            if (r == 11 || r == 23 || r == 47 || r == 59)
            {
                toggle_bit(sieve, n);
            }
        }
    }
}

int main()
{
    unsigned int sqrt_limit = static_cast<unsigned int>(ceil(sqrt(static_cast<double>(LIMIT))));

    // Start total timing
    auto start_total = std::chrono::high_resolution_clock::now();

    // Allocate bit seive and set to 0
    unsigned int* d_sieve;
    cudaError_t err;

    err = cudaMalloc(reinterpret_cast<void**>(&d_sieve), BIT_ARRAY_SIZE * sizeof(unsigned int));
    if (err != cudaSuccess)
    {
        std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }

    err = cudaMemset(d_sieve, 0, BIT_ARRAY_SIZE * sizeof(unsigned int));
    if (err != cudaSuccess)
    {
        std::cerr << "cudaMemset failed: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_sieve);
        return 1;
    }

    // Set up block and grid sizes
    dim3 blockSize(16, 16);
    dim3 gridSize((sqrt_limit + blockSize.x - 1) / blockSize.x,
        (sqrt_limit + blockSize.y - 1) / blockSize.y);

    // Start kernel timing
    cudaEvent_t start_kernel, stop_kernel;
    cudaEventCreate(&start_kernel);
    cudaEventCreate(&stop_kernel);

    cudaEventRecord(start_kernel);

    // Launch the seive kernel
    atkin_kernel << <gridSize, blockSize >> > (d_sieve, LIMIT, sqrt_limit);

    // Check for scary kernel launch errors
    err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        std::cerr << "Kernel launch failed: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_sieve);
        return 1;
    }

    cudaEventRecord(stop_kernel);
    cudaEventSynchronize(stop_kernel);

    // Get kernel time
    float kernel_time = 0;
    cudaEventElapsedTime(&kernel_time, start_kernel, stop_kernel);

    // Copy bit sieve back to host
    unsigned int* h_sieve = new unsigned int[BIT_ARRAY_SIZE];
    err = cudaMemcpy(h_sieve, d_sieve, BIT_ARRAY_SIZE * sizeof(unsigned int), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess)
    {
        std::cerr << "cudaMemcpy failed: " << cudaGetErrorString(err) << std::endl;
        delete[] h_sieve;
        cudaFree(d_sieve);
        return 1;
    }

    // Include small primes
    if (LIMIT >= 2) h_sieve[2 / BITS_PER_WORD] |= (1U << (2 % BITS_PER_WORD));
    if (LIMIT >= 3) h_sieve[3 / BITS_PER_WORD] |= (1U << (3 % BITS_PER_WORD));
    if (LIMIT >= 5) h_sieve[5 / BITS_PER_WORD] |= (1U << (5 % BITS_PER_WORD));

    // Start cpu timing
    auto start_cpu = std::chrono::high_resolution_clock::now();

    // Get the number of threads available
    int num_threads = omp_get_max_threads();
    std::cout << "Using " << num_threads << " threads for CPU post processing." << std::endl;

    // Calculate segment size
    unsigned long long segment_size = (static_cast<unsigned long long>(LIMIT) / num_threads) + 1;

    // Take out multiples of squares of primes in parallel
#pragma omp parallel
    {
        int thread_id = omp_get_thread_num();
        unsigned long long segment_start = thread_id * segment_size;
        unsigned long long segment_end = std::min(segment_start + segment_size, static_cast<unsigned long long>(LIMIT) + 1);

        // Each thread processes its own segment
        for (unsigned int n = 5; n <= sqrt_limit; n++)
        {
            if (h_sieve[n / BITS_PER_WORD] & (1U << (n % BITS_PER_WORD)))
            {
                unsigned long long n_squared = static_cast<unsigned long long>(n) * n;

                // Find the first multiple of n_squared within the segment
                unsigned long long k = ((segment_start + n_squared - 1) / n_squared) * n_squared;

                for (; k < segment_end; k += n_squared)
                {
                    h_sieve[k / BITS_PER_WORD] &= ~(1U << (k % BITS_PER_WORD));
                }
            }
        }
    }

    // Stop cpu timing
    auto end_cpu = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> cpu_time = end_cpu - start_cpu;

    // Stop total timing
    auto end_total = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double> total_time = end_total - start_total;

    // Print primes to test
    for (unsigned int n = 0; n <= 1000; n++)
    {
        if (h_sieve[n / BITS_PER_WORD] & (1U << (n % BITS_PER_WORD)))
            std::cout << n << ", ";
    }
    std::cout << std::endl;

    // Output timing information
    std::cout << "Kernel execution time (GPU): " << kernel_time / 1000.0 << " s" << std::endl;
    std::cout << "Post-processing time (CPU): " << cpu_time.count() << " s" << std::endl;
    std::cout << "Total execution time: " << total_time.count() << " s" << std::endl;

    // Free allocated memory
    delete[] h_sieve;
    cudaFree(d_sieve);

    // KILL CUDA events
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);

    return 0;
}

Upvotes: 0

Viktor Jonsson

Reputation: 11

I wrote code in Python that outputs all primes less than a billion in 8.7 seconds. But I was unsure if you ment the first 4 billion primes or all primes less than that.
Anyways, here is my solution:

import numpy as np
from math import isqrt

def all_primes_less_than(n):
    is_prime = np.full(n,True)
    is_prime[:2] = False
    is_prime[4::2] = False
    for p in range(3,isqrt(n),2):
        if is_prime[p]: is_prime[p*p::p] = False
    return is_prime.nonzero()[0]

Upvotes: 1

Edison von Myosotis

Reputation: 619

I've implemented the sieve of erastosthenes in C++20 using with the punching of the non-prime multiples done multithreaded:

#include <iostream>
#include <cstdlib>
#include <charconv>
#include <cstring>
#include <vector>
#include <algorithm>
#include <cmath>
#include <bit>
#include <fstream>
#include <string_view>
#include <thread>
#include <utility>
#include <new>
#include <span>
#include <array>
#include <cassert>
#include <sstream>
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
    #include <intrin.h>
#elif (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__))
    #include <cpuid.h>
#endif

#if defined(_MSC_VER)
    #pragma warning(disable: 26495) // uninitialized member
#endif

using namespace std;

#if defined(__cpp_lib_hardware_interference_size)
constexpr ptrdiff_t CL_SIZE = hardware_destructive_interference_size;
#else
constexpr ptrdiff_t CL_SIZE = 64;
#endif

size_t getL2Size();
bool smt();

int main( int argc, char **argv )
{
    try
    {
        if( argc < 2 )
            return EXIT_FAILURE;
        auto parse = []( char const *str, auto &value )
        {
            bool hex = str[0] == '0' && tolower( str[1] ) == 'x';
            str += 2 * hex;
            auto [ptr, ec] = from_chars( str, str + strlen( str ), value, !hex ? 10 : 16 );
            return ec == errc() && !*ptr;
        };
        size_t end;
        if( !parse( argv[1], end ) )
            return EXIT_FAILURE;
        if( end < 2 )
            return EXIT_FAILURE;
        if( (ptrdiff_t)end++ < 0 )
            throw bad_alloc();
        unsigned
            hc = jthread::hardware_concurrency(),
            nThreads;
        if( argc < 4 || !parse( argv[3], nThreads ) )
            nThreads = hc;
        if( !nThreads )
            return EXIT_FAILURE;
        using word_t = size_t;
        constexpr size_t
            BITS_PER_CL = CL_SIZE * 8,
            BITS = sizeof(word_t) * 8;
        size_t nBits = end / 2;
        union alignas(CL_SIZE) ndi_words_cl { word_t words[CL_SIZE / sizeof(word_t)]; ndi_words_cl() {} };
        vector<ndi_words_cl> sieveCachelines( (nBits + BITS_PER_CL - 1) / BITS_PER_CL );
        span<word_t> sieve( &sieveCachelines[0].words[0], (nBits + BITS - 1) / BITS );
        assert(to_address( sieve.end() ) <= &to_address( sieveCachelines.end() )->words[0]);
        fill( sieve.begin(), sieve.end(), -1 );
        auto punch = [&]( size_t start, size_t end, size_t prime, auto )
        {
            size_t
                bit = start / 2,
                bitEnd = end / 2;
            if( bit >= bitEnd )
                return;
            if( prime >= BITS ) [[likely]]
                do [[likely]]
                    sieve[bit / BITS] &= rotl( (word_t)-2, bit % BITS );
                while( (bit += prime) < bitEnd );
            else
            {
                auto pSieve = sieve.begin() + bit / BITS;
                do [[likely]]
                {
                    word_t
                        word = *pSieve,
                        mask = rotl( (word_t)-2, bit % BITS ),
                        oldMask;
                    do
                        word &= mask,
                        oldMask = mask,
                        mask = rotl( mask, prime % BITS ),
                        bit += prime;
                    while( mask < oldMask );
                    *pSieve++ = word;
                } while( bit < bitEnd );
            }
        };
        size_t sqrt = (ptrdiff_t)ceil( ::sqrt( (ptrdiff_t)end ) );
        for( size_t bSqrt = sqrt / 2, bit = 1; bit < bSqrt; ++bit ) [[likely]]
        {
            auto pSieve = sieve.begin () + bit / BITS;
            do [[likely]]
                {
                    if( word_t w = *pSieve >> bit % BITS; w ) [[likely]]
                    {
                        bit += countr_zero ( w );
                        break;
                    }
                    ++pSieve;
                    bit = bit + BITS & -(ptrdiff_t)BITS;
                } while( bit < bSqrt );
            if( bit >= bSqrt ) [[unlikely]]
                break;
            size_t prime = 2 * bit + 1;
            punch ( prime *prime, sqrt, prime, false_type () );
        }
        auto scan = [&]( size_t start, size_t end, auto fn )
        {
            size_t
                bit = start / 2,
                bitEnd = end / 2;
            if( bit >= bitEnd )
                return;
            auto pSieve = sieve.begin() + bit / BITS;
            word_t word = *pSieve++ >> bit % BITS;
            for( ; ; )
            {
                size_t nextWordBit = bit + BITS & -(ptrdiff_t)BITS;
                for( unsigned gap; word; word >>= gap, word >>= 1, ++bit )
                {
                    gap = countr_zero( word );
                    bit += gap;
                    if( bit >= bitEnd ) [[unlikely]]
                        return;
                    fn( bit * 2 + 1, true_type() );
                }
                bit = nextWordBit;
                if( bit >= bitEnd ) [[unlikely]]
                    return;
                word = *pSieve++;
            }
        };
        vector<jthread> threads;
        threads.reserve( nThreads );
        static auto dbl = []( ptrdiff_t x ) { return (double)x; };
        auto initiate = [&]( size_t start, auto fn )
        {
            double threadRange = dbl( end - start ) / nThreads;
            ptrdiff_t t = nThreads;
            size_t trEnd = end;
            size_t prevStart, prevEnd;
            bool prevValid = false;
            do [[likely]]
            {
                size_t trStart = start + (ptrdiff_t)(--t * threadRange + 0.5);
                trStart &= -(2 * CL_SIZE * 8);
                trStart = trStart >= start ? trStart : start;
                if( trStart < trEnd )
                {
                    if( prevValid )
                        threads.emplace_back( fn, prevStart, prevEnd );
                    prevStart = trStart;
                    prevEnd = trEnd;
                    prevValid = true;
                }
                trEnd = trStart;
            } while( t );
            if( prevValid )
                fn( prevStart, prevEnd );
            threads.resize( 0 );
        };
        double maxCacheRange = dbl( getL2Size() * 8 * 2 ) / 3 * (smt() && nThreads > hc / 2 ? 1 : 2);
        initiate( sqrt,
            [&]( size_t rStart, size_t rEnd )
            {
                double thisThreadRange = dbl( rEnd - rStart );
                ptrdiff_t nCachePartitions = (ptrdiff_t)ceil( thisThreadRange / maxCacheRange );
                double cacheRange = thisThreadRange / dbl( nCachePartitions );
                for( size_t p = nCachePartitions, cacheEnd = rEnd, cacheStart; p--; cacheEnd = cacheStart ) [[likely]]
                {
                    cacheStart = rStart + (ptrdiff_t)((double)(ptrdiff_t)p * cacheRange + 0.5);
                    cacheStart &= -(2 * CL_SIZE * 8);
                    cacheStart = cacheStart >= sqrt ? cacheStart : sqrt;
                    scan( 3, sqrt,
                        [&]( size_t prime, auto )
                        {
                            size_t start = (cacheStart + prime - 1) / prime * prime;
                            start += start % 2 ? 0 : prime;
                            punch( start, cacheEnd, prime, true_type() );
                        } );
                }
            } );
        if( bool count = false; argc >= 3 && (!*argv[2] || (count = strcmp( argv[2], "*" ) == 0)) )
        {
            if( !count )
                return EXIT_SUCCESS;
            atomic<size_t> aNPrimes( 1 );
            initiate( 3,
                [&]( size_t rStart, size_t rEnd )
                {
                    size_t nPrimes = 0;
                    scan( rStart, rEnd, [&]( ... ) { ++nPrimes; } );
                    aNPrimes.fetch_add( nPrimes, memory_order::relaxed );
                } );
            cout << aNPrimes.load( memory_order::relaxed ) << " primes" << endl;
            return EXIT_SUCCESS;
        }
        ofstream ofs;
        ofs.exceptions( ofstream::failbit | ofstream::badbit );
        ofs.open( argc >= 3 ? argv[2] : "primes.txt", ofstream::binary | ofstream::trunc );
        constexpr size_t
            BUF_SIZE = 0x100000,
            AHEAD = 32;
        union ndi_char { char c; ndi_char() {} };
        vector<ndi_char> rawPrintBuf( BUF_SIZE + AHEAD - 1 );
        span printBuf( &to_address( rawPrintBuf.begin() )->c, &to_address( rawPrintBuf.end() )->c );
        auto
            bufBegin = printBuf.begin(),
            bufEnd = bufBegin,
            bufFlush = bufBegin + BUF_SIZE;
        auto print = [&]() { ofs << string_view( bufBegin, bufEnd ); };
        auto printPrime = [&]( size_t prime, auto )
        {
            auto [ptr, ec] = to_chars( &*bufEnd, &bufEnd[AHEAD - 1], prime );
            if( (bool)ec ) [[unlikely]]
                throw system_error( (int)ec, generic_category(), "converson failed" );
            bufEnd = ptr - &*bufBegin + bufBegin; // pointer to iterator - NOP
            *bufEnd++ = '\n';
            if( bufEnd >= bufFlush ) [[unlikely]]
                print(), bufEnd = bufBegin;
        };
        printPrime( 2, false_type() );
        scan( 3, end, printPrime );
        print();
    }
    catch( exception const &exc )
    {
        cout << exc.what() << endl;
    }
}

#if (defined(__GNUC__) || defined(__clang__)) && (defined(__i386__) || defined(__x86_64__)) || defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
array<unsigned, 4> cpuId( unsigned code )
{
    int regs[4];
#if defined(_MSC_VER)
    __cpuid( regs, code );
#elif defined(__GNUC__) || defined(__clang__)
    __cpuid(code, regs[0], regs[1], regs[2], regs[3]);
#endif
    using u = unsigned;
    return array<u, 4> { (u)regs[0], (u)regs[1], (u)regs[2], (u)regs[3] };
}

bool smt()
{
    if( cpuId( 0 )[0] < 1 )
        return false;
    return cpuId( 1 )[3] >> 28 & 1;
}

size_t getL2Size()
{
    if( cpuId( 0x80000000u )[0] < 0x80000006u )
        return 512ull * 1024;
    return (cpuId( 0x80000006u )[2] >> 16) * 1024;
}
#else
size_t getL2Size()
{
    return 512ull * 1024;
}

bool smt()
{
    return false;
}
#endif

Almost all of the CPU time is spent in the multi-threaded code punching the non-prime multiples. Within the punching threads the bit range is partitioned to fit into the L2-cache. If the number of threads is beyond the half number of logical cores on a SMT-system the thread sub-partitions are half the size. Ive just added storing only the odd primes, i.e no multiples of 2, thereby having about +87% more performance. On my Ryzen 9 7950X Zen4 16-core CPU producing all primes up to 2 ^ 32, that's almost 210E6 primes, takes about 0.15 seconds on all cores with suppressed file output; on one core the algorithm takes about 1.73 seconds. The program's third parameter is the number of threads. With 16 threads on my 16 core Zen4-machine the code I get a scaling of 70% over one thread. Occupying further SMT siblings brings almost no further speedup as the code depends on the throughput of the L2-cache. The file output is done with to_chars and my own buffering to speed up the I/O. Producing a 2.2GB file from the mentioned range takes about an additional two seconds on my computer (64GB, PCIe 4.0 SSD).

Upvotes: 2

Michael P.

Reputation: 1

I wrote a fast way in C to output primes up to 4 billion in under three minutes using a single thread on my Ryzen 9 3900x. If you output it to a file, it ends up being 2.298GB and I think it uses about 20GB of memory to complete.

#include <stdlib.h>
#include <stdio.h>

#define ARRSIZE 4000000000
#define MAXCALC ARRSIZE/2

int main() {
    long int z;
    long int *arr = (long int*) malloc((ARRSIZE) * sizeof(long int));
    for (long int x=3;x <= MAXCALC; x++) {
        if (x % 10 == 3 || x % 10 == 7 || x % 10 == 9) {
            for (long int y=3; y < MAXCALC; y++){
                    z = x * y;
                    if (z < ARRSIZE)
                        arr[z] = 1;
                    else
                        break;
            }
        }
    }
    printf("2 3 5 ");
    for (long int x=7; x < ARRSIZE; x++) {
        if (x % 2 != 0 && x % 10 != 5)
            if (arr[x] != 1)
                printf("%ld ", x);
    }
    printf("\n");

    free(arr);
    return EXIT_SUCCESS;
}

Upvotes: 0

user448810

Reputation: 17866

I discuss the problem of generating large numbers of primes at my blog, where I find the sum of the first billion primes is 11138479445180240497. I describe four different methods:

Brute force, testing each number starting from 2 using trial division.
Generate candidates using a 2,3,5,7-wheel, then test primality with strong pseudoprime tests to bases 2, 7, and 61; this method works only up to 2^32, which was insufficient for me to sum the first billion primes but will be sufficient for you.
An algorithm due to Melissa O'Neill that uses sieve embedded in a priority queue, which is quite slow.
A segmented sieve of Eratosthenes, which is very fast but requires space to store both the sieving primes and the sieve itself.

Upvotes: 5

Mark Ransom

Reputation: 308392

Have you benchmarked what is taking the most time? Is it the sieve itself, or the writing of the output?

One quick way to speed up the sieve is to stop worrying about all the even numbers. There's only one even number that's a prime and you can hard-code it. That cuts the size of your array in half which will help immensely if you're bumping up against the limits of physical memory.

vector<bool> sieve((limit+1)/2, false);
...
  for(long long m = n*n/2; m<=limit/2; m=m+n)
    sieve[m] = true;

As for the output itself, cout is notoriously inefficient. It might be more efficient to call itoa or some equivalent yourself, then use cout.write to output it. You might even go old school and use fwrite with stdout.

Upvotes: 0

Yakk - Adam Nevraumont

Reputation: 275730

The fastest way would probably be to take a pregenerated list.

http://www.bigprimes.net/ has the first 1.4 billion primes available for download, which should include every prime under 30 billion or so.

I suppose loading the binary might take too long when it is a few gigabytes in size.

Upvotes: 0

Zac Howland

Reputation: 15870

This will probably speed it up a bit:

#include <algorithm>
#include <iostream>
#include <iterator>
#include <vector>

int main()
{
    std::vector<unsigned long long> numbers;
    unsigned long long maximum = 4294967296;
    for (unsigned long long i = 2; i <= maximum; ++i)
    {
        if (numbers.empty())
        {
            numbers.push_back(i);
            continue;
        }

        if (std::none_of(numbers.begin(), numbers.end(), [&](unsigned long long p)
        {
            return i % p == 0;
        }))
        {
            numbers.push_back(i);
        }

    }

    std::cout << "Primes:  " << std::endl;
    std::copy(numbers.begin(), numbers.end(), std::ostream_iterator<int>(std::cout, " "));

    return 0;
}

It is kind of the inverse of the Sieve of Eratosthenes (instead of starting with every number under the limit and eliminating multiples, it starts at 2 and ignores multiples up to the limit).

Upvotes: 0

Fastest way to find all primes under 4 billion

Answers (8)

Related Questions