zetatr
zetatr

Reputation: 179

Cuda call won't allocate more than 8 threads per block, regardless of specification

I am creating a parallel version of the Sieve of Eratosthenes in c++. The problem is my kernel call (reduce0) seems to only ever assign 8 threads per block instead of the 256 I specify. Since even the first CUDA version allows 512 threads per block, there must be some error in my code for it. Any help would be appreciated.

#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cutil.h>
//#include <sieve_kernel.cu>
using namespace std;

////////////////////////////////////////////////////
int psum(int arg[], double n);
int call_kernel(int primes[], int n);
int findsmallest(int arg[], int f, double n);
int sieve(int n);
__global__ void reduce0(int *g_idata, int *g_odata);

////////////////////////////////////////////////////
int main(){
    int n = pow((double) 2, 8);
    int total = sieve(n);
    cout << "# primes" << endl << total << endl;
    return 0;
}
///////////////////////////////////////////////////

__global__ void reduce0(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();

// do reduction in shared mem
for (int s = 1; s < blockDim.x; s *= 2) { // step = s x 2
    if (tid % (s*2) == 0) { // only threadIDs divisible by the step participate
        sdata[tid] += sdata[tid + s];
    }
    __syncthreads();
}

// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/////////////////////////////////////////////////////

int call_kernel(int *primes, int n){
    // Allocate and copy device arrays
    int *g_idevice;
    int *g_odevice;
    int size = n * sizeof(int);
    cudaMalloc(&g_idevice, size);
    cudaMemcpy(g_idevice, primes, size, cudaMemcpyHostToDevice);
    cudaMalloc(&g_odevice, size);

    // Specify grid/block dimenstions and invoke the kernel
    dim3 dimGrid(1,1);
    dim3 dimBlock(256,1);
    reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);

    // Copy device data back to primes
    cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);

    //for (int i = 0; i < n; i++) {
    //  cout << i << "  " << primes[i] << endl;
    //}
    int total = primes[0];
    cudaFree(g_idevice);
    cudaFree(g_odevice);
    return total;


}
/////////////////////////////////////////////////////////////////////
int findsmallest(int arg[], int f, double n){
    int i = f;
    while(arg[i]!= 1 && i < n) {
        i++;
    }
    return i;
}
//////////////////////////////////////////////////////////////////////
int psum(int arg[], double n){
    int total = 0;
    int i = 2;
    while(i < n){
        if(arg[i] == 1){
        total = total + 1;
        }
        i++;
    }
    return total;
}
/////////////////////////////////////////////////////////////////////////
int sieve(int n){
    int* primes = NULL;
    int mult = 0;
    int k = 2;
    int i; int total;
    //primes = new int[n];
    primes = new int[256];
    for(i = 0; i < n; i++){
        primes[i] = 1;
    }
    primes[0] = primes[1] = 0;

    while (k * k < n){
        mult = k * k;
        while (mult < n) {
            primes[mult] = 0;
            mult =  mult + k;
        }
        k = findsmallest(primes,k+1, n);
    }
    total = call_kernel(primes, n);
    //delete [] primes;
    //primes = NULL;
    return total;
}

Upvotes: 0

Views: 449

Answers (1)

talonmies
talonmies

Reputation: 72350

Your kernel is using dynamically allocated shared memory, but the kernel launch does not include any allocation, so the result is the kernel will be aborting because of illegal memory operations on that shared memory buffer. You should find it works if you modify this part of call_kernel as follows:

// Specify grid/block dimenstions and invoke the kernel
dim3 dimGrid(1,1);
dim3 dimBlock(256,1);
size_t shmsize = size_t(dimBlock.x * dimBlock.y * dimBlock.z) * sizeof(int);
reduce0<<<dimGrid, dimBlock, shmsize>>>(g_idevice, g_odevice);

If you had of included some basic error checking around the function call, perhaps like this:

reduce0<<<dimGrid, dimBlock>>>(g_idevice, g_odevice);
if (cudaPeekAtLastError() != cudaSuccess) {
    cout << "kernel launch error: " << cudaGetErrorString(cudaGetLastError()) << endl;
}

// Copy device data back to primes
cudaError_t err = cudaMemcpy(primes, g_odevice, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
    cout << "CUDA error: " << cudaGetErrorString(err) << endl;
}

it would have been immediately obvious that the kernel launch or execution was failing with an error.

Upvotes: 2

Related Questions