Successive blocks reading memory from initial blocks

Question

So this is a part of my program, which I make a reduction sum for two classes. I indexed the classes by the half of a shared array __shared__ int nrules[max_threads * MAX_CLASSES];, so first class starts at nrules[0] and second at nrules[blockDim.x or max_threads]. A reduction is made for both halves. The sums are kept in global array passed as parameter, and this array will keep the sum for each block, hence being indexed by blockIdx.x.

I have the size of a test case which is represented by MAX_SIZE, and all tests are first processed from 1 to MAX_SIZE, and the sums are accumulated at the global array for each block.

I wanted to call a kernel with number of blocks equal to my number of tests ( 10000 ), but there were some problems with sums so I changed to do by steps.

I can't find a solution to this, but whenever I call a kernel with more than max_threads number of blocks, it starts to sum things from the initial blocks. If you execute the code you will see that it will print the value for each block which is 64 in this case with 64 threads per block. If I execute at least 1 more block, it's sum will be 128 instead.This for the first class sum. It is as if the offset variable is doing nothing and the writing occurs at the first blocks again. And with MAX_SIZE = 3, the first block has its second class sum altered to 192. The Cuda capability here is 2.0, a GT 520 card. Compiled with CUDA 6.5.

#include 
#include 
#include 

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d
", cudaGetErrorString(code), file, line);

    }
}

#define MAX_CLASSES 2
#define max_threads 64
//#define MAX_FEATURES 65

__device__ __constant__ int d_MAX_SIZE;
__device__  __constant__ int offset;

__device__ void rules_points_reduction(float points[max_threads * MAX_CLASSES], int nrules[max_threads * MAX_CLASSES]){

    float psum[MAX_CLASSES];
    int nsum[MAX_CLASSES];

    for (int i = 0; i < MAX_CLASSES; i++){
        psum[i] = points[threadIdx.x + i * blockDim.x];
        nsum[i] = nrules[threadIdx.x + i * blockDim.x];
    }

    __syncthreads();

    if (blockDim.x >= 1024) {
        if (threadIdx.x < 512) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 512 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 512 + i * blockDim.x];
            }

        } __syncthreads();
    }
    if (blockDim.x >= 512) {
        if (threadIdx.x < 256) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 256 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 256 + i * blockDim.x];
            }
        } __syncthreads();
    }
    if (blockDim.x >= 256) {
        if (threadIdx.x < 128) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 128 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 128 + i * blockDim.x];
            }
        } __syncthreads();
    }
    if (blockDim.x >= 128) {
        if (threadIdx.x <  64) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 64 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 64 + i * blockDim.x];
            }
        } __syncthreads();
    }

    if (threadIdx.x < 32)
    {
        // now that we are using warp-synchronous programming (below)
        // we need to declare our shared memory volatile so that the compiler
        // doesn't reorder stores to it and induce incorrect behavior.
        //volatile int* smem = nrules;
        //volatile float* smemf = points;
        if (blockDim.x >= 64) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 32 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 32 + i * blockDim.x];
            }
        }
        if (blockDim.x >= 32) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 16 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 16 + i * blockDim.x];
            }
        }
        if (blockDim.x >= 16) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 8 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 8 + i * blockDim.x];
            }
        }
        if (blockDim.x >= 8) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 4 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 4 + i * blockDim.x];
            }
        }
        if (blockDim.x >= 4) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 2 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 2 + i * blockDim.x];
            }
        }
        if (blockDim.x >= 2) {
            for (int i = 0; i < MAX_CLASSES; i++){
                points[threadIdx.x + i * blockDim.x] = psum[i] = psum[i] + points[threadIdx.x + 1 + i * blockDim.x];
                nrules[threadIdx.x + i * blockDim.x] = nsum[i] = nsum[i] + nrules[threadIdx.x + 1 + i * blockDim.x];
            }
        }
    }

}

__device__ void d_get_THE_prediction(short k, float* finalpoints, int* gn_rules)
{   
    int max;
    short true_label, n_items;

    __shared__ float points[max_threads * MAX_CLASSES]; 
    __shared__ int nrules[max_threads * MAX_CLASSES];
    //__shared__ short  items[MAX_FEATURES], ele[MAX_FEATURES];
    __shared__ int max2;

    for (int i = 0; i < MAX_CLASSES; i++)
    {
        points[threadIdx.x + i * blockDim.x] = 1;
        nrules[threadIdx.x + i * blockDim.x] = 1;
    }

    if (threadIdx.x == 0) {
        if (k == 1){
            nrules[0] = 1;
            nrules[blockDim.x] = 1;
        }
        //max2 = GetBinCoeff_l_d(n_items, k);
    }
    __syncthreads();

    //max = max2;

    //d_induce_rules(items, ele, n_items, k, max, nrules, points);

    __syncthreads();

    rules_points_reduction(points, nrules);

    if (threadIdx.x == 0){

        for (int i = 0; i < MAX_CLASSES; i++){
            gn_rules[(blockIdx.x + offset) + i * blockDim.x] += nrules[i * blockDim.x];
            finalpoints[(blockIdx.x + offset) + i * blockDim.x] += points[i * blockDim.x];

        }       
        printf("block %d k%d %f %f %d %d
", (blockIdx.x + offset), k, finalpoints[(blockIdx.x + offset)],
            finalpoints[(blockIdx.x + offset) + blockDim.x], gn_rules[(blockIdx.x + offset)], gn_rules[(blockIdx.x + offset) + blockDim.x]);

    }
}

__global__ void lazy_supervised_classification_kernel(int k, float* finalpoints, int* n_rules){

    d_get_THE_prediction( k, finalpoints, n_rules);

}


int main() {
    //freopen("output.txt", "w", stdout);

    int N_TESTS = 10000;
    int MAX_SIZE = 3;

    float *finalpoints = (float*)calloc(MAX_CLASSES * N_TESTS, sizeof(float));
    float *d_finalpoints = 0;

    int *d_nruls = 0;
    int *nruls = (int*)calloc(MAX_CLASSES * N_TESTS, sizeof(int));  

    gpuErrchk(cudaMalloc(&d_finalpoints, MAX_CLASSES * N_TESTS * sizeof(float)));
    gpuErrchk(cudaMemset(d_finalpoints, 0, MAX_CLASSES * N_TESTS * sizeof(float)));

    gpuErrchk(cudaMalloc(&d_nruls, MAX_CLASSES * N_TESTS * sizeof(int)));
    gpuErrchk(cudaMemset(d_nruls, 0, MAX_CLASSES * N_TESTS * sizeof(int)));

    gpuErrchk(cudaMemcpyToSymbol(d_MAX_SIZE, &MAX_SIZE, sizeof(int), 0, cudaMemcpyHostToDevice));

    int step = max_threads, ofset = 0;

    for (int k = 1; k < MAX_SIZE; k++){ 

                               //N_TESTS-step
        for (ofset = 0; ofset < max_threads; ofset += step){

            gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));
            lazy_supervised_classification_kernel <<>>(k, d_finalpoints, d_nruls);
            gpuErrchk(cudaDeviceSynchronize());
        }

        gpuErrchk(cudaMemcpyToSymbol(offset, &ofset, sizeof(int), 0, cudaMemcpyHostToDevice));//comment these lines
                                          //N_TESTS - step      
        lazy_supervised_classification_kernel <<<3, max_threads >> >(k, d_finalpoints, d_nruls);//
        gpuErrchk(cudaDeviceSynchronize());//

    }
    gpuErrchk(cudaFree(d_finalpoints));
    gpuErrchk(cudaFree(d_nruls));
    free(finalpoints);
    free(nruls);    

    gpuErrchk(cudaDeviceReset());   
    return(0);
}

Robert Crovella · Accepted Answer

I don't believe this indexing is what you want:

 gn_rules[(blockIdx.x + offset) + i * blockDim.x] += ...;
 finalpoints[(blockIdx.x + offset) + i * blockDim.x] += ...;

For MAX_CLASSES = 2, each block needs storage for 2 finalpoints values and 2 gn_rules values. Therefore, when offset is non-zero, it needs to be scaled by MAX_CLASSES values in order to index to the start of the correct storage for that block.

So if you change the above lines of code to:

 gn_rules[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += nrules[i * blockDim.x];
 finalpoints[(blockIdx.x + (offset*MAX_CLASSES)) + i * blockDim.x] += points[i * blockDim.x];

I believe you'll get the output you are expecting.

Successive blocks reading memory from initial blocks

Answers (1)

Related Questions