CUDA matrix multiplication - yet again

Question

I feel a bit bad making a forum thread that has already 10 of the same name, but after checking them all, along with most of the guides around, I still can't figure the problem.

I have a char array [40090][11], and I want to make a custom operation on each possible combination of two of its elements (I consider the whole 11-byte bunch as an element). I understand that is a kind of mmatrix multiplication, the matrices being one-column and one-row.

Following the SDK manual I am thinking of having 1 thread per output element. Since 40090=19*2110, I am using:

dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<>>(dev_b2);

Question 1: Is this fine?

Alright, then, I THINK I am following the SDK's maunal example faaithfully (not the one using shared memory). Whenever I dare make a portion of my wanted operations on the data, though, I get a massively unhelpful error 30 returned: Unknown error. So, Question 2: What am I doing wrong? Note: Disregard the kernel's not saving anything anywhere.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 
#include 
#include 
#include 
#include 
using namespace std;

cudaError_t cudafunct(void);
__global__ void xkernel(char * dev_b2);
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2);


#define b2c 40090
unsigned char block2[b2c][11];//
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle
//unsigned char *block4,type=0;
ofstream ofile;




int main()
{
     ifstream block2file("2.blk",ios::binary);
     block2file.read((char*)(&block2),b2c*11);
     block2file.close();
     //block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs

    cudaError_t cudaStatus = cudafunct();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudafunct failed!");
        system("PAUSE");
        return 1;
    }
    /*

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }*/
     cout<<"Sequence end. Saving to file...
";     
     //ofile.open("blk4.et2",ios::binary);
     //ofile.write((char*)block4,17*counter);   
     //ofile.close(); 
     int t=clock();
     //cout<<"
Found a total of "<>>(dev_b2);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!
", cudaStatus);
        goto Error;
    }
    /*
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }*/

Error:
    cudaFree(dev_b2);
    return cudaStatus;
}


__global__ void xkernel(char *dev_b2)
{
        int i = blockIdx.x * blockDim.x + threadIdx.x; 
        int j = blockIdx.y * blockDim.y + threadIdx.y;
        /*for(int k=0;k<11;k++)
        {
            lb2[0][k]=dev_b2[i*b2c+k];
            lb2[1][k]=dev_b2[j*b2c+k];
        }*/
        int b00;
        b00=dev_b2[i*b2c];

        //int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]);
        //if(!j && !(i % 100))cout<up
        *(block4+counter*17+10)=b2[j][5];
        *(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left
        *(block4+counter*17+12)=b2[j][7];
        *(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down
        *(block4+counter*17+14)=b2[i][8];
        *(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right
        *(block4+counter++*17+16)=b2[i][10];*/
}  



__device__ unsigned char typecheck(unsigned char type1,unsigned char type2)
{//Warning! Previous error! First partenthesis is t*2* = upper piece!
       if( (type1==4) && (type2==0) ) return  0;  
       if( (type1==6) && (type2==1) ) return  1;  
       if( (type1==2) && (type2==6) ) return  2;  
       if( (type1==3) && (type2==4) ) return  3;  
       if( (type1==4) && (type2==4) ) return  4;  
       if( (type1==8) && (type2==5) ) return  5;  
       if( (type1==6) && (type2==6) ) return  6;  
       if( (type1==7) && (type2==8) ) return  7;  
       if( (type1==8) && (type2==8) ) return  8;  
       if( (type1==9) && (type2==8) ) return  9;  
       if( (type1==10) && (type2==8) ) return  10;  
       if( (type1==8) && (type2==11) ) return  11;  
       if( (type1==8) && (type2==12) ) return  12;  
       if( (type1==8) && (type2==13) ) return  13;  
       return 255;
}

CygnusX1 · Accepted Answer

I have a feeling you read out-of-bounds from your dev_b2 array. blockIdx.x is in range of [0..2110], so the variable i is in range of [0..23210]. But then you multiply it with b2c. As a result the highest address you read from will be b2c*23210 = 930488900.

But dev_b2 has only the size of b2c*11 = 440990.

CUDA matrix multiplication - yet again

Answers (1)

Related Questions