Reputation: 249
I feel a bit bad making a forum thread that has already 10 of the same name, but after checking them all, along with most of the guides around, I still can't figure the problem.
I have a char array [40090][11], and I want to make a custom operation on each possible combination of two of its elements (I consider the whole 11-byte bunch as an element). I understand that is a kind of mmatrix multiplication, the matrices being one-column and one-row.
Following the SDK manual I am thinking of having 1 thread per output element. Since 40090=19*2110, I am using:
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
Question 1: Is this fine?
Alright, then, I THINK I am following the SDK's maunal example faaithfully (not the one using shared memory). Whenever I dare make a portion of my wanted operations on the data, though, I get a massively unhelpful error 30 returned: Unknown error. So, Question 2: What am I doing wrong? Note: Disregard the kernel's not saving anything anywhere.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <iomanip>
#include <ctime>
#include <stdio.h>
using namespace std;
cudaError_t cudafunct(void);
__global__ void xkernel(char * dev_b2);
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2);
#define b2c 40090
unsigned char block2[b2c][11];//
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle
//unsigned char *block4,type=0;
ofstream ofile;
int main()
{
ifstream block2file("2.blk",ios::binary);
block2file.read((char*)(&block2),b2c*11);
block2file.close();
//block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs
cudaError_t cudaStatus = cudafunct();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudafunct failed!");
system("PAUSE");
return 1;
}
/*
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}*/
cout<<"Sequence end. Saving to file...\n";
//ofile.open("blk4.et2",ios::binary);
//ofile.write((char*)block4,17*counter);
//ofile.close();
int t=clock();
//cout<<"\nFound a total of "<<counter<<" block4s.\nTime elapsed: "<<t<<" clocks / "<<(double)t/(double)CLOCKS_PER_SEC<<" seconds\n";
system("PAUSE");
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t cudafunct(void)
{
char *dev_b2 = 0;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void**)&dev_b2, sizeof(block2));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b2, block2, sizeof(block2), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
dim3 threadsperblock(19,19);
dim3 blocksingrid(2110,2110);
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!\n", cudaStatus);
goto Error;
}
/*
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}*/
Error:
cudaFree(dev_b2);
return cudaStatus;
}
__global__ void xkernel(char *dev_b2)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
/*for(int k=0;k<11;k++)
{
lb2[0][k]=dev_b2[i*b2c+k];
lb2[1][k]=dev_b2[j*b2c+k];
}*/
int b00;
b00=dev_b2[i*b2c];
//int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]);
//if(!j && !(i % 100))cout<<setw(6)<<i<<" / "<<jc<<" ("<<setw(10)<<(float)100*i/jc<<" % )"<<endl;
/*if(
(dev_b2[i*b2c+7]!=dev_b2[j*b2c+9])||//SW~NW
(dev_b2[i*b2c+6]!=dev_b2[j*b2c+10])//SE~NE
) return;
if( (type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]) ) ==255) return;*/
/*if(
(dev_b2[i*b2c+0]==dev_b2[j*b2c+0])||//1st=3rd
(dev_b2[i*b2c+0]==dev_b2[j*b2c+2])||//1st=4th
(dev_b2[i*b2c+2]==dev_b2[j*b2c+0])||//2nd=3rd
(dev_b2[i*b2c+2]==dev_b2[j*b2c+2])//2nd=4th
) return;*/
/*
*(block4+counter*17+0)=b2[i][0];//1st piece
*(block4+counter*17+1)=b2[i][1];//1st rotation
*(block4+counter*17+2)=b2[i][2];//2nd piece
*(block4+counter*17+3)=b2[i][3];//2nd rotation
*(block4+counter*17+4)=b2[j][0];//3rd piece
*(block4+counter*17+5)=b2[j][1];//3rd rotation
*(block4+counter*17+6)=b2[j][2];//4th piece
*(block4+counter*17+7)=b2[j][3];//4th rotation
*(block4+counter*17+8)=type;
*(block4+counter*17+9)=b2[i][5];//Right frame colours, down->up
*(block4+counter*17+10)=b2[j][5];
*(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left
*(block4+counter*17+12)=b2[j][7];
*(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down
*(block4+counter*17+14)=b2[i][8];
*(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right
*(block4+counter++*17+16)=b2[i][10];*/
}
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2)
{//Warning! Previous error! First partenthesis is t*2* = upper piece!
if( (type1==4) && (type2==0) ) return 0;
if( (type1==6) && (type2==1) ) return 1;
if( (type1==2) && (type2==6) ) return 2;
if( (type1==3) && (type2==4) ) return 3;
if( (type1==4) && (type2==4) ) return 4;
if( (type1==8) && (type2==5) ) return 5;
if( (type1==6) && (type2==6) ) return 6;
if( (type1==7) && (type2==8) ) return 7;
if( (type1==8) && (type2==8) ) return 8;
if( (type1==9) && (type2==8) ) return 9;
if( (type1==10) && (type2==8) ) return 10;
if( (type1==8) && (type2==11) ) return 11;
if( (type1==8) && (type2==12) ) return 12;
if( (type1==8) && (type2==13) ) return 13;
return 255;
}
Upvotes: 1
Views: 457
Reputation: 21818
I have a feeling you read out-of-bounds from your dev_b2
array.
blockIdx.x
is in range of [0..2110]
, so the variable i
is in range of [0..23210]
. But then you multiply it with b2c
.
As a result the highest address you read from will be b2c*23210 = 930488900
.
But dev_b2
has only the size of b2c*11 = 440990
.
Upvotes: 1