Reputation: 8889
I've been writing CUDA code for quite a while, but I'm just now getting up to speed on how to use the texture cache.
Using the simpleTexture
example from the Nvidia SDK for inspiration, I coded a simple example that uses the texture cache. The host copies the Lena image to the GPU and binds it as a texture. The kernel just copies the contents of the texture cache into an output array.
Oddly, the result (see the all-gray image below the code) is doesn't match the input. Any thoughts about what might be going wrong?
Code (look at texCache_dummyKernel
):
texture<float, 2, cudaReadModeElementType> tex; //declare texture reference for 2D float texture
//note: tex is global, so no input ptr is needed
__global__ void texCache_dummyKernel(float* out, const int width, const int height){ //copy tex to output
int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
int y = blockIdx.y*blockDim.y + threadIdx.y;
int idx = y*width+x;
if(x < width && y < height)
out[idx] = tex2D(tex, y, x);
}
int main(int argc, char **argv){
cv::Mat img = getRawImage("./Lena.pgm");
img.convertTo(img, CV_32FC1);
float* hostImg = (float*)&img.data[0];
int width = img.cols; int height = img.rows;
dim3 grid; dim3 block;
block.x = 16; block.y = 16;
grid.x = width/block.x + 1;
grid.y = height/block.y + 1;
cudaArray *dImg; //cudaArray*, not float*
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
CHECK_CUDART(cudaMallocArray(&dImg, &channelDesc, width, height));
CHECK_CUDART(cudaMemcpyToArray(dImg, 0, 0, hostImg, width*height*sizeof(float), cudaMemcpyHostToDevice));
setTexCacheParams(); //defined below
CHECK_CUDART(cudaBindTextureToArray(tex, dImg, channelDesc)); //Bind the array to the texture
float* dResult; //device memory for output
CHECK_CUDART(cudaMalloc((void**)&dResult, sizeof(float)*width*height));
texCache_dummyKernel<<<grid, block>>>(dResult, width, height); //dImg isn't an input param, since 'tex' is a global variable
CHECK_CUDART(cudaGetLastError()); //make sure kernel didn't crash
float* hostResult = (float*)malloc(sizeof(float)*width*height);
CHECK_CUDART(cudaMemcpy(hostResult, dResult, sizeof(float)*width*height, cudaMemcpyDeviceToHost));
outputProcessedImage(hostResult, width, height, "result.png"); //defined below
}
I should probably provide a couple of helper functions that I used above:
void setTexCacheParams(){ //configuration directly pulled from simpleTexture in nvidia sdk
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
tex.filterMode = cudaFilterModeLinear;
tex.normalized = true; // access with normalized texture coordinates
}
void outputProcessedImage(float* processedImg, int width, int height, string out_filename){
cv::Mat img = cv::Mat::zeros(height, width, CV_32FC1);
for(int i=0; i<height; i++)
for(int j=0; j<width; j++)
img.at<float>(i,j) = processedImg[i*width + j]; //just grab the 1st of the 4 pixel spaces in a uchar4
img.convertTo(img, CV_8UC1); //float to uchar
vector<int> compression_params;
compression_params.push_back(CV_IMWRITE_PNG_COMPRESSION);
compression_params.push_back(9);
cv::imwrite(out_filename, img, compression_params);
}
Input:
Output:
float*
array, and I keep pretty much everything else the same, I get the correct result.Upvotes: 3
Views: 1611
Reputation: 72345
In your original code, you had initialised the texture to use normalised coordinates. This means that the texture is addressed on [0,1] in each spatial dimension. So your kernel should look this this:
__global__
void texCache_dummyKernel(float* out, const int width, const int height)
{
int x = blockIdx.x*blockDim.x + threadIdx.x; //my index into "big image"
int y = blockIdx.y*blockDim.y + threadIdx.y;
int idx = y*width+x;
if(x < width && y < height) {
float u = float(x)/float(width), v = float(y)/float(height);
out[idx] = tex2D(tex, u, v);
}
}
[Standard disclaimer: written in browser, not compiled or tested, use at own risk]
ie. you should pass coordinates to tex2D
which are normalised by dividing through by the image width and height.
Alternatively, as you have discovered, you can change the texture definition to normalized=false
and use addressing in absolute rather than relative textures coordinates. Even then the texture read in your code should look like this:
out[idx] = tex2D(tex, float(x)+0.5f, float(y)+0.5f);
because texture addressing is always done using floating point coordinates and the texture data is voxel centred, thus 0.5 is added to each coordinate to ensure the read comes from the centroid of each interpolation area or volume within the texture.
You can find a description of texture filtering and addressing modes and their effect on interpolation in one of the appendices of the CUDA C programming guide.
Upvotes: 3