Reputation: 33
I have the following code in a .cu
file that use CUDA:
#include "gpu_stgauss2.h"
#include "gpu_st.h"
#include "gpu_sampler.h"
static texture<float, 2, cudaReadModeElementType> s_texSRC1;
static texture<float4, 2, cudaReadModeElementType> s_texSRC4;
inline __host__ __device__ texture<float,2>& texSRC1() { return s_texSRC1; }
inline __host__ __device__ texture<float4,2>& texSRC4() { return s_texSRC4; }
static texture<float4, 2, cudaReadModeElementType> s_texST;
inline __host__ __device__ texture<float4,2>& texST() { return s_texST; }
They are later used in the same file as follows:
gpu_image<float> gpu_stgauss2_filter( const gpu_image<float>& src, const gpu_image<float4>& st,
float sigma, float max_angle, bool adaptive,
bool src_linear, bool st_linear, int order, float step_size,
float precision )
{
if (sigma <= 0) return src;
gpu_image<float> dst(src.size());
gpu_sampler<float, texSRC1> src_sampler(src, src_linear? cudaFilterModeLinear : cudaFilterModePoint);
float cos_max = cosf(radians(max_angle));
if (src.size() == st.size()) {
gpu_sampler<float4, texST> st_sampler(st, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
if (order == 1) imp_stgauss2_filter<1,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 2) imp_stgauss2_filter<2,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 4) imp_stgauss2_filter<4,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
} else {
float2 s = make_float2((float)st.w() / src.w(), (float)st.h() / src.h());
gpu_resampler<float4, texST> st_sampler(st, s, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
if (order == 1) imp_stgauss2_filter<1,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 2) imp_stgauss2_filter<2,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 4) imp_stgauss2_filter<4,float><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
}
GPU_CHECK_ERROR();
return dst;
}
gpu_image<float4> gpu_stgauss2_filter( const gpu_image<float4>& src, const gpu_image<float4>& st,
float sigma, float max_angle, bool adaptive,
bool src_linear, bool st_linear, int order, float step_size,
float precision )
{
if (sigma <= 0) return src;
gpu_image<float4> dst(src.size());
gpu_sampler<float4, texSRC4> src_sampler(src, src_linear? cudaFilterModeLinear : cudaFilterModePoint);
float cos_max = cosf(radians(max_angle));
if (src.size() == st.size()) {
gpu_sampler<float4, texST> st_sampler(st, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
if (order == 1) imp_stgauss2_filter<1,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 2) imp_stgauss2_filter<2,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 4) imp_stgauss2_filter<4,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
} else {
float2 s = make_float2((float)st.w() / src.w(), (float)st.h() / src.h());
gpu_resampler<float4, texST> st_sampler(st, s, st_linear? cudaFilterModeLinear : cudaFilterModePoint);
if (order == 1) imp_stgauss2_filter<1,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 2) imp_stgauss2_filter<2,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
else if (order == 4) imp_stgauss2_filter<4,float4><<<dst.blocks(), dst.threads()>>>(dst, src_sampler, st_sampler, sigma, cos_max, adaptive, step_size, precision);
}
GPU_CHECK_ERROR();
return dst;
}
However, it will cause the following error:
error : taking reference of texture/surface variable not allowed in __device__/__global__ functions
I have few experience in CUDA. Can anyone help on how to fix it? Thanks.
Upvotes: 3
Views: 564
Reputation: 1
I encountered the same problem when trying to compile the exactly same code. It turns out returning reference here is not necessary after all, using the trick in 'gpu_stbf2.cu' as suggested by ennetws.
These 3 functions are actually only called within this file, so move the struct definiton in gpu_sampler.h back to here, and instead of get the texture by calling these fuction, you can just use it directly. I've put the code on github here.
Upvotes: 0
Reputation: 153
For anyone having the same problem, which in this case is from a GPU library found here, I managed to solve it by adapting the same strategy used elsewhere, e.g. "gpu_stbf2.cu". I managed to successfully compile with Cuda 6.0 and Visual Studio 2012 x64.
Upvotes: 0
Reputation: 50667
Try to downgrade your CUDA to 4.0. It will be OK in CUDA 4.0 for such code syntax. I once encountered a similar problem and CUDA 4.0 works for me.
Upvotes: 0
Reputation: 1246
I'd highly suggest using bindless texture in cc 3.0 ++, because the unbind texture command would not have to synchronize the host thread
2nd, you shall consider using the new cash memory that was proposed in CC 3.0 + , In order to do so please simple specify the memory as
const float* pArray;
3rd, In case you insist of using the old fashion texture , which is powerful for interpolation operations. In the global scope:
texture <float, cudaTextureType1D> textureFloat32_1D;
In your code bind the texture
cudaBindTexture ( NULL , textureFloat32_1D , ... ) ;
Inside the kernel use the texture as you wish...
float a = tex1Dfatch(textureFloat32_1D , location) ;
Outside of the kernel
cudaUnbindTexture(textureFloat32_1D );
Please note that multithreading applications using CUDA code shall have problems using the same texture variable as mention in case three (it's not protected ! )
Upvotes: 1
Reputation: 5854
The compiler error says it all: you aren't allowed to do what you tried. I would propose either using the variables directly (instead of accessing them via texSRC1()
etc, or returning pointers instead of references.
Upvotes: 2