Reputation: 41
My kernel allocated a shared memory for data storage, but bug reports if I change the size of the shared memory, see codes attached.
#include <stdio.h>
#include <assert.h>
#define cucheck_dev(call) \
{ \
cudaError_t cucheck_err = (call); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
assert(0); \
} \
}
__global__ void kernel(int datanum)
{
extern __shared__ int sh[];
// assign data for data 1
float2* data_ptr1((float2*)sh);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr1[thid] = make_float2(0., 0.);
}
__syncthreads();
// assign data for data 2
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
if(threadIdx.x == 0) printf("Offset: %d\n", (int)(shOffset));
__syncthreads();
float4 *data_ptr2((float4*)&sh[shOffset]);
for (int thid = threadIdx.x; thid < datanum; thid += blockDim.x)
{
data_ptr2[thid] = make_float4(0., 0., 0., 0.);
}
__syncthreads();
}
int main()
{
int datanum = 21; // bug reports for datanum = 21, but everthing works fine for datanum = 20
int blocknum = 1;
int threadperblock = 128;
int preallocated = 768;
size_t shmem = datanum*sizeof(float2) + preallocated*sizeof(int);
printf("Allocated Shared memory byte: %d Nums: %d\n", (int)shmem, (int)(shmem/sizeof(int)));
kernel<<<blocknum, threadperblock, shmem>>>(datanum);
cudaDeviceSynchronize();
cucheck_dev(cudaGetLastError());
}
As shown, the shared memory included two regions, one for fixed data, type as float2
.
The other region may save different types as int
or float4
, offset from the shared memory entry.
When I set datanum
to 20, codes work fine.
But when datanum
is changed to 21, the code reports a misaligned address.
I greatly appreciate any reply or suggestions.
Thank you!
Some information provided by cuda-memcheck
is posted here for a reference:
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (20,0,0) in block (0,0,0)
========= Address 0x000001e8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (19,0,0) in block (0,0,0)
========= Address 0x000001d8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (18,0,0) in block (0,0,0)
========= Address 0x000001c8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
=========
========= Invalid __shared__ write of size 16
========= at 0x00000280 in kernel(int)
========= by thread (17,0,0) in block (0,0,0)
========= Address 0x000001b8 is misaligned
========= Device Frame:kernel(int) (kernel(int) : 0x280)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x34e) [0x2efabe]
========= Host Frame:test [0x13de9]
========= Host Frame:test [0x13e77]
========= Host Frame:test [0x4a1c5]
========= Host Frame:test [0x6f32]
========= Host Frame:test [0x6df5]
========= Host Frame:test [0x6e2e]
========= Host Frame:test [0x6c14]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xe7) [0x21b97]
========= Host Frame:test [0x69fa]
Upvotes: 0
Views: 1442
Reputation: 13419
Your problem is that the alignment for float4
is higher than that for float2
. Therefore the lines
size_t shOffset = (sizeof(float2)/sizeof(int)*(datanum));
float4 *data_ptr2((float4*)&sh[shOffset]);
do not guarantee appropriate alignment for data_ptr2 unless datanum is an even number.
I wrote some code for this issue here: CUDA : Shared memory alignement in documentation
The easiest fix is to just swap data_ptr1
and data_ptr2
. Use the front of the memory for the type with the larger alignment
Upvotes: 2