Reputation: 902
I have a very simple algorithm that computes the squared Euclidean distances between the corresponding rows of two matrices. I have the following code but unfortunately it does not return the correct results for different matrix sizes. More specifically, it works fine for matrices of size 2000x4
, 500x4
, 2500x2
, 600x8
, 1000x8
, 100x8
but it is not working for a matrix of size2500x3
, 2500x5
, 400x3
, 100x3
, 100x10
, 1000x10
, 1000x12
, 500x12
, 500x14
.
Can anybody help me? I want to do it manually, without using any optimized library, because I want to understand thread management.
__global__ void cudaEuclid( float* A, float* B, float* C, int rows, int cols )
{
int i, squareeucldist = 0;
int r = blockDim.x * blockIdx.x + threadIdx.x; // rows
int c = blockDim.y * blockIdx.y + threadIdx.y; // cols
extern __shared__ float sdata[];
//int r = blockIdx.y; int c = threadIdx.x;
if( r < rows && c < cols ){
//C[r + rows*c] = ( A[r + rows*c] - B[r + rows*c] ) * ( A[r + rows*c] - B[r + rows*c] );
sdata[threadIdx.x] = ( A[r + rows*c] - B[r + rows*c] ) * ( A[r + rows*c] - B[r + rows*c] );
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x / 2;
offset > 0;
offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
C[r] = sdata[0];
}
}
}
The kernel call is:
dim3 dimBlock( cols, 1 );
dim3 dimGrid( 1, rows );
cudaEuclid<<<dimGrid, cols, cols*sizeof(float)>>>( d_A, d_B, d_C, rows, cols );
PS: I want to mention that I had posted a similar question but it was unclear from the beginning and the discussion was disoriented. Even though Tom made a very useful suggestion that it will be very practical in future for optimized implementations, I need something more handmade. Finally, the reason I made this post is because I do not want to make the related post more complicated. Thanks.
Upvotes: 3
Views: 2403
Reputation: 21515
Although the OP does not want to use optimized libraries to answer his question, the post has a useful title and other users can find it useful to solve the problem without handwritten kernels.
I was curious and played a bit with the problem, having in mind to use CUDA Thrust. I ended up with the code below, which calculates the distances between homologous rows of two matrices by using thrust::reduce_by_key
.
#include <thrust\device_vector.h>
#include <thrust\transform_reduce.h>
#include <thrust\sequence.h>
#include <thrust\random.h>
#include <thrust\gather.h>
#include <thrust\extrema.h>
using namespace thrust::placeholders;
/****************************************************/
/* POWER DIFFERENCE FUNCTOR FOR EUCLIDEAN DISTANCES */
/****************************************************/
struct PowerDifference {
__host__ __device__ float operator()(const float& a, const float& b) const { return pow(a - b, 2); }
};
/*******************/
/* EXPAND OPERATOR */
/*******************/
template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator output)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
difference_type input_size = thrust::distance(first1, last1);
difference_type output_size = thrust::reduce(first1, last1);
// scan the counts to obtain output offsets for each input element
thrust::device_vector<difference_type> output_offsets(input_size, 0);
thrust::exclusive_scan(first1, last1, output_offsets.begin());
// scatter the nonzero counts into their corresponding output positions
thrust::device_vector<difference_type> output_indices(output_size, 0);
thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
output_offsets.begin(), first1, output_indices.begin());
// compute max-scan over the output indices, filling in the holes
thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());
// gather input values according to index array (output = first2[output_indices])
OutputIterator output_end = output; thrust::advance(output_end, output_size);
thrust::gather(output_indices.begin(), output_indices.end(), first2, output);
// return output + output_size
thrust::advance(output, output_size);
return output;
}
/********/
/* MAIN */
/********/
int main()
{
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 10; // --- Number of vector elements
const int Nvec = 20; // --- Number of vectors for each matrix
// --- Random uniform integer distribution between 0 and 100
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 20);
// --- Matrix allocation and initialization
thrust::device_vector<float> d_matrix1(Nvec * N);
thrust::device_vector<float> d_matrix2(Nvec * N);
for (size_t i = 0; i < d_matrix1.size(); i++) d_matrix1[i] = (float)dist(rng);
for (size_t i = 0; i < d_matrix2.size(); i++) d_matrix2[i] = (float)dist(rng);
printf("\n\nFirst matrix\n");
for(int i = 0; i < Nvec; i++) {
std::cout << " [ ";
for(int j = 0; j < N; j++)
std::cout << d_matrix1[i * N + j] << " ";
std::cout << "]\n";
}
printf("\n\nSecond matrix\n");
for(int i = 0; i < Nvec; i++) {
std::cout << " [ ";
for(int j = 0; j < N; j++)
std::cout << d_matrix2[i * N + j] << " ";
std::cout << "]\n";
}
/****************************************************************************/
/* CALCULATING THE EUCLIDEAN DISTANCES BETWEEN THE ROWS OF THE TWO MATRICES */
/****************************************************************************/
// --- Creating the indices for the reduction by key
thrust::device_vector<int> d_sequence(Nvec);
thrust::device_vector<int> d_indices(Nvec * N);
thrust::device_vector<int> d_counts(Nvec, N);
thrust::sequence(d_sequence.begin(), d_sequence.begin() + Nvec);
expand(d_counts.begin(), d_counts.end(), d_sequence.begin(), d_indices.begin());
printf("\n\nSecond matrix\n");
for(int i = 0; i < Nvec; i++) {
std::cout << " [ ";
for(int j = 0; j < N; j++)
std::cout << d_indices[i * N + j] << " ";
std::cout << "]\n";
}
thrust::device_vector<float> d_squared_differences(Nvec * N);
thrust::transform(d_matrix1.begin(), d_matrix1.end(), d_matrix2.begin(), d_squared_differences.begin(), PowerDifference());
thrust::device_vector<float> d_norms(Nvec);
thrust::reduce_by_key(d_indices.begin(), d_indices.end(), d_squared_differences.begin(), d_indices.begin(), d_norms.begin());
printf("\n\ndnorms\n");
for(int i = 0; i < Nvec; i++) {
std::cout << d_norms[i] << " ";
}
return 0;
}
Upvotes: 1
Reputation: 9779
In fact your code works only on m * 2^n
when n
is small enough. You probably want to read more carefully about the following slides on page 14,
http://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
and think about the following questions
blockDim.x
is equal to 3 or 5;blockDim.x
or cols
is not a power of 2;sdata[]
is not added to the final sum;blockDim.x
and size of smem to 2^3 when cols
is 5;smem[5..7]
Try to simulate running the for loop step by step with your pen and paper will help.
Upvotes: 1