Reputation: 8118
I've got a Jacobi implementation on CUDA, but the problem is:
I assign threads at this way:
#define imin(a,b) (a < b ? a : b)
int dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks = imin(32, (dimThreads + dim - 1)/dimThreads);
But if I use 32 threads it's fastest than using 256 threads or moreover...
I've got these results:
Sequential times:
9900 5.882000
9900 6.071000
Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256
Where 9900 is matrix WIDTH... And we can see the following:
5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74
So 32 threads is more efficient than 256?
Sorry, I don't know if I should upload the code(since they are a bit long), if you request it I will do it.
EDIT:
//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#define imin(a,b) (a < b ? a : b)
// name OF FUNCTION: __copy_vector
// PURPOSE:
// The function will copy a vector.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// source double* value vector to be copied
// dest double* reference vector copied
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int dim)
{
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
while(tIdx < dim){
dest[tIdx] = source[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
// The function will execute matrix vector multiplication
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A*B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A,
const double *B,
double *resul,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
resul[tIdx] = 0;
for(int i = 0; i < dim; i++)
if(tIdx != i)
resul[tIdx] += A[tIdx * dim + i] * B[i];
tIdx += gridDim.x * blockDim.x;
}
__syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
// The function will execute A-B=resul
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A-B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A,
const double *B,
double *C,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
C[tIdx] = A[tIdx] - B[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __divide
// PURPOSE:
// The function will execute the jacobi division, that is,
// (B-sum)/A[i,i]
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* reference (B-sum)/A[i,i]
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
//if(A[tIdx * dim + tIdx] != 0)
B[tIdx] /= A[tIdx * dim + tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: __absolute
// PURPOSE:
// The function will calculate the absolute value for each
// number in an array
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* reference |A[i]|
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
if(A[tIdx] < 0)
A[tIdx] = -A[tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
// The function will calculate a X solution for a linear system
// using Jacobi's Method.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// Matrix_A double* value Matrix A(coefficients)
// Vector_B double* value Vector B
// Vector_X double* reference Solution
// dim int value Matrix Dimension
// e double value Error allowed
// maxIter int value Maximum iterations allowed
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
void Jacobi_Cuda(const double *Matrix_A,
const double *Vector_B,
double *Vector_X,
const int dim,
const double e,
const int maxIter,
double *t)
{
/** Host variables **/
int iter = 0; // iter counter
double err = 1; // error between X^k and X^k-1
double *tmp; // temporary for thrust norm
double *norm; // Vector norm
tmp = (double *) malloc(sizeof(double) * dim);
norm = (double *) malloc(sizeof(double));
int dimBlocks, dimThreads;
dimThreads = 64;
dimBlocks = imin(32, (dim + dimThreads - 1)/dimThreads);
/** ************** **/
/** Device variables **/
double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;
cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);
/** **************** **/
/** Initialize **/
cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
/** ********** **/
clock_t start,finish;
double totaltime;
start = clock();
/** Jacobi **/
while(err > e && iter < maxIter){
__copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);
__Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y,
d_Vector_Resul, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul,
d_Vector_X, dim);
__divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
d_Vector_Resul, dim);
__absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);
cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);
double *t = thrust::max_element(tmp, tmp + dim); //vector norm
err = *t;
iter++;
}
finish = clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
*t = totaltime;
cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim,
cudaMemcpyDeviceToHost);
if(iter == maxIter)
puts("Jacobi has reached maxIter!");
/** ****** **/
/** Free memory **/
cudaFree(d_Matrix_A);
cudaFree(d_Vector_B);
cudaFree(d_Vector_X);
cudaFree(d_Vector_Y);
cudaFree(d_Vector_Resul);
free(tmp);
free(norm);
/** *********** **/
}
Upvotes: 1
Views: 381
Reputation: 33406
It depends on your algorithm. Some algorithms are by definition non-parallelizable (calculating the Fibonacci series, for example). But here's a parallelizable Jacobi algorithm courtesy of Brown. Note that solving systems of equations CAN be solved either in serial or in parallel, it's just a matter of writing the code.
In short, it's impossible to know whether or not more threads = more speed
unless you show us (or at least explain) the algorithm. As far as thread synchronization goes, CUDA is very (very) good at normalizing synchronization costs so (if your algorithm is proper), more threads should almost always yield more speed.
Upvotes: 4
Reputation: 155708
Fewer threads might be more efficient if the workload is small enough that the overheads of managing many threads cause the performance degradation.
...but without seeing your code it's hard to say. Personally I'm more inclined to believe it's just a bug in your code.
Upvotes: 0