Reputation: 1
I'm trying to invert a matrix composed of complex numbers, where I'm using matrix inversion code for real numbers posted in the following link by 'user' cuda matrix inverse gaussian jordan
code compiles, no bugs, but problem is output is wrong! I don't know where I went wrong. Can anyone, please, help. Thank you in advance!
here is the complete code:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n = 3;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCimag(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *d_L, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
dim3 numBlocks(16,16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost );
cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost );
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCimag(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
Thank you @RobertCrovella for ur fast and very insightful suggestion! Regarding your answer to my question: I changed my threadsPerBlock(4,4) and numBlocks(1,1) so I'll be using 1 block with 16 threads for my 4x4 matrix. My input matrix is the following
1 0 0 0
0 2 0 0
0 0 3 0
0 0 0 4
all numbers are real in here, then expected inverted matrix should look like
1 0 0 0
0 1/2 0 0
0 0 1/3 0
0 0 0 1/4
and i'm not getting this at all. I inputted cuda memcheck tool to see if my kernel is not lunching but it didn't show any error massages. I started learning CUDA very recently and don't have much experience. Can anyone give more detailed response? Thank You!
here is my modified code.
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if( cuCimag(d_A[x*h+x]) != 0 ){
if( cuCreal(d_A[x*h+x]) != 0 ){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
system("Pause");
return 0;
}
Upvotes: 0
Views: 1679
Reputation: 151799
DISCLAIMER: I am not an expert on matrix inversion. I have not worked through the details of the differences between real matrix inversion and complex matrix inversion (there shouldn't be many differences, I don't think). As suggested already, there are probably better/faster ways to invert matrices.
The immediate problem seems to be in your dev
kernel, particularly here:
if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){
This is requiring that both the real and imaginary parts of the d_A
matrix element in question be non-zero in order for the dev
kernel to do any work. However I don't think this condition should be necessary. For division, we probably only require that either the real or the imaginary part be non-zero. I think in the complex domain we are actually dividing by zero only if both real and imaginary parts are zero. If you inspect the cuCdiv
function provided in cuComplex.h
, you can ascertain for yourself under what conditions it will "blow up" and therefore what conditions need to be tested for and avoided. I'm confident your test is not correct.
The following modified code works correctly for me, for your simple test case:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <math.h>
#include "cuComplex.h"
#include <complex>
__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void gaussjordan(cuDoubleComplex *A, cuDoubleComplex *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
cuDoubleComplex P;
if(x<n && y<n)
if(x>i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
if(y>=i){
A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
}
}
}
__global__ void dev(cuDoubleComplex *d_A, cuDoubleComplex *dI, int h)
{
cuDoubleComplex temp = make_cuDoubleComplex(0,0);
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if(( cuCimag(d_A[x*h+x]) != 0 ) || ( cuCreal(d_A[x*h+x]) != 0 )){
dI[x*h+y] = dI[x*h+y]/d_A[x*h+x];
d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
}
__syncthreads();
}
int main()
{
int const n= 4;
// creating input
cuDoubleComplex iL[n*n],L[n*n], I[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
else L[i*n+j] = make_cuDoubleComplex(0,0);
printf("%.2f ", cuCreal(L[i*n+j]));
}
printf("\n");
}
printf("\n");
cuDoubleComplex *d_A, *dI;
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(cuDoubleComplex);
dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
dim3 numBlocks(1,1); //!!!!!!!!!!!!!!!!!!
// memory allocation
cudaMalloc( (void**) &d_A, ddsize);
cudaMalloc( (void**) &dI, ddsize);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
else I[i*n+j]=make_cuDoubleComplex(0,0);
}
}
//copy data from GPU to CPU
cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice);
cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice);
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
gpuErrchk( cudaPeekAtLastError() );
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
printf("%.2f ", cuCreal(iL[i*n+j]));
}
printf("\n");
}
printf("\n");
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
cudaFree(d_A);
cudaFree(dI);
return 0;
}
FINAL DISCLAIMER: I'm not saying this is a fully-validated approach to inversion of matrices of arbitrary dimensions. I'm simply pointing out a critical bug that seems to make it fail for your simple test case. I also expressed some reservations in the previous question you linked.
Upvotes: 1