Reputation: 97

LU Decomposition in CUDA

I have been trying to implement LU Decomposition. This code works for matrices upto sizes 100x100 but fails for larger matrices. I have no clue as to what is happening. I changed the max finding function, but had similar results. Please help.

 __device__ float max_value = 0;  
 __device__ int max_index;
 __device__ int lock = 0;
 __device__ int blockLock = 0;

__global__ void maxIndex(float *LU, int n, int col)
 {
max_value = 0;
max_index = INT_MAX;

__shared__ float inter[64];
__shared__ int indexBlock;
__shared__ int threadLast;

indexBlock = INT_MAX;
threadLast = INT_MAX;

int index = n*(blockDim.x*blockIdx.x + threadIdx.x + col) + col;
float myValue = fabsf(LU[index]);
int noElem = n - col - blockIdx.x*blockDim.x > blockDim.x ? blockDim.x : n - col - blockIdx.x*blockDim.x;

if(index < n*n)
    inter[threadIdx.x] = fabsf(LU[index]);
else
    inter[threadIdx.x] = FLT_MIN;

__syncthreads();

int h = ceil(log2((float)noElem));

for(int d=0; d<h; d++)
{
    if(threadIdx.x < ceil(noElem/exp2((float)(d+1))))
    {
            int parent = threadIdx.x;
                int left = 2*parent;
                int right = 2*parent + 1;

                if(right < ceil(noElem/exp2((float)d)))
                    inter[parent] = (inter[left] >= inter[right]) ? inter[left] : inter[right];
            else
                    inter[parent] = inter[left];
    }
    __syncthreads();
}

__syncthreads();

int loop = 1;
if(myValue == inter[0])
{
    while(loop)
    {
        if( 0 == atomicCAS( &blockLock, 0, 1 ) )
        {
            if(threadIdx.x < threadLast)
            {
                threadLast = threadIdx.x;
                indexBlock = blockDim.x*blockIdx.x + threadIdx.x + col;
            }
            __threadfence_block();

            atomicExch( &blockLock, 0);
            loop = 0;
        }
     }
}   

__syncthreads();

if( threadIdx.x == 0 && index < n*n)
{
    while(0 != atomicCAS(&lock, 0, 1));

    if((max_value < inter[0]) || (max_value == inter[0] && indexBlock < max_index))
    {
        max_value = inter[0];
        max_index = indexBlock;
    }

    __threadfence();

    atomicExch(&lock, 0);   
}
}

__global__ void swap(float *LU, int n, int row)
{
if(max_index == row)
    return;

float temp;
int index =  blockDim.x*blockIdx.x + threadIdx.x;

if(index < n)
{
    temp = LU[index + n*row];
    LU[index + n*row] = LU[index + n*max_index];
    LU[index + n*max_index] = temp;
}

  }

 __global__ void elimination(float *LU, int n, int row)
 {
float factor;

int indexX = row + 1 + blockDim.x*blockIdx.x + threadIdx.x;
int indexY = row + 1 + blockIdx.y;


if((indexX < n) && (indexY < n))
{
    factor = LU[n*indexY + row] / LU[n*row + row];

    LU[n*indexY + indexX] -= LU[n*row + indexX]*factor; 
}

__syncthreads();

if(blockIdx.x == 0 && threadIdx.x == 0)
{
    LU[n*indexY + row] = factor;
    __threadfence();
}

}

int luDecomposeP(float *LU, int n)
{
int i, noOfThreadsPerBlock = 64, noOfBlocks, sharedSize, pivotValue;
float *dLU;

cudaMalloc((void **)&dLU, n*n*sizeof(float));

cudaMemcpy(dLU, LU, n*n*sizeof(float), cudaMemcpyHostToDevice);

dim3 gridDim(1,1,1);
dim3 blockDim(noOfThreadsPerBlock,1,1);

for(i=0; i<n-1; i++)
{
    noOfBlocks = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
    sharedSize = ((noOfThreadsPerBlock < (n-i) ? noOfThreadsPerBlock : n-i) + 3) * sizeof(float);
    maxIndex <<< noOfBlocks, noOfThreadsPerBlock, sharedSize >>> (dLU, n, i);
    //maxIndex <<< 1, noOfThreadsPerBlock >>> (dLU, n, i);

    cudaMemcpyFromSymbol(&pivotValue, "max_value", sizeof(pivotValue), 0, cudaMemcpyDeviceToHost);      
    if(pivotValue <= 1e-20F)
        return -1;

    noOfBlocks = ceil((float)n / (float)noOfThreadsPerBlock);
    swap <<< noOfBlocks, noOfThreadsPerBlock >>> ( dLU, n, i);

    gridDim.x = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
    gridDim.y = n-i-1;
    elimination <<< gridDim, blockDim >>> ( dLU, n, i);

}

cudaMemcpy(LU, dLU, n*n*sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(dLU);

return 0;
 }

Upvotes: 1

Answers (2)

Vitality

Reputation: 21475

Although this problem has been perhaps solved, I think it is useful for other users to provide an example on how using cuBLAS cublas<t>getrfBatched() to extract the LU decomposition of a matrix.

#include <stdio.h>

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"

int main() {

    const unsigned int N = 3; 

    const unsigned int Nmatrices = 1;

    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    // --- Matrices to be inverted (only one in this example)
    float *h_A = new float[N*N*Nmatrices];

    h_A[0] = 4.f;  
    h_A[1] = 3.f;
    h_A[2] = 8.f;
    h_A[3] = 9.f;
    h_A[4] = 5.f; 
    h_A[5] = 1.f; 
    h_A[6] = 2.f; 
    h_A[7] = 7.f;
    h_A[8] = 6.f;

    // --- Allocate device matrices 
    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N*N*Nmatrices*sizeof(float)));

    // --- Move the matrix to be inverted from host to device
    gpuErrchk(cudaMemcpy(d_A,h_A,N*N*Nmatrices*sizeof(float),cudaMemcpyHostToDevice));

    // --- Creating the array of pointers needed as input to the batched getrf
    float **h_inout_pointers = (float **)malloc(Nmatrices*sizeof(float *));
    for (int i=0; i<Nmatrices; i++) h_inout_pointers[i]=(float *)((char*)d_A+i*((size_t)N*N)*sizeof(float));

    float **d_inout_pointers;
    gpuErrchk(cudaMalloc((void**)&d_inout_pointers, Nmatrices*sizeof(float *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers,h_inout_pointers,Nmatrices*sizeof(float *),cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_PivotArray; gpuErrchk(cudaMalloc((void**)&d_PivotArray, N*Nmatrices*sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc((void**)&d_InfoArray,  Nmatrices*sizeof(int)));

    int *h_PivotArray = (int *)malloc(N*Nmatrices*sizeof(int));
    int *h_InfoArray  = (int *)malloc(  Nmatrices*sizeof(int));

    cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, d_PivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray,d_InfoArray,Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i]  != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    gpuErrchk(cudaMemcpy(h_A,d_A,N*N*sizeof(float),cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_PivotArray,d_PivotArray,N*Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));

    for (int i=0; i<N*N; i++) printf("A[%i]=%f\n", i, h_A[i]);

    printf("\n\n");
    for (int i=0; i<N; i++) printf("P[%i]=%i\n", i, h_PivotArray[i]);

    return 0;
}

The Utilities.cuh and Utilities.cu files needed to compile and run this example are maintained at the CUDA-Utilities repository.

Note that

cublas<t>getrfBatched() can be also invoked without using the pivoting, see the commented cublas<t>getrfBatched() call line;
cublas<t>getrfBatched() is able to perform the LU decomposition of many matrix. The structure of the example provided above is general, although it refers to the case of one matrix only.

The example presented above is the same example provided by Scientific Computing Software Library (SCSL) User’s Guide, see Example 3-1, page 21. Note that cublas<t>getrfBatched() overwrites the input matrices with the corresponding LU factorizations. The full L and U matrices can be extracted by the following Matlab code (the Matlab code detailed in the cuBLAS User's Guide does not seem to be correct):

L = eye(3);
for k = 1:3
    L(k+1:3,k) = M(k+1:3,k);
end
U = zeros(3);
for k = 1:3
    U(k,k:3) = M(k,k:3);
end

The permutation matrix P can be extracted by the following Matlab code from the d_PivotArray vector:

P1 = eye(3);
temp = P1(:,1);
P1(:,1) = P1(:,3);
P1(:,3) = temp;

P2 = eye(3);
temp = P2(:,2);
P2(:,2) = P2(:,3);
P2(:,3) = temp;

Concerning the permutation matrix, both the cuBLAS User's Guide and the Scientific Computing Software Library (SCSL) User’s Guide seem to be mistaken.

In this way,

P * A = L * U

where

P = P2 * P1

Upvotes: 2

gudasergey

Reputation: 59

Synchronization in elimination kernel works well only for threads of the same block. For large matrixes assign

LU[n*indexY + row] = factor;

is evaluated in some blocks earlier than operator

factor = LU[n*indexY + row] / LU[n*row + row];

Therefore you get incorrect values.

Upvotes: 1

LU Decomposition in CUDA

Answers (2)

Related Questions