Siarczansodu99
Siarczansodu99

Reputation: 13

How to make my Cuda kernel run on bigger matrixes?

So my code is suppsed to work like this:

-take in_martix of NxN elements and R factor

-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1 Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ? Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2

#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;

__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    int ty = threadIdx.y + blockIdx.y * blockDim.y;
    int out_local = 0;

    for (int col = tx; col <= tx + 2 * d_R ; col++)
        for (int row = ty; row <= ty + 2 * d_R ; row++)
            out_local += *(d_tab_begin + col * d_N + row);

    *(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;

}
void random_ints(int tab[N][N]) {
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            tab[i][j] = 1;
}

int main() {

    static int tab[N][N];
    random_ints(tab);

    int tab_size = sizeof(int) * N * N;
    int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);

    dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
    dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE  )), ceil((float)N / (float)(THREAD_BLOCK_SIZE  )));

    void** d_tab;
    void** d_out;

    cudaMalloc((void**)&d_tab, tab_size);
    cudaMalloc((void**)&d_out, out_size);

    cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);

    int* d_tab_begin = (int*)(d_tab);
    int* d_out_begin = (int*)(d_out);

    MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);

    int* out = (int*)malloc(out_size);
    
    cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
    cudaThreadSynchronize();

    for (int col = 0; col < N - 2 * R; col++)
    {
        for (int row = 0; row < N - 2 * R; row++)
        {
            cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
        }
        cout << endl;
    }
}

Upvotes: 0

Views: 66

Answers (1)

Siarczansodu99
Siarczansodu99

Reputation: 13

Finally thanks to Robert I found how to make the code work - by adding if statment

if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
        for (int col = tx; col <= tx + 2 * d_R; col++)
            for (int row = ty; row <= ty + 2 * d_R; row++)
                out_local += *(d_tab_begin + col * d_N + row);

        *(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
    }

Upvotes: 1

Related Questions