Reputation: 13
So my code is suppsed to work like this:
-take in_martix of NxN elements and R factor
-it should give back a matrix of size [N-2R]*[N-2R] with each element being a sum of in_matrix elements in R radius it should work like this for N=4 R=1 Even though my code works for smaller matrixes, for bigger ones like 1024 or 2048 or even bigger R factors it gives back a matrix of 0's. Is it a problem inside my code or just my GPU can't compute more calculations ? Code: (for testing purposes initial matrix is filled with 1's so every element of out_matrix should == (2R+1)^2
#include "cuda_runtime.h"
#include <stdio.h>
#include <iostream>
#include <cuda_profiler_api.h>
#define N 1024
#define R 128
#define K 1
#define THREAD_BLOCK_SIZE 8
using namespace std;
__global__ void MatrixStencil(int* d_tab_begin, int* d_out_begin, int d_N, int d_R, int d_K) {
int tx = threadIdx.x + blockIdx.x * blockDim.x;
int ty = threadIdx.y + blockIdx.y * blockDim.y;
int out_local = 0;
for (int col = tx; col <= tx + 2 * d_R ; col++)
for (int row = ty; row <= ty + 2 * d_R ; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
void random_ints(int tab[N][N]) {
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
tab[i][j] = 1;
}
int main() {
static int tab[N][N];
random_ints(tab);
int tab_size = sizeof(int) * N * N;
int out_size = sizeof(int) * (N - 2 * R) * (N - 2 * R);
dim3 BLOCK_SIZE(THREAD_BLOCK_SIZE, THREAD_BLOCK_SIZE);
dim3 GRID_SIZE(ceil((float)N / (float)(THREAD_BLOCK_SIZE )), ceil((float)N / (float)(THREAD_BLOCK_SIZE )));
void** d_tab;
void** d_out;
cudaMalloc((void**)&d_tab, tab_size);
cudaMalloc((void**)&d_out, out_size);
cudaMemcpyAsync(d_tab, tab, tab_size, cudaMemcpyHostToDevice);
int* d_tab_begin = (int*)(d_tab);
int* d_out_begin = (int*)(d_out);
MatrixStencil << < GRID_SIZE, BLOCK_SIZE>> > (d_tab_begin, d_out_begin, N, R, K);
int* out = (int*)malloc(out_size);
cudaMemcpyAsync(out, d_out, out_size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int col = 0; col < N - 2 * R; col++)
{
for (int row = 0; row < N - 2 * R; row++)
{
cout << *(out + ((col * (N - 2 * R)) + row)) << " ";
}
cout << endl;
}
}
Upvotes: 0
Views: 66
Reputation: 13
Finally thanks to Robert I found how to make the code work - by adding if statment
if ((tx < d_N - 2 * d_R) && (ty < d_N - 2 * d_R)) {
for (int col = tx; col <= tx + 2 * d_R; col++)
for (int row = ty; row <= ty + 2 * d_R; row++)
out_local += *(d_tab_begin + col * d_N + row);
*(d_out_begin + (tx) * (d_N - 2 * R) + ty) = out_local;
}
Upvotes: 1