Reputation: 17
I've just started learning cuda and I was wondering the performance of CUDA code versus cpu code on simple string search.
Search Method: If first x characters of data string is exactly the same with the keyword, then it returns true.(x => size of keyword)
There are 100 keywords and 10000 data strings. What I was trying to accomplish here is doing the comparisons in a concurrent way and comparing the elapsed time. I've written 4 different kernels and a cpu code. Yet, the results I've got were rather confusing.
searchKeywordKernel: Creates 4*32 threads. Each thread takes one of the keywords and compares it with 10000 data strings, then writes the results into a bool array. It took 2650ms.
searchKeywordKernel2: Creates 10*1024 threads. Each thread takes one of the data strings and compares it with 100 keywords, then writes the results into a bool array. It took 1397ms.
searchKeywordKernel3: Creates 1*1 thread. It behaves like cpu code and it took 279ms to produce the result.
searchKeywordKernel4: Creates 977*1024 threads. Each thread takes one of the string comparisons and it took 1334ms.
CPU: makes 1000000 string comparisons. It took 265ms.
I would like to ask a couple of questions:
Why did searchKeywordKernel3 generate the results in similar time as cpu code? I double checked the code but couldn't find any problem.
Why did the cpu code work better compared to kernels excluding searchKeywordKernel3?
Would it be because of read operations or size of the data?
Hardware info: Graphic Card:NVidia GT730, Processor: Intel i5-4460.
Code used to generate results is:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250
using namespace std;
__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (keywordStringIndex < SEARCHTERMSIZE)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (dataStringIndex < SEARCHITEMSIZE)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (threadIdx.x + blockIdx.x * blockDim.x < 1)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
resultIndex++;
}
}
}
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
{
int keywordStringIndex = id / SEARCHITEMSIZE;
int dataStringIndex = id%SEARCHITEMSIZE;
int keywordCharIndex;
int dataCharIndex;
int resultIndex;
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
int temp = 0;
int dataIndex = 0;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
dataIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE-21) + 20;
for (int k = 0; k < temp; k++)
{
data[dataIndex] = 'a';
dataIndex++;
}
data[dataIndex] = '\0';
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
int keywordIndex = 0;
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
keywordIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE - 21) + 20;
for (int k = 0; k < temp; k++)
{
keyword[keywordIndex] = 'a';
keywordIndex++;
}
keyword[keywordIndex] = '\0';
keywordIndex++;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
char* d_data;
char* d_keyword;
bool* d_result;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel2//////////////////////////////////////////
cout << "Before Kernel2" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel2: " << elapsedTime << "ms" << endl;
////////////////////////Kernel2//////////////////////////////////////////
cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel3//////////////////////////////////////////
cout << "Before Kernel3" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel3: " << elapsedTime << "ms" << endl;
////////////////////////Kernel3//////////////////////////////////////////
cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel4//////////////////////////////////////////
cout << "Before Kernel4" << endl;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cout << "After Kernel4: " << elapsedTime << "ms" << endl;
////////////////////////Kernel4//////////////////////////////////////////
cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
cout << "*****************************************" << endl;
}
*/
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
int dataCharIndex = 0;
int keywordCharIndex = 0;
int nonParallelResultIndex = 0;
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE;i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
keywordCharIndex = i*STRINGSIZE;
dataCharIndex = j*STRINGSIZE;
cpuResult[nonParallelResultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
cpuResult[nonParallelResultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
nonParallelResultIndex++;
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
cout << "*****************************************" << endl;
}
*/
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernel1Res, kernel2Res, kernel3Res, kernel4Res;
kernel1Res = true;
kernel2Res = true;
kernel3Res = true;
kernel4Res = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i] && kernel1Res)
kernel1Res = false;
if (cpuResult[i] != result2[i] && kernel2Res)
kernel2Res = false;
if (cpuResult[i] != result3[i] && kernel3Res)
kernel3Res = false;
if (cpuResult[i] != result4[i] && kernel4Res)
kernel4Res = false;
if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res)
break;
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
cout << "Before Deleting arrays" << endl;
delete[] data;
delete[] keyword;
delete[] result;
delete[] result2;
delete[] result3;
delete[] result4;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_data);
cudaFree(d_keyword);
cudaFree(d_result);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
system("pause");
return 0;
}
Thank you in advance.
Upvotes: 1
Views: 215
Reputation: 152269
Your code seems to be functionally correct -- that's the first job of a programmer. So how to make it run faster?
The first two concepts a CUDA programmer should be aware of for performance are:
You need lots of threads. Typically we want 10,000 or more threads, and generally there is not any major penalty for having many more threads than that. The need for lots of threads arises out of the machine architecture - it is a latency-hiding machine, and it hides latency by having lots of work that it can switch to on the fly. "Work" in this context can be loosely translated as "threads".
You want to make efficient use of the memory system. This can involve a lot of different ideas, but the first one we want to focus on is coalesced access to global memory. (You're not using any shared memory in any of your kernels, but if you were, we would want un-bank-conflicted access to shared memory as well). We also want efficiency in terms of data usage, and finally, like any computer optimization, we want to take advantage of the memory hierarchy to look for data re-use opportunities, and move those data items to "higher" levels in the memory hierarchy.
So what does this mean for your code? If you want to write a "fast" kernel, you'll want a lot of threads and also aim for 100% coalesced loads of global memory. So immediately, the strategies in kernels 1 and 3 don't look very good - they simply don't launch enough threads. 2 is better, but the strategy in kernel 4 might be even better - it allows us to launch 100*10000 threads. That fits our definition of "lots". So let's continue with a thread strategy that says each thread will be responsible to generate one element of the result
array (since there are 100*10000 results).
Now, regarding coalesced access, this gets down to data organization. How do adjacent threads access data? Is it contiguous? In your case of kernel4, it is not. Adjacent threads are reading from data
with gaps that are quite large, as you iterate through the while
loop that is doing the work.
To fix this, we can transpose our data. I've chosen to use a data-reuse optimization:
data
result
associated with the data
item in step 1.data
we can move that element (string) into shared memory, so that we only read it once per threadblock, and each thread then retrieves the values needed out of shared memory. This means that every string in data
only gets read (from global memory) once, which is optimal.data
to achieve optimal coalesced loads. But we still need to transpose the strings in keyword
, as each thread will be reading that via global loads. We're benefitted here by the fact that the overall keyword
array is smaller - about 25Kbytes, which can fit in the GPU L1 caches (if available) or certainly in L2.With the above strategies and choices, I was able to craft a kernel that runs about 5+ times faster than the CPU code, according to my testing. Since this kernel is likely to be largely bandwidth bound, we're probably in the right ballpark for performance. Here's a fully worked example, taking your code and adding a 5th kernel to it, which is derived from your 4th kernel, but using a transposed form of the keyword
array:
$ cat t703.cu
#include <stdio.h>
#include <iostream>
#include <chrono>
#define SEARCHTERMSIZE 100
#define SEARCHITEMSIZE 10000
#define STRINGSIZE 250
using namespace std;
__global__ void searchKeywordKernel(bool* result, char* data, char* keyword)
{
int keywordStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (keywordStringIndex < SEARCHTERMSIZE)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel2(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = threadIdx.x + blockIdx.x * blockDim.x;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (dataStringIndex < SEARCHITEMSIZE)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
}
__global__ void searchKeywordKernel3(bool* result, char* data, char* keyword)
{
int keywordStringIndex = 0;
int dataStringIndex = 0;
int keywordCharIndex = 0;
int dataCharIndex = 0;
int resultIndex = 0;
if (threadIdx.x + blockIdx.x * blockDim.x < 1)
{
for (; keywordStringIndex < SEARCHTERMSIZE; keywordStringIndex++)
{
for (; dataStringIndex < SEARCHITEMSIZE; dataStringIndex++)
{
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
resultIndex++;
}
}
}
}
__global__ void searchKeywordKernel4(bool* result, char* data, char* keyword)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < SEARCHTERMSIZE*SEARCHITEMSIZE)
{
int keywordStringIndex = id / SEARCHITEMSIZE;
int dataStringIndex = id%SEARCHITEMSIZE;
int keywordCharIndex;
int dataCharIndex;
int resultIndex;
dataCharIndex = dataStringIndex*STRINGSIZE;
keywordCharIndex = keywordStringIndex*STRINGSIZE;
resultIndex = keywordStringIndex*SEARCHITEMSIZE + dataStringIndex;
result[resultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
result[resultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
}
}
// this kernel is a modification of kernel 4, and assumes that the keyword array is transposed
// and that the kernel will be launched with one block per data string, and one thread per keyword
__global__ void searchKeywordKernel5(bool* result, const char * __restrict__ data, const char * keyword)
{
int bid = blockIdx.x;
int tid = threadIdx.x;
__shared__ char sdata[STRINGSIZE];
if (bid < SEARCHITEMSIZE)
{
int my_tid = tid;
while (my_tid < STRINGSIZE){ //load data string to be used by this block into shared mem
sdata[my_tid] = data[bid*STRINGSIZE + my_tid]; //coalesced global load
my_tid += blockDim.x;}
__syncthreads();
if (tid < SEARCHTERMSIZE){
int resultIndex = tid*SEARCHITEMSIZE + bid;
result[resultIndex] = true; //uncoalesced store - could be improved by reorganizing result
char test = keyword[tid]; // coalesced global load
int i = 0;
while (test != '\0')
{
char temp = sdata[i]; // shared memory broadcast
if ((test != temp) || (temp == '\0'))
{
result[resultIndex] = false; //uncoalesced store
break;
}
i++;
test = keyword[i*SEARCHTERMSIZE+tid]; //coalesced global load
}
}
}
}
int main()
{
chrono::steady_clock::time_point startTime;
chrono::steady_clock::time_point endTime;
typedef chrono::duration<int, milli> millisecs_t;
//////////Search Data Init/////////////////
cout << "Before Search Data Init" << endl;
startTime = chrono::steady_clock::now();
char* data = new char[SEARCHITEMSIZE*STRINGSIZE];
int temp = 0;
int dataIndex = 0;
for (int i = 0; i < SEARCHITEMSIZE; i++)
{
dataIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE-21) + 20;
for (int k = 0; k < temp; k++)
{
data[dataIndex] = 'a';
dataIndex++;
}
data[dataIndex] = '\0';
}
endTime = chrono::steady_clock::now();
millisecs_t duration(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Data Init: " << duration.count() << "ms" <<endl;
//////////Search Data Init/////////////////
//////////Search Keyword Init/////////////////
cout << "Before Search Keyword Init" << endl;
startTime = chrono::steady_clock::now();
char* keyword = new char[SEARCHTERMSIZE*STRINGSIZE];
int keywordIndex = 0;
for (int i = 0; i < SEARCHTERMSIZE; i++)
{
keywordIndex = i*STRINGSIZE;
temp = rand() % (STRINGSIZE - 21) + 20;
for (int k = 0; k < temp; k++)
{
keyword[keywordIndex] = 'a';
keywordIndex++;
}
keyword[keywordIndex] = '\0';
keywordIndex++;
}
endTime = chrono::steady_clock::now();
millisecs_t duration1(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Search Keyword Init: " << duration1.count() << "ms" << endl;
//////////Search Keyword Init/////////////////
bool* result = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result2 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result3 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result4 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
bool* result5 = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
char* d_data;
char* d_keyword;
char* d_keyword_T;
bool* d_result;
/////////////////////////CudaMalloc/////////////////////////////////
cout << "Before Malloc" << endl;
startTime = chrono::steady_clock::now();
cudaMalloc(&d_data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE);
cudaMalloc(&d_result, sizeof(bool)*SEARCHITEMSIZE * SEARCHTERMSIZE);
endTime = chrono::steady_clock::now();
millisecs_t duration2(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "After Malloc: " << duration2.count() << "ms" << endl;
/////////////////////////CudaMalloc/////////////////////////////////
cudaEvent_t start, stop;
float elapsedTime;
/////////////////////////CudaMemCpy///////////////////////////////////
cout << "Before Memcpy" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaMemcpy(d_data, data, sizeof(char) * SEARCHITEMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_keyword, keyword, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
//transpose keywords
char* keyword_T = new char[SEARCHTERMSIZE*STRINGSIZE];
for (int i = 0; i < SEARCHTERMSIZE; i++)
for (int j = 0; j < STRINGSIZE; j++)
keyword_T[j*SEARCHTERMSIZE+i] = keyword[i*STRINGSIZE+j];
cudaMemcpy(d_keyword_T, keyword_T, sizeof(char) * SEARCHTERMSIZE * STRINGSIZE, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Memcpy: " << elapsedTime << "ms" << endl;
/////////////////////////CudaMemCpy///////////////////////////////////
////////////////////////Kernel//////////////////////////////////////////
cout << "Before Kernel" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel <<<(SEARCHTERMSIZE/32)+1, 32 >>>(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel: " << elapsedTime << "ms" << endl;
////////////////////////Kernel//////////////////////////////////////////
cudaMemcpy(result, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel2//////////////////////////////////////////
cout << "Before Kernel2" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel2 << < (SEARCHITEMSIZE/1024) +1 , 1024 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel2: " << elapsedTime << "ms" << endl;
////////////////////////Kernel2//////////////////////////////////////////
cudaMemcpy(result2, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel3//////////////////////////////////////////
cout << "Before Kernel3" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel3 << <1, 1 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel3: " << elapsedTime << "ms" << endl;
////////////////////////Kernel3//////////////////////////////////////////
cudaMemcpy(result3, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
////////////////////////Kernel4//////////////////////////////////////////
cout << "Before Kernel4" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel4 << <((SEARCHITEMSIZE*SEARCHTERMSIZE)/1024)+1, 1024 >> >(d_result, d_data, d_keyword);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel4: " << elapsedTime << "ms" << endl;
////////////////////////Kernel4//////////////////////////////////////////
cudaMemcpy(result4, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
cudaFuncSetCacheConfig(searchKeywordKernel5, cudaFuncCachePreferL1);
////////////////////////Kernel5//////////////////////////////////////////
cout << "Before Kernel5" << endl;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
searchKeywordKernel5 << <SEARCHITEMSIZE, SEARCHTERMSIZE >> >(d_result, d_data, d_keyword_T);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cout << "After Kernel5: " << elapsedTime << "ms" << endl;
////////////////////////Kernel5//////////////////////////////////////////
cudaMemcpy(result5, d_result, sizeof(bool) * SEARCHITEMSIZE * SEARCHTERMSIZE, cudaMemcpyDeviceToHost);
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << result4[i*SEARCHITEMSIZE + j] << endl;
cout << "*****************************************" << endl;
}
*/
/////////////////////////////////// CPU code //////////////////////////////////////////
bool* cpuResult = new bool[SEARCHTERMSIZE*SEARCHITEMSIZE];
int dataCharIndex = 0;
int keywordCharIndex = 0;
int nonParallelResultIndex = 0;
cout << "CPU code starts" << endl;
startTime = chrono::steady_clock::now();
for (int i = 0; i < SEARCHTERMSIZE;i++)
{
for (int j = 0; j < SEARCHITEMSIZE; j++)
{
keywordCharIndex = i*STRINGSIZE;
dataCharIndex = j*STRINGSIZE;
cpuResult[nonParallelResultIndex] = true;
while (keyword[keywordCharIndex] != '\0')
{
if ((keyword[keywordCharIndex] != data[dataCharIndex]) || (data[dataCharIndex] == '\0'))
{
cpuResult[nonParallelResultIndex] = false;
break;
}
keywordCharIndex++;
dataCharIndex++;
}
nonParallelResultIndex++;
}
}
endTime = chrono::steady_clock::now();
millisecs_t duration3(chrono::duration_cast<millisecs_t>(endTime - startTime));
cout << "CPU code ends: " << duration3.count() << "ms" << endl;
/////////////////////////////////// CPU code //////////////////////////////////////////
/*
for (int i = 0; i < 10; i++)
{
for (int j = 0; j < 10; j++)
cout << boolalpha << i << " vs " << j << ": " << nonParallelResult[i*SEARCHITEMSIZE+j] << endl;
cout << "*****************************************" << endl;
}
*/
////////////////////////////////////Result Comparison////////////////////////////////////////
bool kernel1Res, kernel2Res, kernel3Res, kernel4Res, kernel5Res;
kernel1Res = true;
kernel2Res = true;
kernel3Res = true;
kernel4Res = true;
kernel5Res = true;
for (int i = 0; i < SEARCHITEMSIZE*SEARCHTERMSIZE; i++)
{
if (cpuResult[i] != result[i] && kernel1Res)
kernel1Res = false;
if (cpuResult[i] != result2[i] && kernel2Res)
kernel2Res = false;
if (cpuResult[i] != result3[i] && kernel3Res)
kernel3Res = false;
if (cpuResult[i] != result4[i] && kernel4Res)
kernel4Res = false;
if (cpuResult[i] != result5[i] && kernel5Res)
kernel5Res = false;
if (!kernel1Res && !kernel2Res && !kernel3Res && !kernel4Res && !kernel5Res)
break;
}
////////////////////////////////////Result Comparison////////////////////////////////////////
cout << boolalpha << "Kernel1 computation: " << kernel1Res << endl;
cout << boolalpha << "Kernel2 computation: " << kernel2Res << endl;
cout << boolalpha << "Kernel3 computation: " << kernel3Res << endl;
cout << boolalpha << "Kernel4 computation: " << kernel4Res << endl;
cout << boolalpha << "Kernel5 computation: " << kernel5Res << endl;
cout << "Before Deleting arrays" << endl;
delete[] data;
delete[] keyword;
delete[] result;
delete[] result2;
delete[] result3;
delete[] result4;
delete[] cpuResult;
cout << "After Deleting arrays" << endl;
cout << "Before Freeing device memory" << endl;
cudaFree(d_data);
cudaFree(d_keyword);
cudaFree(d_result);
cout << "After Freeing device memory" << endl;
cudaDeviceReset();
return 0;
}
$ nvcc -O3 -std=c++11 -o t703 t703.cu
$ ./t703
Before Search Data Init
After Search Data Init: 0ms
Before Search Keyword Init
After Search Keyword Init: 0ms
Before Malloc
After Malloc: 38ms
Before Memcpy
After Memcpy: 1.09805ms
Before Kernel
After Kernel: 1455.98ms
Before Kernel2
After Kernel2: 110.16ms
Before Kernel3
After Kernel3: 363.236ms
Before Kernel4
After Kernel4: 96.9751ms
Before Kernel5
After Kernel5: 10.9064ms
CPU code starts
CPU code ends: 76ms
Kernel1 computation: true
Kernel2 computation: true
Kernel3 computation: true
Kernel4 computation: true
Kernel5 computation: true
Before Deleting arrays
After Deleting arrays
Before Freeing device memory
After Freeing device memory
$
Some notes:
-G
(debug) switch (which is what Visual Studio does on a debug CUDA project) may have a substantial impact on code performance. Whenever you are benchmarking or analyzing CUDA code for performance, you should never use the -G
switch.Upvotes: 2