R3dy
R3dy

Reputation: 9

CUDA Streams are NOT Asynchronous

I am trying to decrypt a file xored with my NVIDIA Geforce 3060 Laptop (With 5 Async Engine) (CUDA) in C.

My program works fine but when I profiled it, I saw that my differents streams are not asynchronous (see pictures 1 & 2(with pinned memory).

second try with pinned memory: second try with pinned memory First try with non-pinned memory: First try with non-pinned memory

This is my function which call my kernel function:

__global__ void decryptKernel(unsigned char* buffer, unsigned char key, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        buffer[idx] ^= 0x43;  
    }


}

void decrypt_large_shellcode(unsigned char* shellcode, unsigned char key) {
    unsigned char* d_buffers[NUM_STREAMS];
    cudaStream_t streams[NUM_STREAMS];

    

    unsigned char* pinned_shellcode;
    cudaHostAlloc((void**)&pinned_shellcode, TOTAL_SIZE, cudaHostAllocDefault);
    memcpy(pinned_shellcode, shellcode, TOTAL_SIZE); 
    for (int i = 0; i < NUM_STREAMS; i++) {
        cudaMalloc((void**)&d_buffers[i], CHUNK_SIZE); 
        cudaStreamCreate(&streams[i]);
    }


    for (int offset = 0; offset < TOTAL_SIZE; offset += CHUNK_SIZE * NUM_STREAMS) {
        for (int i = 0; i < NUM_STREAMS; i++) {
            int chunk_offset = offset + i * CHUNK_SIZE;
            if (chunk_offset >= TOTAL_SIZE) break;

            int chunk_size = (TOTAL_SIZE - chunk_offset) < CHUNK_SIZE ? (TOTAL_SIZE - chunk_offset) : CHUNK_SIZE;

            cudaMemcpyAsync(d_buffers[i], pinned_shellcode + chunk_offset, chunk_size, cudaMemcpyHostToDevice, streams[i]);

            int threadsPerBlock = 256;
            int blocksPerGrid = (chunk_size + threadsPerBlock - 1) / threadsPerBlock;

            decryptKernel << <blocksPerGrid, threadsPerBlock, 0, streams[i] >> > (d_buffers[i], key, chunk_size);

            cudaMemcpyAsync(pinned_shellcode + chunk_offset, d_buffers[i], chunk_size, cudaMemcpyDeviceToHost, streams[i]);
        }
    }

    for (int i = 0; i < NUM_STREAMS; i++) {
        //cudaStreamSynchronize(streams[i]);
        cudaFree(d_buffers[i]);
        cudaStreamDestroy(streams[i]);
    }

    
    memcpy(shellcode, pinned_shellcode, TOTAL_SIZE);
    cudaFreeHost(pinned_shellcode); 
}

Edit : I activated the Windows Hardware Accelerated and I have now this : https://ibb.co/2YyB4PTt

But not the result I want :/

Upvotes: 0

Views: 70

Answers (0)

Related Questions