kummatti
kummatti

Reputation: 1

C++ Smart Pointers and OpenACC

My objective is to see if I can use shared_ptr in c++ to pass resources to compute regions on gpu using OpenACC.

The code snippet is as follows:

#include <iostream>
#include <memory>
#include <vector>
#include <openacc.h>
int main(){
    // Create a unique pointer to an STL vector of ints
    std::shared_ptr<std::vector<int>> vec_ptr = std::make_shared<std::vector<int>>();

    // Fill the vector with some data
    for (int i = 0; i < 10; ++i) {
        vec_ptr->push_back(i);
    }
    #pragma acc enter data copyin(vec_ptr[0:1], vec_ptr[0:9])

    // Parallelize a loop over the vector using OpenACC
    #pragma acc data copyout(vec_ptr[0:9])
    {
      #pragma acc parallel loop
      for (int i = 0; i < vec_ptr->size(); ++i) {
        printf(" The host, device flags are %d, %d \n", acc_on_device(acc_device_host), acc_on_device(acc_device_nvidia));
        // Access and modify vector elements safely in parallel
        (*vec_ptr)[i] *= 2;
      }
    }

    #pragma acc update self (vec_ptr[0:9])
    //Print the modified vector from host
    for (int i = 0; i < vec_ptr->size(); ++i)  {
        std::cout << (*vec_ptr)[i] << " ";
    }
    std::cout << std::endl;

    #pragma acc exit data delete (vec_ptr[0:999], vec_ptr[0:1])
    return 0;
}

I am using nvhpc 23.2 with cuda toolkit 12.0 and cmake/3.23.0. My device details are the following:

[simon@axbn build]$ nvidia-smi
Fri May  3 16:24:17 2024
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  On   | 00000000:2F:00.0 Off |                    0 |
| N/A   47C    P0   135W / 250W |   2900MiB / 16384MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     86839      C   ...s/phydll_train/bin/python      688MiB |
|    0   N/A  N/A     90677      C   python                           2204MiB |
+-----------------------------------------------------------------------------+

My CMake file looks like this:

cmake_minimum_required(VERSION 3.23)

project(
    testSmartPtrs
    VERSION 0.1
    LANGUAGES CXX
    )

#Set output directory
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/app")

find_package(OpenACC REQUIRED)
find_package(CUDAToolkit REQUIRED)

set(CMAKE_CXX_COMPILER "nvc++")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -gpu=cc70,managed -Minfo=all")

message(OPENACC_CXX_FLAGS= ${OpenACC_CXX_FLAGS})
message(OpenACC_CXX_VERSION_MAJOR= ${OpenACC_CXX_VERSION_MAJOR})
message(CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}")

 add_executable( testSmartPtrs )
# target_include_directories(testSmartPtrs PUBLIC "${PROJECT_SOURCE_DIR}/include")
 target_compile_options(testSmartPtrs PUBLIC ${OPENACC_CXX})
 target_link_libraries(testSmartPtrs PUBLIC OpenACC::OpenACC_CXX)
 target_sources(testSmartPtrs PRIVATE
                src/main.cpp
 )

My build looks as follows:

[simon@axbn build]$ make
Consolidate compiler generated dependencies of target testSmartPtrs
[ 50%] Building CXX object CMakeFiles/testSmartPtrs.dir/src/main.cpp.o
nvc++-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
main:
     18, Generating enter data copyin(vec_ptr)
         Generating copyout(vec_ptr) [if not already present]
         Generating NVIDIA GPU code
         20, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     26, Generating update self(vec_ptr)
     33, Generating exit data delete(vec_ptr)
std::__shared_ptr<std::vector<int, std::allocator<int>>, (__gnu_cxx::_Lock_policy)2>::operator*() const:
      2, include "memory"
          10, include "memory"
               82, include "shared_ptr.h"
                    52, include "shared_ptr_base.h"
                        906, Generating implicit acc routine seq
                             Generating acc routine seq
                             Generating NVIDIA GPU code
std::__shared_ptr<std::vector<int, std::allocator<int>>, (__gnu_cxx::_Lock_policy)2>::operator->() const:
      2, include "memory"
          10, include "memory"
               82, include "shared_ptr.h"
                    52, include "shared_ptr_base.h"
                        913, Generating implicit acc routine seq
                             Generating acc routine seq
                             Generating NVIDIA GPU code
std::vector<int, std::allocator<int>>::size() const:
      3, include "vector"
          64, include "stl_vector.h"
              646, Generating implicit acc routine seq
                   Generating acc routine seq
                   Generating NVIDIA GPU code
std::vector<int, std::allocator<int>>::operator[](unsigned long):
      3, include "vector"
          64, include "stl_vector.h"
              771, Generating implicit acc routine seq
                   Generating acc routine seq
                   Generating NVIDIA GPU code
[100%] Linking CXX executable ../app/testSmartPtrs
nvc++-Warning-CUDA_HOME has been deprecated. Please, use NVHPC_CUDA_HOME instead.
[100%] Built target testSmartPtrs

Expected result vs obtained result I would expect that printf() would return 0,1 ten times followed by a modified array {0 2 4 6 8 10 12 14 16 18}. However, what I get instead is that the printf() returns 1,0 ten times followed by a modified array {0 2 4 6 8 10 12 14 16 18}. This means that the computation has happened only on the host and not on the device.

Any help or direction on why the code does not execute on the gpu is highly appreciated.

Thanks a ton, Sangeeth

testSmartPtr.zip|attachment (2.8 KB)

Upvotes: 0

Views: 62

Answers (0)

Related Questions