Matt
Matt

Reputation: 20786

Unable to include thrust/host_vector.h and others with CUDA 12.5

This test program compiled fine with CUDA 12.4 and lower, but fails to compile w/ 12.5.1:

#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <iostream>

int main() {
  thrust::host_vector<int> h_vec(5);
  h_vec[0] = 1;
  h_vec[1] = 2;
  h_vec[2] = 3;
  h_vec[3] = 4;
  h_vec[4] = 5;

  thrust::inclusive_scan(h_vec.begin(), h_vec.end(), h_vec.begin());

  for(int i = 0; i < h_vec.size(); i++) {
    std::cout << h_vec[i] << " ";
  }
  std::cout << std::endl;

  return 0;
}

Build and output:

$ g++-13 -std=c++17 -I/opt/cuda/targets/x86_64-linux/include inclusive_scan.cpp -lthrust
In file included from /opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh:52,                                    
                 from /opt/cuda/targets/x86_64-linux/include/thrust/system/cuda/detail/util.h:48,                  
                 from /opt/cuda/targets/x86_64-linux/include/thrust/system/cuda/detail/internal/copy_cross_system.h:49,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/system/cuda/detail/copy.h:111,                      
                 from /opt/cuda/targets/x86_64-linux/include/thrust/system/detail/adl/copy.h:50,                        
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/copy.inl:31,                                 
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/copy.h:98,                            
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/allocator/copy_construct_range.inl:31,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/allocator/copy_construct_range.h:53,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/contiguous_storage.inl:31,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/contiguous_storage.h:243,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/detail/vector_base.h:39,
                 from /opt/cuda/targets/x86_64-linux/include/thrust/host_vector.h:35,
                 from inclusive_scan.cpp:1:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC()’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:271:5: error: ‘__syncthreads’ was not declared in this scope
  271 |     __syncthreads();
      |     ^~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_AND(int)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:280:12: error: ‘__syncthreads_and’ was not declared in this scope
  280 |     return __syncthreads_and(p);
      |            ^~~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::CTA_SYNC_OR(int)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:289:12: error: ‘__syncthreads_or’ was not declared in this scope
  289 |     return __syncthreads_or(p);
      |            ^~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘void cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_SYNC(unsigned int)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:298:5: error: ‘__syncwarp’ was not declared in this scope
  298 |     __syncwarp(member_mask);
      |     ^~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ANY(int, unsigned int)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:307:12: error: ‘__any_sync’ was not declared in this scope
  307 |     return __any_sync(member_mask, predicate);
      |            ^~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_ALL(int, unsigned int)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:316:12: error: ‘__all_sync’ was not declared in this scope
  316 |     return __all_sync(member_mask, predicate);
      |            ^~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::WARP_BALLOT(int, unsigned int)’:                                                                                               
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:325:12: error: ‘__ballot_sync’ was not declared in this scope
  325 |     return __ballot_sync(member_mask, predicate);                                                               
      |            ^~~~~~~~~~~~~                                                                                        /opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘unsigned int cub::CUB_200400___CUDA_ARCH_LIST___NS::SHFL_IDX_SYNC(unsigned int, int, unsigned int)’:                                                                      
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:368:12: error: ‘__shfl_sync’ was not declared in this scope     
  368 |     return __shfl_sync(member_mask, word, src_lane);                                                            
      |            ^~~~~~~~~~~                                                                                          
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh: In function ‘int cub::CUB_200400___CUDA_ARCH_LIST___NS::RowMajorTid(int, int, int)’:                                                                                                   
/opt/cuda/targets/x86_64-linux/include/cub/util_ptx.cuh:415:39: error: ‘threadIdx’ was not declared in this scope
  415 |     return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +                               
      |                                       ^~~~~~~~~
In file included from /opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/instructions/barrier_cluster.h:30,
                 from /opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx.h:74,
                 from /opt/cuda/targets/x86_64-linux/include/cuda/ptx:19,
                 from /opt/cuda/targets/x86_64-linux/include/cuda/discard_memory:25,
                 from /opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh:57:
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h: In function ‘uint32_t cuda::ptx::__4::__as_ptr_smem(const void*)’:
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h:40:44: error: ‘__cvta_generic_to_shared’ was not declared in this scope
   40 |   return static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__ptr));
      |                                            ^~~~~~~~~~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h: In function ‘uint64_t cuda::ptx::__4::__as_ptr_gmem(const void*)’:
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h:60:44: error: ‘__cvta_generic_to_global’ was not declared in this scope
   60 |   return static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__ptr));
      |                                            ^~~~~~~~~~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_smem(size_t)’:
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h:73:33: error: there are no arguments to ‘__cvta_shared_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_shared_to_generic’ must be available [-fpermissive]
   73 |   return reinterpret_cast<_Tp*>(__cvta_shared_to_generic(__ptr));
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h:73:33: note: (if you use ‘-fpermissive’, G++ will accept your code, but allowing the use of an undeclared name is deprecated)
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h: In function ‘_Tp* cuda::ptx::__4::__from_ptr_gmem(size_t)’:
/opt/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include/__cuda/ptx/ptx_helper_functions.h:94:33: error: there are no arguments to ‘__cvta_global_to_generic’ that depend on a template parameter, so a declaration of ‘__cvta_global_to_generic’ must be available [-fpermissive]
   94 |   return reinterpret_cast<_Tp*>(__cvta_global_to_generic(__ptr));
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh: In static member function ‘static typename AgentT::TempStorage& cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::get_temp_storage(cub::CUB_200400___CUDA_ARCH_LIST___NS::NullType&, cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_t&)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh:160:63: error: ‘blockIdx’ was not declared in this scope
  160 |       static_cast<char*>(vsmem.gmem_ptr) + (vsmem_per_block * blockIdx.x));
      |                                                               ^~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh: In static member function ‘static bool cub::CUB_200400___CUDA_ARCH_LIST___NS::detail::vsmem_helper_impl<AgentT>::discard_temp_storage(typename AgentT::TempStorage&)’:
/opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh:201:38: error: ‘threadIdx’ was not declared in this scope
  201 |     const std::size_t linear_tid   = threadIdx.x;
      |                                      ^~~~~~~~~
/opt/cuda/targets/x86_64-linux/include/cub/util_device.cuh:202:50: error: ‘blockDim’ was not declared in this scope
  202 |     const std::size_t block_stride = line_size * blockDim.x;
      |                                                  ^~~~~~~~

The above errors are specifically for gcc-13.3 and cuda-12.5.1.

However it compiles and runs fine with cuda-12.4, using either gcc 12 or 14.

OS: arch linux kernel 6.9.10.

We have a hybrid project with both .cpp and .cu files and have been using the above code in a .cpp file. I would have expected thrust::host_vector<> and related functions to be compilable w/ gcc as opposed to nvcc. Did this change in CUDA 12.5?

(Note that this test is a MRE for a larger project, so a simple answer like "just use nvcc" is not what I'm looking for.)

Upvotes: 1

Views: 124

Answers (0)

Related Questions