//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_CUDA_REDUCESCAN_HPP #define KOKKOS_CUDA_REDUCESCAN_HPP #include #if defined(KOKKOS_ENABLE_CUDA) #include #include #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { //---------------------------------------------------------------------------- /* * Algorithmic constraints: * (a) threads with same threadIdx.y have same value * (b) blockDim.x == power of two * (c) blockDim.z == 1 */ template __device__ inline void cuda_intra_warp_reduction( ValueType& result, const ReducerType& reducer, const uint32_t max_active_thread = blockDim.y) { unsigned int shift = 1; // Reduce over values from threads with different threadIdx.y while (blockDim.x * shift < 32) { const ValueType tmp = shfl_down(result, blockDim.x * shift, 32u); // Only join if upper thread is active (this allows non power of two for // blockDim.y if (threadIdx.y + shift < max_active_thread) reducer.join(&result, &tmp); shift *= 2; } result = shfl(result, 0, 32); } template __device__ inline void cuda_inter_warp_reduction( ValueType& value, const ReducerType& reducer, const int max_active_thread = blockDim.y) { constexpr int step_width = 4; // Depending on the ValueType, __shared__ memory must be aligned up to 8byte // boundaries. The reason not to use ValueType directly is that for types with // constructors it could lead to race conditions alignas(alignof(ValueType) > alignof(double) ? alignof(ValueType) : alignof(double)) __shared__ double sh_result[(sizeof(ValueType) + 7) / 8 * step_width]; ValueType* result = (ValueType*)&sh_result; const int step = 32 / blockDim.x; int shift = step_width; const int id = threadIdx.y % step == 0 ? threadIdx.y / step : 65000; if (id < step_width) { result[id] = value; } __syncthreads(); while (shift <= max_active_thread / step) { if (shift <= id && shift + step_width > id && threadIdx.x == 0) { reducer.join(&result[id % step_width], &value); } __syncthreads(); shift += step_width; } value = result[0]; for (int i = 1; (i * step < max_active_thread) && i < step_width; i++) reducer.join(&value, &result[i]); __syncthreads(); } template __device__ inline void cuda_intra_block_reduction( ValueType& value, const ReducerType& reducer, const int max_active_thread = blockDim.y) { cuda_intra_warp_reduction(value, reducer, max_active_thread); cuda_inter_warp_reduction(value, reducer, max_active_thread); } template __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { using pointer_type = typename FunctorType::pointer_type; using value_type = typename FunctorType::value_type; // Do the intra-block reduction with shfl operations and static shared memory cuda_intra_block_reduction(value, reducer, max_active_thread); const int id = threadIdx.y * blockDim.x + threadIdx.x; // One thread in the block writes block result to global scratch_memory if (id == 0) { pointer_type global = m_scratch_space + blockIdx.x; *global = value; } // One warp of last block performs inter block reduction through loading the // block values from global scratch_memory bool last_block = false; __threadfence(); __syncthreads(); if (id < 32) { Cuda::size_type count; // Figure out whether this is the last block if (id == 0) count = Kokkos::atomic_fetch_add(m_scratch_flags, 1); count = Kokkos::shfl(count, 0, 32); // Last block does the inter block reduction if (count == gridDim.x - 1) { // set flag back to zero if (id == 0) *m_scratch_flags = 0; last_block = true; value = neutral; pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < 32 ? blockDim.x * blockDim.y : 32; for (int i = id; i < (int)gridDim.x; i += step_size) { value_type tmp = global[i]; reducer.join(&value, &tmp); } // Perform shfl reductions within the warp only join if contribution is // valid (allows gridDim.x non power of two and <32) if (int(blockDim.x * blockDim.y) > 1) { value_type tmp = Kokkos::shfl_down(value, 1, 32); if (id + 1 < int(gridDim.x)) reducer.join(&value, &tmp); } unsigned int mask = __activemask(); __syncwarp(mask); if (int(blockDim.x * blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2, 32); if (id + 2 < int(gridDim.x)) reducer.join(&value, &tmp); } __syncwarp(mask); if (int(blockDim.x * blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4, 32); if (id + 4 < int(gridDim.x)) reducer.join(&value, &tmp); } __syncwarp(mask); if (int(blockDim.x * blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8, 32); if (id + 8 < int(gridDim.x)) reducer.join(&value, &tmp); } __syncwarp(mask); if (int(blockDim.x * blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16, 32); if (id + 16 < int(gridDim.x)) reducer.join(&value, &tmp); } __syncwarp(mask); } } // The last block has in its thread=0 the global reduction value through // "value" return last_block; } template struct CudaReductionsFunctor; template struct CudaReductionsFunctor { using pointer_type = typename FunctorType::pointer_type; using Scalar = typename FunctorType::value_type; __device__ static inline void scalar_intra_warp_reduction( const FunctorType& functor, Scalar value, // Contribution const bool skip_vector, // Skip threads if Kokkos vector lanes are not // part of the reduction const int width, // How much of the warp participates Scalar& result) { unsigned mask = width == 32 ? 0xffffffff : ((1 << width) - 1) << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width; for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { Scalar tmp = Kokkos::shfl_down(value, delta, width, mask); functor.join(&value, &tmp); } Impl::in_place_shfl(result, value, 0, width, mask); } __device__ static inline void scalar_intra_block_reduction( const FunctorType& functor, Scalar value, const bool skip, Scalar* my_global_team_buffer_element, const int shared_elements, Scalar* shared_team_buffer_element) { const int warp_id = (threadIdx.y * blockDim.x) / 32; Scalar* const my_shared_team_buffer_element = shared_team_buffer_element + warp_id % shared_elements; // Warp Level Reduction, ignoring Kokkos vector entries scalar_intra_warp_reduction(functor, value, skip, 32, value); if (warp_id < shared_elements) { *my_shared_team_buffer_element = value; } // Wait for every warp to be done before using one warp to do final cross // warp reduction __syncthreads(); const int num_warps = blockDim.x * blockDim.y / 32; for (int w = shared_elements; w < num_warps; w += shared_elements) { if (warp_id >= w && warp_id < w + shared_elements) { if ((threadIdx.y * blockDim.x + threadIdx.x) % 32 == 0) functor.join(my_shared_team_buffer_element, &value); } __syncthreads(); } if (warp_id == 0) { functor.init(&value); for (unsigned int i = threadIdx.y * blockDim.x + threadIdx.x; i < blockDim.y * blockDim.x / 32; i += 32) functor.join(&value, &shared_team_buffer_element[i]); scalar_intra_warp_reduction(functor, value, false, 32, *my_global_team_buffer_element); } } __device__ static inline bool scalar_inter_block_reduction( const FunctorType& functor, const Cuda::size_type /*block_id*/, const Cuda::size_type block_count, Cuda::size_type* const shared_data, Cuda::size_type* const global_data, Cuda::size_type* const global_flags) { Scalar* const global_team_buffer_element = ((Scalar*)global_data); Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x; Scalar* shared_team_buffer_elements = ((Scalar*)shared_data); Scalar value = shared_team_buffer_elements[threadIdx.y]; int shared_elements = blockDim.x * blockDim.y / 32; int global_elements = block_count; __syncthreads(); scalar_intra_block_reduction(functor, value, true, my_global_team_buffer_element, shared_elements, shared_team_buffer_elements); __threadfence(); __syncthreads(); unsigned int num_teams_done = 0; // The cast in the atomic call is necessary to find matching call with // MSVC/NVCC if (threadIdx.x + threadIdx.y == 0) { num_teams_done = Kokkos::atomic_fetch_add(global_flags, static_cast(1)) + 1; } bool is_last_block = false; if (__syncthreads_or(num_teams_done == gridDim.x)) { is_last_block = true; *global_flags = 0; functor.init(&value); for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; i += blockDim.x * blockDim.y) { functor.join(&value, &global_team_buffer_element[i]); } scalar_intra_block_reduction( functor, value, false, shared_team_buffer_elements + (blockDim.y - 1), shared_elements, shared_team_buffer_elements); } return is_last_block; } }; template struct CudaReductionsFunctor { using pointer_type = typename FunctorType::pointer_type; using Scalar = typename FunctorType::value_type; __device__ static inline void scalar_intra_warp_reduction( const FunctorType& functor, Scalar* value, // Contribution const bool skip_vector, // Skip threads if Kokkos vector lanes are not // part of the reduction const int width) // How much of the warp participates { unsigned mask = width == 32 ? 0xffffffff : ((1 << width) - 1) << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width; const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32; __syncwarp(mask); for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) { if ((lane_id + delta < 32) && (lane_id % (delta * 2) == 0)) { functor.join(value, value + delta); } __syncwarp(mask); } if (lane_id != 0) { *value = *(value - lane_id); } } __device__ static inline void scalar_intra_block_reduction( const FunctorType& functor, Scalar value, const bool skip, Scalar* result, const int /*shared_elements*/, Scalar* shared_team_buffer_element) { const int warp_id = (threadIdx.y * blockDim.x) / 32; Scalar* const my_shared_team_buffer_element = shared_team_buffer_element + threadIdx.y * blockDim.x + threadIdx.x; *my_shared_team_buffer_element = value; // Warp Level Reduction, ignoring Kokkos vector entries scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, skip, 32); // Wait for every warp to be done before using one warp to do final cross // warp reduction __syncthreads(); if (warp_id == 0) { const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32; if (delta < blockDim.x * blockDim.y) *my_shared_team_buffer_element = shared_team_buffer_element[delta]; __syncwarp(0xffffffff); scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false, blockDim.x * blockDim.y / 32); if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element; } } template __device__ static inline bool scalar_inter_block_reduction( const FunctorType& functor, const Cuda::size_type /*block_id*/, const Cuda::size_type block_count, SizeType* const shared_data, SizeType* const global_data, Cuda::size_type* const global_flags) { Scalar* const global_team_buffer_element = ((Scalar*)global_data); Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x; Scalar* shared_team_buffer_elements = ((Scalar*)shared_data); Scalar value = shared_team_buffer_elements[threadIdx.y]; int shared_elements = blockDim.x * blockDim.y / 32; int global_elements = block_count; __syncthreads(); scalar_intra_block_reduction(functor, value, true, my_global_team_buffer_element, shared_elements, shared_team_buffer_elements); __threadfence(); __syncthreads(); unsigned int num_teams_done = 0; // The cast in the atomic call is necessary to find matching call with // MSVC/NVCC if (threadIdx.x + threadIdx.y == 0) { num_teams_done = Kokkos::atomic_fetch_add(global_flags, static_cast(1)) + 1; } bool is_last_block = false; if (__syncthreads_or(num_teams_done == gridDim.x)) { is_last_block = true; *global_flags = 0; functor.init(&value); for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < global_elements; i += blockDim.x * blockDim.y) { functor.join(&value, &global_team_buffer_element[i]); } scalar_intra_block_reduction( functor, value, false, shared_team_buffer_elements + (blockDim.y - 1), shared_elements, shared_team_buffer_elements); } return is_last_block; } }; //---------------------------------------------------------------------------- // See section B.17 of Cuda C Programming Guide Version 3.2 // for discussion of // __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor) // function qualifier which could be used to improve performance. //---------------------------------------------------------------------------- /* * Algorithmic constraints: * (a) blockDim.y <= 1024 * (b) blockDim.x == blockDim.z == 1 */ template __device__ void cuda_intra_block_reduce_scan( const FunctorType& functor, const typename FunctorType::pointer_type base_data) { using pointer_type = typename FunctorType::pointer_type; const unsigned value_count = functor.length(); const unsigned not_less_power_of_two = (1 << (Impl::int_log2(blockDim.y - 1) + 1)); const unsigned BlockSizeMask = not_less_power_of_two - 1; // There is at most one warp that is neither completely full or empty. // For that warp, we shift all indices logically to the end and ignore join // operations with unassigned indices in the warp when performing the intra // warp reduction/scan. const bool is_full_warp = (((threadIdx.y >> CudaTraits::WarpIndexShift) + 1) << CudaTraits::WarpIndexShift) <= blockDim.y; const unsigned mapped_idx = threadIdx.y + (is_full_warp ? 0 : (not_less_power_of_two - blockDim.y) & (CudaTraits::WarpSize - 1)); const pointer_type tdata_intra = base_data + value_count * threadIdx.y; const pointer_type warp_start = base_data + value_count * ((threadIdx.y >> CudaTraits::WarpIndexShift) << CudaTraits::WarpIndexShift); auto block_reduce_step = [&functor, value_count]( int const R, pointer_type const TD, int const S, pointer_type memory_start, int index_shift) { const auto join_ptr = TD - (value_count << S) + value_count * index_shift; if (((R + 1) & ((1 << (S + 1)) - 1)) == 0 && join_ptr >= memory_start) { functor.join(TD, join_ptr); } }; auto block_scan_step = [&functor, value_count]( int const R, pointer_type const TD, int const S, pointer_type memory_start, int index_shift) { const auto N = (1 << (S + 1)); const auto join_ptr = TD - (value_count << S) + value_count * index_shift; if (R >= N && ((R + 1) & (N - 1)) == (N >> 1) && join_ptr >= memory_start) { functor.join(TD, join_ptr); } }; { // Intra-warp reduction: __syncwarp(0xffffffff); block_reduce_step(mapped_idx, tdata_intra, 0, warp_start, 0); __syncwarp(0xffffffff); block_reduce_step(mapped_idx, tdata_intra, 1, warp_start, 0); __syncwarp(0xffffffff); block_reduce_step(mapped_idx, tdata_intra, 2, warp_start, 0); __syncwarp(0xffffffff); block_reduce_step(mapped_idx, tdata_intra, 3, warp_start, 0); __syncwarp(0xffffffff); block_reduce_step(mapped_idx, tdata_intra, 4, warp_start, 0); __syncwarp(0xffffffff); } __syncthreads(); // Wait for all warps to reduce // Inter-warp reduce-scan by a single warp to avoid extra synchronizations. { // There is at most one warp where the memory address to be used is not // (CudaTraits::WarpSize - 1) away from the warp start adress. For the // following reduction, we shift all indices logically to the end of the // next power-of-two to the number of warps. const unsigned n_active_warps = ((blockDim.y - 1) >> CudaTraits::WarpIndexShift) + 1; const unsigned inner_mask = __ballot_sync(0xffffffff, (threadIdx.y < n_active_warps)); if (threadIdx.y < n_active_warps) { const bool is_full_warp_inter = threadIdx.y < (blockDim.y >> CudaTraits::WarpIndexShift); const pointer_type tdata_inter = base_data + value_count * (is_full_warp_inter ? (threadIdx.y << CudaTraits::WarpIndexShift) + (CudaTraits::WarpSize - 1) : blockDim.y - 1); const unsigned index_shift = is_full_warp_inter ? 0 : blockDim.y - (threadIdx.y << CudaTraits::WarpIndexShift); const int rtid_inter = (threadIdx.y << CudaTraits::WarpIndexShift) + (CudaTraits::WarpSize - 1) - index_shift; if ((1 << 5) < BlockSizeMask) { __syncwarp(inner_mask); block_reduce_step(rtid_inter, tdata_inter, 5, base_data, index_shift); } if ((1 << 6) < BlockSizeMask) { __syncwarp(inner_mask); block_reduce_step(rtid_inter, tdata_inter, 6, base_data, index_shift); } if ((1 << 7) < BlockSizeMask) { __syncwarp(inner_mask); block_reduce_step(rtid_inter, tdata_inter, 7, base_data, index_shift); } if ((1 << 8) < BlockSizeMask) { __syncwarp(inner_mask); block_reduce_step(rtid_inter, tdata_inter, 8, base_data, index_shift); } if ((1 << 9) < BlockSizeMask) { __syncwarp(inner_mask); block_reduce_step(rtid_inter, tdata_inter, 9, base_data, index_shift); } if (DoScan) { __syncwarp(inner_mask); block_scan_step(rtid_inter, tdata_inter, 8, base_data, index_shift); __syncwarp(inner_mask); block_scan_step(rtid_inter, tdata_inter, 7, base_data, index_shift); __syncwarp(inner_mask); block_scan_step(rtid_inter, tdata_inter, 6, base_data, index_shift); __syncwarp(inner_mask); block_scan_step(rtid_inter, tdata_inter, 5, base_data, index_shift); } } } __syncthreads(); // Wait for inter-warp reduce-scan to complete if (DoScan) { block_scan_step(mapped_idx, tdata_intra, 4, warp_start, 0); __threadfence_block(); __syncwarp(0xffffffff); block_scan_step(mapped_idx, tdata_intra, 3, warp_start, 0); __threadfence_block(); __syncwarp(0xffffffff); block_scan_step(mapped_idx, tdata_intra, 2, warp_start, 0); __threadfence_block(); __syncwarp(0xffffffff); block_scan_step(mapped_idx, tdata_intra, 1, warp_start, 0); __threadfence_block(); __syncwarp(0xffffffff); block_scan_step(mapped_idx, tdata_intra, 0, warp_start, 0); __threadfence_block(); __syncwarp(0xffffffff); // Update with total from previous warps if (mapped_idx >= CudaTraits::WarpSize && (mapped_idx & (CudaTraits::WarpSize - 1)) != (CudaTraits::WarpSize - 1)) functor.join(tdata_intra, warp_start - value_count); __syncwarp(0xffffffff); } } //---------------------------------------------------------------------------- /**\brief Input value-per-thread starting at 'shared_data'. * Reduction value at last thread's location. * * If 'DoScan' then write blocks' scan values and block-groups' scan values. * * Global reduce result is in the last threads' 'shared_data' location. */ template __device__ bool cuda_single_inter_block_reduce_scan2( const FunctorType& functor, const Cuda::size_type block_id, const Cuda::size_type block_count, SizeType* const shared_data, SizeType* const global_data, Cuda::size_type* const global_flags) { using size_type = SizeType; using value_type = typename FunctorType::value_type; using pointer_type = typename FunctorType::pointer_type; // '__ffs' = position of the least significant bit set to 1. // 'blockDim.y' is guaranteed to be a power of two so this // is the integral shift value that can replace an integral divide. const unsigned BlockSizeShift = __ffs(blockDim.y) - 1; const unsigned BlockSizeMask = blockDim.y - 1; // Must have power of two thread count if (BlockSizeMask & blockDim.y) { Kokkos::abort( "Cuda::cuda_single_inter_block_reduce_scan requires power-of-two " "blockDim"); } const integral_nonzero_constant< size_type, std::is_pointer::value ? 0 : sizeof(value_type) / sizeof(size_type)> word_count((sizeof(value_type) * functor.length()) / sizeof(size_type)); // Reduce the accumulation for the entire block. cuda_intra_block_reduce_scan(functor, pointer_type(shared_data)); { // Write accumulation total to global scratch space. // Accumulation total is the last thread's data. size_type* const shared = shared_data + word_count.value * BlockSizeMask; size_type* const global = global_data + word_count.value * block_id; for (int i = int(threadIdx.y); i < int(word_count.value); i += int(blockDim.y)) { global[i] = shared[i]; } } __threadfence(); // Contributing blocks note that their contribution has been completed via an // atomic-increment flag If this block is not the last block to contribute to // this group then the block is done. const bool is_last_block = !__syncthreads_or( threadIdx.y ? 0 : (1 + atomicInc(global_flags, block_count - 1) < block_count)); if (is_last_block) { const size_type b = (long(block_count) * long(threadIdx.y)) >> BlockSizeShift; const size_type e = (long(block_count) * long(threadIdx.y + 1)) >> BlockSizeShift; { void* const shared_ptr = shared_data + word_count.value * threadIdx.y; /* reference_type shared_value = */ functor.init( static_cast(shared_ptr)); for (size_type i = b; i < e; ++i) { functor.join( static_cast(shared_ptr), reinterpret_cast(global_data + word_count.value * i)); } } cuda_intra_block_reduce_scan(functor, pointer_type(shared_data)); if (DoScan) { pointer_type const shared_value = reinterpret_cast( shared_data + word_count.value * (threadIdx.y ? threadIdx.y - 1 : blockDim.y)); if (!threadIdx.y) { functor.init(shared_value); } // Join previous inclusive scan value to each member for (size_type i = b; i < e; ++i) { size_type* const global_value = global_data + word_count.value * i; functor.join(shared_value, reinterpret_cast(global_value)); functor.copy(reinterpret_cast(global_value), reinterpret_cast(shared_value)); } } } return is_last_block; } template __device__ bool cuda_single_inter_block_reduce_scan( const FunctorType& functor, const Cuda::size_type block_id, const Cuda::size_type block_count, SizeType* const shared_data, SizeType* const global_data, Cuda::size_type* const global_flags) { if (!DoScan && !std::is_pointer::value) return Kokkos::Impl::CudaReductionsFunctor< FunctorType, false, (sizeof(typename FunctorType::value_type) > 16)>:: scalar_inter_block_reduction(functor, block_id, block_count, shared_data, global_data, global_flags); else return cuda_single_inter_block_reduce_scan2( functor, block_id, block_count, shared_data, global_data, global_flags); } // Size in bytes required for inter block reduce or scan template inline std::enable_if_t cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { using Analysis = Impl::FunctorAnalysis, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } template inline std::enable_if_t cuda_single_inter_block_reduce_scan_shmem(const FunctorType& functor, const unsigned BlockSize) { using Analysis = Impl::FunctorAnalysis, FunctorType, ValueType>; return (BlockSize + 2) * Analysis::value_size(functor); } template inline void check_reduced_view_shmem_size(const Policy& policy, const FunctorType& functor) { size_t minBlockSize = CudaTraits::WarpSize * 1; unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( "Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch " "memory"); } } } // namespace Impl } // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #endif /* #if defined(KOKKOS_ENABLE_CUDA) */ #endif /* KOKKOS_CUDA_REDUCESCAN_HPP */