//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP #define KOKKOS_OPENACC_PARALLEL_SCAN_RANGE_HPP #include #include #include #include // Clacc uses an alternative implementation to work around not-yet-implemented // OpenACC features: Clacc does not fully support private clauses for // gang-private variables, and the alternative implementation allocates // the gang-private arrays on GPU global memory using array expansion, // instead of using the private clause. /* clang-format off */ #ifdef KOKKOS_COMPILER_CLANG #define KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(THREADID) \ element_values[team_id * 2 * chunk_size + THREADID] #define KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE create(element_values [0:num_elements]) #else #define KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(THREADID) element_values[THREADID] #define KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE private(element_values [0:num_elements]) #endif /* clang-format on */ namespace Kokkos::Impl { template class ParallelScanOpenACCBase { protected: using Policy = Kokkos::RangePolicy; using Analysis = Kokkos::Impl::FunctorAnalysis; using PointerType = typename Analysis::pointer_type; using ValueType = typename Analysis::value_type; using MemberType = typename Policy::member_type; using IndexType = typename Policy::index_type; Functor m_functor; Policy m_policy; ValueType* m_result_ptr; bool m_result_ptr_device_accessible; static constexpr MemberType default_scan_chunk_size = 128; public: ParallelScanOpenACCBase(Functor const& arg_functor, Policy const& arg_policy, ValueType* arg_result_ptr, bool arg_result_ptr_device_accessible) : m_functor(arg_functor), m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} // This function implements the parallel scan alogithm based on the parallel // prefix sum algorithm proposed by Hillis and Steele (doi:10.1145/7902.7903), // which offers a shorter span and more parallelism but may not be // work-efficient. void OpenACCParallelScanRangePolicy(const IndexType begin, const IndexType end, IndexType chunk_size, const int async_arg) const { if (chunk_size > 1) { if (!Impl::is_integral_power_of_two(chunk_size)) Kokkos::abort( "RangePolicy blocking granularity must be power of two to be used " "with OpenACC parallel_scan()"); } else { chunk_size = default_scan_chunk_size; } const Kokkos::Experimental::Impl::FunctorAdapter< Functor, Policy, Kokkos::Experimental::Impl::RoutineClause::seq> functor(m_functor); const IndexType N = end - begin; const IndexType n_chunks = (N + chunk_size - 1) / chunk_size; #ifdef KOKKOS_COMPILER_CLANG int const num_elements = n_chunks * 2 * chunk_size; #else int const num_elements = 2 * chunk_size; #endif Kokkos::View chunk_values( "Kokkos::OpenACCParallelScan::chunk_values", n_chunks); Kokkos::View offset_values( "Kokkos::OpenACCParallelScan::offset_values", n_chunks); Kokkos::View m_result_total( "Kokkos::OpenACCParallelScan::m_result_total"); std::unique_ptr element_values_owner( new ValueType[num_elements]); ValueType* element_values = element_values_owner.get(); typename Analysis::Reducer final_reducer(m_functor); #pragma acc enter data copyin(functor, final_reducer) \ copyin(chunk_values, offset_values) async(async_arg) /* clang-format off */ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE present(functor, chunk_values, final_reducer) async(async_arg)) /* clang-format on */ for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { IndexType current_step = 0; IndexType next_step = 1; IndexType temp; #pragma acc loop vector for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { const IndexType local_offset = team_id * chunk_size; const IndexType idx = local_offset + thread_id; ValueType update; final_reducer.init(&update); if ((idx > 0) && (idx < N)) functor(idx - 1, update, false); KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(thread_id) = update; } for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) { #pragma acc loop vector for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { if (thread_id < step_size) { KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size + thread_id) = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(current_step * chunk_size + thread_id); } else { ValueType localValue = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + thread_id); final_reducer.join(&localValue, &KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + thread_id - step_size)); KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size + thread_id) = localValue; } } temp = current_step; current_step = next_step; next_step = temp; } chunk_values(team_id) = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + chunk_size - 1); } ValueType tempValue; #pragma acc parallel loop seq num_gangs(1) num_workers(1) vector_length(1) \ present(chunk_values, offset_values, final_reducer) async(async_arg) for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { if (team_id == 0) { final_reducer.init(&offset_values(0)); final_reducer.init(&tempValue); } else { final_reducer.join(&tempValue, &chunk_values(team_id - 1)); offset_values(team_id) = tempValue; } } /* clang-format off */ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE present(functor, offset_values, final_reducer) copyin(m_result_total) async(async_arg)) /* clang-format on */ for (IndexType team_id = 0; team_id < n_chunks; ++team_id) { IndexType current_step = 0; IndexType next_step = 1; IndexType temp; #pragma acc loop vector for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { const IndexType local_offset = team_id * chunk_size; const IndexType idx = local_offset + thread_id; ValueType update; final_reducer.init(&update); if (thread_id == 0) { final_reducer.join(&update, &offset_values(team_id)); } if ((idx > 0) && (idx < N)) functor(idx - 1, update, false); KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(thread_id) = update; } for (IndexType step_size = 1; step_size < chunk_size; step_size *= 2) { #pragma acc loop vector for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { if (thread_id < step_size) { KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size + thread_id) = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(current_step * chunk_size + thread_id); } else { ValueType localValue = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + thread_id); final_reducer.join(&localValue, &KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + thread_id - step_size)); KOKKOS_IMPL_ACC_ACCESS_ELEMENTS(next_step * chunk_size + thread_id) = localValue; } } temp = current_step; current_step = next_step; next_step = temp; } #pragma acc loop vector for (IndexType thread_id = 0; thread_id < chunk_size; ++thread_id) { const IndexType local_offset = team_id * chunk_size; const IndexType idx = local_offset + thread_id; ValueType update = KOKKOS_IMPL_ACC_ACCESS_ELEMENTS( current_step * chunk_size + thread_id); if (idx < N) functor(idx, update, true); if (idx == N - 1) { if (m_result_ptr_device_accessible) { *m_result_ptr = update; } else { m_result_total() = update; } } } } if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) { DeepCopy(m_policy.space(), m_result_ptr, m_result_total.data(), sizeof(ValueType)); } #pragma acc exit data delete (functor, chunk_values, offset_values, \ final_reducer)async(async_arg) acc_wait(async_arg); } void execute() const { const IndexType begin = m_policy.begin(); const IndexType end = m_policy.end(); IndexType chunk_size = m_policy.chunk_size(); if (end <= begin) { if (!m_result_ptr_device_accessible && m_result_ptr != nullptr) { *m_result_ptr = 0; } return; } int const async_arg = m_policy.space().acc_async_queue(); OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); } }; } // namespace Kokkos::Impl //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- template class Kokkos::Impl::ParallelScan, Kokkos::Experimental::OpenACC> : public ParallelScanOpenACCBase { using base_t = ParallelScanOpenACCBase; using IndexType = typename base_t::IndexType; public: void execute() const { const IndexType begin = base_t::m_policy.begin(); const IndexType end = base_t::m_policy.end(); IndexType chunk_size = base_t::m_policy.chunk_size(); int const async_arg = base_t::m_policy.space().acc_async_queue(); base_t::OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); } ParallelScan(const Functor& arg_functor, const typename base_t::Policy& arg_policy) : base_t(arg_functor, arg_policy, nullptr, false) {} }; template class Kokkos::Impl::ParallelScanWithTotal< FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::Experimental::OpenACC> : public ParallelScanOpenACCBase { using base_t = ParallelScanOpenACCBase; using IndexType = typename base_t::IndexType; public: void execute() const { const IndexType begin = base_t::m_policy.begin(); const IndexType end = base_t::m_policy.end(); IndexType chunk_size = base_t::m_policy.chunk_size(); if (end <= begin) { if (!base_t::m_result_ptr_device_accessible && base_t::m_result_ptr != nullptr) { *base_t::m_result_ptr = 0; } return; } int const async_arg = base_t::m_policy.space().acc_async_queue(); base_t::OpenACCParallelScanRangePolicy(begin, end, chunk_size, async_arg); } template ParallelScanWithTotal(const FunctorType& arg_functor, const typename base_t::Policy& arg_policy, const ViewType& arg_result_view) : base_t(arg_functor, arg_policy, arg_result_view.data(), MemorySpaceAccess::accessible) { } }; #undef KOKKOS_IMPL_ACC_ACCESS_ELEMENTS #undef KOKKOS_IMPL_ACC_ELEMENT_VALUES_CLAUSE #endif