//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP #define KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP #include #include #include #include #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES #include #endif template class Kokkos::Impl::ParallelReduce, Kokkos::Experimental::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; private: using value_type = typename ReducerType::value_type; using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; using WorkTag = typename Policy::work_tag; // MDRangePolicy is not trivially copyable. Hence, replicate the data we // really need in DeviceIterateTile in a trivially copyable struct. struct BarePolicy { using index_type = typename Policy::index_type; BarePolicy(const Policy& policy) : m_lower(policy.m_lower), m_upper(policy.m_upper), m_tile(policy.m_tile), m_tile_end(policy.m_tile_end), m_num_tiles(policy.m_num_tiles), m_prod_tile_dims(policy.m_prod_tile_dims) {} const typename Policy::point_type m_lower; const typename Policy::point_type m_upper; const typename Policy::tile_type m_tile; const typename Policy::point_type m_tile_end; const typename Policy::index_type m_num_tiles; const typename Policy::index_type m_prod_tile_dims; static constexpr Iterate inner_direction = Policy::inner_direction; static constexpr int rank = Policy::rank; }; public: // V - View template ParallelReduce(const CombinedFunctorReducerType& f, const Policy& p, const View& v) : m_functor_reducer(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()), m_result_ptr_device_accessible( MemorySpaceAccess::accessible) {} private: template sycl::event sycl_direct_launch( const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references Kokkos::Experimental::Impl::SYCLInternal& instance = *m_space.impl_internal_space_instance(); sycl::queue& q = m_space.sycl_queue(); const typename Policy::index_type n_tiles = m_policy.m_num_tiles; const unsigned int value_count = m_functor_reducer.get_reducer().value_count(); sycl_device_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; sycl::event last_reduction_event; desul::ensure_sycl_lock_arrays_on_device(q); // If n_tiles==0 we only call init() and final() working with the global // scratch memory but don't copy back to m_result_ptr yet. if (n_tiles == 0) { auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); #else (void)memcpy_event; #endif results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); const ReducerType& reducer = functor_reducer.get_reducer(); reducer.init(results_ptr); reducer.final(results_ptr); if (device_accessible_result_ptr) reducer.copy(device_accessible_result_ptr.get(), results_ptr.get()); }); }); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES q.ext_oneapi_submit_barrier( std::vector{parallel_reduce_event}); #endif last_reduction_event = parallel_reduce_event; } else { // Otherwise (when n_tiles is not zero), we perform a reduction on the // values in all workgroups separately, write the workgroup results back // to global memory and recurse until only one workgroup does the // reduction and thus gets the final value. const int wgroup_size = Kokkos::bit_ceil( static_cast(m_policy.m_prod_tile_dims)); // FIXME_SYCL Find a better way to determine a good limit for the // maximum number of work groups, also see // https://github.com/intel/llvm/blob/756ba2616111235bba073e481b7f1c8004b34ee6/sycl/source/detail/reduction.cpp#L51-L62 size_t max_work_groups = 2 * q.get_device().get_info(); int values_per_thread = 1; size_t n_wgroups = n_tiles; while (n_wgroups > max_work_groups) { values_per_thread *= 2; n_wgroups = (n_tiles + values_per_thread - 1) / values_per_thread; } results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { sycl::local_accessor local_mem( sycl::range<1>(wgroup_size) * value_count, cgh); sycl::local_accessor num_teams_done(1, cgh); const BarePolicy bare_policy = m_policy; #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); #else (void)memcpy_event; #endif // REMEMBER swap local x<->y to be conforming with Cuda/HIP // implementation cgh.parallel_for( sycl::nd_range<1>{n_wgroups * wgroup_size, wgroup_size}, [=](sycl::nd_item<1> item) { const int local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); const FunctorType& functor = functor_reducer.get_functor(); const ReducerType& reducer = functor_reducer.get_reducer(); // In the first iteration, we call functor to initialize the local // memory. Otherwise, the local memory is initialized with the // results from the previous iteration that are stored in global // memory. using index_type = typename Policy::index_type; // SWAPPED here to be conforming with CUDA implementation const index_type local_x = 0; const index_type local_y = item.get_local_id(0); const index_type local_z = 0; const index_type global_y = 0; const index_type global_z = 0; const index_type n_global_x = n_tiles; const index_type n_global_y = 1; const index_type n_global_z = 1; if constexpr (!SYCLReduction::use_shuffle_based_algorithm< ReducerType>) { reference_type update = reducer.init(&local_mem[local_id * value_count]); for (index_type global_x = item.get_group(0); global_x < n_tiles; global_x += item.get_group_range(0)) Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, BarePolicy, FunctorType, typename Policy::work_tag, reference_type>( bare_policy, functor, update, {n_global_x, n_global_y, n_global_z}, {global_x, global_y, global_z}, {local_x, local_y, local_z}) .exec_range(); item.barrier(sycl::access::fence_space::local_space); SYCLReduction::workgroup_reduction<>( item, local_mem, results_ptr, device_accessible_result_ptr, value_count, reducer, false, wgroup_size); if (local_id == 0) { sycl::atomic_ref scratch_flags_ref(*scratch_flags); num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id == 0) *scratch_flags = 0; if (local_id >= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { reducer.copy(&local_mem[local_id * value_count], &results_ptr[local_id * value_count]); for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { reducer.join(&local_mem[local_id * value_count], &results_ptr[id * value_count]); } } SYCLReduction::workgroup_reduction<>( item, local_mem, results_ptr, device_accessible_result_ptr, value_count, reducer, true, std::min(n_wgroups, wgroup_size)); } } else { value_type local_value; reference_type update = reducer.init(&local_value); for (index_type global_x = item.get_group(0); global_x < n_tiles; global_x += item.get_group_range(0)) Kokkos::Impl::Reduce::DeviceIterateTile< Policy::rank, BarePolicy, FunctorType, typename Policy::work_tag, reference_type>( bare_policy, functor, update, {n_global_x, n_global_y, n_global_z}, {global_x, global_y, global_z}, {local_x, local_y, local_z}) .exec_range(); SYCLReduction::workgroup_reduction<>( item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, false, wgroup_size); if (local_id == 0) { sycl::atomic_ref scratch_flags_ref(*scratch_flags); num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); if (num_teams_done[0] == n_wgroups) { if (local_id == 0) *scratch_flags = 0; if (local_id >= static_cast(n_wgroups)) reducer.init(&local_value); else { local_value = results_ptr[local_id]; for (unsigned int id = local_id + wgroup_size; id < n_wgroups; id += wgroup_size) { reducer.join(&local_value, &results_ptr[id]); } } SYCLReduction::workgroup_reduction<>( item, local_mem, local_value, results_ptr, device_accessible_result_ptr, reducer, true, std::min(n_wgroups, wgroup_size)); } } }); }); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES q.ext_oneapi_submit_barrier( std::vector{parallel_reduce_event}); #endif last_reduction_event = parallel_reduce_event; } // At this point, the reduced value is written to the entry in results_ptr // and all that is left is to copy it back to the given result pointer if // necessary. // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. if (host_result_ptr) { m_space.fence( "Kokkos::Impl::ParallelReduce::execute: result " "not device-accessible"); std::memcpy(m_result_ptr, host_result_ptr, sizeof(value_type) * value_count); } return last_reduction_event; } public: template static int max_tile_size_product(const Policy& policy, const Functor&) { return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize; } void execute() const { Kokkos::Experimental::Impl::SYCLInternal& instance = *m_space.impl_internal_space_instance(); // Only let one instance at a time resize the instance's scratch memory // allocations. std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); using IndirectKernelMem = Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); functor_reducer_wrapper.register_event(event); } private: const CombinedFunctorReducerType m_functor_reducer; const BarePolicy m_policy; const Kokkos::Experimental::SYCL& m_space; const pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */