//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif #ifndef KOKKOS_VIEW_ALLOC_HPP #define KOKKOS_VIEW_ALLOC_HPP #include #include #include #include #include #include #include #include namespace Kokkos::Impl { template bool is_zero_byte(const T& x) { constexpr std::byte all_zeroes[sizeof(T)] = {}; return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } //---------------------------------------------------------------------------- /* * The construction, assignment to default, and destruction * are merged into a single functor. * Primarily to work around an unresolved CUDA back-end bug * that would lose the destruction cuda device function when * called from the shared memory tracking destruction. * Secondarily to have two fewer partial specializations. */ template ::value> struct ViewValueFunctor; template struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; struct ConstructTag {}; ExecSpace space; ValueType* ptr; size_t n; std::string name; bool default_exec_space; template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> operator()(ConstructTag const&, const size_t i) const { new (ptr + i) ValueType(); } KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, const size_t i) const { (ptr + i)->~ValueType(); } ViewValueFunctor() = default; ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, size_t const arg_n, std::string arg_name) : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)), default_exec_space(false) { functor_instantiate_workaround(); } ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, std::string arg_name) : space(ExecSpace{}), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)), default_exec_space(true) { functor_instantiate_workaround(); } template std::enable_if_t::value && std::is_trivially_copy_assignable::value> construct_dispatch() { ValueType value{}; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) { uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { // We are not really using parallel_for here but using beginParallelFor // instead of begin_parallel_for (and adding "via memset") is the best // we can do to indicate that this is not supposed to be tunable (and // doesn't really execute a parallel_for). Kokkos::Profiling::beginParallelFor( "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } (void)ZeroMemset( space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } if (default_exec_space) space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); } else { #endif parallel_for_implementation(); #ifndef KOKKOS_ARCH_A64FX } #endif } template std::enable_if_t::value && std::is_trivially_copy_assignable::value)> construct_dispatch() { parallel_for_implementation(); } template void parallel_for_implementation() { using PolicyType = Kokkos::RangePolicy, Tag>; PolicyType policy(space, 0, n); uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { const std::string functor_name = (std::is_same_v ? "Kokkos::View::destruction [" + name + "]" : "Kokkos::View::initialization [" + name + "]"); Kokkos::Profiling::beginParallelFor( functor_name, Kokkos::Profiling::Experimental::device_id(space), &kpID); } #ifdef KOKKOS_ENABLE_CUDA if (std::is_same::value) { Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, true); } #endif const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); if (default_exec_space || std::is_same_v) space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } } void construct_shared_allocation() { construct_dispatch(); } void destroy_shared_allocation() { parallel_for_implementation(); } // This function is to ensure that the functor with DestroyTag is instantiated // This is a workaround to avoid "cudaErrorInvalidDeviceFunction" error later // when the function is queried with cudaFuncGetAttributes void functor_instantiate_workaround() { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) if (false) { parallel_for_implementation(); } #endif } }; template struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; using PolicyType = Kokkos::RangePolicy>; ExecSpace space; ValueType* ptr; size_t n; std::string name; bool default_exec_space; KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { ptr[i] = ValueType(); } ViewValueFunctor() = default; ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, size_t const arg_n, std::string arg_name) : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)), default_exec_space(false) {} ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, std::string arg_name) : space(ExecSpace{}), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)), default_exec_space(true) {} template std::enable_if_t::value && std::is_trivially_copy_assignable::value> construct_shared_allocation() { // Shortcut for zero initialization // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX ValueType value{}; if (Impl::is_zero_byte(value)) { uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { // We are not really using parallel_for here but using beginParallelFor // instead of begin_parallel_for (and adding "via memset") is the best // we can do to indicate that this is not supposed to be tunable (and // doesn't really execute a parallel_for). Kokkos::Profiling::beginParallelFor( "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } (void)ZeroMemset( space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } if (default_exec_space) space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); } else { #endif parallel_for_implementation(); #ifndef KOKKOS_ARCH_A64FX } #endif } template std::enable_if_t::value && std::is_trivially_copy_assignable::value)> construct_shared_allocation() { parallel_for_implementation(); } void parallel_for_implementation() { PolicyType policy(space, 0, n); uint64_t kpID = 0; if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::beginParallelFor( "Kokkos::View::initialization [" + name + "]", Kokkos::Profiling::Experimental::device_id(space), &kpID); } #ifdef KOKKOS_ENABLE_CUDA if (std::is_same::value) { Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, true); } #endif const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); if (default_exec_space) space.fence( "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " "view"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } } void destroy_shared_allocation() {} }; } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP