//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_HIP_KERNEL_LAUNCH_HPP #define KOKKOS_HIP_KERNEL_LAUNCH_HPP #include #if defined(__HIPCC__) #include #include #include #if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) #define KOKKOS_IMPL_HIP_GRAPH_ENABLED #endif #ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED #include #include #endif // Must use global variable on the device with HIP-Clang #ifdef __HIP__ #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE __device__ __constant__ extern unsigned long kokkos_impl_hip_constant_memory_buffer[]; #else __device__ __constant__ unsigned long kokkos_impl_hip_constant_memory_buffer [Kokkos::Impl::HIPTraits::ConstantMemoryUsage / sizeof(unsigned long)]; #endif #endif namespace Kokkos { template inline __device__ T *kokkos_impl_hip_shared_memory() { extern __shared__ Kokkos::HIPSpace::size_type sh[]; return (T *)sh; } } // namespace Kokkos namespace Kokkos { namespace Impl { // The hip_parallel_launch_*_memory code is identical to the cuda code template __global__ static void hip_parallel_launch_constant_memory() { const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); driver(); } template __global__ __launch_bounds__( maxTperB, minBperSM) static void hip_parallel_launch_constant_memory() { const DriverType &driver = *(reinterpret_cast( kokkos_impl_hip_constant_memory_buffer)); driver(); } template __global__ static void hip_parallel_launch_local_memory( const DriverType driver) { driver(); } template __global__ __launch_bounds__( maxTperB, minBperSM) static void hip_parallel_launch_local_memory(const DriverType driver) { driver(); } template __global__ static void hip_parallel_launch_global_memory( const DriverType *driver) { driver->operator()(); } template __global__ __launch_bounds__( maxTperB, minBperSM) static void hip_parallel_launch_global_memory(const DriverType *driver) { driver->operator()(); } enum class HIPLaunchMechanism : unsigned { Default = 0, ConstantMemory = 1, GlobalMemory = 2, LocalMemory = 4 }; constexpr inline HIPLaunchMechanism operator|(HIPLaunchMechanism p1, HIPLaunchMechanism p2) { return static_cast(static_cast(p1) | static_cast(p2)); } constexpr inline HIPLaunchMechanism operator&(HIPLaunchMechanism p1, HIPLaunchMechanism p2) { return static_cast(static_cast(p1) & static_cast(p2)); } template struct HIPDispatchProperties { HIPLaunchMechanism launch_mechanism = l; }; // Use local memory up to ConstantMemoryUseThreshold // Use global memory above ConstantMemoryUsage // In between use ConstantMemory // The following code is identical to the cuda code template struct DeduceHIPLaunchMechanism { static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight; static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight; static constexpr Kokkos::Experimental::WorkItemProperty:: ImplForceGlobalLaunch_t force_global_launch = Kokkos::Experimental::WorkItemProperty::ImplForceGlobalLaunch; static constexpr typename DriverType::Policy::work_item_property property = typename DriverType::Policy::work_item_property(); static constexpr HIPLaunchMechanism valid_launch_mechanism = // BuildValidMask (sizeof(DriverType) < HIPTraits::KernelArgumentLimit ? HIPLaunchMechanism::LocalMemory : HIPLaunchMechanism::Default) | (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage ? HIPLaunchMechanism::ConstantMemory : HIPLaunchMechanism::Default) | HIPLaunchMechanism::GlobalMemory; static constexpr HIPLaunchMechanism requested_launch_mechanism = (((property & light_weight) == light_weight) ? HIPLaunchMechanism::LocalMemory : HIPLaunchMechanism::ConstantMemory) | HIPLaunchMechanism::GlobalMemory; static constexpr HIPLaunchMechanism default_launch_mechanism = // BuildValidMask (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold) ? HIPLaunchMechanism::LocalMemory : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage) ? HIPLaunchMechanism::ConstantMemory : HIPLaunchMechanism::GlobalMemory); // None LightWeight HeavyWeight // F struct HIPParallelLaunchKernelFuncData { static unsigned int get_scratch_size( hipFuncAttributes const &hip_func_attributes) { return hip_func_attributes.localSizeBytes; } static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) { static hipFuncAttributes attr = [=]() { hipFuncAttributes attr; KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func)); return attr; }(); return attr; } }; //---------------------------------------------------------------// // Helper function // //---------------------------------------------------------------// inline bool is_empty_launch(dim3 const &grid, dim3 const &block) { return (grid.x == 0) || ((block.x * block.y * block.z) == 0); } //---------------------------------------------------------------// // HIPParallelLaunchKernelFunc structure and its specializations // //---------------------------------------------------------------// template struct HIPParallelLaunchKernelFunc; // HIPLaunchMechanism::LocalMemory specializations template struct HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::LocalMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::LocalMemory>; static auto get_kernel_func() { return hip_parallel_launch_local_memory; } static constexpr auto default_launchbounds() { return false; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; template struct HIPParallelLaunchKernelFunc, HIPLaunchMechanism::LocalMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData, HIPLaunchMechanism::LocalMemory>; static auto get_kernel_func() { return HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::LocalMemory>::get_kernel_func(); } static constexpr auto default_launchbounds() { return true; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; // HIPLaunchMechanism::GlobalMemory specializations template struct HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::GlobalMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::GlobalMemory>; static auto get_kernel_func() { return hip_parallel_launch_global_memory; } static constexpr auto default_launchbounds() { return false; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; template struct HIPParallelLaunchKernelFunc, HIPLaunchMechanism::GlobalMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData, HIPLaunchMechanism::GlobalMemory>; static auto get_kernel_func() { return hip_parallel_launch_global_memory; } static constexpr auto default_launchbounds() { return true; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; // HIPLaunchMechanism::ConstantMemory specializations template struct HIPParallelLaunchKernelFunc< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::ConstantMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData< DriverType, Kokkos::LaunchBounds, HIPLaunchMechanism::ConstantMemory>; static auto get_kernel_func() { return hip_parallel_launch_constant_memory; } static constexpr auto default_launchbounds() { return false; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; template struct HIPParallelLaunchKernelFunc, HIPLaunchMechanism::ConstantMemory> { using funcdata_t = HIPParallelLaunchKernelFuncData, HIPLaunchMechanism::ConstantMemory>; static auto get_kernel_func() { return hip_parallel_launch_constant_memory; } static constexpr auto default_launchbounds() { return true; } static auto get_scratch_size() { return funcdata_t::get_scratch_size(get_hip_func_attributes()); } static hipFuncAttributes get_hip_func_attributes() { return funcdata_t::get_hip_func_attributes( reinterpret_cast(get_kernel_func())); } }; //------------------------------------------------------------------// // HIPParallelLaunchKernelInvoker structure and its specializations // //------------------------------------------------------------------// template struct HIPParallelLaunchKernelInvoker; // HIPLaunchMechanism::LocalMemory specialization template struct HIPParallelLaunchKernelInvoker : HIPParallelLaunchKernelFunc { using base_t = HIPParallelLaunchKernelFunc; static void invoke_kernel(DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { (base_t::get_kernel_func())<<m_stream>>>( driver); } #ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const * /*hip_instance*/) { auto const &graph = get_hip_graph_from_kernel(driver); KOKKOS_EXPECTS(graph); auto &graph_node = get_hip_graph_node_from_kernel(driver); // Expect node not yet initialized KOKKOS_EXPECTS(!graph_node); if (!is_empty_launch(grid, block)) { void const *args[] = {&driver}; hipKernelNodeParams params = {}; params.blockDim = block; params.gridDim = grid; params.sharedMemBytes = shmem; params.func = (void *)base_t::get_kernel_func(); params.kernelParams = (void **)args; params.extra = nullptr; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode( &graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0, ¶ms)); } else { // We still need an empty node for the dependency structure KOKKOS_IMPL_HIP_SAFE_CALL( hipGraphAddEmptyNode(&graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0)); } KOKKOS_ENSURES(graph_node); } #endif }; // HIPLaunchMechanism::GlobalMemory specialization template struct HIPParallelLaunchKernelInvoker : HIPParallelLaunchKernelFunc { using base_t = HIPParallelLaunchKernelFunc; static void invoke_kernel(DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { // Wait until the previous kernel that uses m_scratchFuntor is done std::lock_guard lock(HIPInternal::scratchFunctorMutex); DriverType *driver_ptr = reinterpret_cast( hip_instance->stage_functor_for_execution( reinterpret_cast(&driver), sizeof(DriverType))); (base_t::get_kernel_func())<<m_stream>>>( driver_ptr); } #ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED static void create_parallel_launch_graph_node( DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { auto const &graph = get_hip_graph_from_kernel(driver); KOKKOS_EXPECTS(graph); auto &graph_node = get_hip_graph_node_from_kernel(driver); // Expect node not yet initialized KOKKOS_EXPECTS(!graph_node); if (!Impl::is_empty_launch(grid, block)) { auto *driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, hip_instance->m_stream); void const *args[] = {&driver_ptr}; hipKernelNodeParams params = {}; params.blockDim = block; params.gridDim = grid; params.sharedMemBytes = shmem; params.func = (void *)base_t::get_kernel_func(); params.kernelParams = (void **)args; params.extra = nullptr; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphAddKernelNode( &graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0, ¶ms)); } else { // We still need an empty node for the dependency structure KOKKOS_IMPL_HIP_SAFE_CALL( hipGraphAddEmptyNode(&graph_node, graph, /* dependencies = */ nullptr, /* numDependencies = */ 0)); } KOKKOS_ENSURES(bool(graph_node)) } #endif }; // HIPLaunchMechanism::ConstantMemory specializations template struct HIPParallelLaunchKernelInvoker : HIPParallelLaunchKernelFunc { using base_t = HIPParallelLaunchKernelFunc; static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage, "Kokkos Error: Requested HIPLaunchConstantMemory with a " "Functor larger than 32kB."); static void invoke_kernel(DriverType const &driver, dim3 const &grid, dim3 const &block, int shmem, HIPInternal const *hip_instance) { // Wait until the previous kernel that uses the constant buffer is done std::lock_guard lock(HIPInternal::constantMemMutex); KOKKOS_IMPL_HIP_SAFE_CALL( hipEventSynchronize(HIPInternal::constantMemReusable)); // Copy functor (synchronously) to staging buffer in pinned host memory unsigned long *staging = hip_instance->constantMemHostStaging; std::memcpy(static_cast(staging), static_cast(&driver), sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync( HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging, sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream)); // Invoke the driver function on the device (base_t:: get_kernel_func())<<m_stream>>>(); // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(HIPInternal::constantMemReusable, hip_instance->m_stream)); } }; //-----------------------------// // HIPParallelLaunch structure // //-----------------------------// template , HIPLaunchMechanism LaunchMechanism = DeduceHIPLaunchMechanism::launch_mechanism> struct HIPParallelLaunch; template struct HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, LaunchMechanism> : HIPParallelLaunchKernelInvoker< DriverType, Kokkos::LaunchBounds, LaunchMechanism> { using base_t = HIPParallelLaunchKernelInvoker< DriverType, Kokkos::LaunchBounds, LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { if (hip_instance->m_maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } desul::ensure_hip_lock_arrays_on_device(); // Invoke the driver function on the device base_t::invoke_kernel(driver, grid, block, shmem, hip_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError()); hip_instance->fence( "Kokkos::Impl::HIParallelLaunch: Debug Only Check for " "Execution Error"); #endif } } }; // convenience method to launch the correct kernel given the launch bounds et // al. template , HIPLaunchMechanism LaunchMechanism = DeduceHIPLaunchMechanism::launch_mechanism, bool DoGraph = DriverType::Policy::is_graph_kernel::value> void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { #ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker; base_t::create_parallel_launch_graph_node(driver, grid, block, shmem, hip_instance); } else #endif { // Regular kernel launch #ifndef KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS HIPParallelLaunch( driver, grid, block, shmem, hip_instance, prefer_shmem); #else if constexpr (!HIPParallelLaunch::default_launchbounds()) { // for user defined, we *always* honor the request HIPParallelLaunch( driver, grid, block, shmem, hip_instance, prefer_shmem); } else { // we can do what we like const unsigned flat_block_size = block.x * block.y * block.z; if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) { // we have to use the large blocksize HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, LaunchMechanism>(driver, grid, block, shmem, hip_instance, prefer_shmem); } else { HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, LaunchMechanism>(driver, grid, block, shmem, hip_instance, prefer_shmem); } } #endif } } } // namespace Impl } // namespace Kokkos #undef KOKKOS_IMPL_HIP_GRAPH_ENABLED #endif #endif