//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP #define KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP #include #if defined(KOKKOS_ENABLE_CUDA) #include #include // GraphAccess needs to be complete // GraphNodeImpl needs to be complete because GraphImpl here is a full // specialization and not just a partial one #include #include #include #include #include namespace Kokkos { namespace Impl { template <> struct GraphImpl { public: using execution_space = Kokkos::Cuda; private: execution_space m_execution_space; cudaGraph_t m_graph = nullptr; cudaGraphExec_t m_graph_exec = nullptr; using cuda_graph_flags_t = unsigned int; using node_details_t = GraphNodeBackendSpecificDetails; void _instantiate_graph() { constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); // TODO @graphs print out errors } public: using root_node_impl_t = GraphNodeImpl; using aggregate_kernel_impl_t = CudaGraphNodeAggregateKernel; using aggregate_node_impl_t = GraphNodeImpl; // Not moveable or copyable; it spends its whole life as a shared_ptr in the // Graph object GraphImpl() = delete; GraphImpl(GraphImpl const&) = delete; GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(bool(m_graph)) if (bool(m_graph_exec)) { KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_exec_destroy_wrapper(m_graph_exec))); } KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_destroy_wrapper(m_graph))); }; explicit GraphImpl(Kokkos::Cuda arg_instance) : m_execution_space(std::move(arg_instance)) { KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_create_wrapper(&m_graph, cuda_graph_flags_t{0}))); } void add_node(std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_add_empty_node_wrapper( &(arg_node_ptr->node_details_t::node), m_graph, /* dependencies = */ nullptr, /* numDependencies = */ 0))); } template // requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in it's policy void add_node(std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(bool(arg_node_ptr)); // The Kernel launch from the execute() method has been shimmed to insert // the node into the graph auto& kernel = arg_node_ptr->get_kernel(); // note: using arg_node_ptr->node_details_t::node caused an ICE in NVCC 10.1 auto& cuda_node = static_cast(arg_node_ptr.get())->node; KOKKOS_EXPECTS(!bool(cuda_node)); kernel.set_cuda_graph_ptr(&m_graph); kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); } template // requires PredecessorRef is a specialization of GraphNodeRef that has // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(bool(arg_node_ptr)) auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); KOKKOS_EXPECTS(bool(pred_ptr)) // clang-format off // NOTE const-qualifiers below are commented out because of an API break // from CUDA 10.0 to CUDA 10.1 // cudaGraphAddDependencies(cudaGraph_t, cudaGraphNode_t*, cudaGraphNode_t*, size_t) // cudaGraphAddDependencies(cudaGraph_t, const cudaGraphNode_t*, const cudaGraphNode_t*, size_t) // clang-format on auto /*const*/& pred_cuda_node = pred_ptr->node_details_t::node; KOKKOS_EXPECTS(bool(pred_cuda_node)) auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node; KOKKOS_EXPECTS(bool(cuda_node)) KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_add_dependencies_wrapper(m_graph, &pred_cuda_node, &cuda_node, 1))); } void submit() { if (!bool(m_graph_exec)) { _instantiate_graph(); } KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_launch_wrapper(m_graph_exec))); } execution_space const& get_execution_space() const noexcept { return m_execution_space; } auto create_root_node_ptr() { KOKKOS_EXPECTS(bool(m_graph)) KOKKOS_EXPECTS(!bool(m_graph_exec)) auto rv = std::make_shared( get_execution_space(), _graph_node_is_root_ctor_tag{}); KOKKOS_IMPL_CUDA_SAFE_CALL( (m_execution_space.impl_internal_space_instance() ->cuda_graph_add_empty_node_wrapper(&(rv->node_details_t::node), m_graph, /* dependencies = */ nullptr, /* numDependencies = */ 0))); KOKKOS_ENSURES(bool(rv->node_details_t::node)) return rv; } template // See requirements/expectations in GraphBuilder auto create_aggregate_ptr(PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for // each predecessor ref, so all we need to do here is create the (trivial) // aggregate node. return std::make_shared( m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } }; } // end namespace Impl } // end namespace Kokkos #endif // defined(KOKKOS_ENABLE_CUDA) #endif // KOKKOS_KOKKOS_CUDA_GRAPH_IMPL_HPP