//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_IMPL_KOKKOS_TOOLS_GENERIC_HPP #define KOKKOS_IMPL_KOKKOS_TOOLS_GENERIC_HPP #include #include #include #include #include #include namespace Kokkos { namespace Tools { namespace Experimental { namespace Impl { static std::map team_tuners; template using MDRangeTuningMap = std::map>; template static MDRangeTuningMap mdrange_tuners; // For any policies without a tuning implementation, with a reducer template void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, TagType) {} // For any policies without a tuning implementation, without a reducer template void tune_policy(const size_t, const std::string&, ExecPolicy&, const Functor&, const TagType&) {} /** * Tuning for parallel_fors and parallel_scans is a fairly simple process. * * Tuning for a parallel_reduce turns out to be a little more complicated. * * If you're tuning a reducer, it might be a complex or a simple reducer * (an example of simple would be one where the join is just "+". * * Unfortunately these two paths are very different in terms of which classes * get instantiated. Thankfully, all of this complexity is encoded in the * ReducerType. If it's a "simple" reducer, this will be Kokkos::InvalidType, * otherwise it'll be something else. * * If the type is complex, for the code to be generally right you _must_ * pass an instance of that ReducerType to functions that determine * eligible team sizes. If the type is simple, you can't construct one, * you use the simpler 2-arg formulation of team_size_recommended/max. */ namespace Impl { struct SimpleTeamSizeCalculator { template int get_max_team_size(const Policy& policy, const Functor& functor, const Tag tag) { auto max = policy.team_size_max(functor, tag); return max; } template int get_recommended_team_size(const Policy& policy, const Functor& functor, const Tag tag) { auto max = policy.team_size_recommended(functor, tag); return max; } template int get_mdrange_max_tile_size_product(const Policy& policy, const Functor& functor, const Kokkos::ParallelForTag&) { using exec_space = typename Policy::execution_space; using driver = Kokkos::Impl::ParallelFor; return driver::max_tile_size_product(policy, functor); } template int get_mdrange_max_tile_size_product(const Policy& policy, const Functor& functor, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; using analysis = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, Functor, void>; using driver = typename Kokkos::Impl::ParallelReduce< Kokkos::Impl::CombinedFunctorReducer, Policy, exec_space>; return driver::max_tile_size_product(policy, functor); } }; // when we have a complex reducer, we need to pass an // instance to team_size_recommended/max. Reducers // aren't default constructible, but they are // constructible from a reference to an // instance of their value_type so we construct // a value_type and temporary reducer here template struct ComplexReducerSizeCalculator { template int get_max_team_size(const Policy& policy, const Functor& functor, const Tag tag) { using value_type = typename ReducerType::value_type; value_type value; ReducerType reducer_example = ReducerType(value); using Analysis = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, value_type>; typename Analysis::Reducer final_reducer(reducer_example); return policy.team_size_max(functor, final_reducer, tag); } template int get_recommended_team_size(const Policy& policy, const Functor& functor, const Tag tag) { using value_type = typename ReducerType::value_type; value_type value; ReducerType reducer_example = ReducerType(value); using Analysis = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, value_type>; typename Analysis::Reducer final_reducer(reducer_example); return policy.team_size_recommended(functor, final_reducer, tag); } template int get_mdrange_max_tile_size_product(const Policy& policy, const Functor& functor, const Kokkos::ParallelReduceTag&) { using exec_space = typename Policy::execution_space; using Analysis = Kokkos::Impl::FunctorAnalysis< Kokkos::Impl::FunctorPatternInterface::REDUCE, Policy, ReducerType, void>; using driver = typename Kokkos::Impl::ParallelReduce< Kokkos::Impl::CombinedFunctorReducer, Policy, exec_space>; return driver::max_tile_size_product(policy, functor); } }; } // namespace Impl template void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, const Functor& functor, const TagType& tag, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; if (label_in.empty()) { using policy_type = std::remove_reference_t; using work_tag = typename policy_type::work_tag; Kokkos::Impl::ParallelConstructName name(label); label = name.get(); } auto tuner_iter = [&]() { auto my_tuner = map.find(label); if (my_tuner == map.end()) { return (map.emplace(label, Tuner(label, policy, functor, tag, Impl::SimpleTeamSizeCalculator{})) .first); } return my_tuner; }(); tuner_iter->second.tune(policy); } } template void generic_tune_policy(const std::string& label_in, Map& map, Policy& policy, const Functor& functor, const TagType& tag, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; if (label_in.empty()) { using policy_type = std::remove_reference_t; using work_tag = typename policy_type::work_tag; Kokkos::Impl::ParallelConstructName name(label); label = name.get(); } auto tuner_iter = [&]() { auto my_tuner = map.find(label); if (my_tuner == map.end()) { return (map.emplace( label, Tuner(label, policy, functor, tag, Impl::ComplexReducerSizeCalculator{})) .first); } return my_tuner; }(); tuner_iter->second.tune(policy); } } // tune a TeamPolicy, without reducer template void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::TeamPolicy& policy, const Functor& functor, const TagType& tag) { generic_tune_policy( label_in, team_tuners, policy, functor, tag, [](const Kokkos::TeamPolicy& candidate_policy) { return (candidate_policy.impl_auto_team_size() || candidate_policy.impl_auto_vector_length()); }); } // tune a TeamPolicy, with reducer template void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::TeamPolicy& policy, const Functor& functor, const TagType& tag) { generic_tune_policy( label_in, team_tuners, policy, functor, tag, [](const Kokkos::TeamPolicy& candidate_policy) { return (candidate_policy.impl_auto_team_size() || candidate_policy.impl_auto_vector_length()); }); } // tune a MDRangePolicy, without reducer template void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::MDRangePolicy& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy; static constexpr int rank = Policy::rank; generic_tune_policy>( label_in, mdrange_tuners, policy, functor, tag, [](const Policy& candidate_policy) { return candidate_policy.impl_tune_tile_size(); }); } // tune a MDRangePolicy, with reducer template void tune_policy(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::MDRangePolicy& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy; static constexpr int rank = Policy::rank; generic_tune_policy, ReducerType>( label_in, mdrange_tuners, policy, functor, tag, [](const Policy& candidate_policy) { return candidate_policy.impl_tune_tile_size(); }); } template struct ReductionSwitcher { template static void tune(const size_t tuning_context, const std::string& label, ExecPolicy& policy, const Functor& functor, const TagType& tag) { if (Kokkos::tune_internals()) { tune_policy(tuning_context, label, policy, functor, tag); } } }; template <> struct ReductionSwitcher { template static void tune(const size_t tuning_context, const std::string& label, ExecPolicy& policy, const Functor& functor, const TagType& tag) { if (Kokkos::tune_internals()) { tune_policy(tuning_context, label, policy, functor, tag); } } }; template void generic_report_results(const std::string& label_in, Map& map, Policy& policy, const Functor&, const TagType&, const TuningPermissionFunctor& should_tune) { if (should_tune(policy)) { std::string label = label_in; if (label_in.empty()) { using policy_type = std::remove_reference_t; using work_tag = typename policy_type::work_tag; Kokkos::Impl::ParallelConstructName name(label); label = name.get(); } auto tuner_iter = map[label]; tuner_iter.end(); } } // report results for a policy type we don't tune (do nothing) template void report_policy_results(const size_t, const std::string&, ExecPolicy&, const Functor&, const TagType&) {} // report results for a TeamPolicy template void report_policy_results(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::TeamPolicy& policy, const Functor& functor, const TagType& tag) { generic_report_results( label_in, team_tuners, policy, functor, tag, [](const Kokkos::TeamPolicy& candidate_policy) { return (candidate_policy.impl_auto_team_size() || candidate_policy.impl_auto_vector_length()); }); } // report results for an MDRangePolicy template void report_policy_results(const size_t /**tuning_context*/, const std::string& label_in, Kokkos::MDRangePolicy& policy, const Functor& functor, const TagType& tag) { using Policy = Kokkos::MDRangePolicy; static constexpr int rank = Policy::rank; generic_report_results>( label_in, mdrange_tuners, policy, functor, tag, [](const Policy& candidate_policy) { return candidate_policy.impl_tune_tile_size(); }); } } // namespace Impl } // namespace Experimental namespace Impl { template void begin_parallel_for(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName name(label); Kokkos::Tools::beginParallelFor( name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), &kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); if (Kokkos::tune_internals()) { Experimental::Impl::tune_policy(context_id, label, policy, functor, Kokkos::ParallelForTag{}); } #else (void)functor; #endif } template void end_parallel_for(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelFor(kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); if (Kokkos::tune_internals()) { Experimental::Impl::report_policy_results( context_id, label, policy, functor, Kokkos::ParallelForTag{}); } #else (void)policy; (void)functor; (void)label; #endif } template void begin_parallel_scan(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName name(label); Kokkos::Tools::beginParallelScan( name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), &kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); if (Kokkos::tune_internals()) { Experimental::Impl::tune_policy(context_id, label, policy, functor, Kokkos::ParallelScanTag{}); } #else (void)functor; #endif } template void end_parallel_scan(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelScan(kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); if (Kokkos::tune_internals()) { Experimental::Impl::report_policy_results( context_id, label, policy, functor, Kokkos::ParallelScanTag{}); } #else (void)policy; (void)functor; (void)label; #endif } template void begin_parallel_reduce(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Impl::ParallelConstructName name(label); Kokkos::Tools::beginParallelReduce( name.get(), Kokkos::Profiling::Experimental::device_id(policy.space()), &kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_new_context_id(); Experimental::Impl::ReductionSwitcher::tune( context_id, label, policy, functor, Kokkos::ParallelReduceTag{}); #else (void)functor; #endif } template void end_parallel_reduce(ExecPolicy& policy, FunctorType& functor, const std::string& label, uint64_t& kpID) { if (Kokkos::Tools::profileLibraryLoaded()) { Kokkos::Tools::endParallelReduce(kpID); } #ifdef KOKKOS_ENABLE_TUNING size_t context_id = Kokkos::Tools::Experimental::get_current_context_id(); if (Kokkos::tune_internals()) { Experimental::Impl::report_policy_results( context_id, label, policy, functor, Kokkos::ParallelReduceTag{}); } #else (void)policy; (void)functor; (void)label; #endif } } // end namespace Impl } // namespace Tools } // namespace Kokkos #endif // header guard