//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #include #include #include #include namespace TestTeamVectorRange { struct my_complex { double re, im; int dummy; KOKKOS_INLINE_FUNCTION my_complex() { re = 0.0; im = 0.0; dummy = 0; } KOKKOS_INLINE_FUNCTION my_complex(const my_complex& src) { re = src.re; im = src.im; dummy = src.dummy; } KOKKOS_INLINE_FUNCTION my_complex& operator=(const my_complex& src) { re = src.re; im = src.im; dummy = src.dummy; return *this; } KOKKOS_INLINE_FUNCTION my_complex(const double& val) { re = val; im = 0.0; dummy = 0; } KOKKOS_INLINE_FUNCTION my_complex& operator+=(const my_complex& src) { re += src.re; im += src.im; dummy += src.dummy; return *this; } KOKKOS_INLINE_FUNCTION my_complex operator+(const my_complex& src) { my_complex tmp = *this; tmp.re += src.re; tmp.im += src.im; tmp.dummy += src.dummy; return tmp; } KOKKOS_INLINE_FUNCTION my_complex& operator*=(const my_complex& src) { double re_tmp = re * src.re - im * src.im; double im_tmp = re * src.im + im * src.re; re = re_tmp; im = im_tmp; dummy *= src.dummy; return *this; } KOKKOS_INLINE_FUNCTION bool operator==(const my_complex& src) const { return (re == src.re) && (im == src.im) && (dummy == src.dummy); } KOKKOS_INLINE_FUNCTION bool operator!=(const my_complex& src) const { return (re != src.re) || (im != src.im) || (dummy != src.dummy); } KOKKOS_INLINE_FUNCTION bool operator!=(const double& val) const { return (re != val) || (im != 0) || (dummy != 0); } KOKKOS_INLINE_FUNCTION my_complex& operator=(const int& val) { re = val; im = 0.0; dummy = 0; return *this; } KOKKOS_INLINE_FUNCTION my_complex& operator=(const double& val) { re = val; im = 0.0; dummy = 0; return *this; } KOKKOS_INLINE_FUNCTION operator double() { return re; } }; } // namespace TestTeamVectorRange namespace Kokkos { template <> struct reduction_identity { using t_red_ident = reduction_identity; KOKKOS_FORCEINLINE_FUNCTION static TestTeamVectorRange::my_complex sum() { return TestTeamVectorRange::my_complex(t_red_ident::sum()); } KOKKOS_FORCEINLINE_FUNCTION static TestTeamVectorRange::my_complex prod() { return TestTeamVectorRange::my_complex(t_red_ident::prod()); } }; } // namespace Kokkos namespace TestTeamVectorRange { template struct functor_teamvector_for { using policy_type = Kokkos::TeamPolicy; using execution_space = ExecutionSpace; Kokkos::View flag; functor_teamvector_for( Kokkos::View flag_) : flag(flag_) {} using shmem_space = typename ExecutionSpace::scratch_memory_space; using shared_int = Kokkos::View; unsigned team_shmem_size(int /*team_size*/) const { return shared_int::shmem_size(131); } KOKKOS_INLINE_FUNCTION void operator()(typename policy_type::member_type team) const { using size_type = typename shmem_space::size_type; const size_type shmemSize = 131; shared_int values = shared_int(team.team_shmem(), shmemSize); if (values.data() == nullptr || values.extent(0) < shmemSize) { Kokkos::printf("FAILED to allocate shared memory of size %u\n", static_cast(shmemSize)); } else { // Initialize shared memory. Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131), [&](int i) { values(i) = 0; }); // Wait for all memory to be written. team.team_barrier(); // Accumulate value into per thread shared memory. // This is non blocking. Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 131), [&](int i) { values(i) += i - team.league_rank() + team.league_size() + team.team_size(); }); // Wait for all memory to be written. team.team_barrier(); // One thread per team executes the comparison. Kokkos::single(Kokkos::PerTeam(team), [&]() { Scalar test = 0; Scalar value = 0; for (int i = 0; i < 131; ++i) { test += i - team.league_rank() + team.league_size() + team.team_size(); } for (int i = 0; i < 131; ++i) { value += values(i); } if (test != value) { Kokkos::printf("FAILED teamvector_parallel_for %i %i %lf %lf\n", team.league_rank(), team.team_rank(), static_cast(test), static_cast(value)); flag() = 1; } }); } } }; template struct functor_teamvector_reduce { using policy_type = Kokkos::TeamPolicy; using execution_space = ExecutionSpace; Kokkos::View flag; functor_teamvector_reduce( Kokkos::View flag_) : flag(flag_) {} using shmem_space = typename ExecutionSpace::scratch_memory_space; using shared_scalar_t = Kokkos::View; unsigned team_shmem_size(int team_size) const { return shared_scalar_t::shmem_size(team_size * 13); } KOKKOS_INLINE_FUNCTION void operator()(typename policy_type::member_type team) const { Scalar value = Scalar(); shared_scalar_t shared_value(team.team_scratch(0), 1); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(team, 131), [&](int i, Scalar& val) { val += i - team.league_rank() + team.league_size() + team.team_size(); }, shared_value(0)); team.team_barrier(); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(team, 131), [&](int i, Scalar& val) { val += i - team.league_rank() + team.league_size() + team.team_size(); }, value); // Kokkos::parallel_reduce( Kokkos::TeamVectorRange( team, 131 ), [&] ( // int i, Scalar & val ) // { // val += i - team.league_rank() + team.league_size() + // team.team_size(); // }, shared_value(0) ); team.team_barrier(); Kokkos::single(Kokkos::PerTeam(team), [&]() { Scalar test = 0; for (int i = 0; i < 131; ++i) { test += i - team.league_rank() + team.league_size() + team.team_size(); } if (test != value) { if (team.league_rank() == 0) { Kokkos::printf( "FAILED teamvector_parallel_reduce %i %i %lf %lf %lu\n", (int)team.league_rank(), (int)team.team_rank(), static_cast(test), static_cast(value), static_cast(sizeof(Scalar))); } flag() = 1; } if (test != shared_value(0)) { if (team.league_rank() == 0) { Kokkos::printf( "FAILED teamvector_parallel_reduce with shared result %i %i %lf " "%lf %lu\n", static_cast(team.league_rank()), static_cast(team.team_rank()), static_cast(test), static_cast(shared_value(0)), static_cast(sizeof(Scalar))); } flag() = 1; } }); } }; template struct functor_teamvector_reduce_reducer { using policy_type = Kokkos::TeamPolicy; using execution_space = ExecutionSpace; Kokkos::View flag; functor_teamvector_reduce_reducer( Kokkos::View flag_) : flag(flag_) {} using shmem_space = typename ExecutionSpace::scratch_memory_space; using shared_scalar_t = Kokkos::View; unsigned team_shmem_size(int team_size) const { return shared_scalar_t::shmem_size(team_size * 13); } KOKKOS_INLINE_FUNCTION void operator()(typename policy_type::member_type team) const { Scalar value = 0; shared_scalar_t shared_value(team.team_scratch(0), 1); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(team, 131), [&](int i, Scalar& val) { val += i - team.league_rank() + team.league_size() + team.team_size(); }, Kokkos::Sum(value)); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(team, 131), [&](int i, Scalar& val) { val += i - team.league_rank() + team.league_size() + team.team_size(); }, Kokkos::Sum(shared_value(0))); team.team_barrier(); Kokkos::single(Kokkos::PerTeam(team), [&]() { Scalar test = 0; for (int i = 0; i < 131; ++i) { test += i - team.league_rank() + team.league_size() + team.team_size(); } if (test != value) { Kokkos::printf( "FAILED teamvector_parallel_reduce_reducer %i %i %lf %lf\n", team.league_rank(), team.team_rank(), static_cast(test), static_cast(value)); flag() = 1; } if (test != shared_value(0)) { Kokkos::printf( "FAILED teamvector_parallel_reduce_reducer shared value %i %i %lf " "%lf\n", team.league_rank(), team.team_rank(), static_cast(test), static_cast(shared_value(0))); flag() = 1; } }); } }; template bool test_scalar(int nteams, int team_size, int test) { Kokkos::View d_flag("flag"); typename Kokkos::View::HostMirror h_flag("h_flag"); h_flag() = 0; Kokkos::deep_copy(d_flag, h_flag); Kokkos::TeamPolicy policy(nteams, team_size, 8); // FIXME_OPENMPTARGET - Need to allocate scratch space via set_scratch_space // for the OPENMPTARGET backend. #ifdef KOKKOS_ENABLE_OPENMPTARGET using scratch_t = Kokkos::View >; int scratch_size = 0; if (test == 0) { scratch_size = scratch_t::shmem_size(131); } else { // FIXME_OPENMPTARGET - Currently allocating more than one team for nested // reduction leads to runtime errors of illegal memory access, caused mostly // due to the OpenMP memory allocation constraints. policy = Kokkos::TeamPolicy(1, team_size, 8); scratch_size = scratch_t::shmem_size(1); } policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); #endif if (test == 0) { Kokkos::parallel_for( "Test::TeamVectorFor", policy, functor_teamvector_for(d_flag)); } else if (test == 1) { Kokkos::parallel_for( "Test::TeamVectorReduce", policy, functor_teamvector_reduce(d_flag)); } else if (test == 2) { Kokkos::parallel_for( "Test::TeamVectorReduceReducer", Kokkos::TeamPolicy(nteams, team_size, 8), functor_teamvector_reduce_reducer(d_flag)); } Kokkos::deep_copy(h_flag, d_flag); return (h_flag() == 0); } template bool Test(int test) { bool passed = true; // With SYCL 33*8 exceeds the maximum work group size #ifdef KOKKOS_ENABLE_SYCL int team_size = 31; #else int team_size = 33; #endif int const concurrency = ExecutionSpace().concurrency(); if (team_size > concurrency) team_size = concurrency; passed = passed && test_scalar(317, team_size, test); passed = passed && test_scalar(317, team_size, test); passed = passed && test_scalar(317, team_size, test); passed = passed && test_scalar(317, team_size, test); // FIXME_OPENMPTARGET - Use of custom reducers currently results in runtime // memory errors. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) passed = passed && test_scalar(317, team_size, test); #endif return passed; } } // namespace TestTeamVectorRange namespace Test { TEST(TEST_CATEGORY, team_teamvector_range) { ASSERT_TRUE((TestTeamVectorRange::Test(0))); #if defined(KOKKOS_ENABLE_CUDA) && \ defined(KOKKOS_COMPILER_NVHPC) // FIXME_NVHPC 23.7 if constexpr (std::is_same_v) { GTEST_SKIP() << "Disabling 2/3rd of the test for now"; } #endif ASSERT_TRUE((TestTeamVectorRange::Test(1))); // FIXME_OPENMPTARGET - Use of kokkos reducers currently results in runtime // memory errors. #if !defined(KOKKOS_ENABLE_OPENMPTARGET) ASSERT_TRUE((TestTeamVectorRange::Test(2))); #endif } } // namespace Test