//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #include #include #include #include namespace Test { template struct TestTeamScan { using execution_space = ExecutionSpace; using value_type = DataType; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using view_type = Kokkos::View; view_type a_d; view_type a_r; int32_t M = 0; int32_t N = 0; KOKKOS_FUNCTION void operator()(const member_type& team) const { auto leagueRank = team.league_rank(); auto beg = 0; auto end = N; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, beg, end), [&](const int i) { a_d(leagueRank, i) = leagueRank * N + i; }); Kokkos::parallel_scan(Kokkos::TeamThreadRange(team, beg, end), [&](int i, DataType& val, const bool final) { val += a_d(leagueRank, i); if (final) a_r(leagueRank, i) = val; }); } auto operator()(int32_t _M, int32_t _N) { std::stringstream ss; ss << Kokkos::Impl::demangle(typeid(*this).name()); ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")"; std::string const test_id = ss.str(); M = _M; N = _N; a_d = view_type("a_d", M, N); a_r = view_type("a_r", M, N); // Set team size explicitly to check whether non-power-of-two team sizes can // be used. if (ExecutionSpace().concurrency() > 10000) Kokkos::parallel_for(policy_type(M, 127), *this); else if (ExecutionSpace().concurrency() > 2) Kokkos::parallel_for(policy_type(M, 3), *this); else Kokkos::parallel_for(policy_type(M, 1), *this); auto a_i = Kokkos::create_mirror_view(a_d); auto a_o = Kokkos::create_mirror_view(a_r); Kokkos::deep_copy(a_i, a_d); Kokkos::deep_copy(a_o, a_r); for (int32_t i = 0; i < M; ++i) { value_type scan_ref = 0; value_type scan_calc; value_type abs_err = 0; // each fp addition is subject to small loses in precision and these // compound as loop so we set the base error to be the machine epsilon and // then add in another epsilon each iteration. For example, with CUDA // backend + 32-bit float + large N values (e.g. 1,000) + high // thread-counts (e.g. 1024), this test will fail w/o epsilon // accommodation constexpr value_type epsilon = std::numeric_limits::epsilon(); for (int32_t j = 0; j < N; ++j) { scan_ref += a_i(i, j); scan_calc = a_o(i, j); if (std::is_integral::value) { ASSERT_EQ(scan_ref, scan_calc) << test_id << " calculated scan output value differs from reference at " "indices i=" << i << " and j=" << j; } else { abs_err += epsilon; ASSERT_NEAR(scan_ref, scan_calc, abs_err) << test_id << " calculated scan output value differs from reference at " "indices i=" << i << " and j=" << j; } } } } }; TEST(TEST_CATEGORY, team_scan) { TestTeamScan{}(0, 0); TestTeamScan{}(0, 1); TestTeamScan{}(1, 0); TestTeamScan{}(99, 32); TestTeamScan{}(139, 64); TestTeamScan{}(163, 128); TestTeamScan{}(433, 256); TestTeamScan{}(976, 512); TestTeamScan{}(1234, 1024); TestTeamScan{}(2596, 34); TestTeamScan{}(2596, 59); TestTeamScan{}(2596, 65); TestTeamScan{}(2596, 371); TestTeamScan{}(2596, 987); TestTeamScan{}(2596, 1311); } // Temporary: This condition will progressively be reduced when parallel_scan // with return value will be implemented for more backends. #if !defined(KOKKOS_ENABLE_OPENACC) template struct TestTeamScanRetVal { using execution_space = ExecutionSpace; using value_type = DataType; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using view_1d_type = Kokkos::View; using view_2d_type = Kokkos::View; view_2d_type a_d; view_2d_type a_r; view_1d_type a_s; int32_t M = 0; int32_t N = 0; KOKKOS_FUNCTION void operator()(const member_type& team) const { auto leagueRank = team.league_rank(); auto beg = 0; auto end = N; Kokkos::parallel_for( Kokkos::TeamThreadRange(team, beg, end), [&](const int i) { a_d(leagueRank, i) = leagueRank * N + i; }); DataType accum; Kokkos::parallel_scan( Kokkos::TeamThreadRange(team, beg, end), [&](int i, DataType& val, const bool final) { val += a_d(leagueRank, i); if (final) a_r(leagueRank, i) = val; }, accum); // Save return value from parallel_scan Kokkos::single(Kokkos::PerTeam(team), [&]() { a_s(leagueRank) = accum; }); } auto operator()(int32_t _M, int32_t _N) { std::stringstream ss; ss << Kokkos::Impl::demangle(typeid(*this).name()); ss << "(/*M=*/" << _M << ", /*N=*/" << _N << ")"; std::string const test_id = ss.str(); M = _M; N = _N; a_d = view_2d_type("a_d", M, N); a_r = view_2d_type("a_r", M, N); a_s = view_1d_type("a_s", M); // Set team size explicitly to check whether non-power-of-two team sizes can // be used. if (ExecutionSpace().concurrency() > 10000) Kokkos::parallel_for(policy_type(M, 127), *this); else if (ExecutionSpace().concurrency() > 2) Kokkos::parallel_for(policy_type(M, 3), *this); else Kokkos::parallel_for(policy_type(M, 1), *this); Kokkos::fence(); auto a_i = Kokkos::create_mirror_view(a_d); auto a_o = Kokkos::create_mirror_view(a_r); auto a_os = Kokkos::create_mirror_view(a_s); Kokkos::deep_copy(a_i, a_d); Kokkos::deep_copy(a_o, a_r); Kokkos::deep_copy(a_os, a_s); for (int32_t i = 0; i < M; ++i) { value_type scan_ref = 0; value_type scan_calc; value_type abs_err = 0; // each fp addition is subject to small loses in precision and these // compound as loop so we set the base error to be the machine epsilon and // then add in another epsilon each iteration. For example, with CUDA // backend + 32-bit float + large N values (e.g. 1,000) + high // thread-counts (e.g. 1024), this test will fail w/o epsilon // accommodation constexpr value_type epsilon = std::numeric_limits::epsilon(); for (int32_t j = 0; j < N; ++j) { scan_ref += a_i(i, j); scan_calc = a_o(i, j); if (std::is_integral::value) { ASSERT_EQ(scan_ref, scan_calc) << test_id << " calculated scan output value differs from reference at " "indices i=" << i << " and j=" << j; } else { abs_err += epsilon; ASSERT_NEAR(scan_ref, scan_calc, abs_err) << test_id << " calculated scan output value differs from reference at " "indices i=" << i << " and j=" << j; } } // Validate return value from parallel_scan if (std::is_integral::value) { ASSERT_EQ(scan_ref, a_os(i)); } else { ASSERT_NEAR(scan_ref, a_os(i), abs_err); } } } }; TEST(TEST_CATEGORY, team_scan_ret_val) { TestTeamScanRetVal{}(0, 0); TestTeamScanRetVal{}(0, 1); TestTeamScanRetVal{}(1, 0); TestTeamScanRetVal{}(99, 32); TestTeamScanRetVal{}(139, 64); TestTeamScanRetVal{}(163, 128); TestTeamScanRetVal{}(433, 256); TestTeamScanRetVal{}(976, 512); TestTeamScanRetVal{}(1234, 1024); TestTeamScanRetVal{}(2596, 34); TestTeamScanRetVal{}(2596, 59); TestTeamScanRetVal{}(2596, 65); TestTeamScanRetVal{}(2596, 371); TestTeamScanRetVal{}(2596, 987); TestTeamScanRetVal{}(2596, 1311); } #endif } // namespace Test