//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #include #include namespace Test { struct ReducerTag {}; template class ReduceFunctor { public: using execution_space = DeviceType; using size_type = typename execution_space::size_type; struct value_type { ScalarType value[3]; }; const size_type nwork; KOKKOS_INLINE_FUNCTION ReduceFunctor(const size_type& arg_nwork) : nwork(arg_nwork) {} KOKKOS_INLINE_FUNCTION ReduceFunctor(const ReduceFunctor& rhs) : nwork(rhs.nwork) {} /* KOKKOS_INLINE_FUNCTION void init( value_type & dst ) const { dst.value[0] = 0; dst.value[1] = 0; dst.value[2] = 0; } */ KOKKOS_INLINE_FUNCTION void join(value_type& dst, const value_type& src) const { dst.value[0] += src.value[0]; dst.value[1] += src.value[1]; dst.value[2] += src.value[2]; } KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { dst.value[0] += 1; dst.value[1] += iwork + 1; dst.value[2] += nwork - iwork; } }; template class ReduceFunctorFinal : public ReduceFunctor { public: using value_type = typename ReduceFunctor::value_type; KOKKOS_INLINE_FUNCTION ReduceFunctorFinal(const size_t n) : ReduceFunctor(n) {} KOKKOS_INLINE_FUNCTION void final(value_type& dst) const { dst.value[0] = -dst.value[0]; dst.value[1] = -dst.value[1]; dst.value[2] = -dst.value[2]; } }; template class ReduceFunctorFinalTag { public: using execution_space = DeviceType; using size_type = typename execution_space::size_type; using ScalarType = int64_t; struct value_type { ScalarType value[3]; }; const size_type nwork; KOKKOS_INLINE_FUNCTION ReduceFunctorFinalTag(const size_type arg_nwork) : nwork(arg_nwork) {} KOKKOS_INLINE_FUNCTION void join(const ReducerTag, value_type& dst, const value_type& src) const { dst.value[0] += src.value[0]; dst.value[1] += src.value[1]; dst.value[2] += src.value[2]; } KOKKOS_INLINE_FUNCTION void operator()(const ReducerTag, size_type iwork, value_type& dst) const { dst.value[0] -= 1; dst.value[1] -= iwork + 1; dst.value[2] -= nwork - iwork; } KOKKOS_INLINE_FUNCTION void final(const ReducerTag, value_type& dst) const { ++dst.value[0]; ++dst.value[1]; ++dst.value[2]; } }; template class RuntimeReduceFunctor { public: // Required for functor: using execution_space = DeviceType; using value_type = ScalarType[]; const unsigned value_count; // Unit test details: using size_type = typename execution_space::size_type; const size_type nwork; RuntimeReduceFunctor(const size_type arg_nwork, const size_type arg_count) : value_count(arg_count), nwork(arg_nwork) {} KOKKOS_INLINE_FUNCTION void init(ScalarType dst[]) const { for (unsigned i = 0; i < value_count; ++i) dst[i] = 0; } KOKKOS_INLINE_FUNCTION void join(ScalarType dst[], const ScalarType src[]) const { for (unsigned i = 0; i < value_count; ++i) dst[i] += src[i]; } KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, ScalarType dst[]) const { const size_type tmp[3] = {1, iwork + 1, nwork - iwork}; for (size_type i = 0; i < static_cast(value_count); ++i) { dst[i] += tmp[i % 3]; } } }; template class RuntimeReduceMinMax { public: // Required for functor: using execution_space = DeviceType; using value_type = ScalarType[]; const unsigned value_count; // Unit test details: using size_type = typename execution_space::size_type; const size_type nwork; const ScalarType amin; const ScalarType amax; RuntimeReduceMinMax(const size_type arg_nwork, const size_type arg_count) : value_count(arg_count), nwork(arg_nwork), amin(std::numeric_limits::min()), amax(std::numeric_limits::max()) {} KOKKOS_INLINE_FUNCTION void init(ScalarType dst[]) const { for (unsigned i = 0; i < value_count; ++i) { dst[i] = i % 2 ? amax : amin; } } KOKKOS_INLINE_FUNCTION void join(ScalarType dst[], const ScalarType src[]) const { for (unsigned i = 0; i < value_count; ++i) { dst[i] = i % 2 ? (dst[i] < src[i] ? dst[i] : src[i]) // min : (dst[i] > src[i] ? dst[i] : src[i]); // max } } KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, ScalarType dst[]) const { const ScalarType tmp[2] = {ScalarType(iwork + 1), ScalarType(nwork - iwork)}; for (size_type i = 0; i < static_cast(value_count); ++i) { dst[i] = i % 2 ? (dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2]) : (dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2]); } } }; template class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor { public: using base_type = RuntimeReduceFunctor; using value_type = typename base_type::value_type; using scalar_type = int64_t; RuntimeReduceFunctorFinal(const size_t theNwork, const size_t count) : base_type(theNwork, count) {} KOKKOS_INLINE_FUNCTION void final(value_type dst) const { for (unsigned i = 0; i < base_type::value_count; ++i) { dst[i] = -dst[i]; } } }; template class CombinedReduceFunctorSameType { public: using execution_space = typename DeviceType::execution_space; using size_type = typename execution_space::size_type; const size_type nwork; KOKKOS_INLINE_FUNCTION constexpr explicit CombinedReduceFunctorSameType(const size_type& arg_nwork) : nwork(arg_nwork) {} KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReduceFunctorSameType( const CombinedReduceFunctorSameType& rhs) = default; KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, ValueType& dst1, ValueType& dst2, ValueType& dst3) const { dst1 += 1; dst2 += iwork + 1; dst3 += nwork - iwork; } KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, size_type always_zero_1, size_type always_zero_2, ValueType& dst1, ValueType& dst2, ValueType& dst3) const { dst1 += 1 + always_zero_1; dst2 += iwork + 1 + always_zero_2; dst3 += nwork - iwork; } }; namespace { template class TestReduce { public: using execution_space = DeviceType; using size_type = typename execution_space::size_type; TestReduce(const size_type& nwork) { run_test(nwork); run_test_final(nwork); run_test_final_tag(nwork); } void run_test(const size_type& nwork) { using functor_type = Test::ReduceFunctor; using value_type = typename functor_type::value_type; enum { Count = 3 }; enum { Repeat = 100 }; value_type result[Repeat]; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned i = 0; i < Repeat; ++i) { Kokkos::parallel_reduce(nwork, functor_type(nwork), result[i]); } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ((ScalarType)correct, result[i].value[j]); } } } void run_test_final(const size_type& nwork) { using functor_type = Test::ReduceFunctorFinal; using value_type = typename functor_type::value_type; enum { Count = 3 }; enum { Repeat = 100 }; value_type result[Repeat]; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned i = 0; i < Repeat; ++i) { if (i % 2 == 0) { Kokkos::parallel_reduce(nwork, functor_type(nwork), result[i]); } else { Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork), result[i]); } } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ((ScalarType)correct, -result[i].value[j]); } } } void run_test_final_tag(const size_type& nwork) { using functor_type = Test::ReduceFunctorFinalTag; using value_type = typename functor_type::value_type; enum { Count = 3 }; enum { Repeat = 100 }; value_type result[Repeat]; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned i = 0; i < Repeat; ++i) { if (i % 2 == 0) { Kokkos::parallel_reduce( Kokkos::RangePolicy(0, nwork), functor_type(nwork), result[i]); } else { Kokkos::parallel_reduce( "Reduce", Kokkos::RangePolicy(0, nwork), functor_type(nwork), result[i]); } } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ((ScalarType)correct, 1 - result[i].value[j]); } } } }; template class TestReduceDynamic { public: using execution_space = DeviceType; using size_type = typename execution_space::size_type; TestReduceDynamic(const size_type nwork) { run_test_dynamic(nwork); #ifndef KOKKOS_ENABLE_OPENACC // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. run_test_dynamic_minmax(nwork); #endif run_test_dynamic_final(nwork); } void run_test_dynamic(const size_type nwork) { using functor_type = Test::RuntimeReduceFunctor; enum { Count = 3 }; enum { Repeat = 100 }; ScalarType result[Repeat][Count]; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned i = 0; i < Repeat; ++i) { if (i % 2 == 0) { Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); } else { Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork, Count), result[i]); } } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ((ScalarType)correct, result[i][j]); } } } void run_test_dynamic_minmax(const size_type nwork) { using functor_type = Test::RuntimeReduceMinMax; enum { Count = 2 }; enum { Repeat = 100 }; ScalarType result[Repeat][Count]; for (unsigned i = 0; i < Repeat; ++i) { if (i % 2 == 0) { Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); } else { Kokkos::parallel_reduce("Reduce", nwork, functor_type(nwork, Count), result[i]); } } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { if (nwork == 0) { ScalarType amin(std::numeric_limits::min()); ScalarType amax(std::numeric_limits::max()); const ScalarType correct = (j % 2) ? amax : amin; ASSERT_EQ((ScalarType)correct, result[i][j]); } else { const uint64_t correct = j % 2 ? 1 : nwork; ASSERT_EQ((ScalarType)correct, result[i][j]); } } } } void run_test_dynamic_final(const size_type nwork) { using functor_type = Test::RuntimeReduceFunctorFinal; enum { Count = 3 }; enum { Repeat = 100 }; typename functor_type::scalar_type result[Repeat][Count]; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned i = 0; i < Repeat; ++i) { if (i % 2 == 0) { Kokkos::parallel_reduce(nwork, functor_type(nwork, Count), result[i]); } else { Kokkos::parallel_reduce("TestKernelReduce", nwork, functor_type(nwork, Count), result[i]); } } for (unsigned i = 0; i < Repeat; ++i) { for (unsigned j = 0; j < Count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ((ScalarType)correct, -result[i][j]); } } } }; template class TestReduceDynamicView { public: using execution_space = DeviceType; using size_type = typename execution_space::size_type; TestReduceDynamicView(const size_type nwork) { run_test_dynamic_view(nwork); } void run_test_dynamic_view(const size_type nwork) { using functor_type = Test::RuntimeReduceFunctor; using result_type = Kokkos::View; using result_host_type = typename result_type::HostMirror; const unsigned CountLimit = 23; const uint64_t nw = nwork; const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1); for (unsigned count = 0; count < CountLimit; ++count) { result_type result("result", count); result_host_type host_result = Kokkos::create_mirror(result); // Test result to host pointer: std::string str("TestKernelReduce"); if (count % 2 == 0) { Kokkos::parallel_reduce(nw, functor_type(nw, count), host_result); } else { Kokkos::parallel_reduce(str, nw, functor_type(nw, count), host_result); } Kokkos::fence("Fence before accessing result on the host"); for (unsigned j = 0; j < count; ++j) { const uint64_t correct = 0 == j % 3 ? nw : nsum; ASSERT_EQ(host_result(j), (ScalarType)correct); host_result(j) = 0; } } } }; } // namespace // FIXME_SYCL // FIXME_OPENMPTARGET : The feature works with LLVM/13 on NVIDIA // architectures. The jenkins currently tests with LLVM/12. #if !defined(KOKKOS_ENABLE_SYCL) && \ (!defined(KOKKOS_ENABLE_OPENMPTARGET) || \ defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)) TEST(TEST_CATEGORY, int64_t_reduce) { TestReduce(0); TestReduce(1000000); } TEST(TEST_CATEGORY, double_reduce) { TestReduce(0); TestReduce(1000000); } TEST(TEST_CATEGORY, int64_t_reduce_dynamic) { TestReduceDynamic(0); TestReduceDynamic(1000000); } TEST(TEST_CATEGORY, double_reduce_dynamic) { TestReduceDynamic(0); TestReduceDynamic(1000000); } TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) { TestReduceDynamicView(0); TestReduceDynamicView(1000000); } #endif // FIXME_OPENMPTARGET: Not yet implemented. #ifndef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENACC: Not yet implemented. #ifndef KOKKOS_ENABLE_OPENACC TEST(TEST_CATEGORY, int_combined_reduce) { using functor_type = CombinedReduceFunctorSameType; constexpr uint64_t nw = 1000; uint64_t nsum = (nw / 2) * (nw + 1); int64_t result1 = 0; int64_t result2 = 0; int64_t result3 = 0; Kokkos::parallel_reduce("int_combined_reduce", Kokkos::RangePolicy(0, nw), functor_type(nw), result1, result2, result3); ASSERT_EQ(nw, uint64_t(result1)); ASSERT_EQ(nsum, uint64_t(result2)); ASSERT_EQ(nsum, uint64_t(result3)); } TEST(TEST_CATEGORY, mdrange_combined_reduce) { using functor_type = CombinedReduceFunctorSameType; constexpr uint64_t nw = 1000; uint64_t nsum = (nw / 2) * (nw + 1); int64_t result1 = 0; int64_t result2 = 0; int64_t result3 = 0; Kokkos::parallel_reduce( "int_combined_reduce_mdrange", Kokkos::MDRangePolicy>({{0, 0, 0}}, {{nw, 1, 1}}), functor_type(nw), result1, result2, result3); ASSERT_EQ(nw, uint64_t(result1)); ASSERT_EQ(nsum, uint64_t(result2)); ASSERT_EQ(nsum, uint64_t(result3)); } TEST(TEST_CATEGORY, int_combined_reduce_mixed) { using functor_type = CombinedReduceFunctorSameType; constexpr uint64_t nw = 1000; uint64_t nsum = (nw / 2) * (nw + 1); { auto result1_v = Kokkos::View{"result1_v"}; int64_t result2 = 0; auto result3_v = Kokkos::View{"result3_v"}; Kokkos::parallel_reduce("int_combined-reduce_mixed", Kokkos::RangePolicy(0, nw), functor_type(nw), result1_v, result2, Kokkos::Sum{result3_v}); ASSERT_EQ(int64_t(nw), result1_v()); ASSERT_EQ(int64_t(nsum), result2); ASSERT_EQ(int64_t(nsum), result3_v()); } { using MemorySpace = typename TEST_EXECSPACE::memory_space; auto result1_v = Kokkos::View{"result1_v"}; int64_t result2 = 0; auto result3_v = Kokkos::View{"result3_v"}; Kokkos::parallel_reduce("int_combined-reduce_mixed", Kokkos::RangePolicy(0, nw), functor_type(nw), result1_v, result2, Kokkos::Sum{result3_v}); int64_t result1; Kokkos::deep_copy(result1, result1_v); ASSERT_EQ(int64_t(nw), result1); ASSERT_EQ(int64_t(nsum), result2); int64_t result3; Kokkos::deep_copy(result3, result3_v); ASSERT_EQ(int64_t(nsum), result3); } } #endif #endif #if defined(NDEBUG) // the following test was made for: // https://github.com/kokkos/kokkos/issues/6517 struct FunctorReductionWithLargeIterationCount { KOKKOS_FUNCTION void operator()(const int64_t /*i*/, double& update) const { update += 1.0; } }; TEST(TEST_CATEGORY, reduction_with_large_iteration_count) { if constexpr (std::is_same_v) { GTEST_SKIP() << "Disabling for host backends"; } const int64_t N = pow(2LL, 39LL) - pow(2LL, 8LL) + 1; Kokkos::RangePolicy> p(0, N); double nu = 0; EXPECT_NO_THROW(Kokkos::parallel_reduce( "sample reduction", p, FunctorReductionWithLargeIterationCount(), nu)); ASSERT_DOUBLE_EQ(nu, double(N)); } #endif } // namespace Test