//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #include #include #include "Benchmark_Context.hpp" #include #include using exec_space = Kokkos::DefaultExecutionSpace; template struct ZeroFunctor { using execution_space = DEVICE_TYPE; using type = typename Kokkos::View; using h_type = typename Kokkos::View::HostMirror; type data; KOKKOS_INLINE_FUNCTION void operator()(int) const { data() = 0; } }; //--------------------------------------------------- //--------------atomic_fetch_add--------------------- //--------------------------------------------------- template struct AddFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data; KOKKOS_INLINE_FUNCTION void operator()(int) const { Kokkos::atomic_fetch_add(&data(), (T)1); } }; template T AddLoop(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct AddFunctor f_add; f_add.data = data; Kokkos::parallel_for(loop, f_add); exec_space().fence(); Kokkos::deep_copy(h_data, data); T val = h_data(); return val; } template struct AddNonAtomicFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data; KOKKOS_INLINE_FUNCTION void operator()(int) const { data() += (T)1; } }; template T AddLoopNonAtomic(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct AddNonAtomicFunctor f_add; f_add.data = data; Kokkos::parallel_for(loop, f_add); exec_space().fence(); Kokkos::deep_copy(h_data, data); T val = h_data(); return val; } template T AddLoopSerial(int loop) { T* data = new T[1]; data[0] = 0; for (int i = 0; i < loop; i++) *data += (T)1; T val = *data; delete[] data; return val; } template struct CASFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data; KOKKOS_INLINE_FUNCTION void operator()(int) const { T old = data(); T newval, assumed; do { assumed = old; newval = assumed + (T)1; old = Kokkos::atomic_compare_exchange(&data(), assumed, newval); } while (old != assumed); } }; template T CASLoop(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct CASFunctor f_cas; f_cas.data = data; Kokkos::parallel_for(loop, f_cas); exec_space().fence(); Kokkos::deep_copy(h_data, data); T val = h_data(); return val; } template struct CASNonAtomicFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data; KOKKOS_INLINE_FUNCTION void operator()(int) const { volatile T assumed; volatile T newval; bool fail = 1; do { assumed = data(); newval = assumed + (T)1; if (data() == assumed) { data() = newval; fail = 0; } } while (fail); } }; template T CASLoopNonAtomic(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct CASNonAtomicFunctor f_cas; f_cas.data = data; Kokkos::parallel_for(loop, f_cas); exec_space().fence(); Kokkos::deep_copy(h_data, data); T val = h_data(); return val; } template T CASLoopSerial(int loop) { T* data = new T[1]; data[0] = 0; for (int i = 0; i < loop; i++) { T assumed; T newval; T old; do { assumed = *data; newval = assumed + (T)1; old = *data; *data = newval; } while (!(assumed == old)); } T val = *data; delete[] data; return val; } template struct ExchFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data, data2; KOKKOS_INLINE_FUNCTION void operator()(int i) const { T old = Kokkos::atomic_exchange(&data(), (T)i); Kokkos::atomic_fetch_add(&data2(), old); } }; template T ExchLoop(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); typename ZeroFunctor::type data2("Data"); typename ZeroFunctor::h_type h_data2("HData"); f_zero.data = data2; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct ExchFunctor f_exch; f_exch.data = data; f_exch.data2 = data2; Kokkos::parallel_for(loop, f_exch); exec_space().fence(); Kokkos::deep_copy(h_data, data); Kokkos::deep_copy(h_data2, data2); T val = h_data() + h_data2(); return val; } template struct ExchNonAtomicFunctor { using execution_space = DEVICE_TYPE; using type = Kokkos::View; type data, data2; KOKKOS_INLINE_FUNCTION void operator()(int i) const { T old = data(); data() = (T)i; data2() += old; } }; template T ExchLoopNonAtomic(int loop) { struct ZeroFunctor f_zero; typename ZeroFunctor::type data("Data"); typename ZeroFunctor::h_type h_data("HData"); f_zero.data = data; Kokkos::parallel_for(1, f_zero); exec_space().fence(); typename ZeroFunctor::type data2("Data"); typename ZeroFunctor::h_type h_data2("HData"); f_zero.data = data2; Kokkos::parallel_for(1, f_zero); exec_space().fence(); struct ExchNonAtomicFunctor f_exch; f_exch.data = data; f_exch.data2 = data2; Kokkos::parallel_for(loop, f_exch); exec_space().fence(); Kokkos::deep_copy(h_data, data); Kokkos::deep_copy(h_data2, data2); T val = h_data() + h_data2(); return val; } template T ExchLoopSerial(int loop) { T* data = new T[1]; T* data2 = new T[1]; data[0] = 0; data2[0] = 0; for (int i = 0; i < loop; i++) { T old = *data; *data = (T)i; *data2 += old; } T val = *data2 + *data; delete[] data; delete[] data2; return val; } template T LoopVariant(int loop, int test) { switch (test) { case 1: return AddLoop(loop); case 2: return CASLoop(loop); case 3: return ExchLoop(loop); } return 0; } template T LoopVariantSerial(int loop, int test) { switch (test) { case 1: return AddLoopSerial(loop); case 2: return CASLoopSerial(loop); case 3: return ExchLoopSerial(loop); } return 0; } template T LoopVariantNonAtomic(int loop, int test) { switch (test) { case 1: return AddLoopNonAtomic(loop); case 2: return CASLoopNonAtomic(loop); case 3: return ExchLoopNonAtomic(loop); } return 0; } template void Loop(benchmark::State& state, int test) { int loop = state.range(0); LoopVariant(loop, test); Kokkos::Timer timer; T res = LoopVariant(loop, test); double time = timer.seconds(); timer.reset(); T resNonAtomic = LoopVariantNonAtomic(loop, test); double timeNonAtomic = timer.seconds(); timer.reset(); T resSerial = LoopVariantSerial(loop, test); double timeSerial = timer.seconds(); time *= 1e6 / loop; timeNonAtomic *= 1e6 / loop; timeSerial *= 1e6 / loop; bool passed = (resSerial == res); state.counters["Passed"] = benchmark::Counter(passed); state.counters["Value serial"] = benchmark::Counter(resSerial); state.counters["Value atomic"] = benchmark::Counter(res); state.counters["Value non-atomic"] = benchmark::Counter(resNonAtomic); state.counters["Time serial"] = benchmark::Counter(timeSerial); state.counters["Time atomic"] = benchmark::Counter(time); state.counters["Time non-atomic"] = benchmark::Counter(timeNonAtomic); state.counters["Size of type"] = benchmark::Counter(sizeof(T)); } template static void Test_Atomic(benchmark::State& state) { for (auto _ : state) { Loop(state, 1); Loop(state, 2); Loop(state, 3); } } static constexpr int LOOP = 100'000; BENCHMARK(Test_Atomic)->Arg(30'000)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10); BENCHMARK(Test_Atomic)->Arg(LOOP)->Iterations(10);