//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #include #include #if defined(_WIN32) #include unsigned getBytesPerPage() { SYSTEM_INFO si; GetSystemInfo(&si); return si.dwPageSize; } #else // unix/posix system #include unsigned getBytesPerPage() { return sysconf(_SC_PAGESIZE); } #endif #include #include #include namespace { void printTimings(std::ostream& out, std::vector const& tr, uint64_t threshold = (std::numeric_limits::max)()) { out << "TimingResult contains " << tr.size() << " results:\n"; for (auto it = tr.begin(); it != tr.end(); ++it) { out << "Duration of loop " << it - tr.begin() << " is " << *it << " clock cycles. "; if ((*it) > threshold) out << "Migration assumed."; out << "\n"; } } template T computeMean(std::vector const& results) { return std::accumulate(results.begin(), results.end(), T{}) / results.size(); } template class IncrementFunctor { private: using index_type = decltype(std::declval().size()); ViewType view_; public: IncrementFunctor() = delete; explicit IncrementFunctor(ViewType view) : view_(view) {} KOKKOS_INLINE_FUNCTION void operator()(const index_type idx, uint64_t& clockTics) const { uint64_t start = Kokkos::Impl::clock_tic(); ++view_(idx); clockTics += Kokkos::Impl::clock_tic() - start; } }; // TIMING CAPTURED KERNEL // PREMISE: This kernel should always be memory bound, as we are measuring // memory access times. The compute load of an increment is small enough on // current hardware but this could be different for new hardware. As we count // the clocks in the kernel, the core frequency of the device has to be fast // enough to guarante that the kernel stays memory bound. template std::vector incrementInLoop(ViewType& view, unsigned int numRepetitions) { using index_type = decltype(view.size()); std::vector results; Kokkos::fence(); for (unsigned i = 0; i < numRepetitions; ++i) { uint64_t sum_clockTics; IncrementFunctor func(view); Kokkos::parallel_reduce( "increment", Kokkos::RangePolicy>{ 0, view.size()}, func, sum_clockTics); Kokkos::fence(); results.push_back(sum_clockTics / view.size()); } return results; } TEST(defaultdevicetype, shared_space) { ASSERT_TRUE(Kokkos::has_shared_space); if constexpr (std::is_same_v) GTEST_SKIP() << "Skipping as host and device are the same space"; #if defined(KOKKOS_ARCH_AMD_GPU) && defined(KOKKOS_ENABLE_HIP) if (!Kokkos::SharedSpace().impl_hip_driver_check_page_migration()) GTEST_SKIP() << "skipping because specified arch does not support page migration"; #endif #if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU) GTEST_SKIP() << "skipping because clock_tic is only defined for sycl+intel gpu"; #endif const unsigned int numRepetitions = 10; const unsigned int numDeviceHostCycles = 3; double threshold = 1.5; unsigned int numPages = 100; size_t numBytes = numPages * getBytesPerPage(); using DeviceExecutionSpace = Kokkos::DefaultExecutionSpace; using HostExecutionSpace = Kokkos::DefaultHostExecutionSpace; // ALLOCATION Kokkos::View sharedData("sharedData", numBytes / sizeof(int)); Kokkos::View deviceData( "deviceData", numBytes / sizeof(int)); Kokkos::View hostData( "hostData", numBytes / sizeof(int)); Kokkos::fence(); // GET DEFAULT EXECSPACE LOCAL TIMINGS auto deviceLocalTimings = incrementInLoop(deviceData, numRepetitions); // GET DEFAULT HOSTEXECSPACE LOCAL TIMINGS auto hostLocalTimings = incrementInLoop(hostData, numRepetitions); // GET PAGE MIGRATING TIMINGS DATA std::vector deviceSharedTimings{}; std::vector hostSharedTimings{}; for (unsigned i = 0; i < numDeviceHostCycles; ++i) { // GET RESULTS DEVICE deviceSharedTimings.push_back( incrementInLoop(sharedData, numRepetitions)); // GET RESULTS HOST hostSharedTimings.push_back( incrementInLoop(sharedData, numRepetitions)); } // COMPUTE STATISTICS OF HOST AND DEVICE LOCAL KERNELS auto deviceLocalMean = computeMean(deviceLocalTimings); auto hostLocalMean = computeMean(hostLocalTimings); // ASSESS RESULTS bool fastAsLocalOnRepeatedAccess = true; for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) { std::for_each(std::next(deviceSharedTimings[cycle].begin()), deviceSharedTimings[cycle].end(), [&](const uint64_t timing) { (timing < threshold * deviceLocalMean) ? fastAsLocalOnRepeatedAccess &= true : fastAsLocalOnRepeatedAccess &= false; }); std::for_each(std::next(hostSharedTimings[cycle].begin()), hostSharedTimings[cycle].end(), [&](const uint64_t timing) { (timing < threshold * hostLocalMean) ? fastAsLocalOnRepeatedAccess &= true : fastAsLocalOnRepeatedAccess &= false; }); } // CHECK IF PASSED bool passed = (fastAsLocalOnRepeatedAccess); // PRINT IF NOT PASSED if (!passed) { std::cout << "Page size as reported by os: " << getBytesPerPage() << " bytes \n"; std::cout << "Allocating " << numPages << " pages of memory in SharedSpace.\n"; std::cout << "Behavior found: \n"; std::cout << "SharedSpace is as fast as local space on repeated access: " << fastAsLocalOnRepeatedAccess << ", we expect true \n\n"; std::cout << "Please look at the following timings. The first access in a " "different ExecutionSpace is not evaluated for the test. As we " "expect the memory to migrate during the first access it might have " "a higher cycle count than subsequent accesses, depending on your " "hardware. If the cycles are more than " << threshold << " times the cycles for pure local memory access, we assume a page " "migration happened.\n\n"; std::cout << "################SHARED SPACE####################\n"; for (unsigned cycle = 0; cycle < numDeviceHostCycles; ++cycle) { std::cout << "DeviceExecutionSpace timings of run " << cycle << ":\n"; printTimings(std::cout, deviceSharedTimings[cycle], threshold * deviceLocalMean); std::cout << "HostExecutionSpace timings of run " << cycle << ":\n"; printTimings(std::cout, hostSharedTimings[cycle], threshold * hostLocalMean); } std::cout << "################LOCAL SPACE####################\n"; printTimings(std::cout, deviceLocalTimings); printTimings(std::cout, hostLocalTimings); } ASSERT_TRUE(passed); } } // namespace