//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER namespace Test { template struct MultiDimRangePerf3D { using execution_space = DeviceType; using size_type = typename execution_space::size_type; using iterate_type = Kokkos::Iterate; using view_type = Kokkos::View; using host_view_type = typename view_type::HostMirror; view_type A; view_type B; const long irange; const long jrange; const long krange; MultiDimRangePerf3D(const view_type &A_, const view_type &B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long i, const long j, const long k) const { A(i, j, k) = 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); } struct InitZeroTag {}; // struct InitViewTag {}; struct Init { Init(const view_type &input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long i, const long j, const long k) const { input(i, j, k) = 1.0; } KOKKOS_INLINE_FUNCTION void operator()(const InitZeroTag &, const long i, const long j, const long k) const { input(i, j, k) = 0; } view_type input; const long irange; const long jrange; const long krange; }; static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1) { // This test performs multidim range over all dims view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); using FunctorType = MultiDimRangePerf3D; double dt_min = 0; // LayoutRight if (std::is_same::value) { Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space> policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space> policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, {{Ti, Tj, Tk}}); using MDRangeType = typename Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space>; using tile_type = typename MDRangeType::tile_type; using point_type = typename MDRangeType::point_type; Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space> policy(point_type{{0, 0, 0}}, point_type{{icount, jcount, kcount}}, tile_type{{Ti, Tj, Tk}}); Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); execution_space().fence(); Kokkos::parallel_for(policy_initB, Init(Btest, icount + 2, jcount + 2, kcount + 2)); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if (0 == i) dt_min = dt; else dt_min = dt < dt_min ? dt : dt_min; // Correctness check - only the first run if (0 == i) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise // answer for (long l = 0; l < static_cast(icount); ++l) { for (long j = 0; j < static_cast(jcount); ++j) { for (long k = 0; k < static_cast(kcount); ++k) { ScalarType check = 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + Bhost(l, j, k)); if (Ahost(l, j, k) - check != 0) { ++numErrors; std::cout << " Correctness error at index: " << l << "," << j << "," << k << "\n" << " multi Ahost = " << Ahost(l, j, k) << " expected = " << check << " multi Bhost(ijk) = " << Bhost(l, j, k) << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) << std::endl; // exit(-1); } } } } if (numErrors != 0) { std::cout << "LR multi: errors " << numErrors << " range product " << icount * jcount * kcount << " LL " << jcount * kcount << " LR " << icount * jcount << std::endl; } // else { std::cout << " multi: No errors!" << std::endl; } } } // end for } // LayoutLeft else { Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space> policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space> policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, {{Ti, Tj, Tk}}); // using MDRangeType = // typename Kokkos::MDRangePolicy< // Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, // execution_space >; // using tile_type = typename MDRangeType::tile_type; // using point_type = typename MDRangeType::point_type; // MDRangeType policy(point_type{{0,0,0}}, // point_type{{icount,jcount,kcount}}, // tile_type{{Ti,Tj,Tk}}); Kokkos::MDRangePolicy< Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space> policy({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); execution_space().fence(); Kokkos::parallel_for(policy_initB, Init(Btest, icount + 2, jcount + 2, kcount + 2)); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if (0 == i) dt_min = dt; else dt_min = dt < dt_min ? dt : dt_min; // Correctness check - only the first run if (0 == i) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise // answer for (long l = 0; l < static_cast(icount); ++l) { for (long j = 0; j < static_cast(jcount); ++j) { for (long k = 0; k < static_cast(kcount); ++k) { ScalarType check = 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + Bhost(l, j, k)); if (Ahost(l, j, k) - check != 0) { ++numErrors; std::cout << " Correctness error at index: " << l << "," << j << "," << k << "\n" << " multi Ahost = " << Ahost(l, j, k) << " expected = " << check << " multi Bhost(ijk) = " << Bhost(l, j, k) << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) << std::endl; // exit(-1); } } } } if (numErrors != 0) { std::cout << " LL multi run: errors " << numErrors << " range product " << icount * jcount * kcount << " LL " << jcount * kcount << " LR " << icount * jcount << std::endl; } // else { std::cout << " multi: No errors!" << std::endl; } } } // end for } return dt_min; } }; template struct RangePolicyCollapseTwo { // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for // multi-dim; unroll 2 dims in one-dim using execution_space = DeviceType; using size_type = typename execution_space::size_type; using layout = TestLayout; using iterate_type = Kokkos::Iterate; using view_type = Kokkos::View; using host_view_type = typename view_type::HostMirror; view_type A; view_type B; const long irange; const long jrange; const long krange; RangePolicyCollapseTwo(view_type &A_, const view_type &B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if (std::is_same::value) { // id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r // r = j + i*Nj long i = int(r / jrange); long j = int(r - i * jrange); for (int k = 0; k < krange; ++k) { A(i, j, k) = 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); } } else if (std::is_same::value) { // id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r // r = j + k*Nj long k = int(r / jrange); long j = int(r - k * jrange); for (int i = 0; i < irange; ++i) { A(i, j, k) = 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); } } } struct Init { view_type input; const long irange; const long jrange; const long krange; Init(const view_type &input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if (std::is_same::value) { long i = int(r / jrange); long j = int(r - i * jrange); for (int k = 0; k < krange; ++k) { input(i, j, k) = 1; } } else if (std::is_same::value) { long k = int(r / jrange); long j = int(r - k * jrange); for (int i = 0; i < irange; ++i) { input(i, j, k) = 1; } } } }; static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1) { // This test refers to collapsing two dims while using the RangePolicy view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); using FunctorType = RangePolicyCollapseTwo; long collapse_index_rangeA = 0; long collapse_index_rangeB = 0; if (std::is_same::value) { collapse_index_rangeA = icount * jcount; collapse_index_rangeB = (icount + 2) * (jcount + 2); // std::cout << " LayoutRight " << std::endl; } else if (std::is_same::value) { collapse_index_rangeA = kcount * jcount; collapse_index_rangeB = (kcount + 2) * (jcount + 2); // std::cout << " LayoutLeft " << std::endl; } else { std::cout << " LayoutRight or LayoutLeft required - will pass 0 as " "range instead " << std::endl; exit(-1); } Kokkos::RangePolicy policy(0, (collapse_index_rangeA)); Kokkos::RangePolicy policy_initB(0, (collapse_index_rangeB)); double dt_min = 0; Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); execution_space().fence(); Kokkos::parallel_for(policy_initB, Init(Btest, icount + 2, jcount + 2, kcount + 2)); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if (0 == i) dt_min = dt; else dt_min = dt < dt_min ? dt : dt_min; // Correctness check - first iteration only if (0 == i) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise // answer for (long l = 0; l < static_cast(icount); ++l) { for (long j = 0; j < static_cast(jcount); ++j) { for (long k = 0; k < static_cast(kcount); ++k) { ScalarType check = 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + Bhost(l, j, k)); if (Ahost(l, j, k) - check != 0) { ++numErrors; std::cout << " Correctness error at index: " << l << "," << j << "," << k << "\n" << " flat Ahost = " << Ahost(l, j, k) << " expected = " << check << std::endl; // exit(-1); } } } } if (numErrors != 0) { std::cout << " RP collapse2: errors " << numErrors << " range product " << icount * jcount * kcount << " LL " << jcount * kcount << " LR " << icount * jcount << std::endl; } // else { std::cout << " RP collapse2: Pass! " << std::endl; } } } return dt_min; } }; template struct RangePolicyCollapseAll { // RangePolicy for 3D range, but will collapse all dims using execution_space = DeviceType; using size_type = typename execution_space::size_type; using layout = TestLayout; using view_type = Kokkos::View; using host_view_type = typename view_type::HostMirror; view_type A; view_type B; const long irange; const long jrange; const long krange; RangePolicyCollapseAll(view_type &A_, const view_type &B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if (std::is_same::value) { long i = int(r / (jrange * krange)); long j = int((r - i * jrange * krange) / krange); long k = int(r - i * jrange * krange - j * krange); A(i, j, k) = 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); } else if (std::is_same::value) { long k = int(r / (irange * jrange)); long j = int((r - k * irange * jrange) / irange); long i = int(r - k * irange * jrange - j * irange); A(i, j, k) = 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); } } struct Init { view_type input; const long irange; const long jrange; const long krange; Init(const view_type &input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if (std::is_same::value) { long i = int(r / (jrange * krange)); long j = int((r - i * jrange * krange) / krange); long k = int(r - i * jrange * krange - j * krange); input(i, j, k) = 1; } else if (std::is_same::value) { long k = int(r / (irange * jrange)); long j = int((r - k * irange * jrange) / irange); long i = int(r - k * irange * jrange - j * irange); input(i, j, k) = 1; } } }; static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1) { // This test refers to collapsing all dims using the RangePolicy view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); using FunctorType = RangePolicyCollapseAll; const long flat_index_range = icount * jcount * kcount; Kokkos::RangePolicy policy(0, flat_index_range); Kokkos::RangePolicy policy_initB( 0, (icount + 2) * (jcount + 2) * (kcount + 2)); double dt_min = 0; Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); execution_space().fence(); Kokkos::parallel_for(policy_initB, Init(Btest, icount + 2, jcount + 2, kcount + 2)); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if (0 == i) dt_min = dt; else dt_min = dt < dt_min ? dt : dt_min; // Correctness check - first iteration only if (0 == i) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise // answer for (long l = 0; l < static_cast(icount); ++l) { for (long j = 0; j < static_cast(jcount); ++j) { for (long k = 0; k < static_cast(kcount); ++k) { ScalarType check = 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + Bhost(l, j, k)); if (Ahost(l, j, k) - check != 0) { ++numErrors; std::cout << " Callapse ALL Correctness error at index: " << l << "," << j << "," << k << "\n" << " flat Ahost = " << Ahost(l, j, k) << " expected = " << check << std::endl; // exit(-1); } } } } if (numErrors != 0) { std::cout << " RP collapse all: errors " << numErrors << " range product " << icount * jcount * kcount << " LL " << jcount * kcount << " LR " << icount * jcount << std::endl; } // else { std::cout << " RP collapse all: Pass! " << std::endl; } } } return dt_min; } }; } // end namespace Test