//@HEADER // ************************************************************************ // // Kokkos v. 4.0 // Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. // See https://kokkos.org/LICENSE for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER #ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP #define KOKKOS_HOST_EXP_ITERATE_TILE_HPP #include #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__) #define KOKKOS_MDRANGE_IVDEP #endif #ifdef KOKKOS_MDRANGE_IVDEP #define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep") #else #define KOKKOS_ENABLE_IVDEP_MDRANGE #endif #include namespace Kokkos { namespace Impl { // Temporary, for testing new loop macros #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1 #define KOKKOS_IMPL_LOOP_1L(type, tile) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) #define KOKKOS_IMPL_LOOP_2L(type, tile) \ for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) \ KOKKOS_IMPL_LOOP_1L(type, tile) #define KOKKOS_IMPL_LOOP_3L(type, tile) \ for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) \ KOKKOS_IMPL_LOOP_2L(type, tile) #define KOKKOS_IMPL_LOOP_4L(type, tile) \ for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) \ KOKKOS_IMPL_LOOP_3L(type, tile) #define KOKKOS_IMPL_LOOP_5L(type, tile) \ for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) \ KOKKOS_IMPL_LOOP_4L(type, tile) #define KOKKOS_IMPL_LOOP_6L(type, tile) \ for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) \ KOKKOS_IMPL_LOOP_5L(type, tile) #define KOKKOS_IMPL_LOOP_7L(type, tile) \ for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) \ KOKKOS_IMPL_LOOP_6L(type, tile) #define KOKKOS_IMPL_LOOP_8L(type, tile) \ for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) \ KOKKOS_IMPL_LOOP_7L(type, tile) #define KOKKOS_IMPL_LOOP_1R(type, tile) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) #define KOKKOS_IMPL_LOOP_2R(type, tile) \ KOKKOS_IMPL_LOOP_1R(type, tile) \ for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) #define KOKKOS_IMPL_LOOP_3R(type, tile) \ KOKKOS_IMPL_LOOP_2R(type, tile) \ for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) #define KOKKOS_IMPL_LOOP_4R(type, tile) \ KOKKOS_IMPL_LOOP_3R(type, tile) \ for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) #define KOKKOS_IMPL_LOOP_5R(type, tile) \ KOKKOS_IMPL_LOOP_4R(type, tile) \ for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) #define KOKKOS_IMPL_LOOP_6R(type, tile) \ KOKKOS_IMPL_LOOP_5R(type, tile) \ for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) #define KOKKOS_IMPL_LOOP_7R(type, tile) \ KOKKOS_IMPL_LOOP_6R(type, tile) \ for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) #define KOKKOS_IMPL_LOOP_8R(type, tile) \ KOKKOS_IMPL_LOOP_7R(type, tile) \ for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) #define KOKKOS_IMPL_LOOP_ARGS_1 i0 + m_offset[0] #define KOKKOS_IMPL_LOOP_ARGS_2 KOKKOS_IMPL_LOOP_ARGS_1, i1 + m_offset[1] #define KOKKOS_IMPL_LOOP_ARGS_3 KOKKOS_IMPL_LOOP_ARGS_2, i2 + m_offset[2] #define KOKKOS_IMPL_LOOP_ARGS_4 KOKKOS_IMPL_LOOP_ARGS_3, i3 + m_offset[3] #define KOKKOS_IMPL_LOOP_ARGS_5 KOKKOS_IMPL_LOOP_ARGS_4, i4 + m_offset[4] #define KOKKOS_IMPL_LOOP_ARGS_6 KOKKOS_IMPL_LOOP_ARGS_5, i5 + m_offset[5] #define KOKKOS_IMPL_LOOP_ARGS_7 KOKKOS_IMPL_LOOP_ARGS_6, i6 + m_offset[6] #define KOKKOS_IMPL_LOOP_ARGS_8 KOKKOS_IMPL_LOOP_ARGS_7, i7 + m_offset[7] // New Loop Macros... // parallel_for, non-tagged #define KOKKOS_IMPL_APPLY(func, ...) func(__VA_ARGS__); // LayoutRight // d = 0 to start #define KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_APPLY(func, __VA_ARGS__, i0 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i1 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i2 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i3 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i4 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i5 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i6 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_8(func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, d + 1, __VA_ARGS__, \ i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start #define KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_APPLY(func, i0 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, d - 1, \ i1 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, d - 1, \ i2 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, d - 1, \ i3 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, d - 1, \ i4 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, d - 1, \ i5 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, d - 1, \ i6 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_8(func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, d - 1, \ i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right // TODO: rank not necessary to pass through, can hardcode the values #define KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent, rank) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ KOKKOS_IMPL_APPLY(func, i0 + m_offset[0]) \ } #define KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ KOKKOS_IMPL_LOOP_L_1(func, type, m_offset, extent, rank - 2, \ i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ KOKKOS_IMPL_LOOP_R_1(func, type, m_offset, extent, 1, i1 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ KOKKOS_IMPL_LOOP_L_2(func, type, m_offset, extent, rank - 2, \ i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ KOKKOS_IMPL_LOOP_R_2(func, type, m_offset, extent, 1, i2 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ KOKKOS_IMPL_LOOP_L_3(func, type, m_offset, extent, rank - 2, \ i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ KOKKOS_IMPL_LOOP_R_3(func, type, m_offset, extent, 1, i3 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ KOKKOS_IMPL_LOOP_L_4(func, type, m_offset, extent, rank - 2, \ i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ KOKKOS_IMPL_LOOP_R_4(func, type, m_offset, extent, 1, i4 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ KOKKOS_IMPL_LOOP_L_5(func, type, m_offset, extent, rank - 2, \ i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ KOKKOS_IMPL_LOOP_R_5(func, type, m_offset, extent, 1, i5 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ KOKKOS_IMPL_LOOP_L_6(func, type, m_offset, extent, rank - 2, \ i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ KOKKOS_IMPL_LOOP_R_6(func, type, m_offset, extent, 1, i6 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ KOKKOS_IMPL_LOOP_L_7(func, type, m_offset, extent, rank - 2, \ i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ KOKKOS_IMPL_LOOP_R_7(func, type, m_offset, extent, 1, i7 + m_offset[0]) \ } \ } // Partial vs Full Tile #define KOKKOS_IMPL_TILE_LOOP_1(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_1(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_2(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_2(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_3(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_3(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_4(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_4(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_5(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_5(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_6(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_6(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_7(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_7(func, type, is_left, m_offset, extent_partial, \ rank) \ } #define KOKKOS_IMPL_TILE_LOOP_8(func, type, is_left, cond, m_offset, \ extent_full, extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_full, \ rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_8(func, type, is_left, m_offset, extent_partial, \ rank) \ } // parallel_reduce, non-tagged // Reduction version #define KOKKOS_IMPL_APPLY_REDUX(val, func, ...) func(__VA_ARGS__, val); // LayoutRight // d = 0 to start #define KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_APPLY_REDUX(val, func, __VA_ARGS__, i0 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i1 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i2 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i3 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i4 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i5 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i6 + m_offset[d]) \ } #define KOKKOS_IMPL_LOOP_R_8_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start #define KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, d - 1, \ i1 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, d - 1, \ i2 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, d - 1, \ i3 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, d - 1, \ i4 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, d - 1, \ i5 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, d - 1, \ i6 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_LOOP_L_8_REDUX(val, func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, d - 1, \ i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right #define KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ KOKKOS_IMPL_APPLY_REDUX(val, func, i0 + m_offset[0]) \ } #define KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ KOKKOS_IMPL_LOOP_L_1_REDUX(val, func, type, m_offset, extent, rank - 2, \ i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ KOKKOS_IMPL_LOOP_R_1_REDUX(val, func, type, m_offset, extent, 1, \ i1 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ KOKKOS_IMPL_LOOP_L_2_REDUX(val, func, type, m_offset, extent, rank - 2, \ i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ KOKKOS_IMPL_LOOP_R_2_REDUX(val, func, type, m_offset, extent, 1, \ i2 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ KOKKOS_IMPL_LOOP_L_3_REDUX(val, func, type, m_offset, extent, rank - 2, \ i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ KOKKOS_IMPL_LOOP_R_3_REDUX(val, func, type, m_offset, extent, 1, \ i3 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ KOKKOS_IMPL_LOOP_L_4_REDUX(val, func, type, m_offset, extent, rank - 2, \ i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ KOKKOS_IMPL_LOOP_R_4_REDUX(val, func, type, m_offset, extent, 1, \ i4 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ KOKKOS_IMPL_LOOP_L_5_REDUX(val, func, type, m_offset, extent, rank - 2, \ i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ KOKKOS_IMPL_LOOP_R_5_REDUX(val, func, type, m_offset, extent, 1, \ i5 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ KOKKOS_IMPL_LOOP_L_6_REDUX(val, func, type, m_offset, extent, rank - 2, \ i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ KOKKOS_IMPL_LOOP_R_6_REDUX(val, func, type, m_offset, extent, 1, \ i6 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ KOKKOS_IMPL_LOOP_L_7_REDUX(val, func, type, m_offset, extent, rank - 2, \ i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ KOKKOS_IMPL_LOOP_R_7_REDUX(val, func, type, m_offset, extent, 1, \ i7 + m_offset[0]) \ } \ } // Partial vs Full Tile #define KOKKOS_IMPL_TILE_LOOP_1_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_2_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_3_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_4_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_5_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_6_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_7_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TILE_LOOP_8_REDUX(val, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX(val, func, type, is_left, m_offset, \ extent_partial, rank) \ } // end New Loop Macros // tagged macros #define KOKKOS_IMPL_TAGGED_APPLY(tag, func, ...) func(tag, __VA_ARGS__); // LayoutRight // d = 0 to start #define KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY(tag, func, __VA_ARGS__, i0 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i1 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i2 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i3 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i4 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i5 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i6 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_8(tag, func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, d + 1, \ __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start #define KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, d - 1, \ i1 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, d - 1, \ i2 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, d - 1, \ i3 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, d - 1, \ i4 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, d - 1, \ i5 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, d - 1, \ i6 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_8(tag, func, type, m_offset, extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, d - 1, \ i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right // TODO: rank not necessary to pass through, can hardcode the values #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ extent, rank) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY(tag, func, i0 + m_offset[0]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_L_1(tag, func, type, m_offset, extent, rank - 2, \ i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_R_1(tag, func, type, m_offset, extent, 1, \ i1 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_L_2(tag, func, type, m_offset, extent, rank - 2, \ i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_R_2(tag, func, type, m_offset, extent, 1, \ i2 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_L_3(tag, func, type, m_offset, extent, rank - 2, \ i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_R_3(tag, func, type, m_offset, extent, 1, \ i3 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_L_4(tag, func, type, m_offset, extent, rank - 2, \ i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_R_4(tag, func, type, m_offset, extent, 1, \ i4 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_L_5(tag, func, type, m_offset, extent, rank - 2, \ i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_R_5(tag, func, type, m_offset, extent, 1, \ i5 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_L_6(tag, func, type, m_offset, extent, rank - 2, \ i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_R_6(tag, func, type, m_offset, extent, 1, \ i6 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_L_7(tag, func, type, m_offset, extent, rank - 2, \ i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_R_7(tag, func, type, m_offset, extent, 1, \ i7 + m_offset[0]) \ } \ } // Partial vs Full Tile #define KOKKOS_IMPL_TAGGED_TILE_LOOP_1(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_2(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_3(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_4(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_5(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_6(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_7(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_8(tag, func, type, is_left, cond, \ m_offset, extent_full, extent_partial, \ rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8(tag, func, type, is_left, m_offset, \ extent_partial, rank) \ } // parallel_reduce, tagged // Reduction version #define KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, ...) \ func(tag, __VA_ARGS__, val); // LayoutRight // d = 0 to start #define KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, __VA_ARGS__, \ i0 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i1 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i2 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i3 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i4 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i5 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i6 + m_offset[d]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_R_8_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, extent, \ d + 1, __VA_ARGS__, i7 + m_offset[d]) \ } // LayoutLeft // d = rank-1 to start #define KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[d]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[d], \ __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i1 = (type)0; i1 < static_cast(extent[d]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i1 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i2 = (type)0; i2 < static_cast(extent[d]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i2 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i3 = (type)0; i3 < static_cast(extent[d]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i3 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i4 = (type)0; i4 < static_cast(extent[d]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i4 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i5 = (type)0; i5 < static_cast(extent[d]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i5 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i6 = (type)0; i6 < static_cast(extent[d]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i6 + m_offset[d], __VA_ARGS__) \ } #define KOKKOS_IMPL_TAGGED_LOOP_L_8_REDUX(val, tag, func, type, m_offset, \ extent, d, ...) \ for (type i7 = (type)0; i7 < static_cast(extent[d]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, extent, \ d - 1, i7 + m_offset[d], __VA_ARGS__) \ } // Left vs Right #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ KOKKOS_ENABLE_IVDEP_MDRANGE \ for (type i0 = (type)0; i0 < static_cast(extent[0]); ++i0) { \ KOKKOS_IMPL_TAGGED_APPLY_REDUX(val, tag, func, i0 + m_offset[0]) \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i1 = (type)0; i1 < static_cast(extent[rank - 1]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i1 + m_offset[rank - 1]) \ } \ } else { \ for (type i1 = (type)0; i1 < static_cast(extent[0]); ++i1) { \ KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX(val, tag, func, type, m_offset, \ extent, 1, i1 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i2 = (type)0; i2 < static_cast(extent[rank - 1]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i2 + m_offset[rank - 1]) \ } \ } else { \ for (type i2 = (type)0; i2 < static_cast(extent[0]); ++i2) { \ KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX(val, tag, func, type, m_offset, \ extent, 1, i2 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i3 = (type)0; i3 < static_cast(extent[rank - 1]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i3 + m_offset[rank - 1]) \ } \ } else { \ for (type i3 = (type)0; i3 < static_cast(extent[0]); ++i3) { \ KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX(val, tag, func, type, m_offset, \ extent, 1, i3 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i4 = (type)0; i4 < static_cast(extent[rank - 1]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i4 + m_offset[rank - 1]) \ } \ } else { \ for (type i4 = (type)0; i4 < static_cast(extent[0]); ++i4) { \ KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX(val, tag, func, type, m_offset, \ extent, 1, i4 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i5 = (type)0; i5 < static_cast(extent[rank - 1]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i5 + m_offset[rank - 1]) \ } \ } else { \ for (type i5 = (type)0; i5 < static_cast(extent[0]); ++i5) { \ KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX(val, tag, func, type, m_offset, \ extent, 1, i5 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i6 = (type)0; i6 < static_cast(extent[rank - 1]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i6 + m_offset[rank - 1]) \ } \ } else { \ for (type i6 = (type)0; i6 < static_cast(extent[0]); ++i6) { \ KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX(val, tag, func, type, m_offset, \ extent, 1, i6 + m_offset[0]) \ } \ } #define KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ m_offset, extent, rank) \ if (is_left) { \ for (type i7 = (type)0; i7 < static_cast(extent[rank - 1]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX(val, tag, func, type, m_offset, \ extent, rank - 2, \ i7 + m_offset[rank - 1]) \ } \ } else { \ for (type i7 = (type)0; i7 < static_cast(extent[0]); ++i7) { \ KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX(val, tag, func, type, m_offset, \ extent, 1, i7 + m_offset[0]) \ } \ } // Partial vs Full Tile #define KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } #define KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(val, tag, func, type, is_left, \ cond, m_offset, extent_full, \ extent_partial, rank) \ if (cond) { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ m_offset, extent_full, rank) \ } else { \ KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX(val, tag, func, type, is_left, \ m_offset, extent_partial, rank) \ } // end tagged macros // Structs for calling loops template struct Tile_Loop_Type; template struct Tile_Loop_Type<1, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_1(func, IType, IsLeft, cond, offset, a, b, 1); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_1_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 1); } }; template struct Tile_Loop_Type<2, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_2(func, IType, IsLeft, cond, offset, a, b, 2); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_2_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 2); } }; template struct Tile_Loop_Type<3, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_3(func, IType, IsLeft, cond, offset, a, b, 3); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_3_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 3); } }; template struct Tile_Loop_Type<4, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_4(func, IType, IsLeft, cond, offset, a, b, 4); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_4_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 4); } }; template struct Tile_Loop_Type<5, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_5(func, IType, IsLeft, cond, offset, a, b, 5); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_5_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 5); } }; template struct Tile_Loop_Type<6, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_6(func, IType, IsLeft, cond, offset, a, b, 6); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_6_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 6); } }; template struct Tile_Loop_Type<7, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_7(func, IType, IsLeft, cond, offset, a, b, 7); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_7_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 7); } }; template struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_8(func, IType, IsLeft, cond, offset, a, b, 8); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TILE_LOOP_8_REDUX(value, func, IType, IsLeft, cond, offset, a, b, 8); } }; // tagged versions template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_1(Tagged(), func, IType, IsLeft, cond, offset, a, b, 1); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 1); } }; template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_2(Tagged(), func, IType, IsLeft, cond, offset, a, b, 2); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 2); } }; template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_3(Tagged(), func, IType, IsLeft, cond, offset, a, b, 3); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 3); } }; template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_4(Tagged(), func, IType, IsLeft, cond, offset, a, b, 4); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 4); } }; template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_5(Tagged(), func, IType, IsLeft, cond, offset, a, b, 5); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 5); } }; template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_6(Tagged(), func, IType, IsLeft, cond, offset, a, b, 6); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 6); } }; template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_7(Tagged(), func, IType, IsLeft, cond, offset, a, b, 7); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 7); } }; template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, std::enable_if_t::value>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_8(Tagged(), func, IType, IsLeft, cond, offset, a, b, 8); } template static void apply(ValType& value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX(value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 8); } }; // end Structs for calling loops template struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; using value_type = ValueType; inline HostIterateTile(RP const& rp, Functor const& func) : m_rp(rp), m_func(func) {} inline bool check_iteration_bounds(point_type& partial_tile, point_type& offset) const { bool is_full_tile = true; for (int i = 0; i < RP::rank; ++i) { if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { partial_tile[i] = m_rp.m_tile[i]; } else { is_full_tile = false; partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) : (m_rp.m_upper[i] - m_rp.m_lower[i]); // when single tile encloses range } } return is_full_tile; } // end check bounds template struct RankTag { using type = RankTag; enum { value = (int)Rank }; }; #if KOKKOS_ENABLE_NEW_LOOP_MACROS template inline void operator()(IType tile_idx) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); Tile_Loop_Type::apply(m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims); } #else template inline void operator()(IType tile_idx) const { operator_impl(tile_idx, RankTag()); } // added due to compiler error when using sfinae to choose operator based on // rank w/ cuda+serial template inline void operator_impl(IType tile_idx, const RankTag<2>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Right } // end op() rank == 2 template inline void operator_impl(IType tile_idx, const RankTag<3>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Right } // end op() rank == 3 template inline void operator_impl(IType tile_idx, const RankTag<4>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Right } // end op() rank == 4 template inline void operator_impl(IType tile_idx, const RankTag<5>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Right } // end op() rank == 5 template inline void operator_impl(IType tile_idx, const RankTag<6>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Right } // end op() rank == 6 template inline void operator_impl(IType tile_idx, const RankTag<7>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Right } // end op() rank == 7 template inline void operator_impl(IType tile_idx, const RankTag<8>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Right } // end op() rank == 8 #endif template std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), void> apply(Args&&... args) const { m_func(args...); } template std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; std::conditional_t::value, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && !std::is_array::value>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; using value_type = ValueType; inline HostIterateTile(RP const& rp, Functor const& func) : m_rp(rp) // Cuda 7.0 does not like braces... , m_func(func) { // Errors due to braces rather than parenthesis for init (with cuda 7.0) // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: // error: too many braces around initializer for ‘int’ [-fpermissive] // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: // error: aggregate value used where an integer was expected } inline bool check_iteration_bounds(point_type& partial_tile, point_type& offset) const { bool is_full_tile = true; for (int i = 0; i < RP::rank; ++i) { if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { partial_tile[i] = m_rp.m_tile[i]; } else { is_full_tile = false; partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) : (m_rp.m_upper[i] - m_rp.m_lower[i]); // when single tile encloses range } } return is_full_tile; } // end check bounds template struct RankTag { using type = RankTag; enum { value = (int)Rank }; }; #if KOKKOS_ENABLE_NEW_LOOP_MACROS template inline void operator()(IType tile_idx, value_type& val) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); Tile_Loop_Type::apply(val, m_func.get_functor(), full_tile, m_offset, m_rp.m_tile, m_tiledims); } #else template inline void operator()(IType tile_idx) const { operator_impl(tile_idx, RankTag()); } // added due to compiler error when using sfinae to choose operator based on // rank template inline void operator_impl(IType tile_idx, const RankTag<2>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Right } // end op() rank == 2 template inline void operator_impl(IType tile_idx, const RankTag<3>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Right } // end op() rank == 3 template inline void operator_impl(IType tile_idx, const RankTag<4>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Right } // end op() rank == 4 template inline void operator_impl(IType tile_idx, const RankTag<5>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Right } // end op() rank == 5 template inline void operator_impl(IType tile_idx, const RankTag<6>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Right } // end op() rank == 6 template inline void operator_impl(IType tile_idx, const RankTag<7>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Right } // end op() rank == 7 template inline void operator_impl(IType tile_idx, const RankTag<8>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Right } // end op() rank == 8 template std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), void> apply(Args&&... args) const { m_func(args..., m_v); } template std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), void> apply(Args&&... args) const { m_func(m_tag, args..., m_v); } #endif RP const m_rp; Functor const m_func; std::conditional_t::value, int, Tag> m_tag; }; // For ParallelReduce // Extra specialization for array reductions // ValueType[]: For array reductions template struct HostIterateTile::value && std::is_array::value>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; using value_type = std::remove_extent_t; // strip away the // 'array-ness' [], only // underlying type remains inline HostIterateTile(RP const& rp, Functor const& func) : m_rp(rp) // Cuda 7.0 does not like braces... , m_func(func) {} inline bool check_iteration_bounds(point_type& partial_tile, point_type& offset) const { bool is_full_tile = true; for (int i = 0; i < RP::rank; ++i) { if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { partial_tile[i] = m_rp.m_tile[i]; } else { is_full_tile = false; partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) : (m_rp.m_upper[i] - m_rp.m_lower[i]); // when single tile encloses range } } return is_full_tile; } // end check bounds template struct RankTag { using type = RankTag; enum { value = (int)Rank }; }; #if KOKKOS_ENABLE_NEW_LOOP_MACROS template inline void operator()(IType tile_idx, value_type* val) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); Tile_Loop_Type::apply(val, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims); } #else template inline void operator()(IType tile_idx) const { operator_impl(tile_idx, RankTag()); } // added due to compiler error when using sfinae to choose operator based on // rank template inline void operator_impl(IType tile_idx, const RankTag<2>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } else { // #pragma simd KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } } } // end Iterate::Right } // end op() rank == 2 template inline void operator_impl(IType tile_idx, const RankTag<3>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } else { // #pragma simd KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } } } // end Iterate::Right } // end op() rank == 3 template inline void operator_impl(IType tile_idx, const RankTag<4>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } else { // #pragma simd KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } } } // end Iterate::Right } // end op() rank == 4 template inline void operator_impl(IType tile_idx, const RankTag<5>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } else { // #pragma simd KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } } } // end Iterate::Right } // end op() rank == 5 template inline void operator_impl(IType tile_idx, const RankTag<6>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } else { // #pragma simd KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } } } // end Iterate::Right } // end op() rank == 6 template inline void operator_impl(IType tile_idx, const RankTag<7>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } else { // #pragma simd KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } } } // end Iterate::Right } // end op() rank == 7 template inline void operator_impl(IType tile_idx, const RankTag<8>) const { point_type m_offset; point_type m_tiledims; if (RP::outer_direction == Iterate::Left) { for (int i = 0; i < RP::rank; ++i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } else { for (int i = RP::rank - 1; i >= 0; --i) { m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; tile_idx /= m_rp.m_tile_end[i]; } } // Check if offset+tiledim in bounds - if not, replace tile dims with the // partial tile dims const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); if (RP::inner_direction == Iterate::Left) { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Left else { if (full_tile) { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } else { // #pragma simd KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } } } // end Iterate::Right } // end op() rank == 8 template std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), void> apply(Args&&... args) const { m_func(args..., m_v); } template std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), void> apply(Args&&... args) const { m_func(m_tag, args..., m_v); } #endif RP const m_rp; Functor const m_func; std::conditional_t::value, int, Tag> m_tag; }; // ------------------------------------------------------------------ // #undef KOKKOS_ENABLE_NEW_LOOP_MACROS #undef KOKKOS_IMPL_LOOP_1L #undef KOKKOS_IMPL_LOOP_2L #undef KOKKOS_IMPL_LOOP_3L #undef KOKKOS_IMPL_LOOP_4L #undef KOKKOS_IMPL_LOOP_5L #undef KOKKOS_IMPL_LOOP_6L #undef KOKKOS_IMPL_LOOP_7L #undef KOKKOS_IMPL_LOOP_8L #undef KOKKOS_IMPL_LOOP_1R #undef KOKKOS_IMPL_LOOP_2R #undef KOKKOS_IMPL_LOOP_3R #undef KOKKOS_IMPL_LOOP_4R #undef KOKKOS_IMPL_LOOP_5R #undef KOKKOS_IMPL_LOOP_6R #undef KOKKOS_IMPL_LOOP_7R #undef KOKKOS_IMPL_LOOP_8R #undef KOKKOS_IMPL_LOOP_ARGS_1 #undef KOKKOS_IMPL_LOOP_ARGS_2 #undef KOKKOS_IMPL_LOOP_ARGS_3 #undef KOKKOS_IMPL_LOOP_ARGS_4 #undef KOKKOS_IMPL_LOOP_ARGS_5 #undef KOKKOS_IMPL_LOOP_ARGS_6 #undef KOKKOS_IMPL_LOOP_ARGS_7 #undef KOKKOS_IMPL_LOOP_ARGS_8 #undef KOKKOS_IMPL_APPLY #undef KOKKOS_IMPL_LOOP_R_1 #undef KOKKOS_IMPL_LOOP_R_2 #undef KOKKOS_IMPL_LOOP_R_3 #undef KOKKOS_IMPL_LOOP_R_4 #undef KOKKOS_IMPL_LOOP_R_5 #undef KOKKOS_IMPL_LOOP_R_6 #undef KOKKOS_IMPL_LOOP_R_7 #undef KOKKOS_IMPL_LOOP_R_8 #undef KOKKOS_IMPL_LOOP_L_1 #undef KOKKOS_IMPL_LOOP_L_2 #undef KOKKOS_IMPL_LOOP_L_3 #undef KOKKOS_IMPL_LOOP_L_4 #undef KOKKOS_IMPL_LOOP_L_5 #undef KOKKOS_IMPL_LOOP_L_6 #undef KOKKOS_IMPL_LOOP_L_7 #undef KOKKOS_IMPL_LOOP_L_8 #undef KOKKOS_IMPL_LOOP_LAYOUT_1 #undef KOKKOS_IMPL_LOOP_LAYOUT_2 #undef KOKKOS_IMPL_LOOP_LAYOUT_3 #undef KOKKOS_IMPL_LOOP_LAYOUT_4 #undef KOKKOS_IMPL_LOOP_LAYOUT_5 #undef KOKKOS_IMPL_LOOP_LAYOUT_6 #undef KOKKOS_IMPL_LOOP_LAYOUT_7 #undef KOKKOS_IMPL_LOOP_LAYOUT_8 #undef KOKKOS_IMPL_TILE_LOOP_1 #undef KOKKOS_IMPL_TILE_LOOP_2 #undef KOKKOS_IMPL_TILE_LOOP_3 #undef KOKKOS_IMPL_TILE_LOOP_4 #undef KOKKOS_IMPL_TILE_LOOP_5 #undef KOKKOS_IMPL_TILE_LOOP_6 #undef KOKKOS_IMPL_TILE_LOOP_7 #undef KOKKOS_IMPL_TILE_LOOP_8 #undef KOKKOS_IMPL_APPLY_REDUX #undef KOKKOS_IMPL_LOOP_R_1_REDUX #undef KOKKOS_IMPL_LOOP_R_2_REDUX #undef KOKKOS_IMPL_LOOP_R_3_REDUX #undef KOKKOS_IMPL_LOOP_R_4_REDUX #undef KOKKOS_IMPL_LOOP_R_5_REDUX #undef KOKKOS_IMPL_LOOP_R_6_REDUX #undef KOKKOS_IMPL_LOOP_R_7_REDUX #undef KOKKOS_IMPL_LOOP_R_8_REDUX #undef KOKKOS_IMPL_LOOP_L_1_REDUX #undef KOKKOS_IMPL_LOOP_L_2_REDUX #undef KOKKOS_IMPL_LOOP_L_3_REDUX #undef KOKKOS_IMPL_LOOP_L_4_REDUX #undef KOKKOS_IMPL_LOOP_L_5_REDUX #undef KOKKOS_IMPL_LOOP_L_6_REDUX #undef KOKKOS_IMPL_LOOP_L_7_REDUX #undef KOKKOS_IMPL_LOOP_L_8_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_1_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_2_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_3_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_4_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_5_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_6_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_7_REDUX #undef KOKKOS_IMPL_LOOP_LAYOUT_8_REDUX #undef KOKKOS_IMPL_TILE_LOOP_1_REDUX #undef KOKKOS_IMPL_TILE_LOOP_2_REDUX #undef KOKKOS_IMPL_TILE_LOOP_3_REDUX #undef KOKKOS_IMPL_TILE_LOOP_4_REDUX #undef KOKKOS_IMPL_TILE_LOOP_5_REDUX #undef KOKKOS_IMPL_TILE_LOOP_6_REDUX #undef KOKKOS_IMPL_TILE_LOOP_7_REDUX #undef KOKKOS_IMPL_TILE_LOOP_8_REDUX #undef KOKKOS_IMPL_TAGGED_APPLY #undef KOKKOS_IMPL_TAGGED_LOOP_R_1 #undef KOKKOS_IMPL_TAGGED_LOOP_R_2 #undef KOKKOS_IMPL_TAGGED_LOOP_R_3 #undef KOKKOS_IMPL_TAGGED_LOOP_R_4 #undef KOKKOS_IMPL_TAGGED_LOOP_R_5 #undef KOKKOS_IMPL_TAGGED_LOOP_R_6 #undef KOKKOS_IMPL_TAGGED_LOOP_R_7 #undef KOKKOS_IMPL_TAGGED_LOOP_R_8 #undef KOKKOS_IMPL_TAGGED_LOOP_L_1 #undef KOKKOS_IMPL_TAGGED_LOOP_L_2 #undef KOKKOS_IMPL_TAGGED_LOOP_L_3 #undef KOKKOS_IMPL_TAGGED_LOOP_L_4 #undef KOKKOS_IMPL_TAGGED_LOOP_L_5 #undef KOKKOS_IMPL_TAGGED_LOOP_L_6 #undef KOKKOS_IMPL_TAGGED_LOOP_L_7 #undef KOKKOS_IMPL_TAGGED_LOOP_L_8 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7 #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_1 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_2 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_3 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_4 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_5 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_6 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_7 #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_8 #undef KOKKOS_IMPL_TAGGED_APPLY_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_1_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_2_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_3_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_4_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_5_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_6_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_7_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_R_8_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_1_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_2_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_3_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_4_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_5_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_6_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_7_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_L_8_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_1_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_2_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_3_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_4_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_5_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_6_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_7_REDUX #undef KOKKOS_IMPL_TAGGED_LOOP_LAYOUT_8_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_1_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_2_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_3_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_4_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_5_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_6_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_7_REDUX #undef KOKKOS_IMPL_TAGGED_TILE_LOOP_8_REDUX } // namespace Impl } // namespace Kokkos #endif