44 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP 45 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP 47 #include <initializer_list> 49 #include<impl/KokkosExp_Host_IterateTile.hpp> 50 #include <Kokkos_ExecPolicy.hpp> 53 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) 54 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp> 68 template <
typename ExecSpace>
69 struct default_outer_direction
72 #if defined( KOKKOS_ENABLE_CUDA) 73 static constexpr Iterate value = Iterate::Left;
75 static constexpr Iterate value = Iterate::Right;
79 template <
typename ExecSpace>
80 struct default_inner_direction
83 #if defined( KOKKOS_ENABLE_CUDA) 84 static constexpr Iterate value = Iterate::Left;
86 static constexpr Iterate value = Iterate::Right;
93 , Iterate OuterDir = Iterate::Default
94 , Iterate InnerDir = Iterate::Default
98 static_assert( N != 0u,
"Kokkos Error: rank 0 undefined");
99 static_assert( N != 1u,
"Kokkos Error: rank 1 is not a multi-dimensional range");
100 static_assert( N < 7u,
"Kokkos Error: Unsupported rank...");
102 using iteration_pattern = Rank<N, OuterDir, InnerDir>;
104 static constexpr
int rank = N;
105 static constexpr Iterate outer_direction = OuterDir;
106 static constexpr Iterate inner_direction = InnerDir;
111 template <
typename... Properties>
113 :
public Kokkos::Impl::PolicyTraits<Properties ...>
115 using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
116 using range_policy = RangePolicy<Properties...>;
118 using impl_range_policy = RangePolicy<
typename traits::execution_space
119 ,
typename traits::schedule_type
120 ,
typename traits::index_type
123 static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
124 ,
"Kokkos Error: MD iteration pattern not defined" );
126 using iteration_pattern =
typename traits::iteration_pattern;
127 using work_tag =
typename traits::work_tag;
129 static constexpr
int rank = iteration_pattern::rank;
131 static constexpr
int outer_direction =
static_cast<int> (
132 (iteration_pattern::outer_direction != Iterate::Default)
133 ? iteration_pattern::outer_direction
134 : default_outer_direction< typename traits::execution_space>::value );
136 static constexpr
int inner_direction =
static_cast<int> (
137 iteration_pattern::inner_direction != Iterate::Default
138 ? iteration_pattern::inner_direction
139 : default_inner_direction< typename traits::execution_space>::value ) ;
143 static constexpr
int Right =
static_cast<int>( Iterate::Right );
144 static constexpr
int Left =
static_cast<int>( Iterate::Left );
146 using index_type =
typename traits::index_type;
147 using array_index_type = long;
158 MDRangePolicy( point_type
const& lower, point_type
const& upper, tile_type
const& tile = tile_type{} )
166 #
if defined(KOKKOS_ENABLE_CUDA)
167 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
172 for (
int i=0; i<
rank; ++i) {
173 span = upper[i] - lower[i];
174 if ( m_tile[i] <= 0 ) {
175 if ( (inner_direction == Right && (i < rank-1))
176 || (inner_direction == Left && (i > 0)) )
184 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
185 m_num_tiles *= m_tile_end[i];
188 #if defined(KOKKOS_ENABLE_CUDA) 192 for (
int i=0; i<
rank; ++i) {
193 span = upper[i] - lower[i];
194 if ( m_tile[i] <= 0 ) {
197 if ( (inner_direction == Right && (i < rank-1))
198 || (inner_direction == Left && (i > 0)) )
206 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
207 m_num_tiles *= m_tile_end[i];
209 index_type total_tile_size_check = 1;
210 for (
int i=0; i<
rank; ++i) {
211 total_tile_size_check *= m_tile[i];
213 if ( total_tile_size_check >= 1024 ) {
214 printf(
" Tile dimensions exceed Cuda limits\n");
215 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
223 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
224 MDRangePolicy( std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
228 point_type lower_tmp, upper_tmp;
230 for (
auto i = 0; i <
rank; ++i ) {
231 lower_tmp[i] =
static_cast<array_index_type
>(lower.begin()[i]);
232 upper_tmp[i] =
static_cast<array_index_type
>(upper.begin()[i]);
233 tile_tmp[i] =
static_cast<array_index_type
>(tile.begin()[i]);
236 MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
239 if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
240 Kokkos::abort(
"MDRangePolicy: Constructor initializer lists have wrong size");
242 for (
auto i = 0; i <
rank; ++i ) {
243 m_lower[i] =
static_cast<array_index_type
>(lower.begin()[i]);
244 m_upper[i] =
static_cast<array_index_type
>(upper.begin()[i]);
245 if(static_cast<int>(tile.size())==rank)
246 m_tile[i] =
static_cast<array_index_type
>(tile.begin()[i]);
256 #
if defined(KOKKOS_ENABLE_CUDA)
257 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
262 for (
int i=0; i<
rank; ++i) {
263 span = m_upper[i] - m_lower[i];
264 if ( m_tile[i] <= 0 ) {
265 if ( (inner_direction == Right && (i < rank-1))
266 || (inner_direction == Left && (i > 0)) )
274 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
275 m_num_tiles *= m_tile_end[i];
278 #if defined(KOKKOS_ENABLE_CUDA) 282 for (
int i=0; i<
rank; ++i) {
283 span = m_upper[i] - m_lower[i];
284 if ( m_tile[i] <= 0 ) {
287 if ( (inner_direction == Right && (i < rank-1))
288 || (inner_direction == Left && (i > 0)) )
296 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
297 m_num_tiles *= m_tile_end[i];
299 index_type total_tile_size_check = 1;
300 for (
int i=0; i<
rank; ++i) {
301 total_tile_size_check *= m_tile[i];
303 if ( total_tile_size_check >= 1024 ) {
304 printf(
" Tile dimensions exceed Cuda limits\n");
305 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
317 point_type m_tile_end;
318 index_type m_num_tiles;
325 template <
typename MDRange,
typename Functor,
typename Enable =
void>
326 void md_parallel_for( MDRange
const& range
328 ,
const std::string& str =
"" 329 ,
typename std::enable_if<(
true 330 #
if defined( KOKKOS_ENABLE_CUDA)
331 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
336 Impl::MDFunctor<MDRange, Functor, void> g(range, f);
339 using range_policy =
typename MDRange::impl_range_policy;
344 template <
typename MDRange,
typename Functor>
345 void md_parallel_for(
const std::string& str
346 , MDRange
const& range
348 ,
typename std::enable_if<(
true 349 #
if defined( KOKKOS_ENABLE_CUDA)
350 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
355 Impl::MDFunctor<MDRange, Functor, void> g(range, f);
358 using range_policy =
typename MDRange::impl_range_policy;
364 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA ) 365 template <
typename MDRange,
typename Functor>
366 void md_parallel_for(
const std::string& str
367 , MDRange
const& range
369 ,
typename std::enable_if<(
true 370 #
if defined( KOKKOS_ENABLE_CUDA)
371 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
376 Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
380 template <
typename MDRange,
typename Functor>
381 void md_parallel_for( MDRange
const& range
383 ,
const std::string& str =
"" 384 ,
typename std::enable_if<(
true 385 #
if defined( KOKKOS_ENABLE_CUDA)
386 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
391 Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
400 template <
typename MDRange,
typename Functor,
typename ValueType>
401 void md_parallel_reduce( MDRange
const& range
404 ,
const std::string& str =
"" 405 ,
typename std::enable_if<(
true 406 #
if defined( KOKKOS_ENABLE_CUDA)
407 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
412 Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
415 using range_policy =
typename MDRange::impl_range_policy;
419 template <
typename MDRange,
typename Functor,
typename ValueType>
420 void md_parallel_reduce(
const std::string& str
421 , MDRange
const& range
424 ,
typename std::enable_if<(
true 425 #
if defined( KOKKOS_ENABLE_CUDA)
426 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
431 Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
434 using range_policy =
typename MDRange::impl_range_policy;
476 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Declaration of parallel operators.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P... > &V)
Temporary free function rank() until rank() is implemented in the View.