Kokkos Core Kernels Package  Version of the Day
KokkosExp_MDRangePolicy.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 2.0
6 // Copyright (2014) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
45 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
46 
47 #include <initializer_list>
48 
49 #include<impl/KokkosExp_Host_IterateTile.hpp>
50 #include <Kokkos_ExecPolicy.hpp>
51 #include <Kokkos_Parallel.hpp>
52 
53 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
54 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
55 #endif
56 
57 namespace Kokkos { namespace Experimental {
58 
59 // ------------------------------------------------------------------ //
60 
61 enum class Iterate
62 {
63  Default, // Default for the device
64  Left, // Left indices stride fastest
65  Right, // Right indices stride fastest
66 };
67 
68 template <typename ExecSpace>
69 struct default_outer_direction
70 {
71  using type = Iterate;
72  #if defined( KOKKOS_ENABLE_CUDA)
73  static constexpr Iterate value = Iterate::Left;
74  #else
75  static constexpr Iterate value = Iterate::Right;
76  #endif
77 };
78 
79 template <typename ExecSpace>
80 struct default_inner_direction
81 {
82  using type = Iterate;
83  #if defined( KOKKOS_ENABLE_CUDA)
84  static constexpr Iterate value = Iterate::Left;
85  #else
86  static constexpr Iterate value = Iterate::Right;
87  #endif
88 };
89 
90 
91 // Iteration Pattern
92 template < unsigned N
93  , Iterate OuterDir = Iterate::Default
94  , Iterate InnerDir = Iterate::Default
95  >
96 struct Rank
97 {
98  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
99  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
100  static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
101 
102  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
103 
104  static constexpr int rank = N;
105  static constexpr Iterate outer_direction = OuterDir;
106  static constexpr Iterate inner_direction = InnerDir;
107 };
108 
109 
110 // multi-dimensional iteration pattern
111 template <typename... Properties>
112 struct MDRangePolicy
113  : public Kokkos::Impl::PolicyTraits<Properties ...>
114 {
115  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
116  using range_policy = RangePolicy<Properties...>;
117 
118  using impl_range_policy = RangePolicy< typename traits::execution_space
119  , typename traits::schedule_type
120  , typename traits::index_type
121  > ;
122 
123  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
124  , "Kokkos Error: MD iteration pattern not defined" );
125 
126  using iteration_pattern = typename traits::iteration_pattern;
127  using work_tag = typename traits::work_tag;
128 
129  static constexpr int rank = iteration_pattern::rank;
130 
131  static constexpr int outer_direction = static_cast<int> (
132  (iteration_pattern::outer_direction != Iterate::Default)
133  ? iteration_pattern::outer_direction
134  : default_outer_direction< typename traits::execution_space>::value );
135 
136  static constexpr int inner_direction = static_cast<int> (
137  iteration_pattern::inner_direction != Iterate::Default
138  ? iteration_pattern::inner_direction
139  : default_inner_direction< typename traits::execution_space>::value ) ;
140 
141 
142  // Ugly ugly workaround intel 14 not handling scoped enum correctly
143  static constexpr int Right = static_cast<int>( Iterate::Right );
144  static constexpr int Left = static_cast<int>( Iterate::Left );
145 
146  using index_type = typename traits::index_type;
147  using array_index_type = long;
148  using point_type = Kokkos::Array<array_index_type,rank>; //was index_type
149  using tile_type = Kokkos::Array<array_index_type,rank>;
150  // If point_type or tile_type is not templated on a signed integral type (if it is unsigned),
151  // then if user passes in intializer_list of runtime-determined values of
152  // signed integral type that are not const will receive a compiler error due
153  // to an invalid case for implicit conversion -
154  // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
155  // This would require the user to either pass a matching index_type parameter
156  // as template parameter to the MDRangePolicy or static_cast the individual values
157 
158  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
159  : m_lower(lower)
160  , m_upper(upper)
161  , m_tile(tile)
162  , m_num_tiles(1)
163  {
164  // Host
165  if ( true
166  #if defined(KOKKOS_ENABLE_CUDA)
167  && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
168  #endif
169  )
170  {
171  index_type span;
172  for (int i=0; i<rank; ++i) {
173  span = upper[i] - lower[i];
174  if ( m_tile[i] <= 0 ) {
175  if ( (inner_direction == Right && (i < rank-1))
176  || (inner_direction == Left && (i > 0)) )
177  {
178  m_tile[i] = 2;
179  }
180  else {
181  m_tile[i] = span;
182  }
183  }
184  m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
185  m_num_tiles *= m_tile_end[i];
186  }
187  }
188  #if defined(KOKKOS_ENABLE_CUDA)
189  else // Cuda
190  {
191  index_type span;
192  for (int i=0; i<rank; ++i) {
193  span = upper[i] - lower[i];
194  if ( m_tile[i] <= 0 ) {
195  // TODO: determine what is a good default tile size for cuda
196  // may be rank dependent
197  if ( (inner_direction == Right && (i < rank-1))
198  || (inner_direction == Left && (i > 0)) )
199  {
200  m_tile[i] = 2;
201  }
202  else {
203  m_tile[i] = 16;
204  }
205  }
206  m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
207  m_num_tiles *= m_tile_end[i];
208  }
209  index_type total_tile_size_check = 1;
210  for (int i=0; i<rank; ++i) {
211  total_tile_size_check *= m_tile[i];
212  }
213  if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
214  printf(" Tile dimensions exceed Cuda limits\n");
215  Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
216  //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
217  }
218  }
219  #endif
220  }
221 
222 
223  template < typename LT , typename UT , typename TT = array_index_type >
224  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
225  {
226 #if 0
227  // This should work, less duplicated code but not yet extensively tested
228  point_type lower_tmp, upper_tmp;
229  tile_type tile_tmp;
230  for ( auto i = 0; i < rank; ++i ) {
231  lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
232  upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
233  tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]);
234  }
235 
236  MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
237 
238 #else
239  if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
240  Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
241 
242  for ( auto i = 0; i < rank; ++i ) {
243  m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
244  m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
245  if(static_cast<int>(tile.size())==rank)
246  m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
247  else
248  m_tile[i] = 0;
249  }
250 
251  m_num_tiles = 1;
252 
253 
254  // Host
255  if ( true
256  #if defined(KOKKOS_ENABLE_CUDA)
257  && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
258  #endif
259  )
260  {
261  index_type span;
262  for (int i=0; i<rank; ++i) {
263  span = m_upper[i] - m_lower[i];
264  if ( m_tile[i] <= 0 ) {
265  if ( (inner_direction == Right && (i < rank-1))
266  || (inner_direction == Left && (i > 0)) )
267  {
268  m_tile[i] = 2;
269  }
270  else {
271  m_tile[i] = span;
272  }
273  }
274  m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
275  m_num_tiles *= m_tile_end[i];
276  }
277  }
278  #if defined(KOKKOS_ENABLE_CUDA)
279  else // Cuda
280  {
281  index_type span;
282  for (int i=0; i<rank; ++i) {
283  span = m_upper[i] - m_lower[i];
284  if ( m_tile[i] <= 0 ) {
285  // TODO: determine what is a good default tile size for cuda
286  // may be rank dependent
287  if ( (inner_direction == Right && (i < rank-1))
288  || (inner_direction == Left && (i > 0)) )
289  {
290  m_tile[i] = 2;
291  }
292  else {
293  m_tile[i] = 16;
294  }
295  }
296  m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
297  m_num_tiles *= m_tile_end[i];
298  }
299  index_type total_tile_size_check = 1;
300  for (int i=0; i<rank; ++i) {
301  total_tile_size_check *= m_tile[i];
302  }
303  if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
304  printf(" Tile dimensions exceed Cuda limits\n");
305  Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
306  //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
307  }
308  }
309  #endif
310 #endif
311  }
312 
313 
314  point_type m_lower;
315  point_type m_upper;
316  tile_type m_tile;
317  point_type m_tile_end;
318  index_type m_num_tiles;
319 };
320 // ------------------------------------------------------------------ //
321 
322 // ------------------------------------------------------------------ //
323 //md_parallel_for
324 // ------------------------------------------------------------------ //
325 template <typename MDRange, typename Functor, typename Enable = void>
326 void md_parallel_for( MDRange const& range
327  , Functor const& f
328  , const std::string& str = ""
329  , typename std::enable_if<( true
330  #if defined( KOKKOS_ENABLE_CUDA)
331  && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
332  #endif
333  ) >::type* = 0
334  )
335 {
336  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
337 
338  //using range_policy = typename MDRange::range_policy;
339  using range_policy = typename MDRange::impl_range_policy;
340 
341  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
342 }
343 
344 template <typename MDRange, typename Functor>
345 void md_parallel_for( const std::string& str
346  , MDRange const& range
347  , Functor const& f
348  , typename std::enable_if<( true
349  #if defined( KOKKOS_ENABLE_CUDA)
350  && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
351  #endif
352  ) >::type* = 0
353  )
354 {
355  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
356 
357  //using range_policy = typename MDRange::range_policy;
358  using range_policy = typename MDRange::impl_range_policy;
359 
360  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
361 }
362 
363 // Cuda specialization
364 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
365 template <typename MDRange, typename Functor>
366 void md_parallel_for( const std::string& str
367  , MDRange const& range
368  , Functor const& f
369  , typename std::enable_if<( true
370  #if defined( KOKKOS_ENABLE_CUDA)
371  && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
372  #endif
373  ) >::type* = 0
374  )
375 {
376  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
377  closure.execute();
378 }
379 
380 template <typename MDRange, typename Functor>
381 void md_parallel_for( MDRange const& range
382  , Functor const& f
383  , const std::string& str = ""
384  , typename std::enable_if<( true
385  #if defined( KOKKOS_ENABLE_CUDA)
386  && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
387  #endif
388  ) >::type* = 0
389  )
390 {
391  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
392  closure.execute();
393 }
394 #endif
395 // ------------------------------------------------------------------ //
396 
397 // ------------------------------------------------------------------ //
398 //md_parallel_reduce
399 // ------------------------------------------------------------------ //
400 template <typename MDRange, typename Functor, typename ValueType>
401 void md_parallel_reduce( MDRange const& range
402  , Functor const& f
403  , ValueType & v
404  , const std::string& str = ""
405  , typename std::enable_if<( true
406  #if defined( KOKKOS_ENABLE_CUDA)
407  && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
408  #endif
409  ) >::type* = 0
410  )
411 {
412  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
413 
414  //using range_policy = typename MDRange::range_policy;
415  using range_policy = typename MDRange::impl_range_policy;
416  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
417 }
418 
419 template <typename MDRange, typename Functor, typename ValueType>
420 void md_parallel_reduce( const std::string& str
421  , MDRange const& range
422  , Functor const& f
423  , ValueType & v
424  , typename std::enable_if<( true
425  #if defined( KOKKOS_ENABLE_CUDA)
426  && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
427  #endif
428  ) >::type* = 0
429  )
430 {
431  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
432 
433  //using range_policy = typename MDRange::range_policy;
434  using range_policy = typename MDRange::impl_range_policy;
435 
436  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
437 }
438 
439 // Cuda - parallel_reduce not implemented yet
440 /*
441 template <typename MDRange, typename Functor, typename ValueType>
442 void md_parallel_reduce( MDRange const& range
443  , Functor const& f
444  , ValueType & v
445  , const std::string& str = ""
446  , typename std::enable_if<( true
447  #if defined( KOKKOS_ENABLE_CUDA)
448  && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
449  #endif
450  ) >::type* = 0
451  )
452 {
453  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
454  closure.execute();
455 }
456 
457 template <typename MDRange, typename Functor, typename ValueType>
458 void md_parallel_reduce( const std::string& str
459  , MDRange const& range
460  , Functor const& f
461  , ValueType & v
462  , typename std::enable_if<( true
463  #if defined( KOKKOS_ENABLE_CUDA)
464  && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
465  #endif
466  ) >::type* = 0
467  )
468 {
469  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
470  closure.execute();
471 }
472 */
473 
474 }} // namespace Kokkos::Experimental
475 
476 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
477 
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Declaration of parallel operators.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P... > &V)
Temporary free function rank() until rank() is implemented in the View.