Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
51 
54 #include <Zoltan2_Parameters.hpp>
55 #include <Zoltan2_Algorithm.hpp>
58 #include <Zoltan2_Util.hpp>
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_StandardParameterEntryValidators.hpp>
61 #include <Teuchos_ParameterList.hpp>
62 #include <Kokkos_Sort.hpp>
63 
64 #include <algorithm> // std::sort
65 #include <vector>
66 #include <unordered_map>
67 
68 #ifdef ZOLTAN2_USEZOLTANCOMM
69 #ifdef HAVE_ZOLTAN2_MPI
70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71 #include "zoltan_comm_cpp.h"
72 #include "zoltan_types.h" // for error codes
73 #endif
74 #endif
75 
76 namespace Teuchos{
77 
81 template <typename Ordinal, typename T>
82 class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83 {
84 private:
85  Ordinal size;
86  T epsilon;
87 
88 public:
91  Zoltan2_BoxBoundaries() : size(0),
92  epsilon(std::numeric_limits<T>::epsilon()) {}
93 
97  Zoltan2_BoxBoundaries(Ordinal s_):
98  size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99 
105  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106  for(Ordinal i = 0; i < count; i++) {
107  if(Z2_ABS(inBuffer[i]) > epsilon) {
108  inoutBuffer[i] = inBuffer[i];
109  }
110  }
111  }
112 };
113 
114 } // namespace Teuchos
115 
116 namespace Zoltan2{
117 
124 template <typename IT, typename CT, typename WT>
126 {
127 public:
128  // TODO: Why volatile?
129  // no idea, another intel compiler failure.
130  volatile IT index;
131  volatile CT count;
132  volatile WT *val;
133  volatile WT epsilon;
134 
136  this->index = 0;
137  this->count = 0;
138  this->val = NULL;
139  this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140  }
141 
142  // TODO: Document these methods?
143  uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144  this->index = index_;
145  this->count = count_;
146  this->val = vals_;
147  this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148  }
149 
151  }
152 
153  void set(IT index_ ,CT count_, WT *vals_) {
154  this->index = index_;
155  this->count = count_;
156  this->val = vals_;
157  }
158 
159  bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160  assert(this->count == other.count);
161  for(CT i = 0; i < this->count; ++i) {
162  // if the values are equal go to next one.
163  if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164  continue;
165  }
166  // if next value is smaller return true;
167  if(this->val[i] < other.val[i]) {
168  return true;
169  }
170  // if next value is bigger return false;
171  else {
172  return false;
173  }
174  }
175  // if they are totally equal.
176  return this->index < other.index;
177  }
178 };
179 
182 template <class IT, class WT>
183 struct uSortItem
184 {
185  IT id;
186  WT val;
187 };
188 
193 template <class IT, class WT>
194 void uqsort(IT n, uSortItem<IT, WT> * arr) {
195  int NSTACK = 50;
196  int M = 7;
197  IT i, ir=n, j, k, l=1;
198  IT jstack=0, istack[50];
199  WT aval;
201 
202  --arr;
203  for(;;) {
204  if(ir-l < M) {
205  for(j=l+1;j<=ir;j++) {
206  a=arr[j];
207  aval = a.val;
208  for(i=j-1;i>=1;i--) {
209  if(arr[i].val <= aval)
210  break;
211  arr[i+1] = arr[i];
212  }
213  arr[i+1]=a;
214  }
215  if(jstack == 0)
216  break;
217  ir=istack[jstack--];
218  l=istack[jstack--];
219  }
220  else {
221  k=(l+ir) >> 1;
222  std::swap(arr[k],arr[l+1]);
223  if(arr[l+1].val > arr[ir].val) {
224  std::swap(arr[l+1],arr[ir]);
225  }
226  if(arr[l].val > arr[ir].val) {
227  std::swap(arr[l],arr[ir]);
228  }
229  if(arr[l+1].val > arr[l].val) {
230  std::swap(arr[l+1],arr[l]);
231  }
232  i=l+1;
233  j=ir;
234  a=arr[l];
235  aval = a.val;
236  for(;;) {
237  do i++; while (arr[i].val < aval);
238  do j--; while (arr[j].val > aval);
239  if(j < i) break;
240  std::swap(arr[i],arr[j]);
241  }
242  arr[l]=arr[j];
243  arr[j]=a;
244  jstack += 2;
245  if(jstack > NSTACK) {
246  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247  std::terminate();
248  }
249  if(ir-i+1 >= j-l) {
250  istack[jstack]=ir;
251  istack[jstack-1]=i;
252  ir=j-1;
253  }
254  else {
255  istack[jstack]=j-1;
256  istack[jstack-1]=l;
257  l=i;
258  }
259  }
260  }
261 }
262 
263 template <class IT, class WT, class SIGN>
265 {
266  IT id;
267  WT val;
268  SIGN signbit; // 1 means positive, 0 means negative.
269  bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs) const {
270  /*if I am negative, the other is positive*/
271  if(this->signbit < rhs.signbit) {
272  return true;
273  }
274  /*if both has the same sign*/
275  else if(this->signbit == rhs.signbit) {
276  if(this->val < rhs.val) {//if my value is smaller,
277  return this->signbit;//then if we both are positive return true.
278  //if we both are negative, return false.
279  }
280  else if(this->val > rhs.val) {//if my value is larger,
281  return !this->signbit; //then if we both are positive return false.
282  //if we both are negative, return true.
283  }
284  else { //if both are equal.
285  return false;
286  }
287  }
288  else {
289  /*if I am positive, the other is negative*/
290  return false;
291  }
292  }
293 
295  return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296  }
297 };
298 
302 template <class IT, class WT, class SIGN>
304  IT NSTACK = 50;
305  IT M = 7;
306  IT i, ir=n, j, k, l=1;
307  IT jstack=0, istack[50];
309 
310  --arr;
311  for(;;) {
312  if(ir < M + l) {
313  for(j=l+1;j<=ir;j++) {
314  a=arr[j];
315  for(i=j-1;i>=1;i--) {
316  if(arr[i] <= a) {
317  break;
318  }
319  arr[i+1] = arr[i];
320  }
321  arr[i+1]=a;
322  }
323  if(jstack == 0) {
324  break;
325  }
326  ir=istack[jstack--];
327  l=istack[jstack--];
328  }
329  else {
330  k=(l+ir) >> 1;
331  std::swap(arr[k],arr[l+1]);
332  if(arr[ir] < arr[l+1]) {
333  std::swap(arr[l+1],arr[ir]);
334  }
335  if(arr[ir] < arr[l] ) {
336  std::swap(arr[l],arr[ir]);
337  }
338  if(arr[l] < arr[l+1]) {
339  std::swap(arr[l+1],arr[l]);
340  }
341  i=l+1;
342  j=ir;
343  a=arr[l];
344  for(;;) {
345  do i++; while (arr[i] < a);
346  do j--; while (a < arr[j]);
347  if(j < i) break;
348  std::swap(arr[i],arr[j]);
349  }
350  arr[l]=arr[j];
351  arr[j]=a;
352  jstack += 2;
353  if(jstack > NSTACK) {
354  std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355  std::terminate();
356  }
357  if(ir+l+1 >= j+i) {
358  istack[jstack]=ir;
359  istack[jstack-1]=i;
360  ir=j-1;
361  }
362  else {
363  istack[jstack]=j-1;
364  istack[jstack-1]=l;
365  l=i;
366  }
367  }
368  }
369 }
370 
371 // This exists only so we can track how many times the MJ algorithm is
372 // called and put each of those into different timer names.
373 // Currently the MultiJaggedTest.cpp will actually call it twice.
374 // First time with data from a Tpetra MultiVector and then a second time using
375 // a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376 // results of the two runs are compared which helps to catch a lot of bugs. For
377 // profiling I'm mostly just interested in the UVM off case and need it to be
378 // in separate timers. Passing a value through would mess up the API. Possibly
379 // we could check the Adapter and use that. The statics have to be outside the
380 // templated class as the two called instances will be different template
381 // parameters. Another complication is that MultiJagged.cpp will call through
382 // the Zoltan2_AlgMJ class and we want to time things in both classes. However
383 // TaskMapper will directly call AlgMJ so I made two counters for the two
384 // classes to make sure it was always correct. This does not impact any
385 // behavior and has the sole purpose of generating unique timer names. If you
386 // run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387 // 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389  static int get_counter_AlgMJ() {
390  static int counter = 0;
391  return counter++;
392  }
394  static int counter = 0;
395  return counter++;
396  }
397 };
398 
401 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402  typename mj_part_t, typename mj_node_t>
403 class AlgMJ
404 {
405 private:
406  typedef typename mj_node_t::device_type device_t; // for views
408  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409 
410  //if the (last dimension reduce all count) x the mpi world size
411  //estimated to be bigger than this number then migration will be forced
412  //in earlier iterations.
413  static constexpr size_t future_reduceall_cutoff = 1500000;
414 
415  //if parts right before last dimension are estimated to have less than
416  //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417  static constexpr mj_lno_t min_work_last_dim = 1000;
418 
419  static constexpr mj_scalar_t least_signifiance = 0.0001;
420  static constexpr int significance_mul = 1000;
421 
422  std::string mj_timer_base_string; // for convenience making timer names
423 
424  RCP<const Environment> mj_env; // the environment object
425  RCP<const Comm<int> > mj_problemComm; // initial comm object
426  RCP<Comm<int> > comm; // comm object than can be altered during execution
427  double imbalance_tolerance; // input imbalance tolerance.
428  int recursion_depth; // number of steps that partitioning will be solved in.
429  int coord_dim; // coordinate dim
430  int num_weights_per_coord; // # of weights per coord
431  size_t initial_num_loc_coords; // initial num local coords.
432  global_size_t initial_num_glob_coords; // initial num global coords.
433  mj_lno_t num_local_coords; // number of local coords.
434  mj_gno_t num_global_coords; // number of global coords.
435  mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436 
437  // can distribute points on same coordinant to different parts.
438  bool distribute_points_on_cut_lines;
439 
440  // how many parts we can calculate concurrently.
441  mj_part_t max_concurrent_part_calculation;
442 
443  bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444  int mj_user_recursion_depth; // the recursion depth value provided by user.
445  bool mj_keep_part_boxes; // if the boxes need to be kept.
446 
447  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448  int check_migrate_avoid_migration_option;
449 
450  // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451  // aim for minimized number of messages with possibly bad load-imbalance
452  int migration_type;
453 
454  // when MJ decides whether to migrate, the minimum imbalance for migration.
455  double minimum_migration_imbalance;
456 
457  // Nonuniform first level partitioning
458  // (Currently available only for sequential_task_partitioning):
459  // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460  // machine coordinates and application coordinates.
461  // An optimization that completely partitions the most important machine dimension
462  // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463  // MJ alg follows after the nonuniform first level partitioning.
464  //
465  // Ex. (first level partitioning): If we have 120 elements,
466  // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467  // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468  // continues for all subsequent levels.
469 
470  // If used, number of parts requested for a nonuniform
471  // first level partitioning
472  mj_part_t num_first_level_parts;
473 
474  // If used, the requested distribution of parts for the
475  // nonuniform first level partitioning
476  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477 
478  mj_part_t total_num_cut ; // how many cuts will be totally
479  mj_part_t total_num_part; // how many parts will be totally
480 
481  mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482  mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483 
484  // maximum part+cut count along a dimension.
485  size_t max_num_total_part_along_dim;
486 
487  mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488 
489  // max no of parts that might occur during the partition before the last
490  // partitioning dimension.
491  mj_part_t last_dim_num_part;
492 
493  // input part array specifying num part to divide along each dim.
494  Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495 
496  // two dimension coordinate array
497  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499  mj_coordinates;
500 
501  // two dimension weight array
502  Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503 
504  // if the target parts are uniform
505  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506 
507  // if the coordinates have uniform weights
508  Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509 
510  int mj_num_teams; // the number of teams
511 
512  size_t num_global_parts; // the targeted number of parts
513 
514  // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515  RCP<mj_partBoxVector_t> kept_boxes;
516 
517  RCP<mj_partBox_t> global_box;
518 
519  int myRank; // processor rank
520  int myActualRank; // initial rank
521 
522  bool divide_to_prime_first;
523 
524  // initial global ids of the coordinates.
525  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526 
527  // current global ids of the coordinates, might change during migration.
528  Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529 
530  // the actual processor owner of the coordinate, to track after migrations.
531  Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532 
533  // permutation of coordinates, for partitioning.
534  Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535 
536  // permutation work array.
537  Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538 
539  // the part ids assigned to coordinates.
540  Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541 
542  // beginning and end of each part.
543  Kokkos::View<mj_lno_t *, device_t> part_xadj;
544 
545  // work array for beginning and end of each part.
546  Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547 
548  Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549 
550  // how much weight should a MPI put left side of the each cutline
551  Kokkos::View<mj_scalar_t *, device_t>
552  process_cut_line_weight_to_put_left;
553 
554  // weight percentage each thread in MPI puts left side of the each outline
555  Kokkos::View<mj_scalar_t *, device_t>
556  thread_cut_line_weight_to_put_left;
557 
558  // work array to manipulate coordinate of cutlines in different iterations.
559  // necessary because previous cut line information is used for determining
560  // the next cutline information. therefore, cannot update the cut work array
561  // until all cutlines are determined.
562  Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563 
564  // Used for swapping above cut_coordinates_work_array
565  Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566 
567  // cumulative part weight array.
568  Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569 
570  // upper bound coordinate of a cut line
571  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572 
573  // lower bound coordinate of a cut line
574  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575 
576  // lower bound weight of a cut line
577  Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578 
579  // upper bound weight of a cut line
580  Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581 
582  // combined array to exchange the min and max coordinate, and total
583  // weight of part.
584  Kokkos::View<mj_scalar_t *, device_t>
585  process_local_min_max_coord_total_weight;
586 
587  // global combined array with the results for min, max and total weight.
588  Kokkos::View<mj_scalar_t *, device_t>
589  global_min_max_coord_total_weight;
590 
591  // isDone is used to determine if a cutline is determined already. If a cut
592  // line is already determined, the next iterations will skip this cut line.
593  Kokkos::View<bool *, device_t> is_cut_line_determined;
594 
595  // incomplete_cut_count count holds the number of cutlines that have not
596  // been finalized for each part when concurrentPartCount>1, using this
597  // information, if incomplete_cut_count[x]==0, then no work is done
598  // for this part.
599  Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600  typename decltype(device_incomplete_cut_count)::HostMirror
601  incomplete_cut_count;
602 
603  // Need a quick accessor for this on host
604  typename decltype (part_xadj)::HostMirror host_part_xadj;
605 
606  // local part weights of each thread.
607  Kokkos::View<double *, device_t>
608  thread_part_weights;
609 
610  // the work manupulation array for partweights.
611  Kokkos::View<double *, device_t>
612  thread_part_weight_work;
613 
614  // thread_cut_left_closest_point to hold the closest coordinate
615  // to a cutline from left (for each thread).
616  Kokkos::View<mj_scalar_t *, device_t>
617  thread_cut_left_closest_point;
618 
619  // thread_cut_right_closest_point to hold the closest coordinate
620  // to a cutline from right (for each thread)
621  Kokkos::View<mj_scalar_t *, device_t>
622  thread_cut_right_closest_point;
623 
624  // to store how many points in each part a thread has.
625  Kokkos::View<mj_lno_t *, device_t>
626  thread_point_counts;
627 
628  Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629  Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630 
631  // for faster communication, concatanation of
632  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633  // leftClosest distances sized P-1, since P-1 cut lines
634  // rightClosest distances size P-1, since P-1 cut lines.
635  Kokkos::View<mj_scalar_t *, device_t>
636  total_part_weight_left_right_closests;
637  Kokkos::View<mj_scalar_t *, device_t>
638  global_total_part_weight_left_right_closests;
639 
640  Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641  typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642  host_num_partitioning_in_current_dim; // for quick access on host
643 
644  /* \brief helper functio to calculate imbalance.
645  * \param achieved balance we achieved.
646  * \param expected balance expected.
647  */
648  static
649  KOKKOS_INLINE_FUNCTION
650  double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651  return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652  }
653 
654  /* \brief Either the mj array (part_no_array) or num_global_parts should be
655  * provided in the input. part_no_array takes precedence if both are
656  * provided. Depending on these parameters, total cut/part number, maximum
657  * part/cut number along a dimension, estimated number of reduceAlls,
658  * and the number of parts before the last dimension is calculated.
659  * */
660  void set_part_specifications();
661 
662  /* \brief Tries to determine the part number for current dimension,
663  * by trying to make the partitioning as square as possible.
664  * \param num_total_future how many more partitionings are required.
665  * \param root how many more recursion depth is left.
666  */
667  inline mj_part_t get_part_count(
668  mj_part_t num_total_future,
669  double root);
670 
671  /* \brief for part communication we keep track of the box boundaries.
672  * This is performed when either asked specifically, or when geometric
673  * mapping is performed afterwards. This function initializes a single box
674  * with all global min and max coordinates.
675  * \param initial_partitioning_boxes the input and output vector for boxes.
676  */
677  void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678 
679  /* \brief Function returns how many parts that will be obtained after this
680  * dimension partitioning. It sets how many parts each current part will be
681  * partitioned into in this dimension to device_num_partitioning_in_current_dim
682  * vector, sets how many total future parts each obtained part will be
683  * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684  * are kept, then sets initializes the output_part_boxes as its ancestor.
685  * \param future_num_part_in_parts: input, how many future parts each
686  * current part will be partitioned into.
687  * \param next_future_num_parts_in_parts: output, how many future parts
688  * each obtained part will be partitioned into.
689  * \param future_num_parts: output, max number of future parts that will be
690  * obtained from a single
691  * \param current_num_parts: input, how many parts are there currently.
692  * \param current_iteration: input, current dimension iteration number.
693  * \param input_part_boxes: input, if boxes are kept, current boxes.
694  * \param output_part_boxes: output, if boxes are kept, the initial box
695  * boundaries for obtained parts.
696  * \param atomic_part_count // DOCWORK: Documentation
697  */
698  mj_part_t update_part_num_arrays(
699  std::vector<mj_part_t> *future_num_part_in_parts,
700  std::vector<mj_part_t> *next_future_num_parts_in_parts,
701  mj_part_t &future_num_parts,
702  mj_part_t current_num_parts,
703  int current_iteration,
704  RCP<mj_partBoxVector_t> input_part_boxes,
705  RCP<mj_partBoxVector_t> output_part_boxes,
706  mj_part_t atomic_part_count);
707 
719  static
720  KOKKOS_INLINE_FUNCTION
721  void mj_calculate_new_cut_position (
722  mj_scalar_t cut_upper_bound,
723  mj_scalar_t cut_lower_bound,
724  mj_scalar_t cut_upper_weight,
725  mj_scalar_t cut_lower_weight,
726  mj_scalar_t expected_weight,
727  mj_scalar_t &new_cut_position,
728  mj_scalar_t sEpsilon);
729 
754  bool mj_perform_migration(
755  mj_part_t in_num_parts, //current number of parts
756  mj_part_t &out_num_parts, //output number of parts.
757  std::vector<mj_part_t> *next_future_num_parts_in_parts,
758  mj_part_t &output_part_begin_index,
759  size_t migration_reduce_all_population,
760  mj_lno_t num_coords_for_last_dim_part,
761  std::string iteration,
762  RCP<mj_partBoxVector_t> &input_part_boxes,
763  RCP<mj_partBoxVector_t> &output_part_boxes);
764 
782  bool mj_check_to_migrate(
783  size_t migration_reduce_all_population,
784  mj_lno_t num_coords_for_last_dim_part,
785  mj_part_t num_procs,
786  mj_part_t num_parts,
787  mj_gno_t *num_points_in_all_processor_parts);
788 
813  void mj_migration_part_proc_assignment(
814  mj_gno_t * num_points_in_all_processor_parts,
815  mj_part_t num_parts,
816  mj_part_t num_procs,
817  mj_lno_t *send_count_to_each_proc,
818  std::vector<mj_part_t> &processor_ranks_for_subcomm,
819  std::vector<mj_part_t> *next_future_num_parts_in_parts,
820  mj_part_t &out_num_part,
821  std::vector<mj_part_t> &out_part_indices,
822  mj_part_t &output_part_numbering_begin_index,
823  int *coordinate_destinations);
824 
850  void mj_assign_proc_to_parts(
851  mj_gno_t * num_points_in_all_processor_parts,
852  mj_part_t num_parts,
853  mj_part_t num_procs,
854  mj_lno_t *send_count_to_each_proc,
855  std::vector<mj_part_t> &processor_ranks_for_subcomm,
856  std::vector<mj_part_t> *next_future_num_parts_in_parts,
857  mj_part_t &out_part_index,
858  mj_part_t &output_part_numbering_begin_index,
859  int *coordinate_destinations);
860 
876  void assign_send_destinations(
877  mj_part_t num_parts,
878  mj_part_t *part_assignment_proc_begin_indices,
879  mj_part_t *processor_chains_in_parts,
880  mj_lno_t *send_count_to_each_proc,
881  int *coordinate_destinations);
882 
897  void assign_send_destinations2(
898  mj_part_t num_parts,
899  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900  int *coordinate_destinations,
901  mj_part_t &output_part_numbering_begin_index,
902  std::vector<mj_part_t> *next_future_num_parts_in_parts);
903 
926  void mj_assign_parts_to_procs(
927  mj_gno_t * num_points_in_all_processor_parts,
928  mj_part_t num_parts,
929  mj_part_t num_procs,
930  mj_lno_t *send_count_to_each_proc,
931  std::vector<mj_part_t> *next_future_num_parts_in_parts,
932  mj_part_t &out_num_part,
933  std::vector<mj_part_t> &out_part_indices,
934  mj_part_t &output_part_numbering_begin_index,
935  int *coordinate_destinations);
936 
950  void mj_migrate_coords(
951  mj_part_t num_procs,
952  mj_lno_t &num_new_local_points,
953  std::string iteration,
954  int *coordinate_destinations,
955  mj_part_t num_parts);
956 
962  void create_sub_communicator(
963  std::vector<mj_part_t> &processor_ranks_for_subcomm);
964 
969  mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970  mj_part_t largest_factor = 1;
971  mj_part_t n = num_parts;
972  mj_part_t divisor = 2;
973  while (n > 1) {
974  while (n % divisor == 0) {
975  n = n / divisor;
976  largest_factor = divisor;
977  }
978  ++divisor;
979  if(divisor * divisor > n) {
980  if(n > 1) {
981  largest_factor = n;
982  }
983  break;
984  }
985  }
986  return largest_factor;
987  }
988 
989 public:
990  AlgMJ();
991 
992  // DOCWORK: Make param documentation use : consistently
1018  void multi_jagged_part(
1019  const RCP<const Environment> &env,
1020  RCP<const Comm<int> > &problemComm,
1021  double imbalance_tolerance,
1022  int num_teams,
1023  size_t num_global_parts,
1024  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025  int recursion_depth,
1026  int coord_dim,
1027  mj_lno_t num_local_coords,
1028  mj_gno_t num_global_coords,
1029  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032  int num_weights_per_coord,
1033  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034  Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036  Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038 
1052  bool distribute_points_on_cut_lines_,
1053  int max_concurrent_part_calculation_,
1054  int check_migrate_avoid_migration_option_,
1055  double minimum_migration_imbalance_,
1056  int migration_type_ = 0);
1057 
1060  void set_to_keep_part_boxes();
1061 
1064  RCP<mj_partBox_t> get_global_box() const;
1065 
1068  RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069 
1072  RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073  RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074 
1114  const RCP<const Environment> &env,
1115  mj_lno_t num_total_coords,
1116  mj_lno_t num_selected_coords,
1117  size_t num_target_part,
1118  int coord_dim,
1119  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121  Kokkos::View<mj_lno_t *, device_t> &
1122  initial_selected_coords_output_permutation,
1123  mj_lno_t *output_xadj,
1124  int recursion_depth_,
1125  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126  bool partition_along_longest_dim,
1127  int num_ranks_per_node,
1128  bool divide_to_prime_first_,
1129  mj_part_t num_first_level_parts_ = 1,
1130  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131  = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132 
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134  public:
1135 #else
1136  private:
1137 #endif
1138 
1139  /* \brief Allocates all required memory for the mj partitioning algorithm.
1140  */
1141  void allocate_set_work_memory();
1142 
1143  /* \brief compute global bounding box: min/max coords of global domain */
1144  void compute_global_box();
1145 
1146  // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153  void mj_get_local_min_max_coord_totW(
1154  mj_part_t current_work_part,
1155  mj_part_t current_concurrent_num_parts,
1156  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157 
1170  void mj_get_global_min_max_coord_totW(
1171  mj_part_t current_concurrent_num_parts,
1172  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174 
1205  void mj_get_initial_cut_coords_target_weights(
1206  mj_scalar_t min_coord,
1207  mj_scalar_t max_coord,
1208  mj_part_t num_cuts/*p-1*/ ,
1209  mj_scalar_t global_weight,
1210  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211  Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212  std::vector <mj_part_t> *future_num_part_in_parts,
1213  std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214  mj_part_t concurrent_current_part,
1215  mj_part_t obtained_part_index,
1216  mj_part_t num_target_first_level_parts = 1,
1217  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218  Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219 
1236  void set_initial_coordinate_parts(
1237  mj_scalar_t &max_coordinate,
1238  mj_scalar_t &min_coordinate,
1239  mj_lno_t coordinate_begin_index,
1240  mj_lno_t coordinate_end_index,
1241  Kokkos::View<mj_lno_t *, device_t> &
1242  mj_current_coordinate_permutations,
1243  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245  mj_part_t &partition_count);
1246 
1263  void mj_1D_part(
1264  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265  double imbalanceTolerance,
1266  mj_part_t current_work_part,
1267  mj_part_t current_concurrent_num_parts,
1268  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269  mj_part_t total_incomplete_cut_count,
1270  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271  Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272 
1278  void mj_1D_part_get_part_weights(
1279  mj_part_t current_concurrent_num_parts,
1280  mj_part_t current_work_part,
1281  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282  int loop_count);
1283 
1291  void mj_combine_rightleft_and_weights(
1292  mj_part_t current_work_part,
1293  mj_part_t current_concurrent_num_parts);
1294 
1307  void mj_create_new_partitions(
1308  mj_part_t num_parts,
1309  mj_part_t current_concurrent_work_part,
1310  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314 
1350  void mj_get_new_cut_coordinates(
1351  mj_part_t current_concurrent_num_parts,
1352  mj_part_t kk,
1353  const mj_part_t &num_cuts,
1354  const double &used_imbalance_tolerance,
1355  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358  Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367  Kokkos::View<mj_scalar_t *, device_t> &
1368  current_part_cut_line_weight_to_put_left,
1369  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370 
1380  void get_processor_num_points_in_parts(
1381  mj_part_t num_procs,
1382  mj_part_t num_parts,
1383  mj_gno_t *&num_points_in_all_processor_parts);
1384 
1389  void fill_permutation_array(
1390  mj_part_t output_num_parts,
1391  mj_part_t num_parts);
1392 
1414  void create_consistent_chunks(
1415  mj_part_t num_parts,
1416  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418  mj_lno_t coordinate_begin,
1419  mj_lno_t coordinate_end,
1420  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422  int coordInd,
1423  bool longest_dim_part,
1424  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425 
1434  void set_final_parts(
1435  mj_part_t current_num_parts,
1436  mj_part_t output_part_begin_index,
1437  RCP<mj_partBoxVector_t> &output_part_boxes,
1438  bool is_data_ever_migrated);
1439 };
1440 
1443 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444  typename mj_part_t, typename mj_node_t>
1446  mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447  recursion_depth(0), coord_dim(0),
1448  num_weights_per_coord(0), initial_num_loc_coords(0),
1449  initial_num_glob_coords(0),
1450  num_local_coords(0), num_global_coords(0),
1451  sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452  distribute_points_on_cut_lines(true),
1453  max_concurrent_part_calculation(1),
1454  mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455  mj_keep_part_boxes(false),
1456  check_migrate_avoid_migration_option(0), migration_type(0),
1457  minimum_migration_imbalance(0.30),
1458  num_first_level_parts(1),
1459  total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460  max_num_cut_along_dim(0),
1461  max_num_total_part_along_dim(0),
1462  total_dim_num_reduce_all(0),
1463  last_dim_num_part(0),
1464  mj_num_teams(0),
1465  num_global_parts(1),
1466  kept_boxes(), global_box(),
1467  myRank(0), myActualRank(0),
1468  divide_to_prime_first(false)
1469 {
1470 }
1471 
1515 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516  typename mj_part_t, typename mj_node_t>
1519  const RCP<const Environment> &env,
1520  mj_lno_t num_total_coords,
1521  mj_lno_t num_selected_coords,
1522  size_t num_target_part,
1523  int coord_dim_,
1524  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526  mj_coordinates_,
1527  Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528  mj_lno_t *output_xadj,
1529  int recursion_depth_,
1530  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531  bool partition_along_longest_dim,
1532  int num_ranks_per_node,
1533  bool divide_to_prime_first_,
1534  mj_part_t num_first_level_parts_,
1535  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536 {
1537  this->mj_env = env;
1538  const RCP<Comm<int> > commN;
1539  this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540  this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541  this->myActualRank = this->myRank = 1;
1542 
1543  this->divide_to_prime_first = divide_to_prime_first_;
1544  //weights are uniform for task mapping
1545 
1546  //parts are uniform for task mapping
1547  //as input indices.
1548  this->imbalance_tolerance = 0;
1549  this->num_global_parts = num_target_part;
1550  this->part_no_array = part_no_array_;
1551  this->recursion_depth = recursion_depth_;
1552 
1553  // If nonuniform first level partitioning, the requested num of parts and the
1554  // requested distribution of elements for each part
1555  this->num_first_level_parts = num_first_level_parts_;
1556 
1557  this->first_level_distribution = first_level_distribution_;
1558 
1559  this->coord_dim = coord_dim_;
1560  this->num_local_coords = num_total_coords;
1561 
1562  this->num_global_coords = num_total_coords;
1563  this->mj_coordinates = mj_coordinates_;
1564 
1565 
1566  this->initial_mj_gnos =
1567  Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568 
1569  this->num_weights_per_coord = 0;
1570 
1571  this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572  "uniform weights", 1);
1573  this->mj_uniform_weights(0) = true;
1574 
1575  this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576  ("weights", 1, 1);
1577 
1578  this->mj_uniform_parts =
1579  Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580  this->mj_uniform_parts(0) = true;
1581 
1582  this->set_part_specifications();
1583 
1584  this->allocate_set_work_memory();
1585 
1586  // Do single init
1587  auto local_part_xadj = this->part_xadj;
1588  Kokkos::parallel_for(
1589  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590  KOKKOS_LAMBDA (int dummy) {
1591  local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592  });
1593 
1594  Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595 
1596  mj_part_t current_num_parts = 1;
1597 
1598  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599  this->all_cut_coordinates;
1600 
1601  mj_part_t future_num_parts = this->total_num_part;
1602 
1603  std::vector<mj_part_t> *future_num_part_in_parts =
1604  new std::vector<mj_part_t>();
1605  std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606  new std::vector<mj_part_t>();
1607  next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608  RCP<mj_partBoxVector_t> t1;
1609  RCP<mj_partBoxVector_t> t2;
1610 
1611  std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612  coord_dimension_range_sorted(this->coord_dim);
1613  uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614  &(coord_dimension_range_sorted[0]);
1615  std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616  std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617 
1618  // Need a device counter - how best to allocate?
1619  // Putting this allocation in the loops is very costly so moved out here.
1620  Kokkos::View<mj_part_t*, device_t>
1621  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622  Kokkos::View<size_t*, device_t>
1623  view_total_reduction_size("view_total_reduction_size", 1);
1624 
1625  for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626  // next_future_num_parts_in_parts will be as the size of outnumParts,
1627  // and this will hold how many more parts that each output part
1628  // should be divided. this array will also be used to determine the weight
1629  // ratios of the parts.
1630  // swap the arrays to use iteratively..
1631  std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632  future_num_part_in_parts = next_future_num_parts_in_parts;
1633  next_future_num_parts_in_parts = tmpPartVect;
1634 
1635  // clear next_future_num_parts_in_parts array as
1636  // getPartitionArrays expects it to be empty.
1637  next_future_num_parts_in_parts->clear();
1638 
1639  // returns the total number of output parts for this dimension partitioning.
1640  mj_part_t output_part_count_in_dimension =
1641  this->update_part_num_arrays(
1642  future_num_part_in_parts,
1643  next_future_num_parts_in_parts,
1644  future_num_parts,
1645  current_num_parts,
1646  rd,
1647  t1,
1648  t2, num_ranks_per_node);
1649 
1650  // if the number of obtained parts equal to current number of parts,
1651  // skip this dimension. For example, this happens when 1 is given in
1652  // the input part array is given. P=4,5,1,2
1653  if(output_part_count_in_dimension == current_num_parts) {
1654  tmpPartVect = future_num_part_in_parts;
1655  future_num_part_in_parts = next_future_num_parts_in_parts;
1656  next_future_num_parts_in_parts = tmpPartVect;
1657  continue;
1658  }
1659 
1660  //convert i to string to be used for debugging purposes.
1661  std::string istring = std::to_string(rd);
1662 
1663  // alloc Memory to point the indices
1664  // of the parts in the permutation array.
1665  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666  "new part xadj", output_part_count_in_dimension);
1667 
1668  // the index where in the outtotalCounts will be written.
1669 
1670  mj_part_t output_part_index = 0;
1671 
1672  // whatever is written to outTotalCounts will be added with previousEnd
1673  // so that the points will be shifted.
1674  mj_part_t output_coordinate_end_index = 0;
1675 
1676  mj_part_t current_work_part = 0;
1677  mj_part_t current_concurrent_num_parts = 1;
1678 
1679  mj_part_t obtained_part_index = 0;
1680 
1681  // get the coordinate axis along which the partitioning will be done.
1682  int coordInd = rd % this->coord_dim;
1683 
1684  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686 
1687  auto host_process_local_min_max_coord_total_weight =
1688  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689  auto host_global_min_max_coord_total_weight =
1690  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691 
1692  // run for all available parts.
1693  for(; current_work_part < current_num_parts;
1694  current_work_part += current_concurrent_num_parts) {
1695 
1696  mj_part_t actual_work_part_count = 0;
1697 
1698  // initialization for 1D partitioning.
1699  // get the min and max coordinates of each part
1700  // together with the part weights of each part.
1701  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702  mj_part_t current_work_part_in_concurrent_parts =
1703  current_work_part + kk;
1704 
1705  // if this part wont be partitioned any further
1706  // dont do any work for this part.
1707  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708  current_work_part_in_concurrent_parts);
1709  if(partition_count == 1) {
1710  continue;
1711  }
1712  ++actual_work_part_count;
1713  if(partition_along_longest_dim) {
1714  auto local_process_local_min_max_coord_total_weight =
1715  this->process_local_min_max_coord_total_weight;
1716  for(int coord_traverse_ind = 0;
1717  coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718 
1719  Kokkos::View<mj_scalar_t *, device_t> coords =
1720  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721 
1722  this->mj_get_local_min_max_coord_totW(
1723  current_work_part,
1724  current_concurrent_num_parts,
1725  coords);
1726 
1727  coord_dimension_range_sorted[coord_traverse_ind].id =
1728  coord_traverse_ind;
1729  coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730 
1731  Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732  process_local_min_max_coord_total_weight);
1733 
1734  coord_dim_mins[coord_traverse_ind] =
1735  host_process_local_min_max_coord_total_weight(kk);
1736  coord_dim_maxs[coord_traverse_ind] =
1737  host_process_local_min_max_coord_total_weight(
1738  kk + current_concurrent_num_parts);
1739  coord_dimension_range_sorted[coord_traverse_ind].val =
1740  host_process_local_min_max_coord_total_weight(
1741  kk + current_concurrent_num_parts) -
1742  host_process_local_min_max_coord_total_weight(kk);
1743  }
1744 
1745  uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746  coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747  auto set_min = coord_dim_mins[coordInd];
1748  auto set_max = coord_dim_maxs[coordInd];
1749  Kokkos::parallel_for(
1750  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751  (0, 1), KOKKOS_LAMBDA (int dummy) {
1752  local_process_local_min_max_coord_total_weight(kk) = set_min;
1753  local_process_local_min_max_coord_total_weight(
1754  kk + current_concurrent_num_parts) = set_max;
1755  });
1756 
1757  mj_current_dim_coords =
1758  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759  }
1760  else {
1761  Kokkos::View<mj_scalar_t *, device_t> coords =
1762  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763  this->mj_get_local_min_max_coord_totW(
1764  current_work_part,
1765  current_concurrent_num_parts,
1766  coords);
1767  }
1768  }
1769 
1770  // 1D partitioning
1771  if(actual_work_part_count > 0) {
1772  // obtain global Min max of the part.
1773  this->mj_get_global_min_max_coord_totW(
1774  current_concurrent_num_parts,
1775  this->process_local_min_max_coord_total_weight,
1776  this->global_min_max_coord_total_weight);
1777 
1778  // update host copy
1779  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780  global_min_max_coord_total_weight);
1781 
1782  // represents the total number of cutlines
1783  // whose coordinate should be determined.
1784  mj_part_t total_incomplete_cut_count = 0;
1785 
1786  //Compute weight ratios for parts & cuts:
1787  //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788  // part0 cut0 part1 cut1 part2 cut2 part3
1789  mj_part_t concurrent_part_cut_shift = 0;
1790  mj_part_t concurrent_part_part_shift = 0;
1791  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792  mj_scalar_t min_coordinate =
1793  host_global_min_max_coord_total_weight(kk);
1794  mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795  kk + current_concurrent_num_parts);
1796  mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797  kk + 2*current_concurrent_num_parts);
1798 
1799  mj_part_t concurrent_current_part_index = current_work_part + kk;
1800 
1801  mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802  concurrent_current_part_index);
1803 
1804  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805  Kokkos::subview(current_cut_coordinates,
1806  std::pair<mj_lno_t, mj_lno_t>(
1807  concurrent_part_cut_shift,
1808  current_cut_coordinates.size()));
1809  Kokkos::View<mj_scalar_t *, device_t>
1810  current_target_part_weights =
1811  Kokkos::subview(target_part_weights,
1812  std::pair<mj_lno_t, mj_lno_t>(
1813  concurrent_part_part_shift,
1814  target_part_weights.size()));
1815 
1816  // shift the usedCutCoordinate array as noCuts.
1817  concurrent_part_cut_shift += partition_count - 1;
1818  // shift the partRatio array as noParts.
1819  concurrent_part_part_shift += partition_count;
1820  // calculate only if part is not empty,
1821  // and part will be further partitioend.
1822  if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823  // increase allDone by the number of cuts of the current
1824  // part's cut line number.
1825  total_incomplete_cut_count += partition_count - 1;
1826 
1827  this->incomplete_cut_count(kk) = partition_count - 1;
1828 
1829  // When num_first_level_parts != 1 we have
1830  // nonuniform partitioning on the first level, providing
1831  // requested number of parts (num_first_level_parts) and
1832  // requested distribution in parts (first_level_distribution)
1833 
1834  // Get the target part weights given a desired distribution
1835  this->mj_get_initial_cut_coords_target_weights(
1836  min_coordinate,
1837  max_coordinate,
1838  partition_count - 1,
1839  global_total_weight,
1840  usedCutCoordinate,
1841  current_target_part_weights,
1842  future_num_part_in_parts,
1843  next_future_num_parts_in_parts,
1844  concurrent_current_part_index,
1845  obtained_part_index,
1846  rd == 0 ? this->num_first_level_parts : 1,
1847  this->first_level_distribution);
1848 
1849  mj_lno_t coordinate_end_index =
1850  host_part_xadj(concurrent_current_part_index);
1851  mj_lno_t coordinate_begin_index =
1852  (concurrent_current_part_index==0) ? 0 :
1853  host_part_xadj[concurrent_current_part_index - 1];
1854 
1855  // get the initial estimated part assignments of the coordinates.
1856  this->set_initial_coordinate_parts(
1857  max_coordinate,
1858  min_coordinate,
1859  coordinate_begin_index, coordinate_end_index,
1860  this->coordinate_permutations,
1861  mj_current_dim_coords,
1862  this->assigned_part_ids,
1863  partition_count);
1864  }
1865  else {
1866  // e.g., if have fewer coordinates than parts, don't need to do
1867  // next dim.
1868  this->incomplete_cut_count(kk) = 0;
1869  }
1870  obtained_part_index += partition_count;
1871  }
1872 
1873  // used imbalance, it is always 0, as it is difficult
1874  // to estimate a range.
1875  double used_imbalance = 0;
1876 
1877  // Determine cut lines for k parts here.
1878  this->mj_env->timerStart(MACRO_TIMERS,
1879  mj_timer_base_string + "mj_1D_part()");
1880 
1881  this->mj_1D_part(
1882  mj_current_dim_coords,
1883  used_imbalance,
1884  current_work_part,
1885  current_concurrent_num_parts,
1886  current_cut_coordinates,
1887  total_incomplete_cut_count,
1888  view_rectilinear_cut_count,
1889  view_total_reduction_size);
1890 
1891  this->mj_env->timerStop(MACRO_TIMERS,
1892  mj_timer_base_string + "mj_1D_part()");
1893  }
1894  else {
1895  obtained_part_index += current_concurrent_num_parts;
1896  }
1897  // create part chunks
1898  {
1899  mj_part_t output_array_shift = 0;
1900  mj_part_t cut_shift = 0;
1901  size_t tlr_shift = 0;
1902  size_t partweight_array_shift = 0;
1903 
1904  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905  mj_part_t current_concurrent_work_part = current_work_part + kk;
1906 
1907  mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908  current_concurrent_work_part);
1909 
1910  // if the part is empty, skip the part.
1911  int coordinateA_bigger_than_coordinateB =
1912  host_global_min_max_coord_total_weight(kk) >
1913  host_global_min_max_coord_total_weight(
1914  kk + current_concurrent_num_parts);
1915 
1916  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917  // we still need to write the begin and end point of the empty part.
1918  // simply set it zero, the array indices will be shifted later
1919  auto local_new_part_xadj = this->new_part_xadj;
1920  Kokkos::parallel_for(
1921  Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922  mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923  local_new_part_xadj(
1924  output_part_index + output_array_shift + jj) = 0;
1925  });
1926 
1927  cut_shift += num_parts - 1;
1928  tlr_shift += (4 *(num_parts - 1) + 1);
1929  output_array_shift += num_parts;
1930  partweight_array_shift += (2 * (num_parts - 1) + 1);
1931  continue;
1932  }
1933  mj_lno_t coordinate_end =
1934  host_part_xadj(current_concurrent_work_part);
1935  mj_lno_t coordinate_begin =
1936  current_concurrent_work_part==0 ? 0 :
1937  host_part_xadj(current_concurrent_work_part-1);
1938 
1939  Kokkos::View<mj_scalar_t *, device_t>
1940  current_concurrent_cut_coordinate =
1941  Kokkos::subview(current_cut_coordinates,
1942  std::pair<mj_lno_t, mj_lno_t>(
1943  cut_shift,
1944  current_cut_coordinates.size()));
1945  Kokkos::View<mj_scalar_t *, device_t>
1946  used_local_cut_line_weight_to_left =
1947  Kokkos::subview(process_cut_line_weight_to_put_left,
1948  std::pair<mj_lno_t, mj_lno_t>(
1949  cut_shift,
1950  process_cut_line_weight_to_put_left.size()));
1951 
1952  this->thread_part_weight_work =
1953  Kokkos::subview(
1954  this->thread_part_weights,
1955  std::pair<mj_lno_t, mj_lno_t>(
1956  partweight_array_shift,
1957  this->thread_part_weights.size()));
1958 
1959  if(num_parts > 1) {
1960  // Rewrite the indices based on the computed cuts.
1961  Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962  Kokkos::subview(this->new_part_xadj,
1963  std::pair<mj_lno_t, mj_lno_t>(
1964  output_part_index + output_array_shift,
1965  this->new_part_xadj.size()));
1966 
1967  this->create_consistent_chunks(
1968  num_parts,
1969  mj_current_dim_coords,
1970  current_concurrent_cut_coordinate,
1971  coordinate_begin,
1972  coordinate_end,
1973  used_local_cut_line_weight_to_left,
1974  subview_new_part_xadj,
1975  coordInd,
1976  partition_along_longest_dim,
1977  p_coord_dimension_range_sorted);
1978  }
1979  else {
1980  // if this part is partitioned into 1 then just copy
1981  // the old values.
1982  mj_lno_t part_size = coordinate_end - coordinate_begin;
1983 
1984  auto local_new_part_xadj = this->new_part_xadj;
1985  Kokkos::parallel_for(
1986  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987  (0, 1), KOKKOS_LAMBDA (int dummy) {
1988  local_new_part_xadj(output_part_index + output_array_shift)
1989  = part_size;
1990  });
1991 
1992  auto subview_new_coordinate_permutations =
1993  Kokkos::subview(this->new_coordinate_permutations,
1994  std::pair<mj_lno_t, mj_lno_t>(
1995  coordinate_begin,
1996  coordinate_begin + part_size));
1997  auto subview_coordinate_permutations =
1998  Kokkos::subview(this->coordinate_permutations,
1999  std::pair<mj_lno_t, mj_lno_t>(
2000  coordinate_begin,
2001  coordinate_begin + part_size));
2002  Kokkos::deep_copy(subview_new_coordinate_permutations,
2003  subview_coordinate_permutations);
2004  }
2005 
2006  cut_shift += num_parts - 1;
2007  tlr_shift += (4 *(num_parts - 1) + 1);
2008  output_array_shift += num_parts;
2009  partweight_array_shift += (2 * (num_parts - 1) + 1);
2010  }
2011 
2012  // shift cut coordinates so that all cut coordinates are stored.
2013  // current_cut_coordinates += cutShift;
2014 
2015  // getChunks from coordinates partitioned the parts and
2016  // wrote the indices as if there were a single part.
2017  // now we need to shift the beginning indices.
2018  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019  mj_part_t num_parts =
2020  host_num_partitioning_in_current_dim(current_work_part + kk);
2021  auto local_new_part_xadj = this->new_part_xadj;
2022  auto local_mj_current_dim_coords = mj_current_dim_coords;
2023  auto local_new_coordinate_permutations =
2024  new_coordinate_permutations;
2025  Kokkos::parallel_for(
2026  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027  0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028  //shift it by previousCount
2029  local_new_part_xadj(output_part_index+ii) +=
2030  output_coordinate_end_index;
2031 
2032  if(ii % 2 == 1) {
2033  mj_lno_t coordinate_end =
2034  local_new_part_xadj(output_part_index+ii);
2035  mj_lno_t coordinate_begin =
2036  local_new_part_xadj(output_part_index);
2037 
2038  for(mj_lno_t task_traverse = coordinate_begin;
2039  task_traverse < coordinate_end; ++task_traverse) {
2040  mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041  //MARKER: FLIPPED ZORDER BELOW
2042  local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043  }
2044  }
2045  });
2046 
2047  // increase the previous count by current end.
2048  mj_part_t get_single;
2049  Kokkos::parallel_reduce("Read new_part_xadj",
2050  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052  set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053  }, get_single);;
2054 
2055  output_coordinate_end_index = get_single;
2056  // increase the current out.
2057  output_part_index += num_parts;
2058  }
2059  }
2060  }
2061 
2062  // end of this partitioning dimension
2063  // set the current num parts for next dim partitioning
2064  current_num_parts = output_part_count_in_dimension;
2065 
2066  //swap the coordinate permutations for the next dimension.
2067  Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068  this->coordinate_permutations = this->new_coordinate_permutations;
2069  this->new_coordinate_permutations = tmp;
2070 
2071  this->part_xadj = this->new_part_xadj;
2072  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075  }
2076 
2077  Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078 
2079  // Return output_xadj in CSR format
2080  output_xadj[0] = 0;
2081  for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082  output_xadj[i+1] = host_part_xadj(i);
2083  }
2084 
2085  delete future_num_part_in_parts;
2086  delete next_future_num_parts_in_parts;
2087 }
2088 
2092 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093  typename mj_part_t, typename mj_node_t>
2094 RCP<typename AlgMJ
2095  <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2097  get_global_box() const
2098 {
2099  return this->global_box;
2100 }
2101 
2104 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105  typename mj_part_t, typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107  mj_node_t>::set_to_keep_part_boxes()
2108 {
2109  this->mj_keep_part_boxes = true;
2110 }
2111 
2112 /* \brief Either the mj array (part_no_array) or num_global_parts should be
2113  * provided in the input. part_no_array takes
2114  * precedence if both are provided.
2115  * Depending on these parameters, total cut/part number,
2116  * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117  * and the number of parts before the last dimension is calculated.
2118  * */
2119 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120  typename mj_part_t, typename mj_node_t>
2123 {
2124  this->total_num_cut = 0; //how many cuts will be totally
2125  this->total_num_part = 1; //how many parts will be totally
2126  this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127  this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128  this->last_dim_num_part = 1; //max no of parts that might occur
2129  //during the partition before the
2130  //last partitioning dimension.
2131  this->max_num_cut_along_dim = 0;
2132  this->max_num_total_part_along_dim = 0;
2133 
2134  if(this->part_no_array.size()) {
2135  auto local_recursion_depth = this->recursion_depth;
2136 
2137  this->total_dim_num_reduce_all =
2138  this->total_num_part * this->recursion_depth;
2139 
2140  this->total_num_part = 1;
2141  for(int i = 0; i < local_recursion_depth; ++i) {
2142  this->total_num_part *= this->part_no_array(i);
2143  }
2144 
2145  mj_part_t track_max = 0;
2146  for(int i = 0; i < local_recursion_depth; ++i) {
2147  if(part_no_array(i) > track_max) {
2148  track_max = this->part_no_array(i);
2149  };
2150  }
2151 
2152  this->last_dim_num_part = this->total_num_part /
2153  this->part_no_array(local_recursion_depth-1);
2154 
2155  this->max_num_part_along_dim = track_max;
2156  this->num_global_parts = this->total_num_part;
2157  } else {
2158  mj_part_t future_num_parts = this->num_global_parts;
2159 
2160  // If using nonuniform first level partitioning.
2161  // initial value max_num_part_along_dim == num_first_level_parts
2162  if (this->first_level_distribution.size() != 0 &&
2163  this->num_first_level_parts > 1) {
2164  this->max_num_part_along_dim = this->num_first_level_parts;
2165  }
2166 
2167  // we need to calculate the part numbers now, to determine
2168  // the maximum along the dimensions.
2169  for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170  mj_part_t maxNoPartAlongI = 0;
2171  mj_part_t nfutureNumParts = 0;
2172 
2173  // Nonuniform first level partitioning sets part specificiations for
2174  // rd == 0 only, given requested num of parts and distribution in parts
2175  // for the first level.
2176  if (rd == 0 &&
2177  this->first_level_distribution.size() != 0 &&
2178  this->num_first_level_parts > 1) {
2179 
2180  maxNoPartAlongI = this->num_first_level_parts;
2181  this->max_num_part_along_dim = this->num_first_level_parts;
2182 
2183  mj_part_t sum_first_level_dist = 0;
2184  mj_part_t max_part = 0;
2185 
2186  // Cumulative sum of distribution of parts and size of largest part
2187  for (int i = 0; i < this->num_first_level_parts; ++i) {
2188  sum_first_level_dist += this->first_level_distribution(i);
2189  if (this->first_level_distribution(i) > max_part)
2190  max_part = this->first_level_distribution(i);
2191  }
2192 
2193  // Total parts in largest nonuniform superpart from
2194  // first level partitioning
2195  nfutureNumParts =
2196  this->num_global_parts * max_part / sum_first_level_dist;
2197  }
2198  // Standard uniform partitioning this level
2199  else {
2200  maxNoPartAlongI = this->get_part_count(future_num_parts,
2201  1.0f / (this->recursion_depth - rd));
2202  if (maxNoPartAlongI > this->max_num_part_along_dim)
2203  this->max_num_part_along_dim = maxNoPartAlongI;
2204  nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205  if (future_num_parts % maxNoPartAlongI) {
2206  ++nfutureNumParts;
2207  }
2208  }
2209  future_num_parts = nfutureNumParts;
2210  }
2211  this->total_num_part = this->num_global_parts;
2212 
2213  if(this->divide_to_prime_first) {
2214  this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215  this->last_dim_num_part = this->num_global_parts;
2216  }
2217  else {
2218  //this is the lower bound.
2219  //estimate reduceAll Count here.
2220  //we find the upperbound instead.
2221  size_t p = 1;
2222  for(int i = 0; i < this->recursion_depth; ++i) {
2223  this->total_dim_num_reduce_all += p;
2224  p *= this->max_num_part_along_dim;
2225  }
2226 
2227  if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228  this->last_dim_num_part = this->num_global_parts;
2229  }
2230  else {
2231  this->last_dim_num_part = p / this->max_num_part_along_dim;
2232  }
2233  }
2234  }
2235 
2236  this->total_num_cut = this->total_num_part - 1;
2237  this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238  this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239  size_t(this->max_num_cut_along_dim);
2240  // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241 
2242  // refine the concurrent part count, if it is given bigger than the maximum
2243  // possible part count.
2244  if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245  if(this->mj_problemComm->getRank() == 0) {
2246  std::cerr << "Warning: Concurrent part count (" <<
2247  this->max_concurrent_part_calculation <<
2248  ") has been set bigger than maximum amount that can be used." <<
2249  " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250  }
2251  this->max_concurrent_part_calculation = this->last_dim_num_part;
2252  }
2253 }
2254 
2255 /* \brief Tries to determine the part number for current dimension,
2256  * by trying to make the partitioning as square as possible.
2257  * \param num_total_future how many more partitionings are required.
2258  * \param root how many more recursion depth is left.
2259  */
2260 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261  typename mj_part_t, typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263  get_part_count(mj_part_t num_total_future, double root)
2264 {
2265  double fp = pow(num_total_future, root);
2266  mj_part_t ip = mj_part_t(fp);
2267  if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268  return ip;
2269  }
2270  else {
2271  return ip + 1;
2272  }
2273 }
2274 
2275 /* \brief Function returns how many parts that will be obtained after this
2276  * dimension partitioning. It sets how many parts each current part will be
2277  * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278  * view, sets how many total future parts each obtained part will be
2279  * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280  * kept, then sets initializes the output_part_boxes as its ancestor.
2281  * \param future_num_part_in_parts: input, how many future parts each current
2282  * part will be partitioned into.
2283  * \param next_future_num_parts_in_parts: output, how many future parts each
2284  * obtained part will be partitioned into.
2285  * \param future_num_parts: output, max number of future parts that will be
2286  * obtained from a single
2287  * \param current_num_parts: input, how many parts are there currently.
2288  * \param current_iteration: input, current dimension iteration number.
2289  * \param input_part_boxes: input, if boxes are kept, current boxes.
2290  * \param output_part_boxes: output, if boxes are kept, the initial box
2291  * boundaries for obtained parts.
2292  * \param atomic_part_count DOCWORK: Documentation
2293  */
2294 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295  typename mj_part_t, typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297  update_part_num_arrays(
2298  std::vector<mj_part_t> *future_num_part_in_parts,
2299  std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300  mj_part_t &future_num_parts,
2301  mj_part_t current_num_parts,
2302  int current_iteration,
2303  RCP<mj_partBoxVector_t> input_part_boxes,
2304  RCP<mj_partBoxVector_t> output_part_boxes,
2305  mj_part_t atomic_part_count)
2306 {
2307  std::vector<mj_part_t> num_partitioning_in_current_dim;
2308 
2309  // how many parts that will be obtained after this dimension.
2310  mj_part_t output_num_parts = 0;
2311  if(this->part_no_array.size()) {
2312  // when the partNo array is provided as input,
2313  // each current partition will be partition to the same number of parts.
2314  // we dont need to use the future_num_part_in_parts vector in this case.
2315  mj_part_t current_part_no_array =
2316  this->part_no_array(current_iteration);
2317 
2318  if(current_part_no_array < 1) {
2319  std::cout << "Current recursive iteration: " << current_iteration <<
2320  " part_no_array[" << current_iteration << "] is given as:" <<
2321  current_part_no_array << std::endl;
2322  std::terminate();
2323  }
2324  if(current_part_no_array == 1) {
2325  return current_num_parts;
2326  }
2327 
2328  // If using part_no_array, ensure compatibility with num_first_level_parts.
2329  if (this->first_level_distribution.size() != 0 &&
2330  current_iteration == 0 &&
2331  current_part_no_array != this->num_first_level_parts) {
2332  std::cout << "Current recursive iteration: " << current_iteration
2333  << " part_no_array[" << current_iteration << "] is given as: " <<
2334  current_part_no_array << " and contradicts num_first_level_parts: " <<
2335  this->num_first_level_parts << std::endl;
2336  std::terminate();
2337  }
2338 
2339  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340  num_partitioning_in_current_dim.push_back(current_part_no_array);
2341  }
2342 
2343 /*
2344  std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345  current_iteration << " current_num_parts: " <<
2346  current_num_parts << "\n\n";
2347 
2348  std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349  num_partitioning_in_current_dim[0] << "\n\n";
2350 
2351  std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352  << " num_partitioning_in_current_dim[0]: " <<
2353  num_partitioning_in_current_dim[0] << " " <<
2354  future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355 */
2356 
2357  future_num_parts /= num_partitioning_in_current_dim[0];
2358  output_num_parts = current_num_parts *
2359  num_partitioning_in_current_dim[0];
2360  if(this->mj_keep_part_boxes) {
2361  for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362  //initialized the output boxes as its ancestor.
2363  for(mj_part_t j = 0; j <
2364  num_partitioning_in_current_dim[0]; ++j) {
2365  output_part_boxes->push_back((*input_part_boxes)[k]);
2366  }
2367  }
2368  }
2369 
2370  // set the how many more parts each part will be divided.
2371  // this is obvious when partNo array is provided as input.
2372  // however, fill this so weights will be calculated according to this array.
2373  for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374  next_future_num_parts_in_parts->push_back(future_num_parts);
2375  }
2376  }
2377  else {
2378  // if partNo array is not provided as input, future_num_part_in_parts
2379  // holds how many parts each part should be divided. Initially it holds a
2380  // single number equal to the total number of global parts.
2381 
2382  // calculate the future_num_parts from beginning,
2383  // since each part might be divided into different number of parts.
2384  future_num_parts = 1;
2385 
2386  // cout << "i:" << i << std::endl;
2387  for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388  // get how many parts a part should be divided.
2389  mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390 
2391  // get the ideal number of parts that is close to the
2392  // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393  mj_part_t num_partitions_in_current_dim =
2394  this->get_part_count(future_num_parts_of_part_ii,
2395  1.0 / (this->recursion_depth - current_iteration)
2396  );
2397  if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398  std::cerr << "ERROR: maxPartNo calculation is wrong."
2399  " num_partitions_in_current_dim: "
2400  << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401  << this->max_num_part_along_dim <<
2402  " this->recursion_depth: " << this->recursion_depth <<
2403  " current_iteration:" << current_iteration <<
2404  " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405  " might need to fix max part no calculation for "
2406  "largest_prime_first partitioning." <<
2407  std::endl;
2408  std::terminate();
2409  }
2410  // add this number to vector_num_partitioning_in_current_dim vector.
2411  // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412  // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413 
2414  // Update part num arrays when on current_iteration == 0 and
2415  // using nonuniform first level partitioning
2416  // with requested num parts (num_first_level_parts) and
2417  // a requested distribution in parts (first_level_distribution).
2418  if (current_iteration == 0 &&
2419  this->first_level_distribution.size() != 0 &&
2420  this->num_first_level_parts > 1) {
2421  // Only 1 current part to begin and partitions into
2422  // num_first_level_parts many parts
2423  num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424 
2425  // The output number of parts from first level partitioning
2426  output_num_parts = this->num_first_level_parts;
2427 
2428  // Remaining parts left to partition for all future levels
2429  future_num_parts /= this->num_first_level_parts;
2430 
2431  mj_part_t max_part = 0;
2432  mj_part_t sum_first_level_dist = 0;
2433 
2434  // Cumulative sum of distribution of first level parts
2435  // and size of largest first level part
2436  for (int i = 0; i < this->num_first_level_parts; ++i) {
2437  sum_first_level_dist += this->first_level_distribution(i);
2438 
2439  if (this->first_level_distribution(i) > max_part)
2440  max_part = this->first_level_distribution(i);
2441  }
2442 
2443  // Maximum # of remaining parts left to partition for all future levels
2444  future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445 
2446  // Number of parts remaining left to partition for each future_part
2447  // The sum must exactly equal global_num_parts
2448  for (int i = 0; i < this->num_first_level_parts; ++i) {
2449  next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450  this->num_global_parts / sum_first_level_dist);
2451  }
2452  }
2453  else if (this->divide_to_prime_first) {
2454  // Add this number to num_partitioning_in_current_dim vector.
2455  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456 
2457  mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458 
2459  //increase the output number of parts.
2460  output_num_parts += num_partitions_in_current_dim;
2461 
2462  if (future_num_parts_of_part_ii == atomic_part_count ||
2463  future_num_parts_of_part_ii % atomic_part_count != 0) {
2464  atomic_part_count = 1;
2465  }
2466 
2467  largest_prime_factor =
2468  this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469 
2470  // We divide to num_partitions_in_current_dim. But we adjust the weights
2471  // based on largest prime/ if num_partitions_in_current_dim = 2,
2472  // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473  // if the largest prime is less than part count, we use the part count
2474  // so that we divide uniformly.
2475  if (largest_prime_factor < num_partitions_in_current_dim) {
2476  largest_prime_factor = num_partitions_in_current_dim;
2477  }
2478  //ideal number of future partitions for each part.
2479  mj_part_t ideal_num_future_parts_in_part =
2480  (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482  mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483 
2484 /*
2485  std::cout << "\ncurrent num part: " << ii
2486  << " largest_prime_factor: " << largest_prime_factor
2487  << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488 */
2489 
2490  for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491  //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492  mj_part_t my_ideal_primescale = ideal_prime_scale;
2493  //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494  if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495  ++my_ideal_primescale;
2496  }
2497  //scale with 'x';
2498  mj_part_t num_future_parts_for_part_iii =
2499  ideal_num_future_parts_in_part * my_ideal_primescale;
2500 
2501  //if there is a remainder in the part increase the part weight.
2502  if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503  //if not uniform, add 1 for the extra parts.
2504  ++num_future_parts_for_part_iii;
2505  }
2506 
2507  next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508 
2509  //if part boxes are stored, initialize the box of the parts as the ancestor.
2510  if (this->mj_keep_part_boxes) {
2511  output_part_boxes->push_back((*input_part_boxes)[ii]);
2512  }
2513 
2514  //set num future_num_parts to maximum in this part.
2515  if (num_future_parts_for_part_iii > future_num_parts)
2516  future_num_parts = num_future_parts_for_part_iii;
2517 
2518  }
2519  }
2520  else {
2521  // Add this number to num_partitioning_in_current_dim vector.
2522  num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523 
2524  //increase the output number of parts.
2525  output_num_parts += num_partitions_in_current_dim;
2526 
2527  if((future_num_parts_of_part_ii == atomic_part_count) ||
2528  (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529  atomic_part_count = 1;
2530  }
2531  //ideal number of future partitions for each part.
2532  mj_part_t ideal_num_future_parts_in_part =
2533  (future_num_parts_of_part_ii / atomic_part_count) /
2534  num_partitions_in_current_dim;
2535  for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536  mj_part_t num_future_parts_for_part_iii =
2537  ideal_num_future_parts_in_part;
2538 
2539  //if there is a remainder in the part increase the part weight.
2540  if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541  num_partitions_in_current_dim) {
2542  // if not uniform, add 1 for the extra parts.
2543  ++num_future_parts_for_part_iii;
2544  }
2545 
2546  next_future_num_parts_in_parts->push_back(
2547  num_future_parts_for_part_iii * atomic_part_count);
2548 
2549  // if part boxes are stored, initialize the box of the parts as
2550  // the ancestor.
2551  if(this->mj_keep_part_boxes) {
2552  output_part_boxes->push_back((*input_part_boxes)[ii]);
2553  }
2554  //set num future_num_parts to maximum in this part.
2555  if(num_future_parts_for_part_iii > future_num_parts)
2556  future_num_parts = num_future_parts_for_part_iii;
2557  }
2558  }
2559  }
2560  }
2561  // move temp std::vector to host view
2562  device_num_partitioning_in_current_dim = Kokkos::View<
2563  mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564  host_num_partitioning_in_current_dim =
2565  Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566  for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567  host_num_partitioning_in_current_dim(n) =
2568  num_partitioning_in_current_dim[n];
2569  }
2570  // setup device equivalent - this data is used on host and device and it's
2571  // more efficient to just setup array on both sides now rather than copy
2572  // values as needed later.
2573  Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574  host_num_partitioning_in_current_dim);
2575  return output_num_parts;
2576 }
2577 
2578 /* \brief Allocates and initializes the work memory that will be used by MJ.
2579  * */
2580 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581  typename mj_part_t, typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583  allocate_set_work_memory()
2584 {
2585  // Throughout the partitioning execution,
2586  // instead of the moving the coordinates, hold a permutation array for parts.
2587  // coordinate_permutations holds the current permutation.
2588  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589  Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590  this->num_local_coords);
2591  auto local_coordinate_permutations = coordinate_permutations;
2592  Kokkos::parallel_for(
2593  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594  0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595  local_coordinate_permutations(i) = i;
2596  });
2597 
2598  // new_coordinate_permutations holds the current permutation.
2599  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600  Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601  this->num_local_coords);
2602 
2603  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604  Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605  if(this->num_local_coords > 0) {
2606  this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607  Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608  this->num_local_coords);
2609  }
2610 
2611  // single partition starts at index-0, and ends at numLocalCoords
2612  // inTotalCounts array holds the end points in coordinate_permutations array
2613  // for each partition. Initially sized 1, and single element is set to
2614  // numLocalCoords.
2615  this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616  Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618  host_part_xadj(0) = num_local_coords;
2619  Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620 
2621  // the ends points of the output, this is allocated later.
2622  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624 
2625  // only store this much if cuts are needed to be stored.
2626  this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627  Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629 
2630  // how much weight percentage should a MPI put left side of the each cutline
2631  this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632  device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633 
2634  // how much weight percentage should each thread in MPI put left side of
2635  // each outline
2636  this->thread_cut_line_weight_to_put_left =
2637  Kokkos::View<mj_scalar_t*, device_t>(
2638  Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639 
2640  if(this->distribute_points_on_cut_lines) {
2641  this->process_cut_line_weight_to_put_left =
2642  Kokkos::View<mj_scalar_t *, device_t>(
2643  Kokkos::ViewAllocateWithoutInitializing(
2644  "process_cut_line_weight_to_put_left"),
2645  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646  this->thread_cut_line_weight_to_put_left =
2647  Kokkos::View<mj_scalar_t *, device_t>(
2648  Kokkos::ViewAllocateWithoutInitializing(
2649  "thread_cut_line_weight_to_put_left"),
2650  this->max_num_cut_along_dim);
2651  this->process_rectilinear_cut_weight =
2652  Kokkos::View<mj_scalar_t *, device_t>(
2653  Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654  this->max_num_cut_along_dim);
2655  this->global_rectilinear_cut_weight =
2656  Kokkos::View<mj_scalar_t *, device_t>(
2657  Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658  this->max_num_cut_along_dim);
2659  }
2660 
2661  // work array to manipulate coordinate of cutlines in different iterations.
2662  // necessary because previous cut line information is used for determining
2663  // the next cutline information. therefore, cannot update the cut work array
2664  // until all cutlines are determined.
2665  this->cut_coordinates_work_array =
2666  Kokkos::View<mj_scalar_t *, device_t>(
2667  Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669 
2670  // cumulative part weight array.
2671  this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672  Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673  this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674 
2675  // upper bound coordinate of a cut line
2676  this->cut_upper_bound_coordinates =
2677  Kokkos::View<mj_scalar_t*, device_t>(
2678  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680 
2681  // lower bound coordinate of a cut line
2682  this->cut_lower_bound_coordinates =
2683  Kokkos::View<mj_scalar_t*, device_t>(
2684  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686 
2687  // lower bound weight of a cut line
2688  this->cut_lower_bound_weights =
2689  Kokkos::View<mj_scalar_t*, device_t>(
2690  Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692 
2693  //upper bound weight of a cut line
2694  this->cut_upper_bound_weights =
2695  Kokkos::View<mj_scalar_t*, device_t>(
2696  Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697  this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698 
2699  // combined array to exchange the min and max coordinate,
2700  // and total weight of part.
2701  this->process_local_min_max_coord_total_weight =
2702  Kokkos::View<mj_scalar_t*, device_t>(
2703  Kokkos::ViewAllocateWithoutInitializing(
2704  "process_local_min_max_coord_total_weight"),
2705  3 * this->max_concurrent_part_calculation);
2706 
2707  // global combined array with the results for min, max and total weight.
2708  this->global_min_max_coord_total_weight =
2709  Kokkos::View<mj_scalar_t*, device_t>(
2710  Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711  3 * this->max_concurrent_part_calculation);
2712 
2713  // is_cut_line_determined is used to determine if a cutline is
2714  // determined already. If a cut line is already determined, the next
2715  // iterations will skip this cut line.
2716  this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717  Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719 
2720  // incomplete_cut_count count holds the number of cutlines that have not
2721  // been finalized for each part when concurrentPartCount>1, using this
2722  // information, if incomplete_cut_count[x]==0, then no work is done for
2723  // this part.
2724  this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725  Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726  this->max_concurrent_part_calculation);
2727  this->incomplete_cut_count =
2728  Kokkos::create_mirror_view(device_incomplete_cut_count);
2729 
2730  // local part weights of each thread.
2731  this->thread_part_weights = Kokkos::View<double *, device_t>(
2732  Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733  this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734 
2735  this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736  Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738 
2739  // thread_cut_right_closest_point to hold the closest coordinate to a
2740  // cutline from right (for each thread)
2741  this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742  Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743  this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744 
2745  // to store how many points in each part a thread has.
2746  this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747  Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748  this->max_num_part_along_dim);
2749 
2750  // for faster communication, concatanation of
2751  // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752  // leftClosest distances sized P-1, since P-1 cut lines
2753  // rightClosest distances size P-1, since P-1 cut lines.
2754  this->total_part_weight_left_right_closests =
2755  Kokkos::View<mj_scalar_t*, device_t>(
2756  Kokkos::ViewAllocateWithoutInitializing(
2757  "total_part_weight_left_right_closests"),
2758  (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759  this->max_concurrent_part_calculation);
2760 
2761  this->global_total_part_weight_left_right_closests =
2762  Kokkos::View<mj_scalar_t*, device_t>(
2763  Kokkos::ViewAllocateWithoutInitializing(
2764  "global_total_part_weight_left_right_closests"),
2765  (this->max_num_total_part_along_dim +
2766  this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767 
2768  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769  Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770 
2771  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773  num_local_coords);
2774 
2775  // changes owners back to host - so we don't run them on device
2776  // this improves migration code but means we have to serial init here.
2777  // Note we might allow this to be OpenMP when available even for CUDA.
2778  Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779 
2780  auto local_current_mj_gnos = current_mj_gnos;
2781  auto local_initial_mj_gnos = initial_mj_gnos;
2782  Kokkos::parallel_for(
2783  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784  (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785  local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786  });
2787 }
2788 
2789 /* \brief compute the global bounding box
2790  */
2791 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792  typename mj_part_t, typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794  mj_node_t>::compute_global_box()
2795 {
2796  //local min coords
2797  mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798  //global min coords
2799  mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800  //local max coords
2801  mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802  //global max coords
2803  mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804 
2805  auto local_mj_coordinates = this->mj_coordinates;
2806 
2807  // If we are only doing 2 parts then we don't need these values
2808  // for y and z. Init them all to 0 first
2809  for(int i = 0; i < this->coord_dim; ++i) {
2810  mins[i] = 0;
2811  maxs[i] = 0;
2812  }
2813 
2814  for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815  Kokkos::parallel_reduce("MinReduce",
2816  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817  (0, this->num_local_coords),
2818  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819  if(local_mj_coordinates(j,i) < running_min) {
2820  running_min = local_mj_coordinates(j,i);
2821  }
2822  }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823  Kokkos::parallel_reduce("MaxReduce",
2824  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825  (0, this->num_local_coords),
2826  KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827  if(local_mj_coordinates(j,i) > running_max) {
2828  running_max = local_mj_coordinates(j,i);
2829  }
2830  }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831  }
2832 
2833  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834  this->coord_dim, mins, gmins
2835  );
2836 
2837  reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838  this->coord_dim, maxs, gmaxs
2839  );
2840 
2841  //create single box with all areas.
2842  global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843  //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844  delete [] mins;
2845  delete [] gmins;
2846  delete [] maxs;
2847  delete [] gmaxs;
2848 }
2849 
2850 /* \brief for part communication we keep track of the box boundaries.
2851  * This is performed when either asked specifically, or when geometric mapping
2852  * is performed afterwards.
2853  * This function initializes a single box with all global min, max coordinates.
2854  * \param initial_partitioning_boxes the input and output vector for boxes.
2855  */
2856 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857  typename mj_part_t, typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859  mj_node_t>::init_part_boxes(
2860  RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861 {
2862  mj_partBox_t tmp_box(*global_box);
2863  initial_partitioning_boxes->push_back(tmp_box);
2864 }
2865 
2870 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871  typename mj_part_t,
2872  typename mj_node_t>
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874  mj_get_local_min_max_coord_totW(
2875  mj_part_t current_work_part,
2876  mj_part_t current_concurrent_num_parts,
2877  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878 {
2879  auto local_coordinate_permutations = this->coordinate_permutations;
2880  auto local_process_local_min_max_coord_total_weight =
2881  this->process_local_min_max_coord_total_weight;
2882  auto local_mj_weights = this->mj_weights;
2883 
2884  bool bUniformWeights = mj_uniform_weights(0);
2885 
2886  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887 
2888  mj_part_t concurrent_current_part = current_work_part + kk;
2889  mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890  host_part_xadj(concurrent_current_part - 1);
2891  mj_lno_t coordinate_end_index =
2892  host_part_xadj(concurrent_current_part);
2893 
2894  mj_scalar_t my_min_coord = 0;
2895  mj_scalar_t my_max_coord = 0;
2896  mj_scalar_t my_total_weight;
2897  //if the part is empty.
2898  //set the min and max coordinates as reverse.
2899  if(coordinate_begin_index >= coordinate_end_index)
2900  {
2901  my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902  my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903  my_total_weight = 0;
2904  }
2905  else {
2906  // get min
2907  Kokkos::parallel_reduce("get min",
2908  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909  (coordinate_begin_index, coordinate_end_index),
2910  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911  int i = local_coordinate_permutations(j);
2912  if(mj_current_dim_coords(i) < running_min)
2913  running_min = mj_current_dim_coords(i);
2914  }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915  // get max
2916  Kokkos::parallel_reduce("get max",
2917  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918  (coordinate_begin_index, coordinate_end_index),
2919  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920  int i = local_coordinate_permutations(j);
2921  if(mj_current_dim_coords(i) > running_max)
2922  running_max = mj_current_dim_coords(i);
2923  }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924  if(bUniformWeights) {
2925  my_total_weight = coordinate_end_index - coordinate_begin_index;
2926  }
2927  else {
2928  my_total_weight = 0;
2929  Kokkos::parallel_reduce("get weight",
2930  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931  (coordinate_begin_index, coordinate_end_index),
2932  KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933  int i = local_coordinate_permutations(j);
2934  lsum += local_mj_weights(i,0);
2935  }, my_total_weight);
2936  }
2937  }
2938 
2939  // single write
2940  Kokkos::parallel_for(
2941  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942  (0, 1), KOKKOS_LAMBDA (int dummy) {
2943  local_process_local_min_max_coord_total_weight(kk) =
2944  my_min_coord;
2945  local_process_local_min_max_coord_total_weight(
2946  kk + current_concurrent_num_parts) = my_max_coord;
2947  local_process_local_min_max_coord_total_weight(
2948  kk + 2*current_concurrent_num_parts) = my_total_weight;
2949  });
2950  }
2951 }
2952 
2965 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966  typename mj_part_t, typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968  mj_node_t>::mj_get_global_min_max_coord_totW(
2969  mj_part_t current_concurrent_num_parts,
2970  Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971  Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972  // reduce min for first current_concurrent_num_parts elements, reduce
2973  // max for next concurrentPartCount elements, reduce sum for the last
2974  // concurrentPartCount elements.
2975  if(this->comm->getSize() > 1) {
2976  // We're using explicit host here as Spectrum MPI would fail
2977  // with the prior HostMirror UVMSpace to UVMSpace setup.
2978  auto host_local_min_max_total =
2979  Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980  auto host_global_min_max_total =
2981  Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982  Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984  reductionOp(current_concurrent_num_parts,
2985  current_concurrent_num_parts, current_concurrent_num_parts);
2986  try {
2987  reduceAll<int, mj_scalar_t>(
2988  *(this->comm),
2989  reductionOp,
2990  3 * current_concurrent_num_parts,
2991  host_local_min_max_total.data(),
2992  host_global_min_max_total.data());
2993  }
2994  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995  Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996  }
2997  else {
2998  mj_part_t s = 3 * current_concurrent_num_parts;
2999  Kokkos::parallel_for(
3000  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001  (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002  global_min_max_total(i) = local_min_max_total(i);
3003  });
3004  }
3005 }
3006 
3039 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040  typename mj_part_t, typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042  mj_get_initial_cut_coords_target_weights(
3043  mj_scalar_t min_coord,
3044  mj_scalar_t max_coord,
3045  mj_part_t num_cuts/*p-1*/ ,
3046  mj_scalar_t global_weight,
3047  /*p - 1 sized, coordinate of each cut line*/
3048  Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049  /*cumulative weights, at left side of each cut line. p-1 sized*/
3050  Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051  std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052  std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053  mj_part_t concurrent_current_part,
3054  mj_part_t obtained_part_index,
3055  mj_part_t num_target_first_level_parts,
3056  const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057 {
3058  mj_scalar_t coord_range = max_coord - min_coord;
3059 
3060  // We decided we could keep some std::vectors around for now. Eventually
3061  // it would be nice to have everything just as views with some being device
3062  // and some host. This particular case needs a bit of work to get setup
3063  // in a cleaner way so not going to mess with it at the moment.
3064 
3065  bool bUniformPartsCheck =
3066  num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067 
3068  if(!bUniformPartsCheck) {
3069  bool bValidNonUniformTargetWeights =
3070  (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071  if(!bValidNonUniformTargetWeights) {
3072  std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073  std::terminate();
3074  }
3075  }
3076 
3077  Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078  "device_cumulative", num_cuts);
3079  auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080 
3081  mj_scalar_t cumulative = 0;
3082 
3083  if(bUniformPartsCheck) {
3084  // How many total future parts the part will be partitioned into.
3085  mj_scalar_t total_future_part_count_in_part =
3086  static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087 
3088  // How much each part should weigh in ideal case.
3089  mj_scalar_t unit_part_weight =
3090  global_weight / total_future_part_count_in_part;
3091 
3092  for(mj_part_t i = 0; i < num_cuts; ++i) {
3093  cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094  host_cumulative(i) = cumulative;
3095  }
3096  }
3097  else {
3098  // Sum of entries in the first level partition distribution vector
3099  mj_scalar_t sum_target_first_level_dist = 0.0;
3100  for (int i = 0; i < num_target_first_level_parts; ++i) {
3101  sum_target_first_level_dist += target_first_level_dist(i);
3102  }
3103 
3104  for(mj_part_t i = 0; i < num_cuts; ++i) {
3105  cumulative += global_weight * target_first_level_dist(i) /
3106  sum_target_first_level_dist;
3107  host_cumulative(i) = cumulative;
3108  }
3109  }
3110 
3111  Kokkos::deep_copy(device_cumulative, host_cumulative);
3112 
3113  Kokkos::parallel_for("Write num in parts",
3114  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115  (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116  // set target part weight.
3117  current_target_part_weights(cut) = device_cumulative(cut);
3118  initial_cut_coords(cut) = min_coord +
3119  (coord_range * device_cumulative(cut)) / global_weight;
3120  // set this multiple times but here for device handling
3121  current_target_part_weights(num_cuts) = global_weight;
3122  });
3123 
3124  // round the target part weights.
3125  // Note need to discuss regarding DragonFly commits and determine if we
3126  // would not simply check mj_uniform_weights here.
3127  if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128  Kokkos::parallel_for(
3129  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130  (0, num_cuts + 1),
3131  KOKKOS_LAMBDA (mj_part_t i) {
3132  current_target_part_weights(i) =
3133  long(current_target_part_weights(i) + 0.5);
3134  });
3135  }
3136 }
3137 
3154 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155  typename mj_part_t, typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157  set_initial_coordinate_parts(
3158  mj_scalar_t &max_coordinate,
3159  mj_scalar_t &min_coordinate,
3160  mj_lno_t coordinate_begin_index,
3161  mj_lno_t coordinate_end_index,
3162  Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164  Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165  mj_part_t &partition_count)
3166 {
3167  mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168 
3169  // if there is single point, or if all points are along a line.
3170  // set initial part to 0 for all.
3171  if(std::abs(coordinate_range) < this->sEpsilon ) {
3172  Kokkos::parallel_for(
3173  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174  (coordinate_begin_index, coordinate_end_index),
3175  KOKKOS_LAMBDA (mj_lno_t ii) {
3176  mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177  });
3178  }
3179  else {
3180  // otherwise estimate an initial part for each coordinate.
3181  // assuming uniform distribution of points.
3182  mj_scalar_t slice = coordinate_range / partition_count;
3183  Kokkos::parallel_for(
3184  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185  (coordinate_begin_index, coordinate_end_index),
3186  KOKKOS_LAMBDA (mj_lno_t ii) {
3187  mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188  mj_part_t pp =
3189  mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190  if(pp >= partition_count) {
3191  pp = partition_count - 1; // don't want last coord in an invalid part
3192  }
3193  mj_part_ids[iii] = 2 * pp;
3194  });
3195  }
3196 }
3197 
3212 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213  typename mj_part_t, typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216  double used_imbalance_tolerance,
3217  mj_part_t current_work_part,
3218  mj_part_t current_concurrent_num_parts,
3219  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220  mj_part_t total_incomplete_cut_count,
3221  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222  Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223 {
3224  this->temp_cut_coords = current_cut_coordinates;
3225 
3227  *reductionOp = NULL;
3228 
3229  bool bSingleProcess = (this->comm->getSize() == 1);
3230 
3231  std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232  if(!bSingleProcess) {
3233  for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234  temp[n] = host_num_partitioning_in_current_dim(n);
3235  }
3236  reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
3237  <mj_part_t, mj_scalar_t>(
3238  &temp,
3239  current_work_part,
3240  current_concurrent_num_parts);
3241  }
3242 
3243  auto local_cut_lower_bound_coordinates =
3244  cut_lower_bound_coordinates;
3245  auto local_cut_upper_bound_coordinates =
3246  cut_upper_bound_coordinates;
3247  auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248  auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249  bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250  auto local_process_cut_line_weight_to_put_left =
3251  process_cut_line_weight_to_put_left;
3252  auto local_temp_cut_coords = temp_cut_coords;
3253  auto local_global_total_part_weight_left_right_closests =
3254  global_total_part_weight_left_right_closests;
3255  auto local_cut_coordinates_work_array =
3256  cut_coordinates_work_array;
3257  auto local_part_xadj = part_xadj;
3258  auto local_global_min_max_coord_total_weight =
3259  global_min_max_coord_total_weight;
3260  auto local_target_part_weights =
3261  target_part_weights;
3262  auto local_global_rectilinear_cut_weight =
3263  global_rectilinear_cut_weight;
3264  auto local_process_rectilinear_cut_weight =
3265  process_rectilinear_cut_weight;
3266 
3267  auto local_is_cut_line_determined = this->is_cut_line_determined;
3268  auto local_device_num_partitioning_in_current_dim =
3269  device_num_partitioning_in_current_dim;
3270 
3271  Kokkos::parallel_for(
3272  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273  KOKKOS_LAMBDA (int dummy) {
3274 
3275  // these need to be initialized
3276  view_rectilinear_cut_count(0) = 0;
3277  view_total_reduction_size(0) = 0;
3278 
3279  // initialize the lower and upper bounds of the cuts.
3280  mj_part_t next = 0;
3281  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282  mj_part_t num_part_in_dim =
3283  local_device_num_partitioning_in_current_dim(current_work_part + i);
3284  mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285  view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286 
3287  for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288  local_is_cut_line_determined(next) = false;
3289  // min coordinate
3290  local_cut_lower_bound_coordinates(next) =
3291  local_global_min_max_coord_total_weight(i);
3292  // max coordinate
3293  local_cut_upper_bound_coordinates(next) =
3294  local_global_min_max_coord_total_weight(
3295  i + current_concurrent_num_parts);
3296  // total weight
3297  local_cut_upper_bound_weights(next) =
3298  local_global_min_max_coord_total_weight(
3299  i + 2 * current_concurrent_num_parts);
3300  local_cut_lower_bound_weights(next) = 0;
3301  if(local_distribute_points_on_cut_lines) {
3302  local_process_cut_line_weight_to_put_left(next) = 0;
3303  }
3304  ++next;
3305  }
3306  }
3307  });
3308 
3309  // loop_count allows the kernel to behave differently on the first loop
3310  // and subsequent loops. First loop we do a binary search and subsequent
3311  // loops we simply step towards our target.
3312  int loop_count = 0;
3313  while (total_incomplete_cut_count != 0) {
3314  this->mj_1D_part_get_part_weights(
3315  current_concurrent_num_parts,
3316  current_work_part,
3317  mj_current_dim_coords,
3318  loop_count);
3319  ++loop_count;
3320 
3321  this->mj_combine_rightleft_and_weights(
3322  current_work_part,
3323  current_concurrent_num_parts);
3324 
3325  // now sum up the results of mpi processors.
3326  if(!bSingleProcess) {
3327  // We're using explicit host here as Spectrum MPI would fail
3328  // with the prior HostMirror UVMSpace to UVMSpace setup.
3329  auto host_total_part_weight_left_right_closests =
3330  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331  total_part_weight_left_right_closests);
3332  auto host_global_total_part_weight_left_right_closests =
3333  Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334  global_total_part_weight_left_right_closests);
3335 
3336  Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337  total_part_weight_left_right_closests);
3338 
3339  size_t host_view_total_reduction_size;
3340  Kokkos::parallel_reduce("Read single",
3341  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342  KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343  set_single = view_total_reduction_size(0);
3344  }, host_view_total_reduction_size);
3345 
3346  reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347  host_view_total_reduction_size,
3348  host_total_part_weight_left_right_closests.data(),
3349  host_global_total_part_weight_left_right_closests.data());
3350  Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351  host_global_total_part_weight_left_right_closests);
3352  }
3353  else {
3354  local_global_total_part_weight_left_right_closests =
3355  this->total_part_weight_left_right_closests;
3356  }
3357 
3358  // how much cut will be shifted for the next part in the concurrent
3359  // part calculation.
3360  mj_part_t cut_shift = 0;
3361 
3362  // how much the concantaneted array will be shifted for the next part
3363  // in concurrent part calculation.
3364  size_t tlr_shift = 0;
3365 
3366  Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367  save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368  current_concurrent_num_parts);
3369 
3370  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371 
3372  mj_part_t num_parts =
3373  host_num_partitioning_in_current_dim(current_work_part + kk);
3374 
3375  mj_part_t num_cuts = num_parts - 1;
3376  size_t num_total_part = num_parts + size_t (num_cuts);
3377 
3378  //if the cuts of this cut has already been completed.
3379  //nothing to do for this part.
3380  //just update the shift amount and proceed.
3381  mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382 
3383  if(kk_incomplete_cut_count == 0) {
3384  cut_shift += num_cuts;
3385  tlr_shift += (num_total_part + 2 * num_cuts);
3386  continue;
3387  }
3388 
3389  Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390  Kokkos::subview(this->total_part_weight_left_right_closests,
3391  std::pair<mj_lno_t, mj_lno_t>(
3392  tlr_shift,
3393  this->total_part_weight_left_right_closests.size()));
3394 
3395  Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396  Kokkos::subview(
3397  local_global_total_part_weight_left_right_closests,
3398  std::pair<mj_lno_t, mj_lno_t>(
3399  tlr_shift,
3400  local_global_total_part_weight_left_right_closests.size()));
3401  Kokkos::View<mj_scalar_t *, device_t>
3402  current_global_left_closest_points =
3403  Kokkos::subview(current_global_tlr,
3404  std::pair<mj_lno_t, mj_lno_t>(
3405  num_total_part,
3406  current_global_tlr.size()));
3407  Kokkos::View<mj_scalar_t *, device_t>
3408  current_global_right_closest_points =
3409  Kokkos::subview(current_global_tlr,
3410  std::pair<mj_lno_t, mj_lno_t>(
3411  num_total_part + num_cuts,
3412  current_global_tlr.size()));
3413  Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414  current_global_tlr;
3415 
3416  Kokkos::View<bool *, device_t> current_cut_line_determined =
3417  Kokkos::subview(this->is_cut_line_determined,
3418  std::pair<mj_lno_t, mj_lno_t>(
3419  cut_shift,
3420  this->is_cut_line_determined.size()));
3421  Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422  Kokkos::subview(local_target_part_weights,
3423  std::pair<mj_lno_t, mj_lno_t>(
3424  cut_shift + kk,
3425  local_target_part_weights.size()));
3426  Kokkos::View<mj_scalar_t *, device_t>
3427  current_part_cut_line_weight_to_put_left =
3428  Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429  std::pair<mj_lno_t, mj_lno_t>(
3430  cut_shift,
3431  local_process_cut_line_weight_to_put_left.size()));
3432 
3433  save_initial_incomplete_cut_count(kk) =
3434  kk_incomplete_cut_count;
3435 
3436  Kokkos::View<mj_scalar_t *, device_t>
3437  current_cut_lower_bound_weights =
3438  Kokkos::subview(local_cut_lower_bound_weights,
3439  std::pair<mj_lno_t, mj_lno_t>(
3440  cut_shift,
3441  local_cut_lower_bound_weights.size()));
3442  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443  Kokkos::subview(local_cut_upper_bound_weights,
3444  std::pair<mj_lno_t, mj_lno_t>(
3445  cut_shift,
3446  local_cut_upper_bound_weights.size()));
3447  Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448  Kokkos::subview(local_cut_upper_bound_coordinates,
3449  std::pair<mj_lno_t, mj_lno_t>(
3450  cut_shift,
3451  local_cut_upper_bound_coordinates.size()));
3452  Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453  Kokkos::subview(local_cut_lower_bound_coordinates,
3454  std::pair<mj_lno_t, mj_lno_t>(
3455  cut_shift,
3456  local_cut_lower_bound_coordinates.size()));
3457 
3458  // Now compute the new cut coordinates.
3459  Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460  Kokkos::subview(this->temp_cut_coords,
3461  std::pair<mj_lno_t, mj_lno_t>(
3462  cut_shift, this->temp_cut_coords.size()));
3463  Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464  Kokkos::subview(this->cut_coordinates_work_array,
3465  std::pair<mj_lno_t, mj_lno_t>(
3466  cut_shift, this->cut_coordinates_work_array.size()));
3467 
3468  this->mj_get_new_cut_coordinates(
3469  current_concurrent_num_parts,
3470  kk,
3471  num_cuts,
3472  used_imbalance_tolerance,
3473  current_global_part_weights,
3474  current_local_part_weights,
3475  current_part_target_weights,
3476  current_cut_line_determined,
3477  sub_temp_cut_coords,
3478  current_cut_upper_bounds,
3479  current_cut_lower_bounds,
3480  current_global_left_closest_points,
3481  current_global_right_closest_points,
3482  current_cut_lower_bound_weights,
3483  current_cut_upper_weights,
3484  sub_cut_coordinates_work_array,
3485  current_part_cut_line_weight_to_put_left,
3486  view_rectilinear_cut_count);
3487 
3488  cut_shift += num_cuts;
3489  tlr_shift += (num_total_part + 2 * num_cuts);
3490  } // end of kk loop
3491 
3492  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493  mj_part_t iteration_complete_cut_count =
3494  save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495  total_incomplete_cut_count -= iteration_complete_cut_count;
3496  }
3497 
3498  Kokkos::parallel_for(
3499  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500  (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501  auto t = local_temp_cut_coords(n);
3502  local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503  local_cut_coordinates_work_array(n) = t;
3504  });
3505  } // end of the while loop
3506 
3507  // Needed only if keep_cuts; otherwise can simply swap array pointers
3508  // cutCoordinates and cutCoordinatesWork.
3509  // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510  // computed cuts must be in cutCoordinates.
3511  if(current_cut_coordinates != local_temp_cut_coords) {
3512  Kokkos::parallel_for(
3513  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514  (0, 1), KOKKOS_LAMBDA(int dummy) {
3515  mj_part_t next = 0;
3516  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517  mj_part_t num_parts = -1;
3518  num_parts = local_device_num_partitioning_in_current_dim(
3519  current_work_part + i);
3520  mj_part_t num_cuts = num_parts - 1;
3521  for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522  current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523  }
3524  next += num_cuts;
3525  }
3526  for(int n = 0; n <
3527  static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528  local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529  }
3530  });
3531  }
3532 
3533  delete reductionOp;
3534 }
3535 
3536 template<class scalar_t>
3538  scalar_t * ptr;
3539 
3540  // With new kokkos setup parallel_reduce will call empty constructor and
3541  // we update the ptr in the init method.
3542  KOKKOS_INLINE_FUNCTION
3543  Zoltan2_MJArrayType() : ptr(NULL) {};
3544 
3545  KOKKOS_INLINE_FUNCTION
3546  Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547 
3549  ptr = zmj.ptr;
3550  return *this;
3551  }
3552 };
3553 
3554 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555 
3556 template<class policy_t, class scalar_t, class part_t>
3558 
3561  scalar_t max_scalar;
3565 
3566  KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567  scalar_t mj_max_scalar,
3568  value_type &val,
3569  int mj_value_count_rightleft,
3570  int mj_value_count_weights) :
3571  max_scalar(mj_max_scalar),
3572  value(&val),
3573  value_count_rightleft(mj_value_count_rightleft),
3574  value_count_weights(mj_value_count_weights)
3575  {}
3576 
3577  KOKKOS_INLINE_FUNCTION
3579  return *value;
3580  }
3581 
3582  KOKKOS_INLINE_FUNCTION
3583  void join(value_type& dst, const value_type& src) const {
3584  for(int n = 0; n < value_count_weights; ++n) {
3585  dst.ptr[n] += src.ptr[n];
3586  }
3587 
3588  for(int n = value_count_weights + 2;
3589  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590  if(src.ptr[n] > dst.ptr[n]) {
3591  dst.ptr[n] = src.ptr[n];
3592  }
3593  if(src.ptr[n+1] < dst.ptr[n+1]) {
3594  dst.ptr[n+1] = src.ptr[n+1];
3595  }
3596  }
3597  }
3598 
3599  KOKKOS_INLINE_FUNCTION
3600  void join (volatile value_type& dst, const volatile value_type& src) const {
3601  for(int n = 0; n < value_count_weights; ++n) {
3602  dst.ptr[n] += src.ptr[n];
3603  }
3604 
3605  for(int n = value_count_weights + 2;
3606  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607  if(src.ptr[n] > dst.ptr[n]) {
3608  dst.ptr[n] = src.ptr[n];
3609  }
3610  if(src.ptr[n+1] < dst.ptr[n+1]) {
3611  dst.ptr[n+1] = src.ptr[n+1];
3612  }
3613  }
3614  }
3615 
3616  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617  dst.ptr = value->ptr; // must update ptr
3618 
3619  for(int n = 0; n < value_count_weights; ++n) {
3620  dst.ptr[n] = 0;
3621  }
3622 
3623  for(int n = value_count_weights;
3625  dst.ptr[n] = -max_scalar;
3626  dst.ptr[n+1] = max_scalar;
3627  }
3628  }
3629 };
3630 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631 
3632 template<class policy_t, class scalar_t, class part_t, class index_t,
3633  class device_t, class array_t>
3635  typedef typename policy_t::member_type member_type;
3636  typedef Kokkos::View<scalar_t*> scalar_view_t;
3637 
3638 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639  typedef array_t value_type[];
3640 #endif
3641 
3643  array_t max_scalar;
3644 
3652  Kokkos::View<index_t*, device_t> permutations;
3653  Kokkos::View<scalar_t *, device_t> coordinates;
3654  Kokkos::View<scalar_t**, device_t> weights;
3655  Kokkos::View<part_t*, device_t> parts;
3656  Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657  Kokkos::View<index_t *, device_t> part_xadj;
3659  scalar_t sEpsilon;
3660 
3661 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662  Kokkos::View<double *, device_t> current_part_weights;
3663  Kokkos::View<scalar_t *, device_t> current_left_closest;
3664  Kokkos::View<scalar_t *, device_t> current_right_closest;
3665 #endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666 
3668  int mj_loop_count,
3669  array_t mj_max_scalar,
3670  part_t mj_concurrent_current_part,
3671  part_t mj_num_cuts,
3672  part_t mj_current_work_part,
3673  part_t mj_current_concurrent_num_parts,
3674  part_t mj_left_right_array_size,
3675  part_t mj_weight_array_size,
3676  Kokkos::View<index_t*, device_t> & mj_permutations,
3677  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678  Kokkos::View<scalar_t**, device_t> & mj_weights,
3679  Kokkos::View<part_t*, device_t> & mj_parts,
3680  Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681  Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682  bool mj_uniform_weights0,
3683  scalar_t mj_sEpsilon
3684 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685  ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686  Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687  Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689  ) :
3690  loop_count(mj_loop_count),
3691  max_scalar(mj_max_scalar),
3692  concurrent_current_part(mj_concurrent_current_part),
3693  num_cuts(mj_num_cuts),
3694  current_work_part(mj_current_work_part),
3695  current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696  value_count_rightleft(mj_left_right_array_size),
3697  value_count_weights(mj_weight_array_size),
3698  value_count(mj_weight_array_size+mj_left_right_array_size),
3699  permutations(mj_permutations),
3700  coordinates(mj_coordinates),
3701  weights(mj_weights),
3702  parts(mj_parts),
3703  cut_coordinates(mj_cut_coordinates),
3704  part_xadj(mj_part_xadj),
3705  uniform_weights0(mj_uniform_weights0),
3706  sEpsilon(mj_sEpsilon)
3707 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708  ,current_part_weights(mj_current_part_weights),
3709  current_left_closest(mj_current_left_closest),
3710  current_right_closest(mj_current_right_closest)
3711 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712  {
3713  }
3714 
3715  size_t team_shmem_size (int team_size) const {
3716 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717  int result = sizeof(array_t) *
3719 #else
3720  int result = sizeof(array_t) *
3722 #endif
3723 
3724  // pad this to a multiple of 8 or it will run corrupt
3725  int remainder = result % 8;
3726  if(remainder != 0) {
3727  result += 8 - remainder;
3728  }
3729  return result;
3730  }
3731 
3732  KOKKOS_INLINE_FUNCTION
3733 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734  void operator() (const member_type & teamMember) const {
3735 #else
3736  void operator() (const member_type & teamMember, value_type teamSum) const {
3737 #endif
3738 
3739  index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741  index_t all_end = part_xadj(concurrent_current_part);
3742 
3743  index_t num_working_points = all_end - all_begin;
3744  int num_teams = teamMember.league_size();
3745 
3746  index_t stride = num_working_points / num_teams;
3747  if((num_working_points % num_teams) > 0) {
3748  stride += 1; // make sure we have coverage for the final points
3749  }
3750 
3751  // the last team may have less work than the other teams
3752  // the last team can be empty (begin > end) if num_teams > stride
3753  // which is true for many teams and small numbers of coords (tests)
3754  index_t begin = all_begin + stride * teamMember.league_rank();
3755  index_t end = begin + stride;
3756  if(end > all_end) {
3757  end = all_end;
3758  }
3759 
3760 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763 
3764  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765  sh_mem_size);
3766 
3767  // init the shared array to 0
3768  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769  for(int n = 0; n < value_count_weights; ++n) {
3770  shared_ptr[n] = 0;
3771  }
3772  for(int n = value_count_weights;
3774  shared_ptr[n] = -max_scalar;
3775  shared_ptr[n+1] = max_scalar;
3776  }
3777  });
3778  teamMember.team_barrier();
3779 
3780  Kokkos::parallel_for(
3781  Kokkos::TeamThreadRange(teamMember, begin, end),
3782  [=] (index_t ii) {
3783 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784  // create the team shared data - each thread gets one of the arrays
3785  size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786  value_count_rightleft) * teamMember.team_size();
3787 
3788  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789  sh_mem_size);
3790 
3791  // select the array for this thread
3792  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794 
3795  // create reducer which handles the Zoltan2_MJArrayType class
3797  max_scalar, array,
3800 
3801  Kokkos::parallel_reduce(
3802  Kokkos::TeamThreadRange(teamMember, begin, end),
3803  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3804 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3805 
3806  int i = permutations(ii);
3807  scalar_t coord = coordinates(i);
3808  array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3809 
3810  // now check each part and it's right cut
3811  index_t part = parts(i)/2;
3812 
3813  int upper = num_cuts;
3814  int lower = 0;
3815 
3816  // binary search - find matching part
3817  while(true) {
3818  scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3819  scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3820 
3821  if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3822 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823  Kokkos::atomic_add(&shared_ptr[part*2], w);
3824 #else
3825  threadSum.ptr[part*2] += w;
3826 #endif
3827 
3828  parts(i) = part*2;
3829 
3830  // now handle the left/right closest part
3831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832  array_t new_value = (array_t) coord;
3833  array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3834  while(new_value < prev_value) {
3835  prev_value = Kokkos::atomic_compare_exchange(
3836  &shared_ptr[value_count_weights + part * 2 + 1],
3837  prev_value, new_value);
3838  }
3839  prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3840  while(new_value > prev_value) {
3841  prev_value = Kokkos::atomic_compare_exchange(
3842  &shared_ptr[value_count_weights + part * 2 + 2],
3843  prev_value, new_value);
3844  }
3845 #else
3846  // note cut to left needs to set right closest and cut to right needs
3847  // to set left closest. It's index +1 and +2 instead of -1 and +0
3848  // because right/left segment is padded with an extra pair at
3849  // begining and end to avoid branching with if checks.
3850  if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3851  threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3852  }
3853  if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3854  threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3855  }
3856 #endif
3857 
3858  break;
3859  }
3860  else if(part != num_cuts) {
3861  if(coord < b + sEpsilon && coord > b - sEpsilon) {
3862  // Note if on cut we set right/left closest to the cut itself
3863  // but we add +2 because we buffered the area with an extra slot
3864  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3865 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3867  shared_ptr[value_count_weights + part * 2 + 2] = b;
3868  shared_ptr[value_count_weights + part * 2 + 3] = b;
3869 #else
3870  threadSum.ptr[part*2+1] += w;
3871  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3872  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3873 #endif
3874 
3875  parts(i) = part*2+1;
3876 
3877  // Need to scan up for any other cuts of same coordinate
3878  // This is costly but it's only relevant for the fix4785 test
3879  // which loads a lot of coordinates on the same point, so without
3880  // this our cuts would all just sit at 0.
3881  part_t base_b = part;
3882  scalar_t base_coord = cut_coordinates(base_b);
3883  part += 1;
3884  while(part < num_cuts) {
3885  b = cut_coordinates(part);
3886  scalar_t delta = b - base_coord;
3887  if(delta < 0) delta = -delta;
3888  if(delta < sEpsilon) {
3889  // Note if on cut we set right/left closest to the cut itself
3890  // but we add +2 because we buffered the area with an extra slot
3891  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3892 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3894  shared_ptr[value_count_weights + part * 2 + 2] = b;
3895  shared_ptr[value_count_weights + part * 2 + 3] = b;
3896 #else
3897  threadSum.ptr[part*2+1] += w;
3898  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3899  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3900 #endif
3901  }
3902  else { break; }
3903  ++part;
3904  }
3905  part = base_b - 1;
3906  while(part >= 0) {
3907  b = cut_coordinates(part);
3908  scalar_t delta = b - base_coord;
3909  if(delta < 0) delta = -delta;
3910  if(delta < sEpsilon) {
3911  // Note if on cut we set right/left closest to the cut itself
3912  // but we add +2 because we buffered the area with an extra slot
3913  // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3914 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915  Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3916  shared_ptr[value_count_weights + part * 2 + 2] = b;
3917  shared_ptr[value_count_weights + part * 2 + 3] = b;
3918 #else
3919  threadSum.ptr[part*2+1] += w;
3920  threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3921  threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3922 #endif
3923  }
3924  else { break; }
3925  --part;
3926  }
3927 
3928  break;
3929  }
3930  }
3931 
3932  if(loop_count != 0) {
3933  // subsequent loops can just step towards target
3934  if(coord < b) {
3935  part -= 1;
3936  }
3937  else {
3938  part += 1;
3939  }
3940  }
3941  else {
3942  // initial loop binary search
3943  if(coord < b) {
3944  if(part == lower + 1) {
3945  part = lower;
3946  }
3947  else {
3948  upper = part - 1;
3949  part -= (part - lower)/2;
3950  }
3951  }
3952  else if(part == upper - 1) {
3953  part = upper;
3954  }
3955  else {
3956  lower = part + 1;
3957  part += (upper - part)/2;
3958  }
3959  }
3960  }
3961 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962  });
3963 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3964  }, arraySumReducer);
3965 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966 
3967  teamMember.team_barrier();
3968 
3969  // collect all the team's results
3970  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3971  for(int n = 0; n < value_count_weights; ++n) {
3972 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973  Kokkos::atomic_add(&current_part_weights(n),
3974  static_cast<double>(shared_ptr[n]));
3975 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3976  teamSum[n] += array.ptr[n];
3977 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3978  }
3979 
3980 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981  int insert_left = 0;
3982  int insert_right = 0;
3983 #endif
3984 
3985  for(int n = 2 + value_count_weights;
3986  n < value_count_weights + value_count_rightleft - 2; n += 2) {
3987 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988  scalar_t new_value = shared_ptr[n+1];
3989  scalar_t prev_value = current_right_closest(insert_right);
3990  while(new_value < prev_value) {
3991  prev_value = Kokkos::atomic_compare_exchange(
3992  &current_right_closest(insert_right), prev_value, new_value);
3993  }
3994 
3995  new_value = shared_ptr[n];
3996  prev_value = current_left_closest(insert_left);
3997  while(new_value > prev_value) {
3998  prev_value = Kokkos::atomic_compare_exchange(
3999  &current_left_closest(insert_left), prev_value, new_value);
4000  }
4001 
4002  ++insert_left;
4003  ++insert_right;
4004 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4005  if(array.ptr[n] > teamSum[n]) {
4006  teamSum[n] = array.ptr[n];
4007  }
4008  if(array.ptr[n+1] < teamSum[n+1]) {
4009  teamSum[n+1] = array.ptr[n+1];
4010  }
4011 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4012  }
4013  });
4014 
4015  teamMember.team_barrier();
4016  }
4017 
4018 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019  KOKKOS_INLINE_FUNCTION
4020  void join(value_type dst, const value_type src) const {
4021  for(int n = 0; n < value_count_weights; ++n) {
4022  dst[n] += src[n];
4023  }
4024 
4025  for(int n = value_count_weights + 2;
4026  n < value_count_weights + value_count_rightleft - 2; n += 2) {
4027  if(src[n] > dst[n]) {
4028  dst[n] = src[n];
4029  }
4030  if(src[n+1] < dst[n+1]) {
4031  dst[n+1] = src[n+1];
4032  }
4033  }
4034  }
4035 
4036  KOKKOS_INLINE_FUNCTION
4037  void join (volatile value_type dst, const volatile value_type src) const {
4038  for(int n = 0; n < value_count_weights; ++n) {
4039  dst[n] += src[n];
4040  }
4041 
4042  for(int n = value_count_weights + 2;
4043  n < value_count_weights + value_count_rightleft - 2; n += 2) {
4044  if(src[n] > dst[n]) {
4045  dst[n] = src[n];
4046  }
4047  if(src[n+1] < dst[n+1]) {
4048  dst[n+1] = src[n+1];
4049  }
4050  }
4051  }
4052 
4053  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4054  for(int n = 0; n < value_count_weights; ++n) {
4055  dst[n] = 0;
4056  }
4057 
4058  for(int n = value_count_weights;
4060  dst[n] = -max_scalar;
4061  dst[n+1] = max_scalar;
4062  }
4063  }
4064 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4065 };
4066 
4074 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4075  typename mj_part_t, typename mj_node_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4077  mj_1D_part_get_part_weights(
4078  mj_part_t current_concurrent_num_parts,
4079  mj_part_t current_work_part,
4080  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4081  int loop_count)
4082 {
4083  auto local_is_cut_line_determined = is_cut_line_determined;
4084  auto local_thread_part_weights = thread_part_weights;
4085  auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4086  auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4087 
4088  // Create some locals so we don't use this inside the kernels
4089  // which causes problems
4090  auto local_sEpsilon = this->sEpsilon;
4091  auto local_assigned_part_ids = this->assigned_part_ids;
4092  auto local_coordinate_permutations = this->coordinate_permutations;
4093  auto local_mj_weights = this->mj_weights;
4094  auto local_part_xadj = this->part_xadj;
4095  auto local_global_min_max_coord_total_weight =
4096  this->global_min_max_coord_total_weight;
4097 
4098  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4099 
4100  auto local_device_num_partitioning_in_current_dim =
4101  device_num_partitioning_in_current_dim;
4102 
4103  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4104  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4105 
4106  mj_part_t total_part_shift = 0;
4107 
4108  mj_part_t concurrent_cut_shifts = 0;
4109  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4110  Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4111  Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4112  concurrent_cut_shifts, temp_cut_coords.size()));
4113 
4114  mj_part_t num_parts =
4115  host_num_partitioning_in_current_dim(current_work_part + kk);
4116  mj_part_t num_cuts = num_parts - 1;
4117  mj_part_t total_part_count = num_parts + num_cuts;
4118  mj_part_t weight_array_length = num_cuts + num_parts;
4119 
4120  // for right/left closest + buffer cut on either side
4121  mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4122 
4123  if(this->incomplete_cut_count(kk) == 0) {
4124  total_part_shift += total_part_count;
4125  concurrent_cut_shifts += num_cuts;
4126  continue;
4127  }
4128 
4129  // if not set use 60 - was initial testing amount but somewhat arbitrary
4130  auto policy_ReduceWeightsFunctor = policy_t(
4131  mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4132 
4133 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4134  int total_array_length =
4135  weight_array_length + right_left_array_length;
4136 #endif
4137 
4138  // Using float here caused some numerical errors for coord on cut calculations.
4139  // Probably that can be fixed with proper epsilon adjustment but since cuda
4140  // doesn't reduce right now the shared memory pressure is no longer relevant.
4141  // Just use scalar_t to match the original algorithm.
4142  typedef mj_scalar_t array_t;
4143 
4144 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4145  array_t * reduce_array =
4146  new array_t[static_cast<size_t>(total_array_length)];
4147 #endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4148 
4149  int offset_cuts = 0;
4150  for(int kk2 = 0; kk2 < kk; ++kk2) {
4151  offset_cuts +=
4152  host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4153  }
4154  Kokkos::View<double *, device_t> my_current_part_weights =
4155  Kokkos::subview(local_thread_part_weights,
4156  std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4157  total_part_shift + total_part_count));
4158  Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4159  Kokkos::subview(local_thread_cut_left_closest_point,
4160  std::pair<mj_lno_t, mj_lno_t>(
4161  offset_cuts,
4162  local_thread_cut_left_closest_point.size()));
4163  Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4164  Kokkos::subview(local_thread_cut_right_closest_point,
4165  std::pair<mj_lno_t, mj_lno_t>(
4166  offset_cuts,
4167  local_thread_cut_right_closest_point.size()));
4168 
4169  array_t max_scalar = std::numeric_limits<array_t>::max();
4170 
4171 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4172  // initialize values
4173  Kokkos::parallel_for(
4174  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4175  KOKKOS_LAMBDA (int dummy) {
4176  for(int n = 0; n < weight_array_length; ++n) {
4177  my_current_part_weights(n) = 0;
4178  }
4179  for(int n = 0; n < num_cuts; ++n) {
4180  my_current_left_closest(n) = -max_scalar;
4181  my_current_right_closest(n) = max_scalar;
4182  }
4183  });
4184 #endif
4185 
4186  mj_part_t concurrent_current_part =
4187  current_work_part + kk;
4188 
4189  ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4190  typename mj_node_t::device_type, array_t>
4191  teamFunctor(
4192  loop_count,
4193  max_scalar,
4195  num_cuts,
4198  right_left_array_length,
4199  weight_array_length,
4200  coordinate_permutations,
4201  mj_current_dim_coords,
4202  mj_weights,
4203  assigned_part_ids,
4204  local_temp_cut_coords,
4205  part_xadj,
4206  mj_uniform_weights(0), // host and currently only relevant to slot 0
4207  sEpsilon
4208 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4209  ,my_current_part_weights,
4210  my_current_left_closest,
4211  my_current_right_closest
4212 #endif
4213  );
4214 
4215 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4216  Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4217 #else
4218  Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4219  teamFunctor, reduce_array);
4220 #endif
4221 
4222 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4223  auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4224 
4225  for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4226  hostArray(i) = reduce_array[i];
4227  }
4228 
4229  Kokkos::deep_copy(my_current_part_weights, hostArray);
4230 
4231  auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4232  auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4233  for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4234  hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4235  hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4236  }
4237  Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4238  Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4239 
4240  delete [] reduce_array;
4241 #endif
4242 
4243  total_part_shift += total_part_count;
4244  concurrent_cut_shifts += num_cuts;
4245  }
4246 
4247  auto local_temp_cut_coords = temp_cut_coords;
4248 
4249  Kokkos::parallel_for(
4250  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4251  (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4252  mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4253  current_work_part + kk);
4254  mj_part_t num_cuts = num_parts - 1;
4255  mj_part_t total_part_count = num_parts + num_cuts;
4256 
4257  if(local_device_incomplete_cut_count(kk) > 0) {
4258  // get the prefix sum
4259  // This is an inefficiency but not sure if it matters much
4260  size_t offset = 0;
4261  size_t offset_cuts = 0;
4262  for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4263  auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4264  current_work_part + kk2);
4265  offset += num_parts_kk2 * 2 - 1;
4266  offset_cuts += num_parts_kk2 - 1;
4267  }
4268 
4269  for(mj_part_t i = 1; i < total_part_count; ++i) {
4270  // check for cuts sharing the same position; all cuts sharing a position
4271  // have the same weight == total weight for all cuts sharing the
4272  // position. Don't want to accumulate that total weight more than once.
4273  if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4274  std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4275  local_temp_cut_coords(offset_cuts + i /2 - 1))
4276  < local_sEpsilon) {
4277  // i % 2 = 0 when part i represents the cut coordinate.
4278  // if it is a cut, and if next cut also has the same coordinate, then
4279  // dont addup.
4280  local_thread_part_weights(offset + i)
4281  = local_thread_part_weights(offset + i-2);
4282  continue;
4283  }
4284 
4285  // otherwise do the prefix sum.
4286  local_thread_part_weights(offset + i) +=
4287  local_thread_part_weights(offset + i-1);
4288  }
4289  }
4290  });
4291 }
4292 
4300 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4301  typename mj_part_t, typename mj_node_t>
4302 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4303  mj_combine_rightleft_and_weights(
4304  mj_part_t current_work_part,
4305  mj_part_t current_concurrent_num_parts)
4306 {
4307  auto local_thread_part_weights = this->thread_part_weights;
4308  auto local_is_cut_line_determined = this->is_cut_line_determined;
4309  auto local_thread_cut_left_closest_point =
4310  this->thread_cut_left_closest_point;
4311  auto local_thread_cut_right_closest_point =
4312  this->thread_cut_right_closest_point;
4313  auto local_total_part_weight_left_right_closests =
4314  this->total_part_weight_left_right_closests;
4315  auto local_device_num_partitioning_in_current_dim =
4316  device_num_partitioning_in_current_dim;
4317  Kokkos::parallel_for(
4318  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4319  KOKKOS_LAMBDA (int dummy) {
4320 
4321  size_t tlr_array_shift = 0;
4322  mj_part_t cut_shift = 0;
4323  size_t total_part_array_shift = 0;
4324 
4325  // iterate for all concurrent parts to find the left and right closest
4326  // points in the process.
4327  for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4328 
4329  mj_part_t num_parts_in_part =
4330  local_device_num_partitioning_in_current_dim(current_work_part + i);
4331  mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4332  size_t num_total_part_in_part =
4333  num_parts_in_part + size_t (num_cuts_in_part);
4334 
4335  // iterate for cuts in a single part.
4336  for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4337  mj_part_t next = tlr_array_shift + ii;
4338  mj_part_t cut_index = cut_shift + ii;
4339 
4340  if(!local_is_cut_line_determined(cut_index)) {
4341  mj_scalar_t left_closest_in_process =
4342  local_thread_cut_left_closest_point(cut_index);
4343  mj_scalar_t right_closest_in_process =
4344  local_thread_cut_right_closest_point(cut_index);
4345 
4346  // store the left and right closes points.
4347  local_total_part_weight_left_right_closests(
4348  num_total_part_in_part + next) = left_closest_in_process;
4349 
4350  local_total_part_weight_left_right_closests(
4351  num_total_part_in_part + num_cuts_in_part + next) =
4352  right_closest_in_process;
4353  }
4354  }
4355 
4356  for(size_t j = 0; j < num_total_part_in_part; ++j) {
4357  mj_part_t cut_ind = j / 2 + cut_shift;
4358 
4359  // need to check j != num_total_part_in_part - 1
4360  // which is same as j/2 != num_cuts_in_part.
4361  // we cannot check it using cut_ind, because of the concurrent part
4362  // concantanetion.
4363  if(j == num_total_part_in_part - 1 ||
4364  !local_is_cut_line_determined(cut_ind)) {
4365  double pwj = local_thread_part_weights(total_part_array_shift + j);
4366  local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4367  }
4368  }
4369 
4370  // set the shift position in the arrays
4371  cut_shift += num_cuts_in_part;
4372  tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4373  total_part_array_shift += num_total_part_in_part;
4374  }
4375  });
4376 }
4377 
4390 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4391  typename mj_part_t, typename mj_node_t>
4392 KOKKOS_INLINE_FUNCTION
4393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4394  mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4395  mj_scalar_t cut_lower_bound,
4396  mj_scalar_t cut_upper_weight,
4397  mj_scalar_t cut_lower_weight,
4398  mj_scalar_t expected_weight,
4399  mj_scalar_t &new_cut_position,
4400  mj_scalar_t sEpsilon) {
4401 
4402  if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4403  new_cut_position = cut_upper_bound; //or lower bound does not matter.
4404  }
4405 
4406  if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4407  new_cut_position = cut_lower_bound;
4408  }
4409 
4410  mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4411  mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4412  mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4413 
4414  mj_scalar_t required_shift = (my_weight_diff / weight_range);
4415  int scale_constant = 20;
4416  int shiftint= int (required_shift * scale_constant);
4417  if(shiftint == 0) shiftint = 1;
4418  required_shift = mj_scalar_t (shiftint) / scale_constant;
4419  new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4420 }
4421 
4422 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4423 
4424 template<class policy_t, class scalar_t>
4426 
4431 
4432  KOKKOS_INLINE_FUNCTION ArrayReducer(
4433  value_type &val,
4434  int mj_value_count) :
4435  value(&val),
4436  value_count(mj_value_count)
4437  {}
4438 
4439  KOKKOS_INLINE_FUNCTION
4441  return *value;
4442  }
4443 
4444  KOKKOS_INLINE_FUNCTION
4445  void join(value_type& dst, const value_type& src) const {
4446  for(int n = 0; n < value_count; ++n) {
4447  dst.ptr[n] += src.ptr[n];
4448  }
4449  }
4450 
4451  KOKKOS_INLINE_FUNCTION
4452  void join (volatile value_type& dst, const volatile value_type& src) const {
4453  for(int n = 0; n < value_count; ++n) {
4454  dst.ptr[n] += src.ptr[n];
4455  }
4456  }
4457 
4458  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4459  dst.ptr = value->ptr; // must update ptr
4460  for(int n = 0; n < value_count; ++n) {
4461  dst.ptr[n] = 0;
4462  }
4463  }
4464 };
4465 
4466 #endif
4467 
4468 template<class policy_t, class scalar_t, class part_t, class index_t,
4469  class device_t, class array_t>
4471  typedef typename policy_t::member_type member_type;
4472  typedef Kokkos::View<scalar_t*> scalar_view_t;
4473 
4474 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4475  typedef array_t value_type[];
4476 #endif
4477 
4480  Kokkos::View<index_t*, device_t> permutations;
4481  Kokkos::View<scalar_t *, device_t> coordinates;
4482  Kokkos::View<part_t*, device_t> parts;
4483  Kokkos::View<index_t *, device_t> part_xadj;
4484  Kokkos::View<index_t *, device_t> track_on_cuts;
4485 
4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487  Kokkos::View<int *, device_t> local_point_counts;
4488 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4489 
4491  part_t mj_concurrent_current_part,
4492  part_t mj_weight_array_size,
4493  Kokkos::View<index_t*, device_t> & mj_permutations,
4494  Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4495  Kokkos::View<part_t*, device_t> & mj_parts,
4496  Kokkos::View<index_t *, device_t> & mj_part_xadj,
4497  Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4498 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499  ,Kokkos::View<int *, device_t> & mj_local_point_counts
4500 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4501  ) :
4502  concurrent_current_part(mj_concurrent_current_part),
4503  value_count(mj_weight_array_size),
4504  permutations(mj_permutations),
4505  coordinates(mj_coordinates),
4506  parts(mj_parts),
4507  part_xadj(mj_part_xadj),
4508  track_on_cuts(mj_track_on_cuts)
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510  ,local_point_counts(mj_local_point_counts)
4511 #endif
4512  {
4513  }
4514 
4515  size_t team_shmem_size (int team_size) const {
4516 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4517  int result = sizeof(array_t) * (value_count);
4518 #else
4519  int result = sizeof(array_t) * (value_count) * team_size;
4520 #endif
4521 
4522  // pad this to a multiple of 8 or it will run corrupt
4523  int remainder = result % 8;
4524  if(remainder != 0) {
4525  result += 8 - remainder;
4526  }
4527  return result;
4528  }
4529 
4530  KOKKOS_INLINE_FUNCTION
4531 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532  void operator() (const member_type & teamMember) const {
4533 #else
4534  void operator() (const member_type & teamMember, value_type teamSum) const {
4535 #endif
4536  index_t all_begin = (concurrent_current_part == 0) ? 0 :
4538  index_t all_end = part_xadj(concurrent_current_part);
4539 
4540  index_t num_working_points = all_end - all_begin;
4541  int num_teams = teamMember.league_size();
4542 
4543  index_t stride = num_working_points / num_teams;
4544  if((num_working_points % num_teams) > 0) {
4545  stride += 1; // make sure we have coverage for the final points
4546  }
4547 
4548  index_t begin = all_begin + stride * teamMember.league_rank();
4549  index_t end = begin + stride;
4550  if(end > all_end) {
4551  end = all_end; // the last team may have less work than the other teams
4552  }
4553 
4554  int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4555 
4556  // create the team shared data - each thread gets one of the arrays
4557 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4558  size_t sh_mem_size = sizeof(array_t) * (value_count);
4559 #else
4560  size_t sh_mem_size =
4561  sizeof(array_t) * (value_count) * teamMember.team_size();
4562 #endif
4563 
4564  array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4565  sh_mem_size);
4566 
4567 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4568  // init the shared array to 0
4569  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4570  for(int n = 0; n < value_count; ++n) {
4571  shared_ptr[n] = 0;
4572  }
4573  });
4574  teamMember.team_barrier();
4575 
4576  Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4577  [=] (index_t ii) {
4578 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4579  // select the array for this thread
4580  Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4581  (value_count)]);
4582 
4583  // create reducer which handles the Zoltan2_MJArrayType class
4584  ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4585 
4586  Kokkos::parallel_reduce(
4587  Kokkos::TeamThreadRange(teamMember, begin, end),
4588  [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4589 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4590 
4591  index_t coordinate_index = permutations(ii);
4592  part_t place = parts(coordinate_index);
4593  part_t part = place / 2;
4594  if(place % 2 == 0) {
4595 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596  Kokkos::atomic_add(&shared_ptr[part], 1);
4597 #else
4598  threadSum.ptr[part] += 1;
4599 #endif
4600 
4601  parts(coordinate_index) = part;
4602  }
4603  else {
4604  // fill a tracking array so we can process these slower points
4605  // in next cycle
4606  index_t set_index = Kokkos::atomic_fetch_add(
4607  &track_on_cuts(track_on_cuts_insert_index), 1);
4608  track_on_cuts(set_index) = ii;
4609  }
4610 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4611  });
4612 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4613  }, arrayReducer);
4614 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4615 
4616  teamMember.team_barrier();
4617 
4618  // collect all the team's results
4619  Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4620  for(int n = 0; n < value_count; ++n) {
4621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4622  Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4623 #else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4624  teamSum[n] += array.ptr[n];
4625 #endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4626  }
4627  });
4628 
4629  teamMember.team_barrier();
4630  }
4631 
4632 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4633 
4634  KOKKOS_INLINE_FUNCTION
4635  void join(value_type dst, const value_type src) const {
4636  for(int n = 0; n < value_count; ++n) {
4637  dst[n] += src[n];
4638  }
4639  }
4640 
4641  KOKKOS_INLINE_FUNCTION
4642  void join (volatile value_type dst, const volatile value_type src) const {
4643  for(int n = 0; n < value_count; ++n) {
4644  dst[n] += src[n];
4645  }
4646  }
4647 
4648  KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4649  for(int n = 0; n < value_count; ++n) {
4650  dst[n] = 0;
4651  }
4652  }
4653 #endif
4654 };
4655 
4671 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4672  typename mj_part_t, typename mj_node_t>
4673 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4674 mj_create_new_partitions(
4675  mj_part_t num_parts,
4676  mj_part_t current_concurrent_work_part,
4677  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4678  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4679  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4680  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4681 {
4682  // Get locals for cuda
4683  auto local_thread_part_weight_work = this->thread_part_weight_work;
4684  auto local_point_counts = this->thread_point_counts;
4685  auto local_distribute_points_on_cut_lines =
4686  this->distribute_points_on_cut_lines;
4687  auto local_thread_cut_line_weight_to_put_left =
4688  this->thread_cut_line_weight_to_put_left;
4689  auto local_sEpsilon = this->sEpsilon;
4690  auto local_coordinate_permutations = this->coordinate_permutations;
4691  auto local_mj_weights = this->mj_weights;
4692  auto local_assigned_part_ids = this->assigned_part_ids;
4693  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4694 
4695  mj_part_t num_cuts = num_parts - 1;
4696 
4697  Kokkos::parallel_for(
4698  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4699  KOKKOS_LAMBDA(int dummy) {
4700 
4701  if(local_distribute_points_on_cut_lines) {
4702  for(int i = 0; i < num_cuts; ++i) {
4703  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4704  if(left_weight > local_sEpsilon) {
4705  // the weight of thread ii on cut.
4706  mj_scalar_t thread_ii_weight_on_cut =
4707  local_thread_part_weight_work(i * 2 + 1) -
4708  local_thread_part_weight_work(i * 2);
4709 
4710  if(thread_ii_weight_on_cut < left_weight) {
4711  // if left weight is bigger than threads weight on cut.
4712  local_thread_cut_line_weight_to_put_left(i) =
4713  thread_ii_weight_on_cut;
4714  }
4715  else {
4716  // if thread's weight is bigger than space, then put only a portion.
4717  local_thread_cut_line_weight_to_put_left(i) = left_weight;
4718  }
4719  left_weight -= thread_ii_weight_on_cut;
4720  }
4721  else {
4722  local_thread_cut_line_weight_to_put_left(i) = 0;
4723  }
4724  }
4725 
4726  // this is a special case. If cutlines share the same coordinate,
4727  // their weights are equal. We need to adjust the ratio for that.
4728  for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4729  if(std::abs(current_concurrent_cut_coordinate(i) -
4730  current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4731  local_thread_cut_line_weight_to_put_left(i) -=
4732  local_thread_cut_line_weight_to_put_left(i - 1);
4733  }
4734  local_thread_cut_line_weight_to_put_left(i) =
4735  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4736  least_signifiance) * significance_mul) /
4737  static_cast<mj_scalar_t>(significance_mul);
4738  }
4739  }
4740 
4741  for(mj_part_t i = 0; i < num_parts; ++i) {
4742  local_point_counts(i) = 0;
4743  }
4744  });
4745 
4746  mj_lno_t coordinate_begin_index =
4747  current_concurrent_work_part == 0 ? 0 :
4748  host_part_xadj(current_concurrent_work_part - 1);
4749  mj_lno_t coordinate_end_index =
4750  host_part_xadj(current_concurrent_work_part);
4751 
4752  mj_lno_t total_on_cut;
4753  Kokkos::parallel_reduce("Get total_on_cut",
4754  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4755  coordinate_begin_index, coordinate_end_index),
4756  KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4757  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4758  mj_part_t coordinate_assigned_place =
4759  local_assigned_part_ids(coordinate_index);
4760  if(coordinate_assigned_place % 2 == 1) {
4761  val += 1;
4762  }
4763  }, total_on_cut);
4764 
4765  Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4766  if(total_on_cut > 0) {
4767  track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4768  "track_on_cuts", // would do WithoutInitialization but need last init to 0
4769  total_on_cut + 1); // extra index to use for tracking
4770  }
4771 
4772  // here we need to parallel reduce an array to count coords in each part
4773  // atomically adding, especially for low part count would kill us
4774  // in the original setup we kept arrays allocated for each thread but for
4775  // the cuda version we'd like to avoid allocating N arrays for the number
4776  // of teams/threads which would be complicated based on running openmp or
4777  // cuda.
4778  typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4779 
4780  // if not set use 60 - somewhat arbitrary based on initial performance tests
4781  int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4782 
4783  auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4784  typedef int array_t;
4785 
4786  // just need parts - on the cuts will be handled in a separate serial
4787  // call after this.
4788 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4789  array_t * reduce_array = new array_t[static_cast<size_t>(num_parts)];
4790 #endif
4791 
4792  ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4793  typename mj_node_t::device_type, array_t>teamFunctor(
4794  current_concurrent_work_part,
4795  num_parts,
4796  coordinate_permutations,
4797  mj_current_dim_coords,
4798  assigned_part_ids,
4799  part_xadj,
4800  track_on_cuts
4801 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4802  ,local_point_counts
4803 #endif
4804  );
4805 
4806 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4807  Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4808 #else
4809  Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4810 #endif
4811 
4812 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4813  for(mj_part_t part = 0; part < num_parts; ++part) {
4814  local_point_counts(part) = reduce_array[part];
4815  }
4816  delete [] reduce_array;
4817 #endif
4818 
4819  // the last member is utility used for atomically inserting the values.
4820  // Sorting here avoids potential indeterminancy in the partitioning results
4821  if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4822  auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4823  std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4824  Kokkos::sort(track_on_cuts_sort);
4825  }
4826 
4827  bool uniform_weights0 = this->mj_uniform_weights(0);
4828  Kokkos::parallel_for(
4829  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4830  KOKKOS_LAMBDA (int dummy) {
4831 
4832  for(int j = 0; j < total_on_cut; ++j) {
4833  int ii = track_on_cuts(j);
4834  mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4835  mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4836  local_mj_weights(coordinate_index,0);
4837  mj_part_t coordinate_assigned_place =
4838  local_assigned_part_ids(coordinate_index);
4839  mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4840  // if it is on the cut.
4841  if(local_distribute_points_on_cut_lines &&
4842  local_thread_cut_line_weight_to_put_left(
4843  coordinate_assigned_part) > local_sEpsilon) {
4844  // if the rectilinear partitioning is allowed,
4845  // and the thread has still space to put on the left of the cut
4846  // then thread puts the vertex to left.
4847  local_thread_cut_line_weight_to_put_left(
4848  coordinate_assigned_part) -= coordinate_weight;
4849  // if putting the vertex to left increased the weight more
4850  // than expected, and if the next cut is on the same coordinate,
4851  // then we need to adjust how much weight next cut puts to its left as
4852  // well, in order to take care of the imbalance.
4853  if(local_thread_cut_line_weight_to_put_left(
4854  coordinate_assigned_part) < 0 && coordinate_assigned_part <
4855  num_cuts - 1 &&
4856  std::abs(current_concurrent_cut_coordinate(
4857  coordinate_assigned_part+1) -
4858  current_concurrent_cut_coordinate(
4859  coordinate_assigned_part)) < local_sEpsilon)
4860  {
4861  local_thread_cut_line_weight_to_put_left(
4862  coordinate_assigned_part + 1) +=
4863  local_thread_cut_line_weight_to_put_left(
4864  coordinate_assigned_part);
4865  }
4866  ++local_point_counts(coordinate_assigned_part);
4867  local_assigned_part_ids(coordinate_index) =
4868  coordinate_assigned_part;
4869  }
4870  else {
4871  // if there is no more space on the left, put the coordinate to the
4872  // right of the cut.
4873  ++coordinate_assigned_part;
4874  // this while loop is necessary when a line is partitioned into more
4875  // than 2 parts.
4876  while(local_distribute_points_on_cut_lines &&
4877  coordinate_assigned_part < num_cuts)
4878  {
4879  // traverse all the cut lines having the same partitiong
4880  if(std::abs(current_concurrent_cut_coordinate(
4881  coordinate_assigned_part) -
4882  current_concurrent_cut_coordinate(
4883  coordinate_assigned_part - 1)) < local_sEpsilon)
4884  {
4885  // if line has enough space on left, put it there.
4886  if(local_thread_cut_line_weight_to_put_left(
4887  coordinate_assigned_part) > local_sEpsilon &&
4888  local_thread_cut_line_weight_to_put_left(
4889  coordinate_assigned_part) >=
4890  std::abs(local_thread_cut_line_weight_to_put_left(
4891  coordinate_assigned_part) - coordinate_weight))
4892  {
4893  local_thread_cut_line_weight_to_put_left(
4894  coordinate_assigned_part) -= coordinate_weight;
4895  // Again if it put too much on left of the cut,
4896  // update how much the next cut sharing the same coordinate will
4897  // put to its left.
4898  if(local_thread_cut_line_weight_to_put_left(
4899  coordinate_assigned_part) < 0 &&
4900  coordinate_assigned_part < num_cuts - 1 &&
4901  std::abs(current_concurrent_cut_coordinate(
4902  coordinate_assigned_part+1) -
4903  current_concurrent_cut_coordinate(
4904  coordinate_assigned_part)) < local_sEpsilon)
4905  {
4906  local_thread_cut_line_weight_to_put_left(
4907  coordinate_assigned_part + 1) +=
4908  local_thread_cut_line_weight_to_put_left(
4909  coordinate_assigned_part);
4910  }
4911  break;
4912  }
4913  }
4914  else {
4915  break;
4916  }
4917  ++coordinate_assigned_part;
4918  }
4919  local_point_counts(coordinate_assigned_part) += 1;
4920  local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4921  }
4922  }
4923 
4924  for(int j = 0; j < num_parts; ++j) {
4925  out_part_xadj(j) = local_point_counts(j);
4926  local_point_counts(j) = 0;
4927 
4928  if(j != 0) {
4929  out_part_xadj(j) += out_part_xadj(j - 1);
4930  local_point_counts(j) += out_part_xadj(j - 1);
4931  }
4932  }
4933  });
4934 
4935  // here we will determine insert indices for N teams
4936  // then all the teams can fill
4937 
4938 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4939 
4940  // This is the fastest so far - just straight atomic writes for CUDA
4941  // However this is not a deterministic result since it is atomic.
4942  // The final result will be deterministic.
4943  Kokkos::parallel_for(
4944  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4945  coordinate_begin_index, coordinate_end_index),
4946  KOKKOS_LAMBDA (mj_lno_t ii) {
4947  mj_lno_t i = local_coordinate_permutations(ii);
4948  mj_part_t p = local_assigned_part_ids(i);
4949  mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4950  local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4951  });
4952 
4953 #else
4954 
4955 #ifdef KOKKOS_ENABLE_OPENMP
4956  // will return and fix this - revert back to 1 for clear auto testing
4957  const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4958 #else
4959  const int num_threads = 1;
4960 #endif
4961 
4962  const int num_teams = 1; // cuda is handled above using a different format
4963 
4964  // allow init - we want all 0's first
4965  Kokkos::View<mj_lno_t*, device_t>
4966  point_counter("insert indices", num_teams * num_threads * num_parts);
4967 
4968  // count how many coords per thread
4969  // then we will fill each independently
4970  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4971  block_policy(num_teams, num_threads);
4972  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4973  member_type member_type;
4974  mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4975  mj_lno_t block_size = range / num_teams + 1;
4976  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4977  int team = team_member.league_rank();
4978  int team_offset = team * num_threads * num_parts;
4979  mj_lno_t begin = coordinate_begin_index + team * block_size;
4980  mj_lno_t end = begin + block_size;
4981  if(end > coordinate_end_index) {
4982  end = coordinate_end_index;
4983  }
4984 
4985  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4986  [=] (mj_lno_t ii) {
4987  int thread = team_member.team_rank();
4988  mj_lno_t i = local_coordinate_permutations(ii);
4989  mj_part_t p = local_assigned_part_ids(i);
4990  int index = team_offset + thread * num_parts + p;
4991  ++point_counter(index);
4992  });
4993  });
4994 
4995  // now prefix sum
4996  // we currently have the counts in the slots
4997  // we want the first counter for each part to be 0
4998  // then the rest should be the sum of all the priors
4999  Kokkos::parallel_for(
5000  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5001  KOKKOS_LAMBDA (int dummy) {
5002  int num_sets = point_counter.size() / num_parts;
5003  for(int set = num_sets - 1; set >= 1; set -=1) {
5004  int base = set * num_parts;
5005  for(int part = 0; part < num_parts; ++part) {
5006  point_counter(base + part) = point_counter(base + part - num_parts);
5007  }
5008  }
5009 
5010  for(int part = 0; part < num_parts; ++part) {
5011  point_counter(part) = 0;
5012  }
5013 
5014  for(int set = 1; set < num_sets; ++set) {
5015  int base = set * num_parts;
5016  for(int part = 0; part < num_parts; ++part) {
5017  point_counter(base + part) += point_counter(base + part - num_parts);
5018  }
5019  }
5020  });
5021 
5022  // now permute
5023  Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
5024  int team = team_member.league_rank();
5025  int team_offset = team * num_threads * num_parts;
5026  mj_lno_t begin = coordinate_begin_index + team * block_size;
5027  mj_lno_t end = begin + block_size;
5028  if(end > coordinate_end_index) {
5029  end = coordinate_end_index;
5030  }
5031  Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5032  [=] (mj_lno_t ii) {
5033  int thread = team_member.team_rank();
5034  mj_lno_t i = local_coordinate_permutations(ii);
5035  mj_part_t p = local_assigned_part_ids(i);
5036  int index = team_offset + thread * num_parts + p;
5037  int set_counter = (point_counter(index)++) + local_point_counts(p);
5038  local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5039  });
5040  });
5041 #endif
5042 }
5043 
5087 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5088  typename mj_part_t, typename mj_node_t>
5089 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5090  mj_node_t>::mj_get_new_cut_coordinates(
5091  mj_part_t current_concurrent_num_parts,
5092  mj_part_t kk,
5093  const mj_part_t &num_cuts,
5094  const double &used_imbalance_tolerance,
5095  Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5096  Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5097  Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5098  Kokkos::View<bool *, device_t> & current_cut_line_determined,
5099  Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5100  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5101  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5102  Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5103  Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5104  Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5105  Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5106  Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5107  Kokkos::View<mj_scalar_t *, device_t> &
5108  current_part_cut_line_weight_to_put_left,
5109  Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5110 {
5111  Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5112 
5113  auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5114  auto local_sEpsilon = sEpsilon;
5115  auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5116  auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5117  auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5118  auto local_global_min_max_coord_total_weight =
5119  global_min_max_coord_total_weight;
5120 
5121  const auto _sEpsilon = this->sEpsilon;
5122  // Note for a 22 part system I tried removing the outer loop
5123  // and doing each sub loop as a simple parallel_for over num_cuts.
5124  // But that was about twice as slow (10ms) as the current form (5ms)
5125  // so I think the overhead of launching the new global parallel kernels
5126  // is costly. This form is just running one team so effectively using
5127  // a single warp to process the cuts. I expect with a lot of parts this
5128  // might need changing.
5129  Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5130  policy_one_team(1, Kokkos::AUTO());
5131  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5132  member_type member_type;
5133  Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5134 
5135  mj_scalar_t min_coordinate =
5136  local_global_min_max_coord_total_weight(kk);
5137  mj_scalar_t max_coordinate =
5138  local_global_min_max_coord_total_weight(
5140  mj_scalar_t global_total_weight =
5141  local_global_min_max_coord_total_weight(
5142  kk + current_concurrent_num_parts * 2);
5143 
5144  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5145  [=] (mj_part_t i) {
5146  // if left and right closest points are not set yet,
5147  // set it to the cut itself.
5148  if(min_coordinate -
5149  current_global_left_closest_points(i) > local_sEpsilon) {
5150  current_global_left_closest_points(i) =
5151  current_cut_coordinates(i);
5152  }
5153  if(current_global_right_closest_points(i) -
5154  max_coordinate > local_sEpsilon) {
5155  current_global_right_closest_points(i) =
5156  current_cut_coordinates(i);
5157  }
5158  });
5159  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5160 
5161  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5162  [=] (mj_part_t i) {
5163  using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5164  mj_node_t>;
5165  // seen weight in the part
5166  mj_scalar_t seen_weight_in_part = 0;
5167  // expected weight for part.
5168  mj_scalar_t expected_weight_in_part = 0;
5169  // imbalance for the left and right side of the cut.
5170  double imbalance_on_left = 0, imbalance_on_right = 0;
5171  if(local_distribute_points_on_cut_lines) {
5172  // init the weight on the cut.
5173  local_global_rectilinear_cut_weight(i) = 0;
5174  local_process_rectilinear_cut_weight(i) = 0;
5175  }
5176  bool bContinue = false;
5177  // if already determined at previous iterations,
5178  // then just write the coordinate to new array, and proceed.
5179  if(current_cut_line_determined(i)) {
5180  new_current_cut_coordinates(i) =
5181  current_cut_coordinates(i);
5182  bContinue = true;
5183  }
5184  if(!bContinue) {
5185  //current weight of the part at the left of the cut line.
5186  seen_weight_in_part = current_global_part_weights(i * 2);
5187 
5188  //expected ratio
5189  expected_weight_in_part = current_part_target_weights(i);
5190 
5191  //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5192  imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5193  expected_weight_in_part);
5194  // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5195  // globalTotalWeight, 1 - expected);
5196  imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5197  seen_weight_in_part, global_total_weight - expected_weight_in_part);
5198  bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5199  used_imbalance_tolerance < local_sEpsilon ;
5200  bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5201  used_imbalance_tolerance < local_sEpsilon;
5202  //if the cut line reaches to desired imbalance.
5203  if(is_left_imbalance_valid && is_right_imbalance_valid) {
5204  current_cut_line_determined(i) = true;
5205  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5206  new_current_cut_coordinates(i) = current_cut_coordinates(i);
5207  }
5208  else if(imbalance_on_left < 0) {
5209  //if left imbalance < 0 then we need to move the cut to right.
5210  if(local_distribute_points_on_cut_lines) {
5211  // if it is okay to distribute the coordinate on
5212  // the same coordinate to left and right.
5213  // then check if we can reach to the target weight by including the
5214  // coordinates in the part.
5215  if(current_global_part_weights(i * 2 + 1) ==
5216  expected_weight_in_part) {
5217  // if it is we are done.
5218  current_cut_line_determined(i) = true;
5219  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5220 
5221  //then assign everything on the cut to the left of the cut.
5222  new_current_cut_coordinates(i) =
5223  current_cut_coordinates(i);
5224  //for this cut all the weight on cut will be put to left.
5225  current_part_cut_line_weight_to_put_left(i) =
5226  current_local_part_weights(i * 2 + 1) -
5227  current_local_part_weights(i * 2);
5228  bContinue = true;
5229  }
5230  else if(current_global_part_weights(i * 2 + 1) >
5231  expected_weight_in_part) {
5232  // if the weight is larger than the expected weight,
5233  // then we need to distribute some points to left, some to right.
5234  current_cut_line_determined(i) = true;
5235  Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5236 
5237  // increase the num cuts to be determined with rectilinear
5238  // partitioning.
5239  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5240  new_current_cut_coordinates(i) =
5241  current_cut_coordinates(i);
5242  local_process_rectilinear_cut_weight[i] =
5243  current_local_part_weights(i * 2 + 1) -
5244  current_local_part_weights(i * 2);
5245  bContinue = true;
5246  }
5247  }
5248 
5249  if(!bContinue) {
5250 
5251  // we need to move further right,so set lower bound to current line,
5252  // and shift it to the closes point from right.
5253  current_cut_lower_bounds(i) =
5254  current_global_right_closest_points(i);
5255 
5256  //set the lower bound weight to the weight we have seen.
5257  current_cut_lower_bound_weights(i) = seen_weight_in_part;
5258 
5259  // compare the upper bound with what has been found in the
5260  // last iteration.
5261  // we try to make more strict bounds for the cut here.
5262  for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5263  mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5264  mj_scalar_t line_weight =
5265  current_global_part_weights(ii * 2 + 1);
5266  if(p_weight >= expected_weight_in_part) {
5267  // if a cut on the right has the expected weight, then we found
5268  // our cut position. Set up and low coordiantes to this
5269  // new cut coordinate, but we need one more iteration to
5270  // finalize the cut position, as wee need to update the part ids.
5271  if(p_weight == expected_weight_in_part) {
5272  current_cut_upper_bounds(i) =
5273  current_cut_coordinates(ii);
5274  current_cut_upper_weights(i) = p_weight;
5275  current_cut_lower_bounds(i) =
5276  current_cut_coordinates(ii);
5277  current_cut_lower_bound_weights(i) = p_weight;
5278  } else if(p_weight < current_cut_upper_weights(i)) {
5279  // if a part weight is larger then my expected weight,
5280  // but lower than my upper bound weight, update upper bound.
5281  current_cut_upper_bounds(i) =
5282  current_global_left_closest_points(ii);
5283  current_cut_upper_weights(i) = p_weight;
5284  }
5285  break;
5286  }
5287  // if comes here then pw < ew
5288  // then compare the weight against line weight.
5289  if(line_weight >= expected_weight_in_part) {
5290  // if the line is larger than the expected weight, then we need
5291  // to reach to the balance by distributing coordinates on
5292  // this line.
5293  current_cut_upper_bounds(i) =
5294  current_cut_coordinates(ii);
5295  current_cut_upper_weights(i) = line_weight;
5296  current_cut_lower_bounds(i) =
5297  current_cut_coordinates(ii);
5298  current_cut_lower_bound_weights(i) = p_weight;
5299  break;
5300  }
5301  // if a stricter lower bound is found,
5302  // update the lower bound.
5303  if(p_weight <= expected_weight_in_part && p_weight >=
5304  current_cut_lower_bound_weights(i)) {
5305  current_cut_lower_bounds(i) =
5306  current_global_right_closest_points(ii);
5307  current_cut_lower_bound_weights(i) = p_weight;
5308  }
5309  }
5310 
5311  mj_scalar_t new_cut_position = 0;
5312  algMJ_t::mj_calculate_new_cut_position(
5313  current_cut_upper_bounds(i),
5314  current_cut_lower_bounds(i),
5315  current_cut_upper_weights(i),
5316  current_cut_lower_bound_weights(i),
5317  expected_weight_in_part, new_cut_position,
5318  _sEpsilon);
5319 
5320  // if cut line does not move significantly.
5321  // then finalize the search.
5322  if(std::abs(current_cut_coordinates(i) -
5323  new_cut_position) < local_sEpsilon) {
5324  current_cut_line_determined(i) = true;
5325  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5326 
5327  //set the cut coordinate and proceed.
5328  new_current_cut_coordinates(i) =
5329  current_cut_coordinates(i);
5330  } else {
5331  new_current_cut_coordinates(i) = new_cut_position;
5332  }
5333  } // bContinue
5334  } else {
5335  // need to move the cut line to left.
5336  // set upper bound to current line.
5337  current_cut_upper_bounds(i) =
5338  current_global_left_closest_points(i);
5339  current_cut_upper_weights(i) =
5340  seen_weight_in_part;
5341  // compare the current cut line weights with
5342  // previous upper and lower bounds.
5343  for(int ii = i - 1; ii >= 0; --ii) {
5344  mj_scalar_t p_weight =
5345  current_global_part_weights(ii * 2);
5346  mj_scalar_t line_weight =
5347  current_global_part_weights(ii * 2 + 1);
5348  if(p_weight <= expected_weight_in_part) {
5349  if(p_weight == expected_weight_in_part) {
5350  // if the weight of the part is my expected weight
5351  // then we find the solution.
5352  current_cut_upper_bounds(i) =
5353  current_cut_coordinates(ii);
5354  current_cut_upper_weights(i) = p_weight;
5355  current_cut_lower_bounds(i) =
5356  current_cut_coordinates(ii);
5357  current_cut_lower_bound_weights(i) = p_weight;
5358  }
5359  else if(p_weight > current_cut_lower_bound_weights(i)) {
5360  // if found weight is bigger than the lower bound
5361  // then update the lower bound.
5362  current_cut_lower_bounds(i) =
5363  current_global_right_closest_points(ii);
5364  current_cut_lower_bound_weights(i) = p_weight;
5365 
5366  // at the same time, if weight of line is bigger than the
5367  // expected weight, then update the upper bound as well.
5368  // in this case the balance will be obtained by distributing
5369  // weights on this cut position.
5370  if(line_weight > expected_weight_in_part) {
5371  current_cut_upper_bounds(i) =
5372  current_global_right_closest_points(ii);
5373  current_cut_upper_weights(i) = line_weight;
5374  }
5375  }
5376  break;
5377  }
5378  // if the weight of the cut on the left is still bigger than
5379  // my weight, and also if the weight is smaller than the current
5380  // upper weight, or if the weight is equal to current upper
5381  // weight, but on the left of the upper weight, then update
5382  // upper bound.
5383  if(p_weight >= expected_weight_in_part &&
5384  (p_weight < current_cut_upper_weights(i) ||
5385  (p_weight == current_cut_upper_weights(i) &&
5386  current_cut_upper_bounds(i) >
5387  current_global_left_closest_points(ii)))) {
5388  current_cut_upper_bounds(i) =
5389  current_global_left_closest_points(ii);
5390  current_cut_upper_weights(i) = p_weight;
5391  }
5392  }
5393  mj_scalar_t new_cut_position = 0;
5394  algMJ_t::mj_calculate_new_cut_position(
5395  current_cut_upper_bounds(i),
5396  current_cut_lower_bounds(i),
5397  current_cut_upper_weights(i),
5398  current_cut_lower_bound_weights(i),
5399  expected_weight_in_part,
5400  new_cut_position,
5401  _sEpsilon);
5402 
5403  // if cut line does not move significantly.
5404  if(std::abs(current_cut_coordinates(i) -
5405  new_cut_position) < local_sEpsilon) {
5406  current_cut_line_determined(i) = true;
5407  Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5408  //set the cut coordinate and proceed.
5409  new_current_cut_coordinates(i) =
5410  current_cut_coordinates(i);
5411  } else {
5412  new_current_cut_coordinates(i) =
5413  new_cut_position;
5414  }
5415  }
5416  }; // bContinue
5417  });
5418 
5419  team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5420  });
5421 
5422  // view_rectilinear_cut_count
5423  mj_part_t rectilinear_cut_count;
5424  Kokkos::parallel_reduce("Read bDoingWork",
5425  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5426  KOKKOS_LAMBDA(int dummy, int & set_single) {
5427  set_single = view_rectilinear_cut_count(0);
5428  }, rectilinear_cut_count);
5429 
5430  if(rectilinear_cut_count > 0) {
5431  auto host_local_process_rectilinear_cut_weight =
5432  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5433  local_process_rectilinear_cut_weight);
5434  auto host_local_global_rectilinear_cut_weight =
5435  Kokkos::create_mirror_view(Kokkos::HostSpace(),
5436  local_global_rectilinear_cut_weight);
5437  Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5438  local_process_rectilinear_cut_weight);
5439  Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5440  local_global_rectilinear_cut_weight);
5441  Teuchos::scan<int,mj_scalar_t>(
5442  *comm, Teuchos::REDUCE_SUM,
5443  num_cuts,
5444  host_local_process_rectilinear_cut_weight.data(),
5445  host_local_global_rectilinear_cut_weight.data());
5446  Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5447  host_local_process_rectilinear_cut_weight);
5448  Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5449  host_local_global_rectilinear_cut_weight);
5450 
5451  Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5452  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5453  KOKKOS_LAMBDA(int dummy) {
5454  for(mj_part_t i = 0; i < num_cuts; ++i) {
5455  // if cut line weight to be distributed.
5456  if(local_global_rectilinear_cut_weight(i) > 0) {
5457  // expected weight to go to left of the cut.
5458  mj_scalar_t expected_part_weight = current_part_target_weights(i);
5459  // the weight that should be put to left of the cut.
5460  mj_scalar_t necessary_weight_on_line_for_left =
5461  expected_part_weight - current_global_part_weights(i * 2);
5462 
5463  // the weight of the cut in the process
5464  mj_scalar_t my_weight_on_line =
5465  local_process_rectilinear_cut_weight(i);
5466 
5467  // the sum of the cut weights upto this process,
5468  // including the weight of this process.
5469  mj_scalar_t weight_on_line_upto_process_inclusive =
5470  local_global_rectilinear_cut_weight(i);
5471  // the space on the left side of the cut after all processes
5472  // before this process (including this process)
5473  // puts their weights on cut to left.
5474  mj_scalar_t space_to_put_left =
5475  necessary_weight_on_line_for_left -
5476  weight_on_line_upto_process_inclusive;
5477  // add my weight to this space to find out how much space
5478  // is left to me.
5479  mj_scalar_t space_left_to_me =
5480  space_to_put_left + my_weight_on_line;
5481 
5482  /*
5483  cout << "expected_part_weight:" << expected_part_weight
5484  << " necessary_weight_on_line_for_left:"
5485  << necessary_weight_on_line_for_left
5486  << " my_weight_on_line" << my_weight_on_line
5487  << " weight_on_line_upto_process_inclusive:"
5488  << weight_on_line_upto_process_inclusive
5489  << " space_to_put_left:" << space_to_put_left
5490  << " space_left_to_me" << space_left_to_me << endl;
5491  */
5492 
5493  if(space_left_to_me < 0) {
5494  // space_left_to_me is negative and i dont need to put
5495  // anything to left.
5496  current_part_cut_line_weight_to_put_left(i) = 0;
5497  }
5498  else if(space_left_to_me >= my_weight_on_line) {
5499  // space left to me is bigger than the weight of the
5500  // processor on cut.
5501  // so put everything to left.
5502  current_part_cut_line_weight_to_put_left(i) =
5503  my_weight_on_line;
5504  // cout << "setting current_part_cut_line_weight_to_put_left
5505  // to my_weight_on_line:" << my_weight_on_line << endl;
5506  }
5507  else {
5508  // put only the weight as much as the space.
5509  current_part_cut_line_weight_to_put_left(i) =
5510  space_left_to_me;
5511  // cout << "setting current_part_cut_line_weight_to_put_left
5512  // to space_left_to_me:" << space_left_to_me << endl;
5513  }
5514  }
5515  }
5516  view_rectilinear_cut_count(0) = 0;
5517  });
5518  }
5519 
5520  Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5521 }
5522 
5532 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5533  typename mj_part_t, typename mj_node_t>
5534 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5535  get_processor_num_points_in_parts(
5536  mj_part_t num_procs,
5537  mj_part_t num_parts,
5538  mj_gno_t *&num_points_in_all_processor_parts)
5539 {
5540  // initially allocation_size is num_parts
5541  size_t allocation_size = num_parts * (num_procs + 1);
5542 
5543  // this will be output
5544  // holds how many each processor has in each part.
5545  // last portion is the sum of all processor points in each part.
5546 
5547  // allocate memory for the local num coordinates in each part.
5548  mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5549  new mj_gno_t[allocation_size];
5550 
5551  // this is the portion of the memory which will be used
5552  // at the summation to obtain total number of processors' points in each part.
5553  mj_gno_t *my_local_points_to_reduce_sum =
5554  num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5555 
5556  // this is the portion of the memory where each stores its local number.
5557  // this information is needed by other processors.
5558  mj_gno_t *my_local_point_counts_in_each_part =
5559  num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5560 
5561  // initialize the array with 0's.
5562  memset(num_local_points_in_each_part_to_reduce_sum, 0,
5563  sizeof(mj_gno_t)*allocation_size);
5564 
5565  auto local_new_part_xadj = this->new_part_xadj;
5566  Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5567  Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5568  Kokkos::parallel_for("get vals on device",
5569  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5570  (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5571  points_per_part(i) =
5572  local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5573  });
5574  auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5575  Kokkos::deep_copy(host_points_per_part, points_per_part);
5576  for(int i = 0; i < num_parts; ++i) {
5577  my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5578  }
5579 
5580  // copy the local num parts to the last portion of array, so that this portion
5581  // will represent the global num points in each part after the reduction.
5582  memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5583  sizeof(mj_gno_t) * (num_parts) );
5584 
5585  // reduceAll operation.
5586  // the portion that belongs to a processor with index p
5587  // will start from myRank * num_parts.
5588  // the global number of points will be held at the index
5589  try{
5590  reduceAll<int, mj_gno_t>(
5591  *(this->comm),
5592  Teuchos::REDUCE_SUM,
5593  allocation_size,
5594  num_local_points_in_each_part_to_reduce_sum,
5595  num_points_in_all_processor_parts);
5596  }
5597  Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5598 
5599  delete [] num_local_points_in_each_part_to_reduce_sum;
5600 }
5601 
5617 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5618  typename mj_part_t, typename mj_node_t>
5619 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5620  mj_check_to_migrate(
5621  size_t migration_reduce_all_population,
5622  mj_lno_t num_coords_for_last_dim_part,
5623  mj_part_t num_procs,
5624  mj_part_t num_parts,
5625  mj_gno_t *num_points_in_all_processor_parts)
5626 {
5627  // if reduce all count and population in the last dim is too high
5628  if(migration_reduce_all_population > future_reduceall_cutoff) {
5629  return true;
5630  }
5631 
5632  // if the work in a part per processor in the last dim is too low.
5633  if(num_coords_for_last_dim_part < min_work_last_dim) {
5634  return true;
5635  }
5636 
5637  // if migration is to be checked and the imbalance is too high
5638  if(this->check_migrate_avoid_migration_option == 0) {
5639  double global_imbalance = 0;
5640  // global shift to reach the sum of coordiante count in each part.
5641  size_t global_shift = num_procs * num_parts;
5642 
5643  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5644  for(mj_part_t i = 0; i < num_parts; ++i) {
5645  double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5646  / double(num_procs);
5647 
5648  global_imbalance += std::abs(ideal_num -
5649  num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5650  }
5651  }
5652  global_imbalance /= num_parts;
5653  global_imbalance /= num_procs;
5654 
5655  if(global_imbalance <= this->minimum_migration_imbalance) {
5656  return false;
5657  }
5658  else {
5659  return true;
5660  }
5661  }
5662  else {
5663  // if migration is forced
5664  return true;
5665  }
5666 }
5667 
5681 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5682  typename mj_part_t, typename mj_node_t>
5683 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5684  assign_send_destinations(
5685  mj_part_t num_parts,
5686  mj_part_t *part_assignment_proc_begin_indices,
5687  mj_part_t *processor_chains_in_parts,
5688  mj_lno_t *send_count_to_each_proc,
5689  int *coordinate_destinations) {
5690 
5691  auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5692  deep_copy(host_new_part_xadj, this->new_part_xadj);
5693 
5694  auto host_new_coordinate_permutations =
5695  Kokkos::create_mirror_view(this->new_coordinate_permutations);
5696  deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5697 
5698  for(mj_part_t p = 0; p < num_parts; ++p) {
5699  mj_lno_t part_begin = 0;
5700  if(p > 0) part_begin = host_new_part_xadj(p - 1);
5701  mj_lno_t part_end = host_new_part_xadj(p);
5702  // get the first part that current processor will send its part-p.
5703  mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5704  // initialize how many point I sent to this processor.
5705  mj_lno_t num_total_send = 0;
5706  for(mj_lno_t j=part_begin; j < part_end; j++) {
5707  mj_lno_t local_ind = host_new_coordinate_permutations(j);
5708  while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5709  // then get the next processor to send the points in part p.
5710  num_total_send = 0;
5711  // assign new processor to part_assign_begin[p]
5712  part_assignment_proc_begin_indices[p] =
5713  processor_chains_in_parts[proc_to_sent];
5714  // remove the previous processor
5715  processor_chains_in_parts[proc_to_sent] = -1;
5716  // choose the next processor as the next one to send.
5717  proc_to_sent = part_assignment_proc_begin_indices[p];
5718  }
5719  // write the gno index to corresponding position in sendBuf.
5720  coordinate_destinations[local_ind] = proc_to_sent;
5721  ++num_total_send;
5722  }
5723  }
5724 }
5725 
5746 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5747  typename mj_part_t, typename mj_node_t>
5748 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5749  mj_assign_proc_to_parts(
5750  mj_gno_t * num_points_in_all_processor_parts,
5751  mj_part_t num_parts,
5752  mj_part_t num_procs,
5753  mj_lno_t *send_count_to_each_proc,
5754  std::vector<mj_part_t> &processor_ranks_for_subcomm,
5755  std::vector<mj_part_t> *next_future_num_parts_in_parts,
5756  mj_part_t &out_part_index,
5757  mj_part_t &output_part_numbering_begin_index,
5758  int * coordinate_destinations) {
5759  mj_gno_t *global_num_points_in_parts =
5760  num_points_in_all_processor_parts + num_procs * num_parts;
5761  mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5762 
5763  // boolean variable if the process finds its part to be assigned.
5764  bool did_i_find_my_group = false;
5765 
5766  mj_part_t num_free_procs = num_procs;
5767  mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5768 
5769  double max_imbalance_difference = 0;
5770  mj_part_t max_differing_part = 0;
5771 
5772  // find how many processor each part requires.
5773  for(mj_part_t i = 0; i < num_parts; i++) {
5774 
5775  // scalar portion of the required processors
5776  double scalar_required_proc = num_procs *
5777  (double (global_num_points_in_parts[i]) /
5778  double (this->num_global_coords));
5779 
5780  // round it to closest integer; make sure have at least one proc.
5781  mj_part_t required_proc =
5782  static_cast<mj_part_t> (0.5 + scalar_required_proc);
5783  if(required_proc == 0) required_proc = 1;
5784 
5785  // if assigning the required num procs, creates problems for the rest
5786  // of the parts, then only assign {num_free_procs -
5787  // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5788  if(num_free_procs -
5789  required_proc < minimum_num_procs_required_for_rest_of_parts) {
5790  required_proc = num_free_procs -
5791  (minimum_num_procs_required_for_rest_of_parts);
5792  }
5793 
5794  // reduce the free processor count
5795  num_free_procs -= required_proc;
5796 
5797  // reduce the free minimum processor count required for the rest of the
5798  // part by 1.
5799  --minimum_num_procs_required_for_rest_of_parts;
5800 
5801  // part (i) is assigned to (required_proc) processors.
5802  num_procs_assigned_to_each_part[i] = required_proc;
5803 
5804  // because of the roundings some processors might be left as unassigned.
5805  // we want to assign those processors to the part with most imbalance.
5806  // find the part with the maximum imbalance here.
5807  double imbalance_wrt_ideal =
5808  (scalar_required_proc - required_proc) / required_proc;
5809  if(imbalance_wrt_ideal > max_imbalance_difference) {
5810  max_imbalance_difference = imbalance_wrt_ideal;
5811  max_differing_part = i;
5812  }
5813  }
5814 
5815  // assign extra processors to the part with maximum imbalance
5816  // than the ideal.
5817  if(num_free_procs > 0) {
5818  num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5819  }
5820 
5821  // now find what are the best processors with least migration for each part.
5822 
5823  // part_assignment_proc_begin_indices ([i]) is the array that holds the
5824  // beginning index of a processor that processor sends its data for part - i
5825  mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5826 
5827  // the next processor send is found in processor_chains_in_parts,
5828  // in linked list manner.
5829  mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5830  mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5831 
5832  // initialize the assignment of each processor.
5833  // this has a linked list implementation.
5834  // the beginning of processors assigned
5835  // to each part is hold at part_assignment_proc_begin_indices[part].
5836  // then the next processor assigned to that part is located at
5837  // proc_part_assignments[part_assign_begins[part]], this is a chain
5838  // until the value of -1 is reached.
5839  for(int i = 0; i < num_procs; ++i ) {
5840  processor_part_assignments[i] = -1;
5841  processor_chains_in_parts[i] = -1;
5842  }
5843  for(int i = 0; i < num_parts; ++i ) {
5844  part_assignment_proc_begin_indices[i] = -1;
5845  }
5846 
5847  // std::cout << "Before migration: mig type:" <<
5848  // this->migration_type << std::endl;
5849  // Allocate memory for sorting data structure.
5850  uSignedSortItem<mj_part_t, mj_gno_t, char> *
5851  sort_item_num_part_points_in_procs =
5852  new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5853 
5854  for(mj_part_t i = 0; i < num_parts; ++i) {
5855  // the algorithm tries to minimize the cost of migration, by assigning the
5856  // processors with highest number of coordinates on that part.
5857  // here we might want to implement a maximum weighted bipartite matching
5858  // algorithm.
5859  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5860  sort_item_num_part_points_in_procs[ii].id = ii;
5861  // if processor is not assigned yet.
5862  // add its num points to the sort data structure.
5863  if(processor_part_assignments[ii] == -1) {
5864  sort_item_num_part_points_in_procs[ii].val =
5865  num_points_in_all_processor_parts[ii * num_parts + i];
5866  // indicate that the processor has positive weight.
5867  sort_item_num_part_points_in_procs[ii].signbit = 1;
5868  }
5869  else {
5870  // if processor is already assigned, insert -nLocal - 1 so that it
5871  // won't be selected again.
5872  // would be same if we simply set it to -1, but more information with
5873  // no extra cost (which is used later) is provided.
5874  // sort_item_num_part_points_in_procs[ii].val =
5875  // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5876 
5877  // UPDATE: Since above gets warning when unsigned is used to
5878  // represent, we added extra bit to as sign bit to the sort item.
5879  // It is 1 for positives, 0 for negatives.
5880  sort_item_num_part_points_in_procs[ii].val =
5881  num_points_in_all_processor_parts[ii * num_parts + i];
5882  sort_item_num_part_points_in_procs[ii].signbit = 0;
5883  }
5884  }
5885 
5886  // sort the processors in the part.
5887  uqSignsort<mj_part_t, mj_gno_t,char>
5888  (num_procs, sort_item_num_part_points_in_procs);
5889 
5890  /*
5891  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5892  std::cout << "ii:" << ii << " " <<
5893  sort_item_num_part_points_in_procs[ii].id <<
5894  " " << sort_item_num_part_points_in_procs[ii].val <<
5895  " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5896  std::endl;
5897  }
5898  */
5899 
5900  mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5901  mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5902  mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5903  ceil(total_num_points_in_part / double (required_proc_count)));
5904 
5905  // starts sending to least heaviest part.
5906  mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5907  mj_part_t next_proc_to_send_id =
5908  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5909  mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5910  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5911 
5912  // find the processors that will be assigned to this part, which are the
5913  // heaviest non assigned processors.
5914  for(mj_part_t ii = num_procs - 1;
5915  ii >= num_procs - required_proc_count; --ii) {
5916  mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5917  // assign processor to part - i.
5918  processor_part_assignments[proc_id] = i;
5919  }
5920 
5921  bool did_change_sign = false;
5922  // if processor has a minus count, reverse it.
5923  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5924  // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5925  // TODO: SEE BUG 6194
5926  if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5927  did_change_sign = true;
5928  sort_item_num_part_points_in_procs[ii].signbit = 1;
5929  }
5930  else {
5931  break;
5932  }
5933  }
5934 
5935  if(did_change_sign) {
5936  // resort the processors in the part for the rest of the processors that
5937  // is not assigned.
5938  uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5939  sort_item_num_part_points_in_procs);
5940  }
5941 
5942  /*
5943  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5944  std::cout << "after resort ii:" << ii << " " <<
5945  sort_item_num_part_points_in_procs[ii].id <<
5946  " " << sort_item_num_part_points_in_procs[ii].val <<
5947  " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5948  std::endl;
5949  }
5950  */
5951 
5952  // check if this processors is one of the procs assigned to this part.
5953  // if it is, then get the group.
5954  if(!did_i_find_my_group) {
5955  for(mj_part_t ii = num_procs - 1; ii >=
5956  num_procs - required_proc_count; --ii) {
5957 
5958  mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5959 
5960  // add the proc to the group.
5961  processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5962 
5963  if(proc_id_to_assign == this->myRank) {
5964  // if the assigned process is me, then I find my group.
5965  did_i_find_my_group = true;
5966 
5967  // set the beginning of part i to my rank.
5968  part_assignment_proc_begin_indices[i] = this->myRank;
5969  processor_chains_in_parts[this->myRank] = -1;
5970 
5971  // set send count to myself to the number of points that I have
5972  // in part i.
5973  send_count_to_each_proc[this->myRank] =
5974  sort_item_num_part_points_in_procs[ii].val;
5975 
5976  // calculate the shift required for the
5977  // output_part_numbering_begin_index
5978  for(mj_part_t in = 0; in < i; ++in) {
5979  output_part_numbering_begin_index +=
5980  (*next_future_num_parts_in_parts)[in];
5981  }
5982  out_part_index = i;
5983  }
5984  }
5985 
5986  // if these was not my group,
5987  // clear the subcomminicator processor array.
5988  if(!did_i_find_my_group) {
5989  processor_ranks_for_subcomm.clear();
5990  }
5991  }
5992 
5993  // send points of the nonassigned coordinates to the assigned coordinates.
5994  // starts from the heaviest nonassigned processor.
5995  // TODO we might want to play with this part, that allows more
5996  // computational imbalance but having better communication balance.
5997  for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5998  mj_part_t nonassigned_proc_id =
5999  sort_item_num_part_points_in_procs[ii].id;
6000  mj_lno_t num_points_to_sent =
6001  sort_item_num_part_points_in_procs[ii].val;
6002 
6003  // we set number of points to -to_sent - 1 for the assigned processors.
6004  // we reverse it here. This should not happen, as we have already
6005  // reversed them above.
6006 #ifdef MJ_DEBUG
6007  if(num_points_to_sent < 0) {
6008  cout << "Migration - processor assignments - for part:" << i
6009  << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
6010  << num_points_to_sent << std::endl;
6011  std::terminate();
6012  }
6013 #endif
6014 
6015  switch (migration_type) {
6016  case 0:
6017  {
6018  // now sends the points to the assigned processors.
6019  while (num_points_to_sent > 0) {
6020  // if the processor has enough space.
6021  if(num_points_to_sent <= space_left_in_sent_proc) {
6022  // reduce the space left in the processor.
6023  space_left_in_sent_proc -= num_points_to_sent;
6024  // if my rank is the one that is sending the coordinates.
6025  if(this->myRank == nonassigned_proc_id) {
6026  // set my sent count to the sent processor.
6027  send_count_to_each_proc[next_proc_to_send_id] =
6028  num_points_to_sent;
6029  // save the processor in the list (processor_chains_in_parts
6030  // and part_assignment_proc_begin_indices)
6031  // that the processor will send its point in part-i.
6032  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6033  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6034  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6035  }
6036  num_points_to_sent = 0;
6037  }
6038  else {
6039  // there might be no space left in the processor.
6040  if(space_left_in_sent_proc > 0) {
6041  num_points_to_sent -= space_left_in_sent_proc;
6042 
6043  //send as the space left in the processor.
6044  if(this->myRank == nonassigned_proc_id) {
6045  // send as much as the space in this case.
6046  send_count_to_each_proc[next_proc_to_send_id] =
6047  space_left_in_sent_proc;
6048  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6049  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6050  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6051  }
6052  }
6053  // change the sent part
6054  ++next_proc_to_send_index;
6055 
6056 #ifdef MJ_DEBUG
6057  if(next_part_to_send_index < nprocs - required_proc_count ) {
6058  cout << "Migration - processor assignments - for part:"
6059  << i
6060  << " next_part_to_send :" << next_part_to_send_index
6061  << " nprocs:" << nprocs
6062  << " required_proc_count:" << required_proc_count
6063  << " Error: next_part_to_send_index <" <<
6064  << " nprocs - required_proc_count" << std::endl;
6065  std::terminate();
6066  }
6067 #endif
6068  // send the new id.
6069  next_proc_to_send_id =
6070  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6071  // set the new space in the processor.
6072  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6073  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6074  }
6075  }
6076  }
6077  break;
6078  default:
6079  {
6080  // to minimize messages, we want each processor to send its
6081  // coordinates to only a single point.
6082  // we do not respect imbalances here, we send all points to the
6083  // next processor.
6084  if(this->myRank == nonassigned_proc_id) {
6085  // set my sent count to the sent processor.
6086  send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6087  // save the processor in the list (processor_chains_in_parts and
6088  // part_assignment_proc_begin_indices)
6089  // that the processor will send its point in part-i.
6090  mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6091  part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6092  processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6093  }
6094  num_points_to_sent = 0;
6095  ++next_proc_to_send_index;
6096 
6097  // if we made it to the heaviest processor we round robin and
6098  // go to beginning
6099  if(next_proc_to_send_index == num_procs) {
6100  next_proc_to_send_index = num_procs - required_proc_count;
6101  }
6102  // send the new id.
6103  next_proc_to_send_id =
6104  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6105  // set the new space in the processor.
6106  space_left_in_sent_proc = ideal_num_points_in_a_proc -
6107  sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6108  }
6109  }
6110  }
6111  }
6112 
6113  /*
6114  for(int i = 0; i < num_procs;++i) {
6115  std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6116  send_count_to_each_proc[i] << std::endl;
6117  }
6118  */
6119 
6120  this->assign_send_destinations(
6121  num_parts,
6122  part_assignment_proc_begin_indices,
6123  processor_chains_in_parts,
6124  send_count_to_each_proc,
6125  coordinate_destinations);
6126  delete [] part_assignment_proc_begin_indices;
6127  delete [] processor_chains_in_parts;
6128  delete [] processor_part_assignments;
6129  delete [] sort_item_num_part_points_in_procs;
6130  delete [] num_procs_assigned_to_each_part;
6131 }
6132 
6148 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6149  typename mj_part_t, typename mj_node_t>
6150 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6151  assign_send_destinations2(
6152  mj_part_t num_parts,
6153  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6154  int *coordinate_destinations,
6155  mj_part_t &output_part_numbering_begin_index,
6156  std::vector<mj_part_t> *next_future_num_parts_in_parts)
6157 {
6158  mj_part_t part_shift_amount = output_part_numbering_begin_index;
6159  mj_part_t previous_processor = -1;
6160 
6161  auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6162  Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6163 
6164  auto local_new_coordinate_permutations =
6165  Kokkos::create_mirror_view(this->new_coordinate_permutations);
6166  Kokkos::deep_copy(local_new_coordinate_permutations,
6167  this->new_coordinate_permutations);
6168 
6169  for(mj_part_t i = 0; i < num_parts; ++i) {
6170  mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6171 
6172  // assigned processors are sorted.
6173  mj_lno_t part_begin_index = 0;
6174 
6175  if(p > 0) {
6176  part_begin_index = local_new_part_xadj(p - 1);
6177  }
6178 
6179  mj_lno_t part_end_index = local_new_part_xadj(p);
6180 
6181  mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6182  if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6183  output_part_numbering_begin_index = part_shift_amount;
6184  }
6185  previous_processor = assigned_proc;
6186  part_shift_amount += (*next_future_num_parts_in_parts)[p];
6187 
6188  for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6189  mj_lno_t localInd = local_new_coordinate_permutations(j);
6190  coordinate_destinations[localInd] = assigned_proc;
6191  }
6192  }
6193 }
6194 
6216 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6217  typename mj_part_t, typename mj_node_t>
6218 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6219  mj_assign_parts_to_procs(
6220  mj_gno_t * num_points_in_all_processor_parts,
6221  mj_part_t num_parts,
6222  mj_part_t num_procs,
6223  mj_lno_t *send_count_to_each_proc,
6224  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6225  mj_part_t &out_num_part,
6226  std::vector<mj_part_t> &out_part_indices,
6227  mj_part_t &output_part_numbering_begin_index,
6228  int *coordinate_destinations) {
6229 
6230  out_num_part = 0;
6231  mj_gno_t *global_num_points_in_parts =
6232  num_points_in_all_processor_parts + num_procs * num_parts;
6233  out_part_indices.clear();
6234 
6235  // to sort the parts that is assigned to the processors.
6236  // id is the part number, sort value is the assigned processor id.
6237  uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6238  new uSortItem<mj_part_t, mj_part_t>[num_parts];
6239  uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6240  new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6241 
6242  // calculate the optimal number of coordinates that should be assigned
6243  // to each processor.
6244  mj_lno_t work_each =
6245  mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6246 
6247  // to hold the left space as the number of coordinates to the optimal
6248  // number in each proc.
6249  mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6250 
6251  // initialize left space in each.
6252  for(mj_part_t i = 0; i < num_procs; ++i) {
6253  space_in_each_processor[i] = work_each;
6254  }
6255 
6256  // we keep track of how many parts each processor is assigned to.
6257  // because in some weird inputs, it might be possible that some
6258  // processors is not assigned to any part. Using these variables,
6259  // we force each processor to have at least one part.
6260  mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6261  memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6262  int empty_proc_count = num_procs;
6263 
6264  // to sort the parts with decreasing order of their coordiantes.
6265  // id are the part numbers, sort value is the number of points in each.
6266  uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6267  new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6268 
6269  // initially we will sort the parts according to the number of coordinates
6270  // they have, so that we will start assigning with the part that has the most
6271  // number of coordinates.
6272  for(mj_part_t i = 0; i < num_parts; ++i) {
6273  sort_item_point_counts_in_parts[i].id = i;
6274  sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6275  }
6276 
6277  // sort parts with increasing order of loads.
6278  uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6279 
6280  // assigning parts to the processors
6281  // traverse the part with decreasing order of load.
6282  // first assign the heaviest part.
6283  for(mj_part_t j = 0; j < num_parts; ++j) {
6284  // sorted with increasing order, traverse inverse.
6285  mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6286 
6287  // load of the part
6288  mj_gno_t load = global_num_points_in_parts[i];
6289 
6290  // assigned processors
6291  mj_part_t assigned_proc = -1;
6292 
6293  // sort processors with increasing number of points in this part.
6294  for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6295  sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6296 
6297  // if there are still enough parts to fill empty processors, than proceed
6298  // normally, but if empty processor count is equal to the number of part,
6299  // then we force to part assignments only to empty processors.
6300  if(empty_proc_count < num_parts - j ||
6301  num_parts_proc_assigned[ii] == 0) {
6302  // how many points processor ii has in part i?
6303  sort_item_num_points_of_proc_in_part_i[ii].val =
6304  num_points_in_all_processor_parts[ii * num_parts + i];
6305  }
6306  else {
6307  sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6308  }
6309  }
6310 
6311  uqsort<mj_part_t, mj_gno_t>(num_procs,
6312  sort_item_num_points_of_proc_in_part_i);
6313 
6314  // traverse all processors with decreasing load.
6315  for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6316  mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6317  if(assigned_proc == -1 ||
6318  (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6319  assigned_proc = ii;
6320  }
6321  else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6322  if(ii < assigned_proc) {
6323  // ties go to lower proc
6324  // not necessary for a valid result but allows testing to compare
6325  // MPI results and have parts numbers assigned to the same boxes.
6326  // We don't break here because we may have more ties still to check.
6327  // The indeterminate state before this is due to Cuda using
6328  // atomics to refill the permutation array. So non-cuda runs don't
6329  // actualy need this since they will always have the same pattern.
6330  assigned_proc = ii;
6331  }
6332  }
6333  else {
6334  break; // now we can break - we have our part and no more ties.
6335  }
6336  }
6337 
6338  if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6339  --empty_proc_count;
6340  }
6341 
6342  space_in_each_processor[assigned_proc] -= load;
6343  //to sort later, part-i is assigned to the proccessor - assignment.
6344  sort_item_part_to_proc_assignment[j].id = i; //part i
6345 
6346  // assigned to processor - assignment.
6347  sort_item_part_to_proc_assignment[j].val = assigned_proc;
6348 
6349  // if assigned processor is me, increase the number.
6350  if(assigned_proc == this->myRank) {
6351  out_num_part++;//assigned_part_count;
6352  out_part_indices.push_back(i);
6353  }
6354 
6355  // increase the send to that processor by the number of points in that
6356  // part, as everyone send their coordiantes in this part to the
6357  // processor assigned to this part.
6358  send_count_to_each_proc[assigned_proc] +=
6359  num_points_in_all_processor_parts[this->myRank * num_parts + i];
6360  }
6361 
6362  delete [] num_parts_proc_assigned;
6363  delete [] sort_item_num_points_of_proc_in_part_i;
6364  delete [] sort_item_point_counts_in_parts;
6365  delete [] space_in_each_processor;
6366 
6367  // sort assignments with respect to the assigned processors.
6368  uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6369 
6370  // fill sendBuf.
6371  this->assign_send_destinations2(
6372  num_parts,
6373  sort_item_part_to_proc_assignment,
6374  coordinate_destinations,
6375  output_part_numbering_begin_index,
6376  next_future_num_parts_in_parts);
6377 
6378  delete [] sort_item_part_to_proc_assignment;
6379 }
6380 
6381 
6405 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6406  typename mj_part_t, typename mj_node_t>
6407 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6408  mj_migration_part_proc_assignment(
6409  mj_gno_t * num_points_in_all_processor_parts,
6410  mj_part_t num_parts,
6411  mj_part_t num_procs,
6412  mj_lno_t *send_count_to_each_proc,
6413  std::vector<mj_part_t> &processor_ranks_for_subcomm,
6414  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6415  mj_part_t &out_num_part,
6416  std::vector<mj_part_t> &out_part_indices,
6417  mj_part_t &output_part_numbering_begin_index,
6418  int *coordinate_destinations)
6419 {
6420  processor_ranks_for_subcomm.clear();
6421  // if(this->num_local_coords > 0)
6422  if(num_procs > num_parts) {
6423  // if there are more processors than the number of current part
6424  // then processors share the existing parts.
6425  // at the end each processor will have a single part,
6426  // but a part will be shared by a group of processors.
6427  mj_part_t out_part_index = 0;
6428 
6429  this->mj_assign_proc_to_parts(
6430  num_points_in_all_processor_parts,
6431  num_parts,
6432  num_procs,
6433  send_count_to_each_proc,
6434  processor_ranks_for_subcomm,
6435  next_future_num_parts_in_parts,
6436  out_part_index,
6437  output_part_numbering_begin_index,
6438  coordinate_destinations
6439  );
6440 
6441  out_num_part = 1;
6442  out_part_indices.clear();
6443  out_part_indices.push_back(out_part_index);
6444  }
6445  else {
6446 
6447  // there are more parts than the processors.
6448  // therefore a processor will be assigned multiple parts,
6449  // the subcommunicators will only have a single processor.
6450  processor_ranks_for_subcomm.push_back(this->myRank);
6451 
6452  // since there are more parts then procs,
6453  // assign multiple parts to processors.
6454 
6455  this->mj_assign_parts_to_procs(
6456  num_points_in_all_processor_parts,
6457  num_parts,
6458  num_procs,
6459  send_count_to_each_proc,
6460  next_future_num_parts_in_parts,
6461  out_num_part,
6462  out_part_indices,
6463  output_part_numbering_begin_index,
6464  coordinate_destinations);
6465  }
6466 }
6467 
6481 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6482  typename mj_part_t, typename mj_node_t>
6483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6484  mj_migrate_coords(
6485  mj_part_t num_procs,
6486  mj_lno_t &num_new_local_points,
6487  std::string iteration,
6488  int *coordinate_destinations,
6489  mj_part_t num_parts)
6490 {
6491 
6492 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6493  if(sizeof(mj_lno_t) <= sizeof(int)) {
6494  // Cannot use Zoltan_Comm with local ordinals larger than ints.
6495  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6496  // may overflow.
6497  ZOLTAN_COMM_OBJ *plan = NULL;
6498  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6499  int num_incoming_gnos = 0;
6500  int message_tag = 7859;
6501 
6502  this->mj_env->timerStart(MACRO_TIMERS,
6503  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6504  int ierr = Zoltan_Comm_Create(
6505  &plan,
6506  int(this->num_local_coords),
6507  coordinate_destinations,
6508  mpi_comm,
6509  message_tag,
6510  &num_incoming_gnos);
6511 
6512  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6513  this->mj_env->timerStop(MACRO_TIMERS,
6514  mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6515 
6516  this->mj_env->timerStart(MACRO_TIMERS,
6517  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6518 
6519  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6520 
6521  // migrate gnos.
6522  {
6523  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6524  Kokkos::HostSpace(), this->current_mj_gnos);
6525  Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6526  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6527  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6528  auto host_dst_gnos = Kokkos::create_mirror_view(
6529  Kokkos::HostSpace(), dst_gnos);
6530  message_tag++;
6531  ierr = Zoltan_Comm_Do(
6532  plan,
6533  message_tag,
6534  (char *) host_current_mj_gnos.data(),
6535  sizeof(mj_gno_t),
6536  (char *) host_dst_gnos.data());
6537  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6538  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6539  this->current_mj_gnos = dst_gnos;
6540  }
6541 
6542  //migrate coordinates
6543  {
6544  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6545  auto host_src_coordinates = Kokkos::create_mirror_view(
6546  Kokkos::HostSpace(), this->mj_coordinates);
6547  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6548  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6549  dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6550  num_incoming_gnos, this->coord_dim);
6551  auto host_dst_coordinates = Kokkos::create_mirror_view(
6552  Kokkos::HostSpace(), dst_coordinates);
6553  for(int i = 0; i < this->coord_dim; ++i) {
6554  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6555  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6556  Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6557  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6558  // Note Layout Left means we can do these in contiguous blocks
6559  message_tag++;
6560  ierr = Zoltan_Comm_Do(
6561  plan,
6562  message_tag,
6563  (char *) sub_host_src_coordinates.data(),
6564  sizeof(mj_scalar_t),
6565  (char *) sub_host_dst_coordinates.data());
6566  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6567  }
6568  deep_copy(dst_coordinates, host_dst_coordinates);
6569  this->mj_coordinates = dst_coordinates;
6570  }
6571 
6572  // migrate weights.
6573  {
6574  auto host_src_weights = Kokkos::create_mirror_view(
6575  Kokkos::HostSpace(), this->mj_weights);
6576  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6577  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6578  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6579  num_incoming_gnos, this->num_weights_per_coord);
6580  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6581  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6582  auto sub_host_src_weights
6583  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6584  auto sub_host_dst_weights
6585  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6586  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6587  // Copy because of layout
6588  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6589  sent_weight[n] = sub_host_src_weights(n);
6590  }
6591  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6592  message_tag++;
6593  ierr = Zoltan_Comm_Do(
6594  plan,
6595  message_tag,
6596  (char *) sent_weight.getRawPtr(),
6597  sizeof(mj_scalar_t),
6598  (char *) received_weight.getRawPtr());
6599  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6600  // Again we copy by index due to layout
6601  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6602  sub_host_dst_weights(n) = received_weight[n];
6603  }
6604  }
6605  deep_copy(dst_weights, host_dst_weights);
6606  this->mj_weights = dst_weights;
6607  }
6608 
6609  // migrate owners.
6610  {
6611  // Note that owners we kept on Serial
6612  Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6613  Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6614  num_incoming_gnos);
6615  message_tag++;
6616  ierr = Zoltan_Comm_Do(
6617  plan,
6618  message_tag,
6619  (char *) owner_of_coordinate.data(),
6620  sizeof(int),
6621  (char *) dst_owners_of_coordinate.data());
6622  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6623  this->owner_of_coordinate = dst_owners_of_coordinate;
6624  }
6625 
6626  // if num procs is less than num parts,
6627  // we need the part assigment arrays as well, since
6628  // there will be multiple parts in processor.
6629  {
6630  auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6631  Kokkos::HostSpace(), this->assigned_part_ids);
6632  Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6633  Kokkos::View<int *, device_t> dst_assigned_part_ids(
6634  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6635  num_incoming_gnos);
6636  auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6637  Kokkos::HostSpace(), dst_assigned_part_ids);
6638  mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6639  if(num_procs < num_parts) {
6640  message_tag++;
6641  ierr = Zoltan_Comm_Do(
6642  plan,
6643  message_tag,
6644  (char *) host_src_assigned_part_ids.data(),
6645  sizeof(mj_part_t),
6646  (char *) host_dst_assigned_part_ids.data());
6647  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6648  Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6649  }
6650  // In original code this would just assign to an uninitialized array
6651  // if num_procs < num_parts. We're doing the same here.
6652  this->assigned_part_ids = dst_assigned_part_ids;
6653  }
6654 
6655  ierr = Zoltan_Comm_Destroy(&plan);
6656  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6657  num_new_local_points = num_incoming_gnos;
6658  this->mj_env->timerStop(MACRO_TIMERS,
6659  mj_timer_base_string + "Migration Z1Migration-" + iteration);
6660  }
6661  else
6662 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6663  {
6664  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6665  "Migration DistributorPlanCreating-" + iteration);
6666 
6667  Tpetra::Distributor distributor(this->comm);
6668  ArrayView<const mj_part_t> destinations( coordinate_destinations,
6669  this->num_local_coords);
6670  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6671  this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6672  "Migration DistributorPlanCreating-" + iteration);
6673  this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6674  "Migration DistributorMigration-" + iteration);
6675 
6676  // note MPI buffers should all be on Kokkos::HostSpace and not
6677  // Kokkos::CudaUVMSpace.
6678 
6679  // migrate gnos.
6680  {
6681  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6682  auto src_host_current_mj_gnos =
6683  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
6684  Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
6685  ArrayView<mj_gno_t> sent_gnos(
6686  src_host_current_mj_gnos.data(), this->num_local_coords);
6687  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6688  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6689  Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6690  auto host_current_mj_gnos = Kokkos::create_mirror_view(
6691  this->current_mj_gnos);
6692  memcpy(host_current_mj_gnos.data(),
6693  received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
6694  Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
6695  }
6696 
6697  // migrate coordinates
6698  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6699  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6700  dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6701  auto host_dst_coordinates = Kokkos::create_mirror_view(dst_coordinates);
6702  auto host_src_coordinates = Kokkos::create_mirror_view(
6703  Kokkos::HostSpace(), this->mj_coordinates);
6704  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6705  for(int i = 0; i < this->coord_dim; ++i) {
6706  Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6707  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6708  auto sub_host_dst_coordinates
6709  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6710  // Note Layout Left means we can do these in contiguous blocks
6711  // This form was causing problems on cuda 10 pascal nodes, issue #6422
6712  // Doing a manual copy clears the error though it seems this is probably
6713  // just shifting some kind of race condition or UVM issue around. The
6714  // bug can be sensitive to simple changes like adding a printf log.
6715 
6716  // Using this form will segfault on cuda 10 pascal node
6717  //ArrayView<mj_scalar_t> sent_coord(
6718  // sub_host_src_coordinates.data(), this->num_local_coords);
6719 
6720  // Manual copy will clear the error but this is probably just due to
6721  // shifting some kind of race condition.
6722  ArrayRCP<mj_scalar_t> sent_coord(this->num_local_coords);
6723  for(int n = 0; n < this->num_local_coords; ++n) {
6724  sent_coord[n] = sub_host_src_coordinates[n];
6725  }
6726 
6727  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6728  distributor.doPostsAndWaits<mj_scalar_t>(
6729  sent_coord(), 1, received_coord());
6730  memcpy(sub_host_dst_coordinates.data(),
6731  received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
6732  }
6733  deep_copy(dst_coordinates, host_dst_coordinates);
6734  this->mj_coordinates = dst_coordinates;
6735 
6736  // migrate weights.
6737  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6738  "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6739  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6740  auto host_src_weights = Kokkos::create_mirror_view(
6741  Kokkos::HostSpace(), this->mj_weights);
6742  Kokkos::deep_copy(host_src_weights, this->mj_weights);
6743  for(int i = 0; i < this->num_weights_per_coord; ++i) {
6744  auto sub_host_src_weights
6745  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6746  auto sub_host_dst_weights
6747  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6748  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6749 
6750  // TODO: Layout Right means these are not contiguous
6751  // However we don't have any systems setup with more than 1 weight so
6752  // really I have not tested any of this code with num weights > 1.
6753  // I think this is the right thing to do.
6754  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6755  sent_weight[n] = sub_host_src_weights(n);
6756  }
6757  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6758  distributor.doPostsAndWaits<mj_scalar_t>(
6759  sent_weight(), 1, received_weight());
6760 
6761  // Again we copy by index due to layout
6762  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6763  sub_host_dst_weights(n) = received_weight[n];
6764  }
6765  }
6766  Kokkos::deep_copy(dst_weights, host_dst_weights);
6767  this->mj_weights = dst_weights;
6768 
6769  // migrate owners
6770  {
6771  // Note owners we kept on Serial
6772  ArrayView<int> sent_owners(
6773  owner_of_coordinate.data(), this->num_local_coords);
6774  ArrayRCP<int> received_owners(num_incoming_gnos);
6775  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
6776  this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>
6777  ("owner_of_coordinate", num_incoming_gnos);
6778  memcpy(this->owner_of_coordinate.data(),
6779  received_owners.getRawPtr(), num_incoming_gnos * sizeof(int));
6780  }
6781 
6782  // if num procs is less than num parts,
6783  // we need the part assigment arrays as well, since
6784  // there will be multiple parts in processor.
6785  if(num_procs < num_parts) {
6786  auto src_host_assigned_part_ids =
6787  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
6788  Kokkos::deep_copy(src_host_assigned_part_ids, assigned_part_ids);
6789  ArrayView<mj_part_t> sent_partids(
6790  src_host_assigned_part_ids.data(), this->num_local_coords);
6791  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6792  distributor.doPostsAndWaits<mj_part_t>(
6793  sent_partids, 1, received_partids());
6794  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6795  ("assigned_part_ids", num_incoming_gnos);
6796  auto host_assigned_part_ids = Kokkos::create_mirror_view(
6797  this->assigned_part_ids);
6798  memcpy(
6799  host_assigned_part_ids.data(),
6800  received_partids.getRawPtr(),
6801  num_incoming_gnos * sizeof(mj_part_t));
6802  Kokkos::deep_copy(this->assigned_part_ids, host_assigned_part_ids);
6803  }
6804  else {
6805  this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6806  ("assigned_part_ids", num_incoming_gnos);
6807  }
6808  this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6809  "Migration DistributorMigration-" + iteration);
6810 
6811  num_new_local_points = num_incoming_gnos;
6812  }
6813 }
6814 
6820 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6821  typename mj_part_t, typename mj_node_t>
6822 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6823  create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6824 {
6825  mj_part_t group_size = processor_ranks_for_subcomm.size();
6826  mj_part_t *ids = new mj_part_t[group_size];
6827  for(mj_part_t i = 0; i < group_size; ++i) {
6828  ids[i] = processor_ranks_for_subcomm[i];
6829  }
6830  ArrayView<const mj_part_t> idView(ids, group_size);
6831  this->comm = this->comm->createSubcommunicator(idView);
6832  delete [] ids;
6833 }
6834 
6840 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6841  typename mj_part_t, typename mj_node_t>
6842 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6843  fill_permutation_array(
6844  mj_part_t output_num_parts,
6845  mj_part_t num_parts)
6846 {
6847  // if there is single output part, then simply fill the permutation array.
6848  if(output_num_parts == 1) {
6849  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6850  Kokkos::parallel_for(
6851  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6852  (0, this->num_local_coords),
6853  KOKKOS_LAMBDA(mj_lno_t i) {
6854  local_new_coordinate_permutations(i) = i;
6855  });
6856  auto local_new_part_xadj = this->new_part_xadj;
6857  auto local_num_local_coords = this->num_local_coords;
6858  Kokkos::parallel_for(
6859  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6860  KOKKOS_LAMBDA(int dummy) {
6861  local_new_part_xadj(0) = local_num_local_coords;
6862  });
6863  }
6864  else {
6865  auto local_num_local_coords = this->num_local_coords;
6866  auto local_assigned_part_ids = this->assigned_part_ids;
6867  auto local_new_part_xadj = this->new_part_xadj;
6868  auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6869 
6870  // part shift holds the which part number an old part number corresponds to.
6871  Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6872 
6873  // otherwise we need to count how many points are there in each part.
6874  // we allocate here as num_parts, because the sent partids are up to
6875  // num_parts, although there are outout_num_parts different part.
6876  Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6877  "num_points_in_parts", num_parts);
6878 
6879  Kokkos::parallel_for(
6880  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6881  KOKKOS_LAMBDA(int dummy) {
6882 
6883  for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6884  mj_part_t ii = local_assigned_part_ids(i);
6885  ++num_points_in_parts(ii);
6886  }
6887 
6888  // write the end points of the parts.
6889  mj_part_t p = 0;
6890  mj_lno_t prev_index = 0;
6891  for(mj_part_t i = 0; i < num_parts; ++i) {
6892  if(num_points_in_parts(i) > 0) {
6893  local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6894  prev_index += num_points_in_parts(i);
6895  part_shifts(i) = p++;
6896  }
6897  }
6898 
6899  // for the rest of the parts write the end index as end point.
6900  mj_part_t assigned_num_parts = p - 1;
6901  for(;p < num_parts; ++p) {
6902  local_new_part_xadj(p) =
6903  local_new_part_xadj(assigned_num_parts);
6904  }
6905  for(mj_part_t i = 0; i < output_num_parts; ++i) {
6906  num_points_in_parts(i) = local_new_part_xadj(i);
6907  }
6908 
6909  // write the permutation array here.
6910  // get the part of the coordinate i, shift it to obtain the new part number.
6911  // assign it to the end of the new part numbers pointer.
6912  for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6913  mj_part_t part =
6914  part_shifts[mj_part_t(local_assigned_part_ids(i))];
6915  local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6916  }
6917  });
6918  }
6919 }
6920 
6945 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6946  typename mj_part_t, typename mj_node_t>
6947 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6948  mj_perform_migration(
6949  mj_part_t input_num_parts,
6950  mj_part_t &output_num_parts,
6951  std::vector<mj_part_t> *next_future_num_parts_in_parts,
6952  mj_part_t &output_part_begin_index,
6953  size_t migration_reduce_all_population,
6954  mj_lno_t num_coords_for_last_dim_part,
6955  std::string iteration,
6956  RCP<mj_partBoxVector_t> &input_part_boxes,
6957  RCP<mj_partBoxVector_t> &output_part_boxes)
6958 {
6959  mj_part_t num_procs = this->comm->getSize();
6960  this->myRank = this->comm->getRank();
6961 
6962  // this array holds how many points each processor has in each part.
6963  // to access how many points processor i has on part j,
6964  // num_points_in_all_processor_parts[i * num_parts + j]
6965  mj_gno_t *num_points_in_all_processor_parts =
6966  new mj_gno_t[input_num_parts * (num_procs + 1)];
6967 
6968  // get the number of coordinates in each part in each processor.
6969  this->get_processor_num_points_in_parts(
6970  num_procs,
6971  input_num_parts,
6972  num_points_in_all_processor_parts);
6973 
6974  // check if migration will be performed or not.
6975  if(!this->mj_check_to_migrate(
6976  migration_reduce_all_population,
6977  num_coords_for_last_dim_part,
6978  num_procs,
6979  input_num_parts,
6980  num_points_in_all_processor_parts)) {
6981  delete [] num_points_in_all_processor_parts;
6982  return false;
6983  }
6984 
6985  mj_lno_t *send_count_to_each_proc = NULL;
6986  int *coordinate_destinations = new int[this->num_local_coords];
6987  send_count_to_each_proc = new mj_lno_t[num_procs];
6988 
6989  for(int i = 0; i < num_procs; ++i) {
6990  send_count_to_each_proc[i] = 0;
6991  }
6992 
6993  std::vector<mj_part_t> processor_ranks_for_subcomm;
6994  std::vector<mj_part_t> out_part_indices;
6995 
6996  // determine which processors are assigned to which parts
6997  this->mj_migration_part_proc_assignment(
6998  num_points_in_all_processor_parts,
6999  input_num_parts,
7000  num_procs,
7001  send_count_to_each_proc,
7002  processor_ranks_for_subcomm,
7003  next_future_num_parts_in_parts,
7004  output_num_parts,
7005  out_part_indices,
7006  output_part_begin_index,
7007  coordinate_destinations);
7008 
7009  delete [] send_count_to_each_proc;
7010  std::vector <mj_part_t> tmpv;
7011 
7012  std::sort (out_part_indices.begin(), out_part_indices.end());
7013  mj_part_t outP = out_part_indices.size();
7014  mj_gno_t new_global_num_points = 0;
7015  mj_gno_t *global_num_points_in_parts =
7016  num_points_in_all_processor_parts + num_procs * input_num_parts;
7017 
7018  if(this->mj_keep_part_boxes) {
7019  input_part_boxes->clear();
7020  }
7021 
7022  // now we calculate the new values for next_future_num_parts_in_parts.
7023  // same for the part boxes.
7024  for(mj_part_t i = 0; i < outP; ++i) {
7025  mj_part_t ind = out_part_indices[i];
7026  new_global_num_points += global_num_points_in_parts[ind];
7027  tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7028  if(this->mj_keep_part_boxes) {
7029  input_part_boxes->push_back((*output_part_boxes)[ind]);
7030  }
7031  }
7032 
7033  // swap the input and output part boxes.
7034  if(this->mj_keep_part_boxes) {
7035  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7036  input_part_boxes = output_part_boxes;
7037  output_part_boxes = tmpPartBoxes;
7038  }
7039  next_future_num_parts_in_parts->clear();
7040  for(mj_part_t i = 0; i < outP; ++i) {
7041  mj_part_t p = tmpv[i];
7042  next_future_num_parts_in_parts->push_back(p);
7043  }
7044 
7045  delete [] num_points_in_all_processor_parts;
7046 
7047  mj_lno_t num_new_local_points = 0;
7048  //perform the actual migration operation here.
7049  this->mj_migrate_coords(
7050  num_procs,
7051  num_new_local_points,
7052  iteration,
7053  coordinate_destinations,
7054  input_num_parts);
7055 
7056  delete [] coordinate_destinations;
7057  if(this->num_local_coords != num_new_local_points) {
7058  this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7059  (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7060  num_new_local_points);
7061  this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7062  (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7063  num_new_local_points);
7064  }
7065  this->num_local_coords = num_new_local_points;
7066  this->num_global_coords = new_global_num_points;
7067 
7068  // create subcommunicator.
7069  this->create_sub_communicator(processor_ranks_for_subcomm);
7070 
7071  processor_ranks_for_subcomm.clear();
7072 
7073  // fill the new permutation arrays.
7074  this->fill_permutation_array(output_num_parts, input_num_parts);
7075 
7076  return true;
7077 }
7078 
7097 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7098  typename mj_part_t, typename mj_node_t>
7099 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7100  create_consistent_chunks(
7101  mj_part_t num_parts,
7102  Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7103  Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7104  mj_lno_t coordinate_begin,
7105  mj_lno_t coordinate_end,
7106  Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7107  Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7108  int coordInd,
7109  bool longest_dim_part,
7110  uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7111 {
7112  // Note that this method is only used by task mapper
7113  // All code in this file has been verified to run with UVM off by running
7114  // mj tests and task mapper tests with UVM off. However for this particular
7115  // method I did not do much for UVM off. I heavily use device to host copies
7116  // and more or less preserve the original logic. Due to the handling of
7117  // arrays it will be a bit of work to convert this to as better form.
7118  // Since it's only relevant to task mapper and I wasn't sure how much priority
7119  // to give it, I put that on hold until further discussion.
7120  mj_part_t no_cuts = num_parts - 1;
7121 
7122  // now if the rectilinear partitioning is allowed we decide how
7123  // much weight each thread should put to left and right.
7124  if(this->distribute_points_on_cut_lines) {
7125  auto local_thread_cut_line_weight_to_put_left =
7126  this->thread_cut_line_weight_to_put_left;
7127  auto local_thread_part_weight_work =
7128  this->thread_part_weight_work;
7129  auto local_sEpsilon = this->sEpsilon;
7130 
7131  Kokkos::parallel_for(
7132  Kokkos::RangePolicy<typename mj_node_t::execution_space,
7133  mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7134  // the left to be put on the left of the cut.
7135  mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7136  if(left_weight > local_sEpsilon) {
7137  // the weight of thread ii on cut.
7138  mj_scalar_t thread_ii_weight_on_cut =
7139  local_thread_part_weight_work(i * 2 + 1) -
7140  local_thread_part_weight_work(i * 2);
7141  if(thread_ii_weight_on_cut < left_weight) {
7142  local_thread_cut_line_weight_to_put_left(i) =
7143  thread_ii_weight_on_cut;
7144  }
7145  else {
7146  local_thread_cut_line_weight_to_put_left(i) = left_weight;
7147  }
7148  }
7149  else {
7150  local_thread_cut_line_weight_to_put_left(i) = 0;
7151  }
7152  });
7153 
7154  if(no_cuts > 0) {
7155  auto local_least_signifiance = least_signifiance;
7156  auto local_significance_mul = significance_mul;
7157  Kokkos::parallel_for(
7158  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7159  (0, 1), KOKKOS_LAMBDA (int dummy) {
7160  // this is a special case. If cutlines share the same coordinate,
7161  // their weights are equal.
7162  // we need to adjust the ratio for that.
7163  for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7164  mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7165  mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7166  mj_scalar_t delta = cut2 - cut1;
7167  mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7168  if(abs_delta < local_sEpsilon) {
7169  local_thread_cut_line_weight_to_put_left(i) -=
7170  local_thread_cut_line_weight_to_put_left(i - 1);
7171  }
7172  local_thread_cut_line_weight_to_put_left(i) =
7173  static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7174  local_least_signifiance) * local_significance_mul) /
7175  static_cast<mj_scalar_t>(local_significance_mul);
7176  }
7177  });
7178  }
7179  }
7180 
7181  auto local_thread_point_counts = this->thread_point_counts;
7182  Kokkos::parallel_for(
7183  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7184  (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7185  local_thread_point_counts(i) = 0;
7186  });
7187 
7188  // for this specific case we dont want to distribute the points along the
7189  // cut position randomly, as we need a specific ordering of them. Instead,
7190  // we put the coordinates into a sort item, where we sort those
7191  // using the coordinates of points on other dimensions and the index.
7192 
7193  // some of the cuts might share the same position.
7194  // in this case, if cut i and cut j share the same position
7195  // cut_map[i] = cut_map[j] = sort item index.
7196  mj_part_t *cut_map = new mj_part_t[no_cuts];
7197 
7198  typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7199  typedef std::vector< multiSItem > multiSVector;
7200  typedef std::vector<multiSVector> multiS2Vector;
7201 
7202  // to keep track of the memory allocated.
7203  std::vector<mj_scalar_t *>allocated_memory;
7204 
7205  // vector for which the coordinates will be sorted.
7206  multiS2Vector sort_vector_points_on_cut;
7207 
7208  // the number of cuts that have different coordinates.
7209  mj_part_t different_cut_count = 1;
7210  cut_map[0] = 0;
7211 
7212  // now we insert 1 sort vector for all cuts on the different
7213  // positins.if multiple cuts are on the same position,
7214  // they share sort vectors.
7215  multiSVector tmpMultiSVector;
7216  sort_vector_points_on_cut.push_back(tmpMultiSVector);
7217 
7218  auto local_current_concurrent_cut_coordinate =
7219  current_concurrent_cut_coordinate;
7220  auto host_current_concurrent_cut_coordinate =
7221  Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7222  Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7223  local_current_concurrent_cut_coordinate);
7224 
7225  for(mj_part_t i = 1; i < no_cuts ; ++i) {
7226  // if cuts share the same cut coordinates
7227  // set the cutmap accordingly.
7228  if(std::abs(host_current_concurrent_cut_coordinate(i) -
7229  host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7230  cut_map[i] = cut_map[i-1];
7231  }
7232  else {
7233  cut_map[i] = different_cut_count++;
7234  multiSVector tmp2MultiSVector;
7235  sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7236  }
7237  }
7238  Kokkos::deep_copy(current_concurrent_cut_coordinate,
7239  host_current_concurrent_cut_coordinate);
7240 
7241  // now the actual part assigment.
7242  auto host_coordinate_permutations =
7243  Kokkos::create_mirror_view(coordinate_permutations);
7244  Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7245 
7246  auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7247  Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7248 
7249  auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7250  Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7251 
7252  auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7253  Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7254 
7255  auto local_coord_dim = this->coord_dim;
7256 
7257  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7258  mj_lno_t i = host_coordinate_permutations(ii);
7259  mj_part_t pp = host_assigned_part_ids(i);
7260  mj_part_t p = pp / 2;
7261  // if the coordinate is on a cut.
7262  if(pp % 2 == 1 ) {
7263  mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7264  allocated_memory.push_back(vals);
7265 
7266  // we insert the coordinates to the sort item here.
7267  int val_ind = 0;
7268 
7269  if(longest_dim_part) {
7270  // std::cout << std::endl << std::endl;
7271  for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7272  // uSignedSortItem<int, mj_scalar_t, char>
7273  // *p_coord_dimension_range_sorted
7274  int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7275  // std::cout << "next_largest_coord_dim: " <<
7276  // next_largest_coord_dim << " ";
7277  // Note refactor in progress
7278  vals[val_ind++] =
7279  host_mj_coordinates(i,next_largest_coord_dim);
7280  }
7281  }
7282  else {
7283  for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7284  vals[val_ind++] = host_mj_coordinates(i,dim);
7285  }
7286  for(int dim = 0; dim < coordInd; ++dim) {
7287  vals[val_ind++] = host_mj_coordinates(i,dim);
7288  }
7289  }
7290 
7291  multiSItem tempSortItem(i, local_coord_dim -1, vals);
7292  //insert the point to the sort vector pointed by the cut_map[p].
7293  mj_part_t cmap = cut_map[p];
7294  sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7295  }
7296  else {
7297  //if it is not on the cut, simple sorting.
7298  ++host_thread_point_counts(p);
7299  host_assigned_part_ids(i) = p;
7300  }
7301  }
7302 
7303  // sort all the sort vectors.
7304  for(mj_part_t i = 0; i < different_cut_count; ++i) {
7305  std::sort (sort_vector_points_on_cut[i].begin(),
7306  sort_vector_points_on_cut[i].end());
7307  }
7308 
7309  mj_part_t previous_cut_map = cut_map[0];
7310 
7311  auto host_thread_cut_line_weight_to_put_left =
7312  Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7313  Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7314  thread_cut_line_weight_to_put_left);
7315 
7316  auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7317  Kokkos::deep_copy(host_mj_weights, mj_weights);
7318 
7319  // this is how much previous part owns the weight of the current part.
7320  // when target part weight is 1.6, and the part on the left is given 2,
7321  // the left has an extra 0.4, while the right has missing 0.4 from the
7322  // previous cut.
7323  // This parameter is used to balance this issues.
7324  // in the above example weight_stolen_from_previous_part will be 0.4.
7325  // if the left part target is 2.2 but it is given 2,
7326  // then weight_stolen_from_previous_part will be -0.2.
7327  mj_scalar_t weight_stolen_from_previous_part = 0;
7328  for(mj_part_t p = 0; p < no_cuts; ++p) {
7329  mj_part_t mapped_cut = cut_map[p];
7330 
7331  // if previous cut map is done, and it does not have the same index,
7332  // then assign all points left on that cut to its right.
7333  if(previous_cut_map != mapped_cut) {
7334  mj_lno_t sort_vector_end = (mj_lno_t)
7335  sort_vector_points_on_cut[previous_cut_map].size() - 1;
7336  for(; sort_vector_end >= 0; --sort_vector_end) {
7337  multiSItem t =
7338  sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7339  mj_lno_t i = t.index;
7340  ++host_thread_point_counts(p);
7341  host_assigned_part_ids(i) = p;
7342  }
7343  sort_vector_points_on_cut[previous_cut_map].clear();
7344  }
7345 
7346  // TODO: MD: I dont remember why I have it reverse order here.
7347  mj_lno_t sort_vector_end = (mj_lno_t)
7348  sort_vector_points_on_cut[mapped_cut].size() - 1;
7349  // mj_lno_t sort_vector_begin= 0;
7350  // mj_lno_t sort_vector_size =
7351  // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7352 
7353  // TODO commented for reverse order
7354  for(; sort_vector_end >= 0; --sort_vector_end) {
7355  // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7356  // TODO COMMENTED FOR REVERSE ORDER
7357  multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7358  //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7359  mj_lno_t i = t.index;
7360  mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7361  this->mj_weights(i,0);
7362  // part p has enough space for point i, then put it to point i.
7363  if(host_thread_cut_line_weight_to_put_left(p) +
7364  weight_stolen_from_previous_part> this->sEpsilon &&
7365  host_thread_cut_line_weight_to_put_left(p) +
7366  weight_stolen_from_previous_part -
7367  std::abs(host_thread_cut_line_weight_to_put_left(p) +
7368  weight_stolen_from_previous_part - w)> this->sEpsilon)
7369  {
7370  host_thread_cut_line_weight_to_put_left(p) -= w;
7371 
7372  sort_vector_points_on_cut[mapped_cut].pop_back();
7373 
7374  ++host_thread_point_counts(p);
7375  host_assigned_part_ids(i) = p;
7376  // if putting this weight to left overweights the left cut, then
7377  // increase the space for the next cut using
7378  // weight_stolen_from_previous_part.
7379  if(p < no_cuts - 1 &&
7380  host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7381  if(mapped_cut == cut_map[p + 1] ) {
7382  // if the cut before the cut indexed at p was also at the same
7383  // position special case, as we handle the weight differently here.
7384  if(previous_cut_map != mapped_cut) {
7385  weight_stolen_from_previous_part =
7386  host_thread_cut_line_weight_to_put_left(p);
7387  }
7388  else {
7389  // if the cut before the cut indexed at p was also at the same
7390  // position we assign extra weights cumulatively in this case.
7391  weight_stolen_from_previous_part +=
7392  host_thread_cut_line_weight_to_put_left(p);
7393  }
7394  }
7395  else{
7396  weight_stolen_from_previous_part =
7397  -host_thread_cut_line_weight_to_put_left(p);
7398  }
7399  // end assignment for part p
7400  break;
7401  }
7402  } else {
7403  // if part p does not have enough space for this point
7404  // and if there is another cut sharing the same positon,
7405  // again increase the space for the next
7406  if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7407  if(previous_cut_map != mapped_cut) {
7408  weight_stolen_from_previous_part =
7409  host_thread_cut_line_weight_to_put_left(p);
7410  }
7411  else {
7412  weight_stolen_from_previous_part +=
7413  host_thread_cut_line_weight_to_put_left(p);
7414  }
7415  }
7416  else{
7417  weight_stolen_from_previous_part =
7418  -host_thread_cut_line_weight_to_put_left(p);
7419  }
7420  // end assignment for part p
7421  break;
7422  }
7423  }
7424  previous_cut_map = mapped_cut;
7425  }
7426 
7427  // TODO commented for reverse order
7428  // put everything left on the last cut to the last part.
7429  mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7430  previous_cut_map].size() - 1;
7431 
7432  // mj_lno_t sort_vector_begin= 0;
7433  // mj_lno_t sort_vector_size = (mj_lno_t)
7434  // sort_vector_points_on_cut[previous_cut_map].size();
7435  // TODO commented for reverse order
7436  for(; sort_vector_end >= 0; --sort_vector_end) {
7437  // TODO commented for reverse order
7438  multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7439  // multiSItem t =
7440  // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7441  mj_lno_t i = t.index;
7442  ++host_thread_point_counts(no_cuts);
7443  host_assigned_part_ids(i) = no_cuts;
7444  }
7445 
7446  sort_vector_points_on_cut[previous_cut_map].clear();
7447  delete [] cut_map;
7448 
7449  //free the memory allocated for vertex sort items .
7450  mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7451  for(mj_lno_t i = 0; i < vSize; ++i) {
7452  delete [] allocated_memory[i];
7453  }
7454 
7455  auto local_out_part_xadj = out_part_xadj;
7456  auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7457  Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7458 
7459  // creation of part_xadj as in usual case.
7460  for(mj_part_t j = 0; j < num_parts; ++j) {
7461  host_out_part_xadj(j) = host_thread_point_counts(j);
7462  host_thread_point_counts(j) = 0;
7463  }
7464 
7465  // perform prefix sum for num_points in parts.
7466  for(mj_part_t j = 1; j < num_parts; ++j) {
7467  host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7468  }
7469 
7470  // shift the num points in threads thread to obtain the
7471  // beginning index of each thread's private space.
7472  for(mj_part_t j = 1; j < num_parts; ++j) {
7473  host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7474  }
7475 
7476  auto host_new_coordinate_permutations =
7477  Kokkos::create_mirror_view(new_coordinate_permutations);
7478  Kokkos::deep_copy(host_new_coordinate_permutations,
7479  new_coordinate_permutations);
7480 
7481  // now thread gets the coordinate and writes the index of coordinate to
7482  // the permutation array using the part index we calculated.
7483  for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7484  mj_lno_t i = host_coordinate_permutations(ii);
7485  mj_part_t p = host_assigned_part_ids(i);
7486  host_new_coordinate_permutations(coordinate_begin +
7487  host_thread_point_counts(p)++) = i;
7488  }
7489 
7490  Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7491  Kokkos::deep_copy(new_coordinate_permutations,
7492  host_new_coordinate_permutations);
7493  Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7494 }
7495 
7505 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7506  typename mj_part_t, typename mj_node_t>
7507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7508  set_final_parts(
7509  mj_part_t current_num_parts,
7510  mj_part_t output_part_begin_index,
7511  RCP<mj_partBoxVector_t> &output_part_boxes,
7512  bool is_data_ever_migrated)
7513 {
7514  this->mj_env->timerStart(MACRO_TIMERS,
7515  mj_timer_base_string + "Part_Assignment");
7516 
7517  auto local_part_xadj = part_xadj;
7518  auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7519  auto local_coordinate_permutations = coordinate_permutations;
7520  auto local_assigned_part_ids = assigned_part_ids;
7521 
7522  if(local_mj_keep_part_boxes) {
7523  for(int i = 0; i < current_num_parts; ++i) {
7524  (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7525  }
7526  }
7527 
7528  Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7529  current_num_parts, Kokkos::AUTO());
7530  typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7531  member_type member_type;
7532  Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7533  int i = team_member.league_rank();
7534  Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7535  local_part_xadj(i-1) : 0, local_part_xadj(i)),
7536  [=] (mj_lno_t ii) {
7537  mj_lno_t k = local_coordinate_permutations(ii);
7538  local_assigned_part_ids(k) = i + output_part_begin_index;
7539  });
7540  });
7541 
7542  if(is_data_ever_migrated) {
7543 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7544  if(sizeof(mj_lno_t) <= sizeof(int)) {
7545 
7546  // Cannot use Zoltan_Comm with local ordinals larger than ints.
7547  // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7548  // may overflow.
7549 
7550  // if data is migrated, then send part numbers to the original owners.
7551  ZOLTAN_COMM_OBJ *plan = NULL;
7552  MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7553 
7554  int incoming = 0;
7555  int message_tag = 7856;
7556 
7557  this->mj_env->timerStart(MACRO_TIMERS,
7558  mj_timer_base_string + "Final Z1PlanCreating");
7559 
7560  // setup incoming count
7561  int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7562  this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7563 
7564  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7565  this->mj_env->timerStop(MACRO_TIMERS,
7566  mj_timer_base_string + "Final Z1PlanCreating" );
7567 
7568  this->mj_env->timerStart(MACRO_TIMERS,
7569  mj_timer_base_string + "Final Z1PlanComm");
7570 
7571  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7572 
7573  // migrate gnos to actual owners.
7574  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7575  Kokkos::HostSpace(), this->current_mj_gnos);
7576  deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7577  Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7578  Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7579  auto host_dst_gnos = Kokkos::create_mirror_view(
7580  Kokkos::HostSpace(), dst_gnos);
7581  message_tag++;
7582  ierr = Zoltan_Comm_Do( plan, message_tag,
7583  (char *) host_current_mj_gnos.data(),
7584  sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7585  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7586  Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7587  this->current_mj_gnos = dst_gnos;
7588 
7589  // migrate part ids to actual owners.
7590  auto host_src_part_ids = Kokkos::create_mirror_view(
7591  Kokkos::HostSpace(), this->assigned_part_ids);
7592  deep_copy(host_src_part_ids, this->assigned_part_ids);
7593  Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7594  Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7595  auto host_dst_part_ids = Kokkos::create_mirror_view(
7596  Kokkos::HostSpace(), dst_part_ids);
7597  message_tag++;
7598  ierr = Zoltan_Comm_Do( plan, message_tag,
7599  (char *) host_src_part_ids.data(),
7600  sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7601  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7602  Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7603  this->assigned_part_ids = dst_part_ids;
7604 
7605  ierr = Zoltan_Comm_Destroy(&plan);
7606  Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7607 
7608  this->num_local_coords = incoming;
7609 
7610  this->mj_env->timerStop(MACRO_TIMERS,
7611  mj_timer_base_string + "Final Z1PlanComm");
7612  }
7613  else
7614 #endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7615  {
7616  // setup incoming count
7617  this->mj_env->timerStart(MACRO_TIMERS,
7618  mj_timer_base_string + "Final DistributorPlanCreating");
7619  Tpetra::Distributor distributor(this->mj_problemComm);
7620  ArrayView<const mj_part_t> owners_of_coords(
7621  this->owner_of_coordinate.data(), this->num_local_coords);
7622  mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7623  this->mj_env->timerStop(MACRO_TIMERS,
7624  mj_timer_base_string + "Final DistributorPlanCreating" );
7625 
7626  this->mj_env->timerStart(MACRO_TIMERS,
7627  mj_timer_base_string + "Final DistributorPlanComm");
7628 
7629  // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7630 
7631  // migrate gnos to actual owners.
7632  auto src_host_current_mj_gnos =
7633  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
7634  Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
7635  ArrayRCP<mj_gno_t> received_gnos(incoming);
7636  ArrayView<mj_gno_t> sent_gnos(src_host_current_mj_gnos.data(),
7637  this->num_local_coords);
7638  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7639  this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7640  Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7641  auto host_current_mj_gnos = Kokkos::create_mirror_view(
7642  this->current_mj_gnos);
7643  memcpy(host_current_mj_gnos.data(),
7644  received_gnos.getRawPtr(), incoming * sizeof(mj_gno_t));
7645  Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
7646 
7647  // migrate part ids to actual owners.
7648  auto src_host_assigned_part_ids =
7649  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
7650  Kokkos::deep_copy(src_host_assigned_part_ids, this->assigned_part_ids);
7651  ArrayView<mj_part_t> sent_partids(src_host_assigned_part_ids.data(),
7652  this->num_local_coords);
7653  ArrayRCP<mj_part_t> received_partids(incoming);
7654  distributor.doPostsAndWaits<mj_part_t>(
7655  sent_partids, 1, received_partids());
7656  this->assigned_part_ids =
7657  Kokkos::View<mj_part_t*, device_t>(
7658  Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7659  incoming);
7660  auto host_assigned_part_ids = Kokkos::create_mirror_view(
7661  this->assigned_part_ids);
7662  memcpy( host_assigned_part_ids.data(),
7663  received_partids.getRawPtr(), incoming * sizeof(mj_part_t));
7664  deep_copy(this->assigned_part_ids, host_assigned_part_ids);
7665  this->num_local_coords = incoming;
7666 
7667  this->mj_env->timerStop(MACRO_TIMERS,
7668  mj_timer_base_string + "Final DistributorPlanComm");
7669  }
7670  }
7671 
7672  this->mj_env->timerStop(MACRO_TIMERS,
7673  mj_timer_base_string + "Part_Assignment");
7674 
7675  this->mj_env->timerStart(MACRO_TIMERS,
7676  mj_timer_base_string + "Solution_Part_Assignment");
7677 
7678  // ArrayRCP<mj_part_t> partId;
7679  // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7680 
7681  if(this->mj_keep_part_boxes) {
7682  this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7683  }
7684 
7685  this->mj_env->timerStop(MACRO_TIMERS,
7686  mj_timer_base_string + "Solution_Part_Assignment");
7687 }
7688 
7701 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7702  typename mj_part_t, typename mj_node_t>
7705  bool distribute_points_on_cut_lines_,
7706  int max_concurrent_part_calculation_,
7707  int check_migrate_avoid_migration_option_,
7708  double minimum_migration_imbalance_,
7709  int migration_type_)
7710 {
7711  this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7712  this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7713  this->check_migrate_avoid_migration_option =
7714  check_migrate_avoid_migration_option_;
7715  this->minimum_migration_imbalance = minimum_migration_imbalance_;
7716  this->migration_type = migration_type_;
7717 }
7718 
7746 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7747  typename mj_part_t, typename mj_node_t>
7750  const RCP<const Environment> &env,
7751  RCP<const Comm<int> > &problemComm,
7752  double imbalance_tolerance_,
7753  int num_teams_,
7754  size_t num_global_parts_,
7755  Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7756  int recursion_depth_,
7757  int coord_dim_,
7758  mj_lno_t num_local_coords_,
7759  mj_gno_t num_global_coords_,
7760  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7761  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7762  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7763  int num_weights_per_coord_,
7764  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7765  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7766  Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7767  Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7768  Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7769 {
7770 
7771  // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7773  this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7774 
7775  this->mj_env = env;
7776  this->mj_problemComm = problemComm;
7777  this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7778  this->mj_env->timerStart(MACRO_TIMERS,
7779  mj_timer_base_string + "Total");
7780  this->mj_env->debug(3, "In MultiJagged Jagged");
7781  this->imbalance_tolerance = imbalance_tolerance_;
7782  this->mj_num_teams = num_teams_;
7783  this->num_global_parts = num_global_parts_;
7784  this->part_no_array = part_no_array_;
7785  this->recursion_depth = recursion_depth_;
7786  this->coord_dim = coord_dim_;
7787  this->num_local_coords = num_local_coords_;
7788  this->num_global_coords = num_global_coords_;
7789  this->mj_coordinates = mj_coordinates_;
7790  this->initial_mj_gnos = initial_mj_gnos_;
7791  this->num_weights_per_coord = num_weights_per_coord_;
7792  this->mj_uniform_weights = mj_uniform_weights_;
7793  this->mj_weights = mj_weights_;
7794  this->mj_uniform_parts = mj_uniform_parts_;
7795 
7796  // this->set_input_data();
7797 
7798  this->set_part_specifications();
7799 
7800  this->mj_env->timerStart(MACRO_TIMERS,
7801  mj_timer_base_string + "Allocate Views");
7802  this->allocate_set_work_memory();
7803  this->mj_env->timerStop(MACRO_TIMERS,
7804  mj_timer_base_string + "Allocate Views");
7805 
7806  // We duplicate the comm as we create subcommunicators during migration.
7807  // We keep the problemComm as it is, while comm changes after each migration.
7808  this->comm = this->mj_problemComm->duplicate();
7809 
7810 #ifdef print_debug
7811  if(comm->getRank() == 0) {
7812  std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7813  std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7814  std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7815  }
7816 #endif
7817 
7818  // initially there is a single partition
7819  mj_part_t current_num_parts = 1;
7820  Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7821  this->all_cut_coordinates;
7822  this->mj_env->timerStart(MACRO_TIMERS,
7823  mj_timer_base_string + "Problem_Partitioning");
7824  mj_part_t output_part_begin_index = 0;
7825  mj_part_t future_num_parts = this->total_num_part;
7826  bool is_data_ever_migrated = false;
7827 
7828  std::vector<mj_part_t> *future_num_part_in_parts =
7829  new std::vector<mj_part_t> ();
7830  std::vector<mj_part_t> *next_future_num_parts_in_parts =
7831  new std::vector<mj_part_t> ();
7832 
7833  next_future_num_parts_in_parts->push_back(this->num_global_parts);
7834 
7835  RCP<mj_partBoxVector_t> input_part_boxes;
7836  RCP<mj_partBoxVector_t> output_part_boxes;
7837 
7838  if(this->mj_keep_part_boxes) {
7839  input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7840  output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7841  compute_global_box();
7842  this->init_part_boxes(output_part_boxes);
7843  }
7844 
7845  auto local_part_xadj = this->part_xadj;
7846 
7847  // Need a device counter - how best to allocate?
7848  // Putting this allocation in the loops is very costly so moved out here.
7849  Kokkos::View<mj_part_t*, device_t>
7850  view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7851  Kokkos::View<size_t*, device_t>
7852  view_total_reduction_size("view_total_reduction_size", 1);
7853 
7854  for(int i = 0; i < this->recursion_depth; ++i) {
7855 
7856  // convert i to string to be used for debugging purposes.
7857  std::string istring = std::to_string(i);
7858 
7859  // next_future_num_parts_in_parts will be as the size of outnumParts,
7860  // and this will hold how many more parts that each output part
7861  // should be divided. this array will also be used to determine the weight
7862  // ratios of the parts. swap the arrays to use iteratively.
7863  std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7864  future_num_part_in_parts = next_future_num_parts_in_parts;
7865  next_future_num_parts_in_parts = tmpPartVect;
7866 
7867  // clear next_future_num_parts_in_parts array as
7868  // getPartitionArrays expects it to be empty.
7869  next_future_num_parts_in_parts->clear();
7870  if(this->mj_keep_part_boxes) {
7871  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7872  input_part_boxes = output_part_boxes;
7873  output_part_boxes = tmpPartBoxes;
7874  output_part_boxes->clear();
7875  }
7876 
7877  // returns the total no. of output parts for this dimension partitioning.
7878  mj_part_t output_part_count_in_dimension =
7879  this->update_part_num_arrays(
7880  future_num_part_in_parts,
7881  next_future_num_parts_in_parts,
7882  future_num_parts,
7883  current_num_parts,
7884  i,
7885  input_part_boxes,
7886  output_part_boxes, 1);
7887 
7888  // if the number of obtained parts equal to current number of parts,
7889  // skip this dimension. For example, this happens when 1 is given in the
7890  // input part array is given. P=4,5,1,2
7891  if(output_part_count_in_dimension == current_num_parts) {
7892  //still need to swap the input output arrays.
7893  tmpPartVect= future_num_part_in_parts;
7894  future_num_part_in_parts = next_future_num_parts_in_parts;
7895  next_future_num_parts_in_parts = tmpPartVect;
7896 
7897  if(this->mj_keep_part_boxes) {
7898  RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7899  input_part_boxes = output_part_boxes;
7900  output_part_boxes = tmpPartBoxes;
7901  }
7902  continue;
7903  }
7904 
7905  // get the coordinate axis along which the partitioning will be done.
7906  int coordInd = i % this->coord_dim;
7907 
7908  Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7909  Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7910 
7911  this->mj_env->timerStart(MACRO_TIMERS,
7912  mj_timer_base_string + "Problem_Partitioning_" + istring);
7913 
7914  // alloc Memory to point the indices
7915  // of the parts in the permutation array.
7916  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7917  "new part xadj", output_part_count_in_dimension);
7918 
7919  // the index where in the new_part_xadj will be written.
7920  mj_part_t output_part_index = 0;
7921 
7922  // whatever is written to output_part_index will be added with
7923  // output_coordinate_end_index so that the points will be shifted.
7924  mj_part_t output_coordinate_end_index = 0;
7925 
7926  mj_part_t current_work_part = 0;
7927  mj_part_t current_concurrent_num_parts =
7928  std::min(current_num_parts - current_work_part,
7929  this->max_concurrent_part_calculation);
7930 
7931  mj_part_t obtained_part_index = 0;
7932 
7933  auto host_process_local_min_max_coord_total_weight =
7934  Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7935  auto host_global_min_max_coord_total_weight =
7936  Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7937 
7938  // run for all available parts.
7939  for(; current_work_part < current_num_parts;
7941 
7943  std::min(current_num_parts - current_work_part,
7944  this->max_concurrent_part_calculation);
7945 
7946  int bDoingWork_int; // Can't reduce on bool so use int
7947  auto local_device_num_partitioning_in_current_dim =
7948  device_num_partitioning_in_current_dim;
7949  Kokkos::parallel_reduce("Read bDoingWork",
7950  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7951  KOKKOS_LAMBDA(int dummy, int & set_single) {
7952  set_single = 0;
7953  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7954  if(local_device_num_partitioning_in_current_dim(
7955  current_work_part + kk) != 1) {
7956  set_single = 1;
7957  break;
7958  }
7959  }
7960  }, bDoingWork_int);
7961  bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7962 
7963  this->mj_get_local_min_max_coord_totW(
7966  mj_current_dim_coords);
7967 
7968  // 1D partitioning
7969  if(bDoingWork) {
7970  // obtain global Min max of the part.
7971  this->mj_get_global_min_max_coord_totW(
7973  this->process_local_min_max_coord_total_weight,
7974  this->global_min_max_coord_total_weight);
7975 
7976  // represents the total number of cutlines
7977  // whose coordinate should be determined.
7978  mj_part_t total_incomplete_cut_count = 0;
7979 
7980  // Compute weight ratios for parts & cuts:
7981  // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7982  // part0 cut0 part1 cut1 part2 cut2 part3
7983  mj_part_t concurrent_part_cut_shift = 0;
7984  mj_part_t concurrent_part_part_shift = 0;
7985 
7986  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7987 
7988  Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7989  global_min_max_coord_total_weight);
7990 
7991  mj_scalar_t min_coordinate =
7992  host_global_min_max_coord_total_weight(kk);
7993  mj_scalar_t max_coordinate =
7994  host_global_min_max_coord_total_weight(
7996 
7997  mj_scalar_t global_total_weight =
7998  host_global_min_max_coord_total_weight(
7999  kk + 2 * current_concurrent_num_parts);
8000 
8001  mj_part_t concurrent_current_part_index = current_work_part + kk;
8002 
8003  mj_part_t partition_count = host_num_partitioning_in_current_dim(
8004  concurrent_current_part_index);
8005 
8006  Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
8007  Kokkos::subview(current_cut_coordinates,
8008  std::pair<mj_lno_t, mj_lno_t>(
8009  concurrent_part_cut_shift, current_cut_coordinates.size()));
8010  Kokkos::View<mj_scalar_t *, device_t>
8011  current_target_part_weights =
8012  Kokkos::subview(target_part_weights,
8013  std::pair<mj_lno_t, mj_lno_t>(
8014  concurrent_part_part_shift, target_part_weights.size()));
8015 
8016  // shift the usedCutCoordinate array as noCuts.
8017  concurrent_part_cut_shift += partition_count - 1;
8018  // shift the partRatio array as noParts.
8019  concurrent_part_part_shift += partition_count;
8020 
8021  // calculate only if part is not empty,
8022  // and part will be further partitioned.
8023  if(partition_count > 1 && min_coordinate <= max_coordinate) {
8024 
8025  // increase num_cuts_do_be_determined by the number of cuts of the
8026  // current part's cut line number.
8027  total_incomplete_cut_count += partition_count - 1;
8028 
8029  this->incomplete_cut_count(kk) = partition_count - 1;
8030 
8031  // get the target weights of the parts
8032  this->mj_get_initial_cut_coords_target_weights(
8033  min_coordinate,
8034  max_coordinate,
8035  partition_count - 1,
8036  global_total_weight,
8037  usedCutCoordinate,
8038  current_target_part_weights,
8039  future_num_part_in_parts,
8040  next_future_num_parts_in_parts,
8041  concurrent_current_part_index,
8042  obtained_part_index);
8043 
8044  mj_lno_t coordinate_end_index =
8045  host_part_xadj(concurrent_current_part_index);
8046  mj_lno_t coordinate_begin_index =
8047  concurrent_current_part_index==0 ? 0 :
8048  host_part_xadj(concurrent_current_part_index - 1);
8049 
8050  this->set_initial_coordinate_parts(
8051  max_coordinate,
8052  min_coordinate,
8053  coordinate_begin_index, coordinate_end_index,
8054  this->coordinate_permutations,
8055  mj_current_dim_coords,
8056  this->assigned_part_ids,
8057  partition_count);
8058  }
8059  else {
8060  // e.g., if have fewer coordinates than parts, don't need to do
8061  // next dim.
8062  this->incomplete_cut_count(kk) = 0;
8063  }
8064 
8065  obtained_part_index += partition_count;
8066  }
8067 
8068  // used imbalance, it is always 0, as it is difficult to
8069  // estimate a range.
8070  double used_imbalance = 0;
8071  // Determine cut lines for all concurrent parts parts here.
8072  this->mj_env->timerStart(MACRO_TIMERS,
8073  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8074 
8075  this->mj_1D_part(
8076  mj_current_dim_coords,
8077  used_imbalance,
8080  current_cut_coordinates,
8081  total_incomplete_cut_count,
8082  view_rectilinear_cut_count,
8083  view_total_reduction_size);
8084 
8085  this->mj_env->timerStop(MACRO_TIMERS,
8086  mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8087  }
8088 
8089  // create new part chunks
8090  {
8091  mj_part_t output_array_shift = 0;
8092  mj_part_t cut_shift = 0;
8093  size_t tlr_shift = 0;
8094  size_t partweight_array_shift = 0;
8095  for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8096 
8097  mj_part_t current_concurrent_work_part = current_work_part + kk;
8098 
8099  mj_part_t num_parts = host_num_partitioning_in_current_dim(
8100  current_concurrent_work_part);
8101 
8102  // if the part is empty, skip the part.
8103  int coordinateA_bigger_than_coordinateB =
8104  host_global_min_max_coord_total_weight(kk) >
8105  host_global_min_max_coord_total_weight(
8107 
8108  if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8109  // we still need to write the begin and end point of the empty part.
8110  // simply set it zero, the array indices will be shifted later
8111  auto local_new_part_xadj = this->new_part_xadj;
8112  Kokkos::parallel_for(
8113  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8114  (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8115  local_new_part_xadj(
8116  output_part_index + output_array_shift + jj) = 0;
8117  });
8118 
8119  cut_shift += num_parts - 1;
8120  tlr_shift += (4 *(num_parts - 1) + 1);
8121  output_array_shift += num_parts;
8122  partweight_array_shift += (2 * (num_parts - 1) + 1);
8123  continue;
8124  }
8125 
8126  Kokkos::View<mj_scalar_t *, device_t>
8127  current_concurrent_cut_coordinate =
8128  Kokkos::subview(current_cut_coordinates,
8129  std::pair<mj_lno_t, mj_lno_t>(
8130  cut_shift,
8131  current_cut_coordinates.size()));
8132  Kokkos::View<mj_scalar_t *, device_t>
8133  used_local_cut_line_weight_to_left =
8134  Kokkos::subview(process_cut_line_weight_to_put_left,
8135  std::pair<mj_lno_t, mj_lno_t>(
8136  cut_shift,
8137  process_cut_line_weight_to_put_left.size()));
8138 
8139  this->thread_part_weight_work =
8140  Kokkos::subview(
8141  this->thread_part_weights,
8142  std::pair<mj_lno_t, mj_lno_t>(
8143  partweight_array_shift,
8144  this->thread_part_weights.extent(0)));
8145 
8146  if(num_parts > 1) {
8147  if(this->mj_keep_part_boxes) {
8148  // if part boxes are to be stored update the boundaries.
8149  for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8150  mj_scalar_t temp_get_val;
8151  Kokkos::parallel_reduce("Read single",
8152  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8153  KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8154  set_single = current_concurrent_cut_coordinate(j);
8155  }, temp_get_val);
8156  (*output_part_boxes)
8157  [output_array_shift + output_part_index + j].
8158  updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8159  (*output_part_boxes)
8160  [output_array_shift + output_part_index + j + 1].
8161  updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8162  }
8163  }
8164 
8165  // Rewrite the indices based on the computed cuts.
8166  Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8167  Kokkos::subview(this->new_part_xadj,
8168  std::pair<mj_lno_t, mj_lno_t>(
8169  output_part_index + output_array_shift,
8170  this->new_part_xadj.size()));
8171 
8172  this->mj_create_new_partitions(
8173  num_parts,
8174  current_concurrent_work_part,
8175  mj_current_dim_coords,
8176  current_concurrent_cut_coordinate,
8177  used_local_cut_line_weight_to_left,
8178  sub_new_part_xadj);
8179  }
8180  else {
8181 
8182  mj_lno_t coordinate_end = host_part_xadj(
8183  current_concurrent_work_part);
8184  mj_lno_t coordinate_begin =
8185  current_concurrent_work_part==0 ? 0 : host_part_xadj(
8186  current_concurrent_work_part - 1);
8187 
8188  // if this part is partitioned into 1 then just copy
8189  // the old values.
8190  mj_lno_t part_size = coordinate_end - coordinate_begin;
8191 
8192  // Awkward here to set one value - need some broader
8193  // refactoring to improve this one.
8194  auto local_new_part_xadj = this->new_part_xadj;
8195  Kokkos::parallel_for(
8196  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8197  (0, 1), KOKKOS_LAMBDA (int dummy) {
8198  local_new_part_xadj(
8199  output_part_index + output_array_shift) = part_size;
8200  });
8201 
8202  auto subview_new_coordinate_permutations =
8203  Kokkos::subview(this->new_coordinate_permutations,
8204  std::pair<mj_lno_t, mj_lno_t>(
8205  coordinate_begin,
8206  coordinate_begin + part_size));
8207  auto subview_coordinate_permutations =
8208  Kokkos::subview(this->coordinate_permutations,
8209  std::pair<mj_lno_t, mj_lno_t>(
8210  coordinate_begin,
8211  coordinate_begin + part_size));
8212  Kokkos::deep_copy(subview_new_coordinate_permutations,
8213  subview_coordinate_permutations);
8214  }
8215  cut_shift += num_parts - 1;
8216  output_array_shift += num_parts;
8217  partweight_array_shift += (2 * (num_parts - 1) + 1);
8218  }
8219 
8220  // shift cut coordinates so that all cut coordinates are stored.
8221  // no shift now because we dont keep the cuts.
8222  // current_cut_coordinates += cut_shift;
8223  // mj_create_new_partitions from coordinates partitioned the parts
8224  // and write the indices as if there were a single part.
8225  // now we need to shift the beginning indices.
8226  for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8227  mj_part_t num_parts =
8228  host_num_partitioning_in_current_dim(current_work_part + kk);
8229 
8230  // These two kernels are a bit awkward but need broader redesign to
8231  // avoid this situation.
8232  auto local_new_part_xadj = this->new_part_xadj;
8233  Kokkos::parallel_for(
8234  Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8235  (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8236  local_new_part_xadj(output_part_index+ii) +=
8237  output_coordinate_end_index;
8238  });
8239 
8240  // increase the previous count by current end.
8241  mj_part_t temp_get;
8242  Kokkos::parallel_reduce("Read single",
8243  Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8244  KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8245  set_single =
8246  local_new_part_xadj(output_part_index + num_parts - 1);
8247  }, temp_get);
8248  output_coordinate_end_index = temp_get;
8249  //increase the current out.
8250  output_part_index += num_parts;
8251  }
8252  }
8253  }
8254 
8255  // end of this partitioning dimension
8256  int current_world_size = this->comm->getSize();
8257  long migration_reduce_all_population =
8258  this->total_dim_num_reduce_all * current_world_size;
8259  bool is_migrated_in_current_dimension = false;
8260 
8261  // we migrate if there are more partitionings to be done after this step
8262  // and if the migration is not forced to be avoided.
8263  // and the operation is not sequential.
8264  if(future_num_parts > 1 &&
8265  this->check_migrate_avoid_migration_option >= 0 &&
8266  current_world_size > 1) {
8267  this->mj_env->timerStart(MACRO_TIMERS,
8268  mj_timer_base_string + "Problem_Migration-" + istring);
8269  mj_part_t num_parts = output_part_count_in_dimension;
8270 
8271  if(this->mj_perform_migration(
8272  num_parts,
8273  current_num_parts, //output
8274  next_future_num_parts_in_parts, //output
8275  output_part_begin_index,
8276  migration_reduce_all_population,
8277  this->num_global_coords / (future_num_parts * current_num_parts),
8278  istring,
8279  input_part_boxes, output_part_boxes) )
8280  {
8281  is_migrated_in_current_dimension = true;
8282  is_data_ever_migrated = true;
8283  this->mj_env->timerStop(MACRO_TIMERS,
8284  mj_timer_base_string + "Problem_Migration-" + istring);
8285  // since data is migrated, we reduce the number of reduceAll
8286  // operations for the last part.
8287  this->total_dim_num_reduce_all /= num_parts;
8288  }
8289  else {
8290  is_migrated_in_current_dimension = false;
8291  this->mj_env->timerStop(MACRO_TIMERS,
8292  mj_timer_base_string + "Problem_Migration-" + istring);
8293  }
8294  }
8295 
8296  // swap the coordinate permutations for the next dimension.
8297  Kokkos::View<mj_lno_t*, device_t> tmp =
8298  this->coordinate_permutations;
8299  this->coordinate_permutations =
8300  this->new_coordinate_permutations;
8301 
8302  this->new_coordinate_permutations = tmp;
8303  if(!is_migrated_in_current_dimension) {
8304  this->total_dim_num_reduce_all -= current_num_parts;
8305  current_num_parts = output_part_count_in_dimension;
8306  }
8307 
8308  {
8309  this->part_xadj = this->new_part_xadj;
8310  local_part_xadj = this->new_part_xadj;
8311  this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8312  Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8313 
8314  this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8315  this->mj_env->timerStop(MACRO_TIMERS,
8316  mj_timer_base_string + "Problem_Partitioning_" + istring);
8317  }
8318  }
8319 
8320  // Partitioning is done
8321  delete future_num_part_in_parts;
8322  delete next_future_num_parts_in_parts;
8323  this->mj_env->timerStop(MACRO_TIMERS,
8324  mj_timer_base_string + "Problem_Partitioning");
8326 
8327  //get the final parts of each initial coordinate
8328  //the results will be written to
8329  //this->assigned_part_ids for gnos given in this->current_mj_gnos
8330  this->set_final_parts(
8331  current_num_parts,
8332  output_part_begin_index,
8333  output_part_boxes,
8334  is_data_ever_migrated);
8335 
8336  result_assigned_part_ids_ = this->assigned_part_ids;
8337  result_mj_gnos_ = this->current_mj_gnos;
8338  this->mj_env->timerStop(MACRO_TIMERS,
8339  mj_timer_base_string + "Total");
8340  this->mj_env->debug(3, "Out of MultiJagged");
8341 }
8342 
8343 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8344  typename mj_part_t, typename mj_node_t>
8345 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8346  mj_partBoxVector_t>
8348  get_kept_boxes() const
8349 {
8350  if(this->mj_keep_part_boxes) {
8351  return this->kept_boxes;
8352  }
8353  else {
8354  throw std::logic_error("Error: part boxes are not stored.");
8355  }
8356 }
8357 
8358 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8359  typename mj_part_t, typename mj_node_t>
8360 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8361  mj_partBoxVector_t>
8363  compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8364 {
8365  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8366  mj_part_t ntasks = this->num_global_parts;
8367  int dim = (*localPartBoxes)[0].getDim();
8368  coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8369 
8370  memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8371 
8372  coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8373  memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8374 
8375  coord_t *localPartMins = localPartBoundaries;
8376  coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8377 
8378  coord_t *globalPartMins = globalPartBoundaries;
8379  coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8380 
8381  mj_part_t boxCount = localPartBoxes->size();
8382  for(mj_part_t i = 0; i < boxCount; ++i) {
8383  mj_part_t pId = (*localPartBoxes)[i].getpId();
8384 
8385  // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8386 
8387  coord_t *lmins = (*localPartBoxes)[i].getlmins();
8388  coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8389 
8390  for(int j = 0; j < dim; ++j) {
8391  localPartMins[dim * pId + j] = lmins[j];
8392  localPartMaxs[dim * pId + j] = lmaxs[j];
8393 
8394  /*
8395  std::cout << "me:" << comm->getRank() <<
8396  " dim * pId + j:"<< dim * pId + j <<
8397  " localMin:" << localPartMins[dim * pId + j] <<
8398  " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8399  */
8400  }
8401  }
8402 
8403  Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8404 
8405  reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8406  ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8407 
8408  RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8409  for(mj_part_t i = 0; i < ntasks; ++i) {
8411  globalPartMins + dim * i,
8412  globalPartMaxs + dim * i);
8413 
8414  /*
8415  for(int j = 0; j < dim; ++j) {
8416  std::cout << "me:" << comm->getRank() <<
8417  " dim * pId + j:"<< dim * i + j <<
8418  " globalMin:" << globalPartMins[dim * i + j] <<
8419  " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8420  }
8421  */
8422 
8423  pB->push_back(tpb);
8424  }
8425  delete []localPartBoundaries;
8426  delete []globalPartBoundaries;
8427  //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8428  return pB;
8429 }
8430 
8433 template <typename Adapter>
8434 class Zoltan2_AlgMJ : public Algorithm<Adapter>
8435 {
8436 
8437 private:
8438 
8439 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8440  typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
8441 
8442  // For coordinates and weights, MJ needs floats or doubles
8443  // But Adapter can provide other scalars, e.g., ints.
8444  // So have separate scalar_t for MJ and adapter.
8445  typedef typename Adapter::scalar_t adapter_scalar_t;
8446 
8447  // Provide a default type for mj_scalar_t;
8448  typedef float default_mj_scalar_t;
8449 
8450  // If Adapter provided float or double scalar_t, use it (prevents copies).
8451  // Otherwise, use the default type of mj_scalar_t;
8452  typedef typename
8453  std::conditional<
8454  (std::is_same<adapter_scalar_t, float>::value ||
8455  std::is_same<adapter_scalar_t, double>::value),
8456  adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8457 
8458  typedef typename Adapter::gno_t mj_gno_t;
8459  typedef typename Adapter::lno_t mj_lno_t;
8460  typedef typename Adapter::part_t mj_part_t;
8461  typedef typename Adapter::node_t mj_node_t;
8462  typedef coordinateModelPartBox mj_partBox_t;
8463  typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8464  typedef typename mj_node_t::device_type device_t;
8465 #endif
8466 
8468 
8469  RCP<const Environment> mj_env; // the environment object
8470  RCP<const Comm<int> > mj_problemComm; // initial comm object
8471  RCP<const coordinateModel_t> mj_coords; // coordinate adapter
8472 
8473  // PARAMETERS
8474  double imbalance_tolerance; // input imbalance tolerance.
8475 
8476  int num_teams; // how many teams to run main loop with
8477 
8478  size_t num_global_parts; // the targeted number of parts
8479 
8480  // input part array specifying num part to divide along each dim.
8481  Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8482 
8483  // the number of steps that partitioning will be solved in.
8484  int recursion_depth;
8485 
8486  int coord_dim; // coordinate dimension.
8487  mj_lno_t num_local_coords; //number of local coords.
8488  mj_gno_t num_global_coords; //number of global coords.
8489 
8490  // initial global ids of the coordinates.
8491  Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8492 
8493  // two dimension coordinate array.
8494  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8495  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8496  mj_coordinates;
8497 
8498  int num_weights_per_coord; // number of weights per coordinate
8499 
8500  // if the target parts are uniform.
8501  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8502 
8503  // two dimensional weight array.
8504  Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8505 
8506  // if the target parts are uniform
8507  Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8508 
8509  // Nonuniform first level partitioning
8510  // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8511  // machine coordinates and application coordinates.
8512  // An optimization that completely partitions the most important machine
8513  // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8514  // coordinate). The standard MJ alg follows after the nonuniform first level
8515  // partitioning.
8516  // If used, number of parts for the first level partitioning
8517  mj_part_t num_first_level_parts;
8518 
8519  // If used, the distribution of parts for the nonuniform
8520  // first level partitioning
8521  Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8522 
8523  // if partitioning can distribute points on same coordiante to
8524  // different parts.
8525  bool distribute_points_on_cut_lines;
8526 
8527  // how many parts we can calculate concurrently.
8528  mj_part_t max_concurrent_part_calculation;
8529 
8530  // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8531  int check_migrate_avoid_migration_option;
8532 
8533  // when doing the migration, 0 will aim for perfect load-imbalance,
8534  int migration_type;
8535 
8536  // 1 for minimized messages
8537 
8538  // when MJ decides whether to migrate, the minimum imbalance for migration.
8539  double minimum_migration_imbalance;
8540  bool mj_keep_part_boxes; //if the boxes need to be kept.
8541 
8542  // if this is set, then recursion depth is adjusted to its maximum value.
8543  bool mj_run_as_rcb;
8544  int mj_premigration_option;
8545  int min_coord_per_rank_for_premigration;
8546 
8547  // communication graph xadj
8548  ArrayRCP<mj_part_t> comXAdj_;
8549 
8550  // communication graph adj.
8551  ArrayRCP<mj_part_t> comAdj_;
8552 
8553  void copy(
8554  const RCP<PartitioningSolution<Adapter> >&solution);
8555 
8556  void set_input_parameters(const Teuchos::ParameterList &p);
8557 
8558  RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8559 
8560  bool mj_premigrate_to_subset(
8561  int used_num_ranks,
8562  int migration_selection_option,
8563  RCP<const Environment> mj_env_,
8564  RCP<const Comm<int> > mj_problemComm_,
8565  int coord_dim_,
8566  mj_lno_t num_local_coords_,
8567  mj_gno_t num_global_coords_, size_t num_global_parts_,
8568  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8569  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8570  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8571  mj_coordinates_,
8572  int num_weights_per_coord_,
8573  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8574  //results
8575  RCP<const Comm<int> > &result_problemComm_,
8576  mj_lno_t & result_num_local_coords_,
8577  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8578  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8579  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8580  result_mj_coordinates_,
8581  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8582  int * &result_actual_owner_rank_);
8583 
8584 public:
8585 
8586  Zoltan2_AlgMJ(const RCP<const Environment> &env,
8587  RCP<const Comm<int> > &problemComm,
8588  const RCP<const coordinateModel_t> &coords) :
8589  mj_partitioner(),
8590  mj_env(env),
8591  mj_problemComm(problemComm),
8592  mj_coords(coords),
8593  imbalance_tolerance(0),
8594  num_teams(0),
8595  num_global_parts(1),
8596  recursion_depth(0),
8597  coord_dim(0),
8598  num_local_coords(0),
8599  num_global_coords(0),
8600  num_weights_per_coord(0),
8601  num_first_level_parts(1),
8602  distribute_points_on_cut_lines(true),
8603  max_concurrent_part_calculation(1),
8604  check_migrate_avoid_migration_option(0),
8605  migration_type(0),
8606  minimum_migration_imbalance(0.30),
8607  mj_keep_part_boxes(false),
8608  mj_run_as_rcb(false),
8609  mj_premigration_option(0),
8610  min_coord_per_rank_for_premigration(32000),
8611  comXAdj_(),
8612  comAdj_()
8613  {
8614  }
8615 
8617  {
8618  }
8619 
8622  static void getValidParameters(ParameterList & pl)
8623  {
8624  const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8625  RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8626  Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8627  pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8628  "algorithm. As many as the dimension count.", mj_parts_Validator);
8629 
8630  pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8631  "coordinates will be calculated concurently.",
8633 
8634  pl.set("mj_minimum_migration_imbalance", 1.1,
8635  "mj_minimum_migration_imbalance, the minimum imbalance of the "
8636  "processors to avoid migration",
8638 
8639  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8640  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8641  pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8642  "depending on the imbalance, 1 for forcing migration, 2 for "
8643  "avoiding migration", mj_migration_option_validator);
8644 
8645  RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8646  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8647  pl.set("mj_migration_type", 0,
8648  "Migration type, 0 for migration to minimize the imbalance "
8649  "1 for migration to minimize messages exchanged the migration.",
8650  mj_migration_option_validator);
8651 
8652  // bool parameter
8653  pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8654  "geometric partitioning.", Environment::getBoolValidator());
8655 
8656  // bool parameter
8657  pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8659 
8660  pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8661  "greater than 0.", Environment::getAnyIntValidator());
8662 
8663  RCP<Teuchos::EnhancedNumberValidator<int>>
8664  mj_num_teams_validator =
8665  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8666  0, Teuchos::EnhancedNumberTraits<int>::max()) );
8667  pl.set("mj_num_teams", 0,
8668  "How many teams for the main kernel loop"
8669  , mj_num_teams_validator);
8670 
8671  RCP<Teuchos::EnhancedNumberValidator<int>>
8672  mj_premigration_option_validator =
8673  Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8674 
8675  pl.set("mj_premigration_option", 0,
8676  "Whether to do premigration or not. 0 for no migration "
8677  "x > 0 for migration to consecutive processors, "
8678  "the subset will be 0,x,2x,3x,...subset ranks."
8679  , mj_premigration_option_validator);
8680 
8681  pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8682  "assign each rank in multijagged after premigration"
8684  }
8685 
8691  void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8692 
8693  mj_partBoxVector_t &getPartBoxesView() const
8694  {
8695  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8696  return *pBoxes;
8697  }
8698 
8699  mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8700 
8701  void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8702  size_t &nPartsFound, mj_part_t **partsFound) const;
8703 
8706  void getCommunicationGraph(
8707  const PartitioningSolution<Adapter> *solution,
8708  ArrayRCP<mj_part_t> &comXAdj,
8709  ArrayRCP<mj_part_t> &comAdj);
8710 
8711  void set_up_partitioning_data( // public for CUDA
8712  const RCP<PartitioningSolution<Adapter> >&solution);
8713 
8714  private:
8715  std::string timer_base_string; // used for making timers
8716 
8717  // After loading views from coordinate adapter we may need to copy them
8718  // if mj type is different, but otherwise we just want to assign the view.
8719  // So purpose of this code is to make that assign only happen when the types
8720  // match. The empty case would otherwise not compile.
8721  // If they don't match the internal code handles allocating the new view
8722  // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8723  template<class dst_t, class src_t> // version for same types
8724  typename std::enable_if<std::is_same<typename dst_t::value_type,
8725  typename src_t::value_type>::value>::type
8726  assign_if_same(dst_t & dst, const src_t & src) {
8727  dst = src;
8728  }
8729  template<class dst_t, class src_t> // version for different types
8730  typename std::enable_if<!std::is_same<typename dst_t::value_type,
8731  typename src_t::value_type>::value>::type
8732  assign_if_same(dst_t & dst, const src_t & src) {
8733  // do nothing - handled manually
8734  }
8735 };
8736 
8737 template <typename Adapter>
8738 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8739  int used_num_ranks,
8740  int migration_selection_option,
8741  RCP<const Environment> mj_env_,
8742  RCP<const Comm<int> > mj_problemComm_,
8743  int coord_dim_,
8744  mj_lno_t num_local_coords_,
8745  mj_gno_t num_global_coords_, size_t num_global_parts_,
8746  Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8747  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8748  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8749  int num_weights_per_coord_,
8750  Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8751  //results
8752  RCP<const Comm<int> > & result_problemComm_,
8753  mj_lno_t &result_num_local_coords_,
8754  Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8755  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8756  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8757  result_mj_coordinates_,
8758  Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8759  int * &result_actual_owner_rank_)
8760 {
8761  mj_env_->timerStart(MACRO_TIMERS,
8762  timer_base_string + "PreMigration DistributorPlanCreating");
8763 
8764  int myRank = mj_problemComm_->getRank();
8765  int worldSize = mj_problemComm_->getSize();
8766 
8767  mj_part_t groupsize = worldSize / used_num_ranks;
8768 
8769  std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8770 
8771  mj_part_t i_am_sending_to = 0;
8772  bool am_i_a_receiver = false;
8773 
8774  for(int i = 0; i < used_num_ranks; ++i) {
8775  group_begins[i+ 1] = group_begins[i] + groupsize;
8776  if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8777  if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8778  if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8779  i_am_sending_to = group_begins[i];
8780  }
8781  if(myRank == group_begins[i]) {
8782  am_i_a_receiver = true;
8783  }
8784  }
8785 
8786  ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8787  result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8788 
8789  Tpetra::Distributor distributor(mj_problemComm_);
8790 
8791  std::vector<mj_part_t>
8792  coordinate_destinations(num_local_coords_, i_am_sending_to);
8793 
8794  ArrayView<const mj_part_t>
8795  destinations(&(coordinate_destinations[0]), num_local_coords_);
8796  mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8797  result_num_local_coords_ = num_incoming_gnos;
8798  mj_env_->timerStop(MACRO_TIMERS,
8799  timer_base_string + "PreMigration DistributorPlanCreating");
8800 
8801  mj_env_->timerStart(MACRO_TIMERS,
8802  timer_base_string + "PreMigration DistributorMigration");
8803 
8804  // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8805 
8806  // migrate gnos.
8807  {
8808  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
8809  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_initial_mj_gnos(
8810  Kokkos::ViewAllocateWithoutInitializing("host_initial_mj_gnos"),
8811  initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8812  Kokkos::deep_copy(host_initial_mj_gnos, initial_mj_gnos_);
8813  ArrayView<const mj_gno_t> sent_gnos(host_initial_mj_gnos.data(),
8814  num_local_coords_);
8815  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
8816  result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8817  Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8818  num_incoming_gnos);
8819  auto host_result_initial_mj_gnos_ = Kokkos::create_mirror_view(
8820  result_initial_mj_gnos_);
8821  memcpy(host_result_initial_mj_gnos_.data(),
8822  received_gnos.getRawPtr(), num_incoming_gnos * sizeof(mj_gno_t));
8823  Kokkos::deep_copy(result_initial_mj_gnos_, host_result_initial_mj_gnos_);
8824  }
8825 
8826  // migrate coordinates
8827  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8828  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8829  Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8830  num_incoming_gnos, this->coord_dim);
8831  auto host_dst_coordinates = Kokkos::create_mirror_view(
8832  dst_coordinates);
8833  auto host_src_coordinates =
8834  Kokkos::create_mirror_view(Kokkos::HostSpace(), this->mj_coordinates);
8835  Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8836  for(int i = 0; i < this->coord_dim; ++i) {
8837  auto sub_host_src_coordinates
8838  = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839  auto sub_host_dst_coordinates
8840  = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
8841  // Note Layout Left means we can do these in contiguous blocks
8842  ArrayView<mj_scalar_t> sent_coord(
8843  sub_host_src_coordinates.data(), this->num_local_coords);
8844  ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
8845  distributor.doPostsAndWaits<mj_scalar_t>(
8846  sent_coord, 1, received_coord());
8847  memcpy(sub_host_dst_coordinates.data(),
8848  received_coord.getRawPtr(), num_incoming_gnos * sizeof(mj_scalar_t));
8849  }
8850  deep_copy(dst_coordinates, host_dst_coordinates);
8851  result_mj_coordinates_ = dst_coordinates;
8852 
8853  // migrate weights.
8854  Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8855  Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8856  num_incoming_gnos, this->num_weights_per_coord);
8857  auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8858  auto host_src_weights = Kokkos::create_mirror_view(this->mj_weights);
8859  Kokkos::deep_copy(host_src_weights, this->mj_weights);
8860  for(int i = 0; i < this->num_weights_per_coord; ++i) {
8861  auto sub_host_src_weights
8862  = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8863  auto sub_host_dst_weights
8864  = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8865  ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
8866 
8867  // Layout Right means these are not contiguous
8868  // However we don't have any systems setup with more than 1 weight so
8869  // really I have not tested any of this code with num weights > 1.
8870  // I think this is the right thing to do. Note that there are other
8871  // places in the code which don't handle the possibility of more weights.
8872  // So evaluating all that and adding tests would be another project.
8873  for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8874  sent_weight[n] = sub_host_src_weights(n);
8875  }
8876  ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
8877  distributor.doPostsAndWaits<mj_scalar_t>(
8878  sent_weight(), 1, received_weight());
8879 
8880  // Again we copy by index due to layout
8881  for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882  sub_host_dst_weights(n) = received_weight[n];
8883  }
8884  }
8885  Kokkos::deep_copy(dst_weights, host_dst_weights);
8886  result_mj_weights_ = dst_weights;
8887 
8888  // migrate the owners of the coordinates
8889  {
8890  std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
8891  ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
8892  ArrayRCP<int> received_owners(num_incoming_gnos);
8893  distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
8894  result_actual_owner_rank_ = new int[num_incoming_gnos];
8895  memcpy(
8896  result_actual_owner_rank_,
8897  received_owners.getRawPtr(),
8898  num_incoming_gnos * sizeof(int));
8899  }
8900 
8901  mj_env_->timerStop(MACRO_TIMERS,
8902  timer_base_string + "PreMigration DistributorMigration");
8903  return am_i_a_receiver;
8904 }
8905 
8913 template <typename Adapter>
8915  const RCP<PartitioningSolution<Adapter> > &solution)
8916 {
8917  // purpose of this code is to validate node and UVM status for the tests
8918  // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8919  // << "Execution Space: " << mj_node_t::execution_space::name()
8920  // << std::endl;
8921 
8922  int execute_counter =
8924  timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8925 
8926  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8927  {
8928  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8929 
8930  this->set_up_partitioning_data(solution);
8931 
8932  this->set_input_parameters(this->mj_env->getParameters());
8933  if(this->mj_keep_part_boxes) {
8934  this->mj_partitioner.set_to_keep_part_boxes();
8935  }
8936 
8937  this->mj_partitioner.set_partitioning_parameters(
8938  this->distribute_points_on_cut_lines,
8939  this->max_concurrent_part_calculation,
8940  this->check_migrate_avoid_migration_option,
8941  this->minimum_migration_imbalance, this->migration_type);
8942 
8943  RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8944  mj_lno_t result_num_local_coords = this->num_local_coords;
8945  Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8946  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8947  Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8948  result_mj_coordinates = this->mj_coordinates;
8949  Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8950  this->mj_weights;
8951  int *result_actual_owner_rank = NULL;
8952 
8953  Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8954  this->initial_mj_gnos;
8955 
8956  // TODO: MD 08/2017: Further discussion is required.
8957  // MueLu calls MJ when it has very few coordinates per processors,
8958  // such as 10. For example, it begins with 1K processor with 1K coordinate
8959  // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8960  // It calls MJ to repartition these to 10 coordinates.
8961  // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8962  // 10 parts. As expected strong scaling is problem here, because
8963  // computation is almost 0, and communication cost of MJ linearly increases.
8964  // Premigration option gathers the coordinates to 10 parts before MJ starts
8965  // therefore MJ will run with a smalller subset of the problem.
8966  // Below, I am migrating the coordinates if mj_premigration_option is set,
8967  // and the result parts are less than the current part count, and the
8968  // average number of local coordinates is less than some threshold.
8969  // For example, premigration may not help if 1000 processors are
8970  // partitioning data to 10, but each of them already have 1M coordinate.
8971  // In that case, we premigration would not help.
8972  int current_world_size = this->mj_problemComm->getSize();
8973  mj_lno_t threshold_num_local_coords =
8974  this->min_coord_per_rank_for_premigration;
8975  bool is_pre_migrated = false;
8976  bool am_i_in_subset = true;
8977 
8978  // Note that we need to add testing for migration and should also cover the
8979  // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8980  // Currently did a minimal test of this code by running mjTest with
8981  // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8982  if(mj_premigration_option > 0 &&
8983  size_t (current_world_size) > this->num_global_parts &&
8984  this->num_global_coords < mj_gno_t (
8985  current_world_size * threshold_num_local_coords))
8986  {
8987  if(this->mj_keep_part_boxes) {
8988  throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8989  "mj_premigration_option are not supported together yet.");
8990  }
8991 
8992  is_pre_migrated =true;
8993  int migration_selection_option = mj_premigration_option;
8994  if(migration_selection_option * this->num_global_parts >
8995  (size_t) (current_world_size)) {
8996  migration_selection_option =
8997  current_world_size / this->num_global_parts;
8998  }
8999 
9000  int used_num_ranks = int (this->num_global_coords /
9001  float (threshold_num_local_coords) + 0.5);
9002 
9003  if(used_num_ranks == 0) {
9004  used_num_ranks = 1;
9005  }
9006 
9007  am_i_in_subset = this->mj_premigrate_to_subset(
9008  used_num_ranks,
9009  migration_selection_option,
9010  this->mj_env,
9011  this->mj_problemComm,
9012  this->coord_dim,
9013  this->num_local_coords,
9014  this->num_global_coords,
9015  this->num_global_parts,
9016  this->initial_mj_gnos,
9017  this->mj_coordinates,
9018  this->num_weights_per_coord,
9019  this->mj_weights,
9020  //results
9021  result_problemComm,
9022  result_num_local_coords,
9023  result_initial_mj_gnos,
9024  result_mj_coordinates,
9025  result_mj_weights,
9026  result_actual_owner_rank);
9027 
9028  result_initial_mj_gnos_ = result_initial_mj_gnos;
9029  }
9030 
9031  Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9032  Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9033 
9034  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9035 
9036  if(am_i_in_subset) {
9037  this->mj_partitioner.multi_jagged_part(
9038  this->mj_env,
9039  result_problemComm, //this->mj_problemComm,
9040  this->imbalance_tolerance,
9041  this->num_teams,
9042  this->num_global_parts,
9043  this->part_no_array,
9044  this->recursion_depth,
9045  this->coord_dim,
9046  result_num_local_coords, //this->num_local_coords,
9047  this->num_global_coords,
9048  result_initial_mj_gnos_,
9049  result_mj_coordinates,
9050  this->num_weights_per_coord,
9051  this->mj_uniform_weights,
9052  result_mj_weights,
9053  this->mj_uniform_parts,
9054  result_assigned_part_ids,
9055  result_mj_gnos
9056  );
9057  }
9058 
9059  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9060 
9061  // Reorder results so that they match the order of the input
9062  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9063  localGidToLid.reserve(result_num_local_coords);
9064  Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9065  Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9066  result_initial_mj_gnos_.size());
9067  Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9068  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9069  localGidToLid[host_result_initial_mj_gnos(i)] = i;
9070  }
9071 
9072  ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9073  0, result_num_local_coords, true);
9074  auto host_result_assigned_part_ids =
9075  Kokkos::create_mirror_view(result_assigned_part_ids);
9076  Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9077  auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9078  Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9079  for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9080  mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9081  partId[origLID] = host_result_assigned_part_ids(i);
9082  }
9083 
9084  //now the results are reordered. but if premigration occured,
9085  //then we need to send these ids to actual owners again.
9086  if(is_pre_migrated) {
9087  this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9088  "PostMigration DistributorPlanCreating");
9089  Tpetra::Distributor distributor(this->mj_problemComm);
9090  ArrayView<const mj_part_t> actual_owner_destinations(
9091  result_actual_owner_rank , result_num_local_coords);
9092  mj_lno_t num_incoming_gnos = distributor.createFromSends(
9093  actual_owner_destinations);
9094  if(num_incoming_gnos != this->num_local_coords) {
9095  throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9096  "num incoming is not equal to num local coords");
9097  }
9098 
9099  mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9100  "PostMigration DistributorPlanCreating");
9101  mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9102  "PostMigration DistributorMigration");
9103  ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
9104  ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
9105  {
9106  ArrayView<const mj_gno_t> sent_gnos(host_result_initial_mj_gnos.data(),
9107  result_num_local_coords);
9108  distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
9109  }
9110 
9111  {
9112  ArrayView<mj_part_t> sent_partnos(partId());
9113  distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1,
9114  received_partids());
9115  }
9116 
9117  partId = arcp(new mj_part_t[this->num_local_coords],
9118  0, this->num_local_coords, true);
9119 
9120  {
9121  std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9122  localGidToLid2.reserve(this->num_local_coords);
9123  auto host_initial_mj_gnos =
9124  Kokkos::create_mirror_view(this->initial_mj_gnos);
9125  Kokkos::deep_copy(host_initial_mj_gnos,
9126  this->initial_mj_gnos);
9127  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9128  localGidToLid2[host_initial_mj_gnos(i)] = i;
9129  }
9130 
9131  for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132  mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9133  partId[origLID] = received_partids[i];
9134  }
9135  }
9136 
9137  {
9138  delete [] result_actual_owner_rank;
9139  }
9140  mj_env->timerStop(MACRO_TIMERS,
9141  timer_base_string + "PostMigration DistributorMigration");
9142  }
9143  solution->setParts(partId);
9144  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9145  }
9146 
9147  this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9148 }
9149 
9150 /* \brief Sets the partitioning data for multijagged algorithm.
9151  * */
9152 template <typename Adapter>
9154  const RCP<PartitioningSolution<Adapter> > &solution
9155 )
9156 {
9157  this->coord_dim = this->mj_coords->getCoordinateDim();
9158  this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
9159  this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
9160  this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
9161  int criteria_dim = (this->num_weights_per_coord ?
9162  this->num_weights_per_coord : 1);
9163  // From the Solution we get part information.
9164  // If the part sizes for a given criteria are not uniform,
9165  // then they are values that sum to 1.0.
9166  this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9167  // allocate only two dimensional pointer.
9168  // raw pointer addresess will be obtained from multivector.
9169  this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9170  "uniform parts", criteria_dim);
9171  this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9172  "uniform weights", criteria_dim);
9173 
9174  Kokkos::View<const mj_gno_t *, device_t> gnos;
9175  Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9176  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9177  Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9178  this->mj_coords->getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9179  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9180  Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9181  Kokkos::View<mj_scalar_t **, device_t> wgts;
9182 
9183  // Now we must get the data from the adapter.
9184  // If the types match we point to the view but if not, we must copy.
9185  if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9186  // we can just point the views but we must specialize because this code
9187  // only compiles in this case - for is_same false assign does nothing.
9188  assign_if_same(xyz, xyz_adapter);
9189  assign_if_same(wgts, wgts_adapter);
9190  }
9191  else {
9192  // we only allocate a new view if we are going to copy
9193  // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9194  xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9195  (Kokkos::ViewAllocateWithoutInitializing(
9196  "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9197  wgts = Kokkos::View<mj_scalar_t **, device_t>(
9198  Kokkos::ViewAllocateWithoutInitializing("wgts"),
9199  wgts_adapter.extent(0), wgts_adapter.extent(1));
9200 
9201  typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9202  Kokkos::parallel_for(
9203  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9204  (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9205  for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9206  xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9207  }
9208  });
9209  Kokkos::parallel_for(
9210  Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9211  (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9212  for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9213  wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9214  }
9215  });
9216  }
9217 
9218  // obtain global ids.
9219  this->initial_mj_gnos = gnos;
9220  // extract coordinates from multivector.
9221  this->mj_coordinates = xyz;
9222  // if no weights are provided set uniform weight.
9223 
9224  if(this->num_weights_per_coord == 0) {
9225  this->mj_uniform_weights(0) = true;
9226  Kokkos::resize(this->mj_weights, 0, 0);
9227  }
9228  else{
9229  this->mj_weights = wgts;
9230  for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9231  this->mj_uniform_weights(wdim) = false;
9232  }
9233  }
9234 
9235  for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9236  if(solution->criteriaHasUniformPartSizes(wdim)) {
9237  this->mj_uniform_parts(wdim) = true;
9238  }
9239  else {
9240  printf("Error: MJ does not support non uniform target part weights\n");
9241  std::terminate();
9242  }
9243  }
9244 }
9245 
9246 /* \brief Sets the partitioning parameters for multijagged algorithm.
9247  * \param pl: is the parameter list provided to zoltan2 call
9248  * */
9249 template <typename Adapter>
9251  const Teuchos::ParameterList &pl)
9252 {
9253  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9254  if(pe) {
9255  double tol;
9256  tol = pe->getValue(&tol);
9257  this->imbalance_tolerance = tol - 1.0;
9258  }
9259 
9260  // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9261  if(this->imbalance_tolerance <= 0) {
9262  this->imbalance_tolerance= 10e-4;
9263  }
9264 
9265  // if an input partitioning array is provided.
9266  Kokkos::resize(this->part_no_array, 0);
9267 
9268  // the length of the input partitioning array.
9269  this->recursion_depth = 0;
9270 
9271  if(pl.getPtr<int>("mj_num_teams")) {
9272  this->num_teams = pl.get<int>("mj_num_teams");
9273  }
9274 
9275  if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9276  auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9277  int mj_parts_size = static_cast<int>(mj_parts.size());
9278 
9279  // build the view we'll have data on and copy values from host
9280  this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9281  "part_no_array", mj_parts_size);
9282  for(int i = 0; i < mj_parts_size; ++i) {
9283  this->part_no_array(i) = mj_parts.getRawPtr()[i];
9284  }
9285 
9286  this->recursion_depth = mj_parts_size - 1;
9287  this->mj_env->debug(2, "mj_parts provided by user");
9288  }
9289 
9290  // get mj specific parameters.
9291  this->distribute_points_on_cut_lines = true;
9292  this->max_concurrent_part_calculation = 1;
9293 
9294  this->mj_run_as_rcb = false;
9295  this->mj_premigration_option = 0;
9296  this->min_coord_per_rank_for_premigration = 32000;
9297 
9298  int mj_user_recursion_depth = -1;
9299  this->mj_keep_part_boxes = false;
9300  this->check_migrate_avoid_migration_option = 0;
9301  this->migration_type = 0;
9302  this->minimum_migration_imbalance = 0.35;
9303 
9304  pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9305  if(pe) {
9306  double imb;
9307  imb = pe->getValue(&imb);
9308  this->minimum_migration_imbalance = imb - 1.0;
9309  }
9310 
9311  pe = pl.getEntryPtr("mj_migration_option");
9312  if(pe) {
9313  this->check_migrate_avoid_migration_option =
9314  pe->getValue(&this->check_migrate_avoid_migration_option);
9315  } else {
9316  this->check_migrate_avoid_migration_option = 0;
9317  }
9318  if(this->check_migrate_avoid_migration_option > 1) {
9319  this->check_migrate_avoid_migration_option = -1;
9320  }
9321 
9323  pe = pl.getEntryPtr("mj_migration_type");
9324  if(pe) {
9325  this->migration_type = pe->getValue(&this->migration_type);
9326  } else {
9327  this->migration_type = 0;
9328  }
9329 
9330  //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9332 
9333  pe = pl.getEntryPtr("mj_concurrent_part_count");
9334  if(pe) {
9335  this->max_concurrent_part_calculation =
9336  pe->getValue(&this->max_concurrent_part_calculation);
9337  } else {
9338  this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9339  }
9340 
9341  pe = pl.getEntryPtr("mj_keep_part_boxes");
9342  if(pe) {
9343  this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9344  } else {
9345  this->mj_keep_part_boxes = false; // Set to invalid value
9346  }
9347 
9348  // For now, need keep_part_boxes to do pointAssign and boxAssign.
9349  // pe = pl.getEntryPtr("keep_cuts");
9350  // if(pe) {
9351  // int tmp = pe->getValue(&tmp);
9352  // if(tmp) this->mj_keep_part_boxes = true;
9353  // }
9354 
9355  //need to keep part boxes if mapping type is geometric.
9356  if(this->mj_keep_part_boxes == false) {
9357  pe = pl.getEntryPtr("mapping_type");
9358  if(pe) {
9359  int mapping_type = -1;
9360  mapping_type = pe->getValue(&mapping_type);
9361  if(mapping_type == 0) {
9362  mj_keep_part_boxes = true;
9363  }
9364  }
9365  }
9366 
9367  // need to keep part boxes if mapping type is geometric.
9368  pe = pl.getEntryPtr("mj_enable_rcb");
9369  if(pe) {
9370  this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9371  } else {
9372  this->mj_run_as_rcb = false; // Set to invalid value
9373  }
9374 
9375  pe = pl.getEntryPtr("mj_premigration_option");
9376  if(pe) {
9377  mj_premigration_option = pe->getValue(&mj_premigration_option);
9378  } else {
9379  mj_premigration_option = 0;
9380  }
9381 
9382  pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9383  if(pe) {
9384  min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9385  } else {
9386  min_coord_per_rank_for_premigration = 32000;
9387  }
9388 
9389  pe = pl.getEntryPtr("mj_recursion_depth");
9390  if(pe) {
9391  mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9392  } else {
9393  mj_user_recursion_depth = -1; // Set to invalid value
9394  }
9395 
9396  bool val = false;
9397  pe = pl.getEntryPtr("rectilinear");
9398  if(pe) {
9399  val = pe->getValue(&val);
9400  }
9401  if(val) {
9402  this->distribute_points_on_cut_lines = false;
9403  } else {
9404  this->distribute_points_on_cut_lines = true;
9405  }
9406 
9407  if(this->mj_run_as_rcb) {
9408  mj_user_recursion_depth =
9409  (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9410  }
9411  if(this->recursion_depth < 1) {
9412  if(mj_user_recursion_depth > 0) {
9413  this->recursion_depth = mj_user_recursion_depth;
9414  }
9415  else {
9416  this->recursion_depth = this->coord_dim;
9417  }
9418  }
9419 }
9420 
9422 template <typename Adapter>
9424  int dim,
9425  adapter_scalar_t *lower,
9426  adapter_scalar_t *upper,
9427  size_t &nPartsFound,
9428  typename Adapter::part_t **partsFound) const
9429 {
9430  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9431  // TODO: complexity. Or at least do a search through the boxes, using
9432  // TODO: p x q x r x ... if possible.
9433 
9434  nPartsFound = 0;
9435  *partsFound = NULL;
9436 
9437  if(this->mj_keep_part_boxes) {
9438 
9439  // Get vector of part boxes
9440  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9441 
9442  size_t nBoxes = (*partBoxes).size();
9443  if(nBoxes == 0) {
9444  throw std::logic_error("no part boxes exist");
9445  }
9446 
9447  // Determine whether the box overlaps the globalBox at all
9448  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9449 
9450  if(globalBox->boxesOverlap(dim, lower, upper)) {
9451 
9452  std::vector<typename Adapter::part_t> partlist;
9453 
9454  // box overlaps the global box; find specific overlapping boxes
9455  for(size_t i = 0; i < nBoxes; i++) {
9456  try {
9457  if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9458  nPartsFound++;
9459  partlist.push_back((*partBoxes)[i].getpId());
9460  /*
9461  std::cout << "Given box (";
9462  for(int j = 0; j < dim; j++)
9463  std::cout << lower[j] << " ";
9464  std::cout << ") x (";
9465  for(int j = 0; j < dim; j++)
9466  std::cout << upper[j] << " ";
9467  std::cout << ") overlaps PartBox "
9468  << (*partBoxes)[i].getpId() << " (";
9469  for(int j = 0; j < dim; j++)
9470  std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9471  std::cout << ") x (";
9472  for(int j = 0; j < dim; j++)
9473  std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9474  std::cout << ")" << std::endl;
9475  */
9476  }
9477  }
9479  }
9480  if(nPartsFound) {
9481  *partsFound = new mj_part_t[nPartsFound];
9482  for(size_t i = 0; i < nPartsFound; i++)
9483  (*partsFound)[i] = partlist[i];
9484  }
9485  }
9486  else {
9487  // Box does not overlap the domain at all. Find the closest part
9488  // Not sure how to perform this operation for MJ without having the
9489  // cuts. With the RCB cuts, the concept of a part extending to
9490  // infinity was natural. With the boxes, it is much more difficult.
9491  // TODO: For now, return information indicating NO OVERLAP.
9492  }
9493  }
9494  else {
9495  throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9496  }
9497 }
9498 
9500 template <typename Adapter>
9502  int dim,
9503  adapter_scalar_t *point) const
9504 {
9505  // TODO: Implement with cuts rather than boxes to reduce algorithmic
9506  // TODO: complexity. Or at least do a search through the boxes, using
9507  // TODO: p x q x r x ... if possible.
9508 
9509  if(this->mj_keep_part_boxes) {
9510  typename Adapter::part_t foundPart = -1;
9511 
9512  // Get vector of part boxes
9513  RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9514 
9515  size_t nBoxes = (*partBoxes).size();
9516  if(nBoxes == 0) {
9517  throw std::logic_error("no part boxes exist");
9518  }
9519 
9520  // Determine whether the point is within the global domain
9521  RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9522 
9523  if(globalBox->pointInBox(dim, point)) {
9524 
9525  // point is in the global domain; determine in which part it is.
9526  size_t i;
9527  for(i = 0; i < nBoxes; i++) {
9528  try {
9529  if((*partBoxes)[i].pointInBox(dim, point)) {
9530  foundPart = (*partBoxes)[i].getpId();
9531  // std::cout << "Point (";
9532  // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9533  // std::cout << ") found in box " << i << " part " << foundPart
9534  // << std::endl;
9535  // (*partBoxes)[i].print();
9536  break;
9537  }
9538  }
9540  }
9541 
9542  if(i == nBoxes) {
9543  // This error should never occur
9544  std::ostringstream oss;
9545  oss << "Point (";
9546  for(int j = 0; j < dim; j++) oss << point[j] << " ";
9547  oss << ") not found in domain";
9548  throw std::logic_error(oss.str());
9549  }
9550  }
9551 
9552  else {
9553  // Point is outside the global domain.
9554  // Determine to which part it is closest.
9555  // TODO: with cuts, would not need this special case
9556 
9557  typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9558  size_t closestBox = 0;
9559  coord_t minDistance = std::numeric_limits<coord_t>::max();
9560  coord_t *centroid = new coord_t[dim];
9561  for(size_t i = 0; i < nBoxes; i++) {
9562  (*partBoxes)[i].computeCentroid(centroid);
9563  coord_t sum = 0.;
9564  coord_t diff;
9565  for(int j = 0; j < dim; j++) {
9566  diff = centroid[j] - point[j];
9567  sum += diff * diff;
9568  }
9569  if(sum < minDistance) {
9570  minDistance = sum;
9571  closestBox = i;
9572  }
9573  }
9574  foundPart = (*partBoxes)[closestBox].getpId();
9575  delete [] centroid;
9576  }
9577 
9578  return foundPart;
9579  }
9580  else {
9581  throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9582  }
9583 }
9584 
9585 template <typename Adapter>
9587  const PartitioningSolution<Adapter> *solution,
9588  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9589  ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9590 {
9591  if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9592  RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9593  mj_part_t ntasks = (*pBoxes).size();
9594  int dim = (*pBoxes)[0].getDim();
9595  GridHash grid(pBoxes, ntasks, dim);
9596  grid.getAdjArrays(comXAdj_, comAdj_);
9597  }
9598  comAdj = comAdj_;
9599  comXAdj = comXAdj_;
9600 }
9601 
9602 template <typename Adapter>
9603 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9605 {
9606  return this->mj_partitioner.get_kept_boxes();
9607 }
9608 } // namespace Zoltan2
9609 
9610 #endif
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition: nd.cpp:82
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.