49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50 #define _ZOLTAN2_ALGMultiJagged_HPP_
59 #include <Tpetra_Distributor.hpp>
60 #include <Teuchos_StandardParameterEntryValidators.hpp>
61 #include <Teuchos_ParameterList.hpp>
62 #include <Kokkos_Sort.hpp>
66 #include <unordered_map>
68 #ifdef ZOLTAN2_USEZOLTANCOMM
69 #ifdef HAVE_ZOLTAN2_MPI
70 #define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71 #include "zoltan_comm_cpp.h"
72 #include "zoltan_types.h"
81 template <
typename Ordinal,
typename T>
105 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const {
106 for(Ordinal i = 0; i < count; i++) {
108 inoutBuffer[i] = inBuffer[i];
124 template <
typename IT,
typename CT,
typename WT>
144 this->index = index_;
145 this->count = count_;
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
160 assert(this->count == other.
count);
161 for(CT i = 0; i < this->
count; ++i) {
163 if(std::abs(this->val[i] - other.
val[i]) < this->epsilon) {
167 if(this->val[i] < other.
val[i]) {
176 return this->index < other.
index;
182 template <
class IT,
class WT>
193 template <
class IT,
class WT>
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[50];
205 for(j=l+1;j<=ir;j++) {
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
237 do i++;
while (arr[i].val < aval);
238 do j--;
while (arr[j].val > aval);
240 std::swap(arr[i],arr[j]);
245 if(jstack > NSTACK) {
246 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
263 template <
class IT,
class WT,
class SIGN>
271 if(this->signbit < rhs.
signbit) {
275 else if(this->signbit == rhs.
signbit) {
276 if(this->val < rhs.
val) {
280 else if(this->val > rhs.
val) {
295 return (this->val == rhs.
val && this->signbit == rhs.
signbit) || (*
this < rhs);
302 template <
class IT,
class WT,
class SIGN>
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[50];
313 for(j=l+1;j<=ir;j++) {
315 for(i=j-1;i>=1;i--) {
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
345 do i++;
while (arr[i] < a);
346 do j--;
while (a < arr[j]);
348 std::swap(arr[i],arr[j]);
353 if(jstack > NSTACK) {
354 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
390 static int counter = 0;
394 static int counter = 0;
401 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
402 typename mj_part_t,
typename mj_node_t>
406 typedef typename mj_node_t::device_type device_t;
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
413 static constexpr
size_t future_reduceall_cutoff = 1500000;
417 static constexpr mj_lno_t min_work_last_dim = 1000;
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr
int significance_mul = 1000;
422 std::string mj_timer_base_string;
424 RCP<const Environment> mj_env;
425 RCP<const Comm<int> > mj_problemComm;
426 RCP<Comm<int> > comm;
427 double imbalance_tolerance;
430 int num_weights_per_coord;
431 size_t initial_num_loc_coords;
433 mj_lno_t num_local_coords;
434 mj_gno_t num_global_coords;
435 mj_scalar_t sEpsilon;
438 bool distribute_points_on_cut_lines;
441 mj_part_t max_concurrent_part_calculation;
444 int mj_user_recursion_depth;
445 bool mj_keep_part_boxes;
448 int check_migrate_avoid_migration_option;
455 double minimum_migration_imbalance;
472 mj_part_t num_first_level_parts;
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
478 mj_part_t total_num_cut ;
479 mj_part_t total_num_part;
481 mj_part_t max_num_part_along_dim ;
482 mj_part_t max_num_cut_along_dim;
485 size_t max_num_total_part_along_dim;
487 mj_part_t total_dim_num_reduce_all;
491 mj_part_t last_dim_num_part;
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
512 size_t num_global_parts;
515 RCP<mj_partBoxVector_t> kept_boxes;
517 RCP<mj_partBox_t> global_box;
522 bool divide_to_prime_first;
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
607 Kokkos::View<double *, device_t>
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
625 Kokkos::View<mj_lno_t *, device_t>
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim;
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) /
static_cast<double>(expected) - 1.0;
660 void set_part_specifications();
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
754 bool mj_perform_migration(
755 mj_part_t in_num_parts,
756 mj_part_t &out_num_parts,
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
787 mj_gno_t *num_points_in_all_processor_parts);
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
876 void assign_send_destinations(
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
897 void assign_send_destinations2(
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
950 void mj_migrate_coords(
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
974 while (n % divisor == 0) {
976 largest_factor = divisor;
979 if(divisor * divisor > n) {
986 return largest_factor;
1019 const RCP<const Environment> &env,
1020 RCP<
const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1073 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1133 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1141 void allocate_set_work_memory();
1144 void compute_global_box();
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1423 bool longest_dim_part,
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1443 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1444 typename mj_part_t,
typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::
epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1515 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1516 typename mj_part_t,
typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1543 this->divide_to_prime_first = divide_to_prime_first_;
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1555 this->num_first_level_parts = num_first_level_parts_;
1557 this->first_level_distribution = first_level_distribution_;
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>(
"gids", this->num_local_coords);
1569 this->num_weights_per_coord = 0;
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) =
true;
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>(
"uniform parts", 1);
1580 this->mj_uniform_parts(0) =
true;
1582 this->set_part_specifications();
1584 this->allocate_set_work_memory();
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (
int dummy) {
1591 local_part_xadj(0) =
static_cast<mj_lno_t
>(num_selected_coords);
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1596 mj_part_t current_num_parts = 1;
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1601 mj_part_t future_num_parts = this->total_num_part;
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size(
"view_total_reduction_size", 1);
1625 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1637 next_future_num_parts_in_parts->clear();
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1648 t2, num_ranks_per_node);
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1661 std::string istring = std::to_string(rd);
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1670 mj_part_t output_part_index = 0;
1674 mj_part_t output_coordinate_end_index = 0;
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1679 mj_part_t obtained_part_index = 0;
1682 int coordInd = rd % this->coord_dim;
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1696 mj_part_t actual_work_part_count = 0;
1701 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(
int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1722 this->mj_get_local_min_max_coord_totW(
1724 current_concurrent_num_parts,
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1765 current_concurrent_num_parts,
1771 if(actual_work_part_count > 0) {
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1784 mj_part_t total_incomplete_cut_count = 0;
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1817 concurrent_part_cut_shift += partition_count - 1;
1819 concurrent_part_part_shift += partition_count;
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1825 total_incomplete_cut_count += partition_count - 1;
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1835 this->mj_get_initial_cut_coords_target_weights(
1838 partition_count - 1,
1839 global_total_weight,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1856 this->set_initial_coordinate_parts(
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1868 this->incomplete_cut_count(kk) = 0;
1870 obtained_part_index += partition_count;
1875 double used_imbalance = 0;
1879 mj_timer_base_string +
"mj_1D_part()");
1882 mj_current_dim_coords,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1892 mj_timer_base_string +
"mj_1D_part()");
1895 obtained_part_index += current_concurrent_num_parts;
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1904 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1950 process_cut_line_weight_to_put_left.size()));
1952 this->thread_part_weight_work =
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1967 this->create_consistent_chunks(
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (
int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce(
"Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2055 output_coordinate_end_index = get_single;
2057 output_part_index += num_parts;
2064 current_num_parts = output_part_count_in_dimension;
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj);
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2081 for(
size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2092 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2093 typename mj_part_t,
typename mj_node_t>
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2099 return this->global_box;
2104 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2105 typename mj_part_t,
typename mj_node_t>
2106 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2109 this->mj_keep_part_boxes =
true;
2119 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2120 typename mj_part_t,
typename mj_node_t>
2124 this->total_num_cut = 0;
2125 this->total_num_part = 1;
2126 this->max_num_part_along_dim = 0;
2127 this->total_dim_num_reduce_all = 0;
2128 this->last_dim_num_part = 1;
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2140 this->total_num_part = 1;
2141 for(
int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2145 mj_part_t track_max = 0;
2146 for(
int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2158 mj_part_t future_num_parts = this->num_global_parts;
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2169 for(
int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2187 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2196 this->num_global_parts * max_part / sum_first_level_dist;
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2209 future_num_parts = nfutureNumParts;
2211 this->total_num_part = this->num_global_parts;
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2222 for(
int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr <<
"Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2260 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2261 typename mj_part_t,
typename mj_node_t>
2262 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future,
double root)
2265 double fp = pow(num_total_future,
root);
2266 mj_part_t ip = mj_part_t(fp);
2294 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2295 typename mj_part_t,
typename mj_node_t>
2296 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2318 if(current_part_no_array < 1) {
2319 std::cout <<
"Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration <<
"] is given as:" <<
2321 current_part_no_array << std::endl;
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout <<
"Current recursive iteration: " << current_iteration
2333 <<
" part_no_array[" << current_iteration <<
"] is given as: " <<
2334 current_part_no_array <<
" and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2384 future_num_parts = 1;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr <<
"ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim <<
" this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2426 output_num_parts = this->num_first_level_parts;
2429 future_num_parts /= this->num_first_level_parts;
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2436 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2448 for (
int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2453 else if (this->divide_to_prime_first) {
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2460 output_num_parts += num_partitions_in_current_dim;
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2504 ++num_future_parts_for_part_iii;
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2525 output_num_parts += num_partitions_in_current_dim;
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2543 ++num_future_parts_for_part_iii;
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>(
"test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(
size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2580 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2581 typename mj_part_t,
typename mj_node_t>
2582 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing(
"num_local_coords"),
2601 this->num_local_coords);
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing(
"assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing(
"assigned part ids"),
2608 this->num_local_coords);
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing(
"part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing(
"all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing(
"empty"), 0);
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing(
"process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing(
"global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing(
"cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing(
"target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing(
"cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing(
"cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing(
"global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing(
"is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing(
"device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing(
"thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing(
"thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing(
"thread_point_counts"),
2748 this->max_num_part_along_dim);
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_local_coords);
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2791 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2792 typename mj_part_t,
typename mj_node_t>
2793 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2797 mj_scalar_t *mins =
new mj_scalar_t[this->coord_dim];
2799 mj_scalar_t *gmins =
new mj_scalar_t[this->coord_dim];
2801 mj_scalar_t *maxs =
new mj_scalar_t[this->coord_dim];
2803 mj_scalar_t *gmaxs =
new mj_scalar_t[this->coord_dim];
2805 auto local_mj_coordinates = this->mj_coordinates;
2809 for(
int i = 0; i < this->coord_dim; ++i) {
2814 for(
int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce(
"MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce(
"MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2842 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2856 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2857 typename mj_part_t,
typename mj_node_t>
2858 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2870 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2873 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2884 bool bUniformWeights = mj_uniform_weights(0);
2886 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2899 if(coordinate_begin_index >= coordinate_end_index)
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2907 Kokkos::parallel_reduce(
"get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2916 Kokkos::parallel_reduce(
"get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce(
"get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (
int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2965 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2966 typename mj_part_t,
typename mj_node_t>
2967 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2975 if(this->comm->getSize() > 1) {
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2987 reduceAll<int, mj_scalar_t>(
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3039 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3040 typename mj_part_t,
typename mj_node_t>
3041 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts ,
3046 mj_scalar_t global_weight,
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts,
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3058 mj_scalar_t coord_range = max_coord - min_coord;
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr <<
"MJ does not support non uniform part weights beyond the first partition" << std::endl;
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3081 mj_scalar_t cumulative = 0;
3083 if(bUniformPartsCheck) {
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t
>((*future_num_part_in_parts)[concurrent_current_part]);
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight *
static_cast<mj_scalar_t
>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (
int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3113 Kokkos::parallel_for(
"Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3121 current_target_part_weights(num_cuts) = global_weight;
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3154 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3155 typename mj_part_t,
typename mj_node_t>
3156 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1;
3193 mj_part_ids[iii] = 2 * pp;
3212 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3213 typename mj_part_t,
typename mj_node_t>
3214 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3224 this->temp_cut_coords = current_cut_coordinates;
3227 *reductionOp = NULL;
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(
size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3237 <mj_part_t, mj_scalar_t>(
3240 current_concurrent_num_parts);
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (
int dummy) {
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) =
false;
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3317 mj_current_dim_coords,
3321 this->mj_combine_rightleft_and_weights(
3323 current_concurrent_num_parts);
3326 if(!bSingleProcess) {
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce(
"Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(
int dummy,
size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3360 mj_part_t cut_shift = 0;
3364 size_t tlr_shift = 0;
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count(
"save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3393 this->total_part_weight_left_right_closests.size()));
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3431 local_process_cut_line_weight_to_put_left.size()));
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3456 local_cut_lower_bound_coordinates.size()));
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(
int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(
int dummy) {
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3536 template<
class scalar_t>
3542 KOKKOS_INLINE_FUNCTION
3545 KOKKOS_INLINE_FUNCTION
3554 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3556 template<
class policy_t,
class scalar_t,
class part_t>
3567 scalar_t mj_max_scalar,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3577 KOKKOS_INLINE_FUNCTION
3582 KOKKOS_INLINE_FUNCTION
3585 dst.
ptr[n] += src.
ptr[n];
3590 if(src.
ptr[n] > dst.
ptr[n]) {
3593 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3594 dst.
ptr[n+1] = src.
ptr[n+1];
3599 KOKKOS_INLINE_FUNCTION
3602 dst.
ptr[n] += src.
ptr[n];
3607 if(src.
ptr[n] > dst.
ptr[n]) {
3610 if(src.
ptr[n+1] < dst.
ptr[n+1]) {
3611 dst.
ptr[n+1] = src.
ptr[n+1];
3632 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
3633 class device_t,
class array_t>
3638 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3661 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3707 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3716 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result =
sizeof(array_t) *
3720 int result =
sizeof(array_t) *
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3732 KOKKOS_INLINE_FUNCTION
3733 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3760 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3778 teamMember.team_barrier();
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3811 index_t part =
parts(i)/2;
3822 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3825 threadSum.
ptr[part*2] += w;
3831 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3837 prev_value, new_value);
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3843 prev_value, new_value);
3861 if(coord < b + sEpsilon && coord > b -
sEpsilon) {
3865 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3870 threadSum.
ptr[part*2+1] += w;
3875 parts(i) = part*2+1;
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3892 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3897 threadSum.
ptr[part*2+1] += w;
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3914 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3919 threadSum.
ptr[part*2+1] += w;
3944 if(part == lower + 1) {
3949 part -= (part - lower)/2;
3952 else if(part == upper - 1) {
3957 part += (upper - part)/2;
3961 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3964 }, arraySumReducer);
3967 teamMember.team_barrier();
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3972 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(¤t_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3976 teamSum[n] += array.ptr[n];
3980 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3987 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 ¤t_right_closest(insert_right), prev_value, new_value);
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 ¤t_left_closest(insert_left), prev_value, new_value);
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4015 teamMember.team_barrier();
4018 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4027 if(src[n] > dst[n]) {
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4036 KOKKOS_INLINE_FUNCTION
4044 if(src[n] > dst[n]) {
4047 if(src[n+1] < dst[n+1]) {
4048 dst[n+1] = src[n+1];
4074 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4075 typename mj_part_t,
typename mj_node_t>
4076 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4077 mj_1D_part_get_part_weights(
4080 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4083 auto local_is_cut_line_determined = is_cut_line_determined;
4084 auto local_thread_part_weights = thread_part_weights;
4085 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4086 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4090 auto local_sEpsilon = this->
sEpsilon;
4091 auto local_assigned_part_ids = this->assigned_part_ids;
4092 auto local_coordinate_permutations = this->coordinate_permutations;
4093 auto local_mj_weights = this->mj_weights;
4095 auto local_global_min_max_coord_total_weight =
4096 this->global_min_max_coord_total_weight;
4098 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4100 auto local_device_num_partitioning_in_current_dim =
4101 device_num_partitioning_in_current_dim;
4103 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4104 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4106 mj_part_t total_part_shift = 0;
4108 mj_part_t concurrent_cut_shifts = 0;
4110 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4111 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4112 concurrent_cut_shifts, temp_cut_coords.size()));
4114 mj_part_t num_parts =
4116 mj_part_t
num_cuts = num_parts - 1;
4117 mj_part_t total_part_count = num_parts +
num_cuts;
4118 mj_part_t weight_array_length =
num_cuts + num_parts;
4121 mj_part_t right_left_array_length = (
num_cuts + 2) * 2;
4123 if(this->incomplete_cut_count(kk) == 0) {
4124 total_part_shift += total_part_count;
4130 auto policy_ReduceWeightsFunctor = policy_t(
4131 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4133 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4134 int total_array_length =
4135 weight_array_length + right_left_array_length;
4142 typedef mj_scalar_t array_t;
4144 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4145 array_t * reduce_array =
4146 new array_t[
static_cast<size_t>(total_array_length)];
4149 int offset_cuts = 0;
4150 for(
int kk2 = 0; kk2 < kk; ++kk2) {
4154 Kokkos::View<double *, device_t> my_current_part_weights =
4155 Kokkos::subview(local_thread_part_weights,
4156 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4157 total_part_shift + total_part_count));
4158 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4159 Kokkos::subview(local_thread_cut_left_closest_point,
4160 std::pair<mj_lno_t, mj_lno_t>(
4162 local_thread_cut_left_closest_point.size()));
4163 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4164 Kokkos::subview(local_thread_cut_right_closest_point,
4165 std::pair<mj_lno_t, mj_lno_t>(
4167 local_thread_cut_right_closest_point.size()));
4169 array_t
max_scalar = std::numeric_limits<array_t>::max();
4171 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4173 Kokkos::parallel_for(
4174 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4175 KOKKOS_LAMBDA (
int dummy) {
4176 for(
int n = 0; n < weight_array_length; ++n) {
4177 my_current_part_weights(n) = 0;
4179 for(
int n = 0; n <
num_cuts; ++n) {
4190 typename mj_node_t::device_type, array_t>
4198 right_left_array_length,
4199 weight_array_length,
4200 coordinate_permutations,
4201 mj_current_dim_coords,
4204 local_temp_cut_coords,
4206 mj_uniform_weights(0),
4208 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4209 ,my_current_part_weights,
4210 my_current_left_closest,
4211 my_current_right_closest
4215 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4216 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4218 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4219 teamFunctor, reduce_array);
4222 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4223 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4225 for(
int i = 0; i < static_cast<int>(total_part_count); ++i) {
4226 hostArray(i) = reduce_array[i];
4229 Kokkos::deep_copy(my_current_part_weights, hostArray);
4231 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4232 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4233 for(mj_part_t cut = 0; cut <
num_cuts; ++cut) {
4234 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4235 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4237 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4238 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4240 delete [] reduce_array;
4243 total_part_shift += total_part_count;
4247 auto local_temp_cut_coords = temp_cut_coords;
4249 Kokkos::parallel_for(
4250 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4252 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4254 mj_part_t
num_cuts = num_parts - 1;
4255 mj_part_t total_part_count = num_parts +
num_cuts;
4257 if(local_device_incomplete_cut_count(kk) > 0) {
4261 size_t offset_cuts = 0;
4262 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4263 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4265 offset += num_parts_kk2 * 2 - 1;
4266 offset_cuts += num_parts_kk2 - 1;
4269 for(mj_part_t i = 1; i < total_part_count; ++i) {
4273 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4274 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4275 local_temp_cut_coords(offset_cuts + i /2 - 1))
4280 local_thread_part_weights(offset + i)
4281 = local_thread_part_weights(offset + i-2);
4286 local_thread_part_weights(offset + i) +=
4287 local_thread_part_weights(offset + i-1);
4300 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4301 typename mj_part_t,
typename mj_node_t>
4302 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4303 mj_combine_rightleft_and_weights(
4307 auto local_thread_part_weights = this->thread_part_weights;
4308 auto local_is_cut_line_determined = this->is_cut_line_determined;
4309 auto local_thread_cut_left_closest_point =
4310 this->thread_cut_left_closest_point;
4311 auto local_thread_cut_right_closest_point =
4312 this->thread_cut_right_closest_point;
4313 auto local_total_part_weight_left_right_closests =
4314 this->total_part_weight_left_right_closests;
4315 auto local_device_num_partitioning_in_current_dim =
4316 device_num_partitioning_in_current_dim;
4317 Kokkos::parallel_for(
4318 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4319 KOKKOS_LAMBDA (
int dummy) {
4321 size_t tlr_array_shift = 0;
4322 mj_part_t cut_shift = 0;
4323 size_t total_part_array_shift = 0;
4329 mj_part_t num_parts_in_part =
4331 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4332 size_t num_total_part_in_part =
4333 num_parts_in_part + size_t (num_cuts_in_part);
4336 for(
int ii = 0; ii < num_cuts_in_part; ++ii) {
4337 mj_part_t next = tlr_array_shift + ii;
4338 mj_part_t cut_index = cut_shift + ii;
4340 if(!local_is_cut_line_determined(cut_index)) {
4341 mj_scalar_t left_closest_in_process =
4342 local_thread_cut_left_closest_point(cut_index);
4343 mj_scalar_t right_closest_in_process =
4344 local_thread_cut_right_closest_point(cut_index);
4347 local_total_part_weight_left_right_closests(
4348 num_total_part_in_part + next) = left_closest_in_process;
4350 local_total_part_weight_left_right_closests(
4351 num_total_part_in_part + num_cuts_in_part + next) =
4352 right_closest_in_process;
4356 for(
size_t j = 0; j < num_total_part_in_part; ++j) {
4357 mj_part_t cut_ind = j / 2 + cut_shift;
4363 if(j == num_total_part_in_part - 1 ||
4364 !local_is_cut_line_determined(cut_ind)) {
4365 double pwj = local_thread_part_weights(total_part_array_shift + j);
4366 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4371 cut_shift += num_cuts_in_part;
4372 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4373 total_part_array_shift += num_total_part_in_part;
4390 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4391 typename mj_part_t,
typename mj_node_t>
4392 KOKKOS_INLINE_FUNCTION
4393 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4394 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4395 mj_scalar_t cut_lower_bound,
4396 mj_scalar_t cut_upper_weight,
4397 mj_scalar_t cut_lower_weight,
4398 mj_scalar_t expected_weight,
4399 mj_scalar_t &new_cut_position,
4402 if(std::abs(cut_upper_bound - cut_lower_bound) <
sEpsilon) {
4403 new_cut_position = cut_upper_bound;
4406 if(std::abs(cut_upper_weight - cut_lower_weight) <
sEpsilon) {
4407 new_cut_position = cut_lower_bound;
4410 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4411 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4412 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4414 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4415 int scale_constant = 20;
4416 int shiftint= int (required_shift * scale_constant);
4417 if(shiftint == 0) shiftint = 1;
4418 required_shift = mj_scalar_t (shiftint) / scale_constant;
4419 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4422 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4424 template<
class policy_t,
class scalar_t>
4434 int mj_value_count) :
4439 KOKKOS_INLINE_FUNCTION
4444 KOKKOS_INLINE_FUNCTION
4447 dst.
ptr[n] += src.
ptr[n];
4451 KOKKOS_INLINE_FUNCTION
4454 dst.
ptr[n] += src.
ptr[n];
4468 template<
class policy_t,
class scalar_t,
class part_t,
class index_t,
4469 class device_t,
class array_t>
4474 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4486 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4487 Kokkos::View<int *, device_t> local_point_counts;
4491 part_t mj_concurrent_current_part,
4492 part_t mj_weight_array_size,
4493 Kokkos::View<index_t*, device_t> & mj_permutations,
4494 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4495 Kokkos::View<part_t*, device_t> & mj_parts,
4496 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4497 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4498 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4499 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4508 track_on_cuts(mj_track_on_cuts)
4509 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4510 ,local_point_counts(mj_local_point_counts)
4516 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4519 int result =
sizeof(array_t) * (
value_count) * team_size;
4523 int remainder = result % 8;
4524 if(remainder != 0) {
4525 result += 8 - remainder;
4530 KOKKOS_INLINE_FUNCTION
4531 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4540 index_t num_working_points = all_end - all_begin;
4541 int num_teams = teamMember.league_size();
4543 index_t stride = num_working_points / num_teams;
4544 if((num_working_points % num_teams) > 0) {
4548 index_t begin = all_begin + stride * teamMember.league_rank();
4549 index_t end = begin + stride;
4554 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4557 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4558 size_t sh_mem_size =
sizeof(array_t) * (
value_count);
4560 size_t sh_mem_size =
4561 sizeof(array_t) * (
value_count) * teamMember.team_size();
4564 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4567 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4569 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4574 teamMember.team_barrier();
4576 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4586 Kokkos::parallel_reduce(
4587 Kokkos::TeamThreadRange(teamMember, begin, end),
4594 if(place % 2 == 0) {
4595 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&shared_ptr[part], 1);
4598 threadSum.
ptr[part] += 1;
4601 parts(coordinate_index) = part;
4606 index_t set_index = Kokkos::atomic_fetch_add(
4607 &track_on_cuts(track_on_cuts_insert_index), 1);
4608 track_on_cuts(set_index) = ii;
4610 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4616 teamMember.team_barrier();
4619 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4621 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4622 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4624 teamSum[n] += array.ptr[n];
4629 teamMember.team_barrier();
4632 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4634 KOKKOS_INLINE_FUNCTION
4641 KOKKOS_INLINE_FUNCTION
4671 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4672 typename mj_part_t,
typename mj_node_t>
4673 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4674 mj_create_new_partitions(
4675 mj_part_t num_parts,
4676 mj_part_t current_concurrent_work_part,
4677 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4678 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4679 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4680 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4683 auto local_thread_part_weight_work = this->thread_part_weight_work;
4684 auto local_point_counts = this->thread_point_counts;
4685 auto local_distribute_points_on_cut_lines =
4686 this->distribute_points_on_cut_lines;
4687 auto local_thread_cut_line_weight_to_put_left =
4688 this->thread_cut_line_weight_to_put_left;
4689 auto local_sEpsilon = this->
sEpsilon;
4690 auto local_coordinate_permutations = this->coordinate_permutations;
4691 auto local_mj_weights = this->mj_weights;
4692 auto local_assigned_part_ids = this->assigned_part_ids;
4693 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4695 mj_part_t
num_cuts = num_parts - 1;
4697 Kokkos::parallel_for(
4698 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4699 KOKKOS_LAMBDA(
int dummy) {
4701 if(local_distribute_points_on_cut_lines) {
4702 for(
int i = 0; i <
num_cuts; ++i) {
4703 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4704 if(left_weight > local_sEpsilon) {
4706 mj_scalar_t thread_ii_weight_on_cut =
4707 local_thread_part_weight_work(i * 2 + 1) -
4708 local_thread_part_weight_work(i * 2);
4710 if(thread_ii_weight_on_cut < left_weight) {
4712 local_thread_cut_line_weight_to_put_left(i) =
4713 thread_ii_weight_on_cut;
4717 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4719 left_weight -= thread_ii_weight_on_cut;
4722 local_thread_cut_line_weight_to_put_left(i) = 0;
4728 for(mj_part_t i =
num_cuts - 1; i > 0 ; --i) {
4729 if(std::abs(current_concurrent_cut_coordinate(i) -
4730 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4731 local_thread_cut_line_weight_to_put_left(i) -=
4732 local_thread_cut_line_weight_to_put_left(i - 1);
4734 local_thread_cut_line_weight_to_put_left(i) =
4735 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4736 least_signifiance) * significance_mul) /
4737 static_cast<mj_scalar_t
>(significance_mul);
4741 for(mj_part_t i = 0; i < num_parts; ++i) {
4742 local_point_counts(i) = 0;
4746 mj_lno_t coordinate_begin_index =
4747 current_concurrent_work_part == 0 ? 0 :
4748 host_part_xadj(current_concurrent_work_part - 1);
4749 mj_lno_t coordinate_end_index =
4750 host_part_xadj(current_concurrent_work_part);
4752 mj_lno_t total_on_cut;
4753 Kokkos::parallel_reduce(
"Get total_on_cut",
4754 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4755 coordinate_begin_index, coordinate_end_index),
4756 KOKKOS_LAMBDA(
int ii, mj_lno_t & val) {
4757 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4758 mj_part_t coordinate_assigned_place =
4759 local_assigned_part_ids(coordinate_index);
4760 if(coordinate_assigned_place % 2 == 1) {
4765 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4766 if(total_on_cut > 0) {
4767 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4778 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4781 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4783 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4784 typedef int array_t;
4788 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4789 array_t * reduce_array =
new array_t[
static_cast<size_t>(num_parts)];
4792 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4793 typename mj_node_t::device_type, array_t>teamFunctor(
4794 current_concurrent_work_part,
4796 coordinate_permutations,
4797 mj_current_dim_coords,
4801 #
if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4806 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4807 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4809 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4812 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4813 for(mj_part_t part = 0; part < num_parts; ++part) {
4814 local_point_counts(part) = reduce_array[part];
4816 delete [] reduce_array;
4821 if(track_on_cuts.size() > 0) {
4822 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4823 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1));
4824 Kokkos::sort(track_on_cuts_sort);
4828 Kokkos::parallel_for(
4829 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4830 KOKKOS_LAMBDA (
int dummy) {
4832 for(
int j = 0; j < total_on_cut; ++j) {
4833 int ii = track_on_cuts(j);
4834 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4836 local_mj_weights(coordinate_index,0);
4837 mj_part_t coordinate_assigned_place =
4838 local_assigned_part_ids(coordinate_index);
4839 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4841 if(local_distribute_points_on_cut_lines &&
4842 local_thread_cut_line_weight_to_put_left(
4843 coordinate_assigned_part) > local_sEpsilon) {
4847 local_thread_cut_line_weight_to_put_left(
4848 coordinate_assigned_part) -= coordinate_weight;
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4856 std::abs(current_concurrent_cut_coordinate(
4857 coordinate_assigned_part+1) -
4858 current_concurrent_cut_coordinate(
4859 coordinate_assigned_part)) < local_sEpsilon)
4861 local_thread_cut_line_weight_to_put_left(
4862 coordinate_assigned_part + 1) +=
4863 local_thread_cut_line_weight_to_put_left(
4864 coordinate_assigned_part);
4866 ++local_point_counts(coordinate_assigned_part);
4867 local_assigned_part_ids(coordinate_index) =
4868 coordinate_assigned_part;
4873 ++coordinate_assigned_part;
4876 while(local_distribute_points_on_cut_lines &&
4877 coordinate_assigned_part <
num_cuts)
4880 if(std::abs(current_concurrent_cut_coordinate(
4881 coordinate_assigned_part) -
4882 current_concurrent_cut_coordinate(
4883 coordinate_assigned_part - 1)) < local_sEpsilon)
4886 if(local_thread_cut_line_weight_to_put_left(
4887 coordinate_assigned_part) > local_sEpsilon &&
4888 local_thread_cut_line_weight_to_put_left(
4889 coordinate_assigned_part) >=
4890 std::abs(local_thread_cut_line_weight_to_put_left(
4891 coordinate_assigned_part) - coordinate_weight))
4893 local_thread_cut_line_weight_to_put_left(
4894 coordinate_assigned_part) -= coordinate_weight;
4898 if(local_thread_cut_line_weight_to_put_left(
4899 coordinate_assigned_part) < 0 &&
4900 coordinate_assigned_part <
num_cuts - 1 &&
4901 std::abs(current_concurrent_cut_coordinate(
4902 coordinate_assigned_part+1) -
4903 current_concurrent_cut_coordinate(
4904 coordinate_assigned_part)) < local_sEpsilon)
4906 local_thread_cut_line_weight_to_put_left(
4907 coordinate_assigned_part + 1) +=
4908 local_thread_cut_line_weight_to_put_left(
4909 coordinate_assigned_part);
4917 ++coordinate_assigned_part;
4919 local_point_counts(coordinate_assigned_part) += 1;
4920 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4924 for(
int j = 0; j < num_parts; ++j) {
4925 out_part_xadj(j) = local_point_counts(j);
4926 local_point_counts(j) = 0;
4929 out_part_xadj(j) += out_part_xadj(j - 1);
4930 local_point_counts(j) += out_part_xadj(j - 1);
4938 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4943 Kokkos::parallel_for(
4944 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4945 coordinate_begin_index, coordinate_end_index),
4946 KOKKOS_LAMBDA (mj_lno_t ii) {
4947 mj_lno_t i = local_coordinate_permutations(ii);
4948 mj_part_t p = local_assigned_part_ids(i);
4949 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4950 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4955 #ifdef KOKKOS_ENABLE_OPENMP
4957 const int num_threads = 1;
4959 const int num_threads = 1;
4962 const int num_teams = 1;
4965 Kokkos::View<mj_lno_t*, device_t>
4966 point_counter(
"insert indices", num_teams * num_threads * num_parts);
4970 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4971 block_policy(num_teams, num_threads);
4972 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4974 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4975 mj_lno_t block_size = range / num_teams + 1;
4976 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(
member_type team_member) {
4977 int team = team_member.league_rank();
4978 int team_offset = team * num_threads * num_parts;
4979 mj_lno_t begin = coordinate_begin_index + team * block_size;
4980 mj_lno_t end = begin + block_size;
4981 if(end > coordinate_end_index) {
4982 end = coordinate_end_index;
4985 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4987 int thread = team_member.team_rank();
4988 mj_lno_t i = local_coordinate_permutations(ii);
4989 mj_part_t p = local_assigned_part_ids(i);
4990 int index = team_offset + thread * num_parts + p;
4991 ++point_counter(index);
4999 Kokkos::parallel_for(
5000 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5001 KOKKOS_LAMBDA (
int dummy) {
5002 int num_sets = point_counter.size() / num_parts;
5003 for(
int set = num_sets - 1; set >= 1; set -=1) {
5004 int base = set * num_parts;
5005 for(
int part = 0; part < num_parts; ++part) {
5006 point_counter(base + part) = point_counter(base + part - num_parts);
5010 for(
int part = 0; part < num_parts; ++part) {
5011 point_counter(part) = 0;
5014 for(
int set = 1; set < num_sets; ++set) {
5015 int base = set * num_parts;
5016 for(
int part = 0; part < num_parts; ++part) {
5017 point_counter(base + part) += point_counter(base + part - num_parts);
5023 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(
member_type team_member) {
5024 int team = team_member.league_rank();
5025 int team_offset = team * num_threads * num_parts;
5026 mj_lno_t begin = coordinate_begin_index + team * block_size;
5027 mj_lno_t end = begin + block_size;
5028 if(end > coordinate_end_index) {
5029 end = coordinate_end_index;
5031 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
5033 int thread = team_member.team_rank();
5034 mj_lno_t i = local_coordinate_permutations(ii);
5035 mj_part_t p = local_assigned_part_ids(i);
5036 int index = team_offset + thread * num_parts + p;
5037 int set_counter = (point_counter(index)++) + local_point_counts(p);
5038 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5087 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5088 typename mj_part_t,
typename mj_node_t>
5089 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5090 mj_node_t>::mj_get_new_cut_coordinates(
5094 const double &used_imbalance_tolerance,
5095 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5096 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5097 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5098 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5099 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5100 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5101 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5102 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5103 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5104 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5105 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5106 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5107 Kokkos::View<mj_scalar_t *, device_t> &
5108 current_part_cut_line_weight_to_put_left,
5109 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5111 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5113 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5115 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5116 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5117 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5118 auto local_global_min_max_coord_total_weight =
5119 global_min_max_coord_total_weight;
5121 const auto _sEpsilon = this->
sEpsilon;
5129 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5130 policy_one_team(1, Kokkos::AUTO());
5131 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5133 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(
member_type team_member) {
5135 mj_scalar_t min_coordinate =
5136 local_global_min_max_coord_total_weight(kk);
5137 mj_scalar_t max_coordinate =
5138 local_global_min_max_coord_total_weight(
5140 mj_scalar_t global_total_weight =
5141 local_global_min_max_coord_total_weight(
5144 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member,
num_cuts),
5149 current_global_left_closest_points(i) > local_sEpsilon) {
5150 current_global_left_closest_points(i) =
5151 current_cut_coordinates(i);
5153 if(current_global_right_closest_points(i) -
5154 max_coordinate > local_sEpsilon) {
5155 current_global_right_closest_points(i) =
5156 current_cut_coordinates(i);
5159 team_member.team_barrier();
5161 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member,
num_cuts),
5163 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5166 mj_scalar_t seen_weight_in_part = 0;
5168 mj_scalar_t expected_weight_in_part = 0;
5170 double imbalance_on_left = 0, imbalance_on_right = 0;
5171 if(local_distribute_points_on_cut_lines) {
5173 local_global_rectilinear_cut_weight(i) = 0;
5174 local_process_rectilinear_cut_weight(i) = 0;
5176 bool bContinue =
false;
5179 if(current_cut_line_determined(i)) {
5180 new_current_cut_coordinates(i) =
5181 current_cut_coordinates(i);
5186 seen_weight_in_part = current_global_part_weights(i * 2);
5189 expected_weight_in_part = current_part_target_weights(i);
5192 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5193 expected_weight_in_part);
5196 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5197 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5198 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5199 used_imbalance_tolerance < local_sEpsilon ;
5200 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5201 used_imbalance_tolerance < local_sEpsilon;
5203 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5204 current_cut_line_determined(i) =
true;
5205 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5206 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5208 else if(imbalance_on_left < 0) {
5210 if(local_distribute_points_on_cut_lines) {
5215 if(current_global_part_weights(i * 2 + 1) ==
5216 expected_weight_in_part) {
5218 current_cut_line_determined(i) =
true;
5219 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5222 new_current_cut_coordinates(i) =
5223 current_cut_coordinates(i);
5225 current_part_cut_line_weight_to_put_left(i) =
5226 current_local_part_weights(i * 2 + 1) -
5227 current_local_part_weights(i * 2);
5230 else if(current_global_part_weights(i * 2 + 1) >
5231 expected_weight_in_part) {
5234 current_cut_line_determined(i) =
true;
5235 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5239 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5240 new_current_cut_coordinates(i) =
5241 current_cut_coordinates(i);
5242 local_process_rectilinear_cut_weight[i] =
5243 current_local_part_weights(i * 2 + 1) -
5244 current_local_part_weights(i * 2);
5253 current_cut_lower_bounds(i) =
5254 current_global_right_closest_points(i);
5257 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5262 for(mj_part_t ii = i + 1; ii <
num_cuts ; ++ii) {
5263 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5264 mj_scalar_t line_weight =
5265 current_global_part_weights(ii * 2 + 1);
5266 if(p_weight >= expected_weight_in_part) {
5271 if(p_weight == expected_weight_in_part) {
5272 current_cut_upper_bounds(i) =
5273 current_cut_coordinates(ii);
5274 current_cut_upper_weights(i) = p_weight;
5275 current_cut_lower_bounds(i) =
5276 current_cut_coordinates(ii);
5277 current_cut_lower_bound_weights(i) = p_weight;
5278 }
else if(p_weight < current_cut_upper_weights(i)) {
5281 current_cut_upper_bounds(i) =
5282 current_global_left_closest_points(ii);
5283 current_cut_upper_weights(i) = p_weight;
5289 if(line_weight >= expected_weight_in_part) {
5293 current_cut_upper_bounds(i) =
5294 current_cut_coordinates(ii);
5295 current_cut_upper_weights(i) = line_weight;
5296 current_cut_lower_bounds(i) =
5297 current_cut_coordinates(ii);
5298 current_cut_lower_bound_weights(i) = p_weight;
5303 if(p_weight <= expected_weight_in_part && p_weight >=
5304 current_cut_lower_bound_weights(i)) {
5305 current_cut_lower_bounds(i) =
5306 current_global_right_closest_points(ii);
5307 current_cut_lower_bound_weights(i) = p_weight;
5311 mj_scalar_t new_cut_position = 0;
5312 algMJ_t::mj_calculate_new_cut_position(
5313 current_cut_upper_bounds(i),
5314 current_cut_lower_bounds(i),
5315 current_cut_upper_weights(i),
5316 current_cut_lower_bound_weights(i),
5317 expected_weight_in_part, new_cut_position,
5322 if(std::abs(current_cut_coordinates(i) -
5323 new_cut_position) < local_sEpsilon) {
5324 current_cut_line_determined(i) =
true;
5325 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5328 new_current_cut_coordinates(i) =
5329 current_cut_coordinates(i);
5331 new_current_cut_coordinates(i) = new_cut_position;
5337 current_cut_upper_bounds(i) =
5338 current_global_left_closest_points(i);
5339 current_cut_upper_weights(i) =
5340 seen_weight_in_part;
5343 for(
int ii = i - 1; ii >= 0; --ii) {
5344 mj_scalar_t p_weight =
5345 current_global_part_weights(ii * 2);
5346 mj_scalar_t line_weight =
5347 current_global_part_weights(ii * 2 + 1);
5348 if(p_weight <= expected_weight_in_part) {
5349 if(p_weight == expected_weight_in_part) {
5352 current_cut_upper_bounds(i) =
5353 current_cut_coordinates(ii);
5354 current_cut_upper_weights(i) = p_weight;
5355 current_cut_lower_bounds(i) =
5356 current_cut_coordinates(ii);
5357 current_cut_lower_bound_weights(i) = p_weight;
5359 else if(p_weight > current_cut_lower_bound_weights(i)) {
5362 current_cut_lower_bounds(i) =
5363 current_global_right_closest_points(ii);
5364 current_cut_lower_bound_weights(i) = p_weight;
5370 if(line_weight > expected_weight_in_part) {
5371 current_cut_upper_bounds(i) =
5372 current_global_right_closest_points(ii);
5373 current_cut_upper_weights(i) = line_weight;
5383 if(p_weight >= expected_weight_in_part &&
5384 (p_weight < current_cut_upper_weights(i) ||
5385 (p_weight == current_cut_upper_weights(i) &&
5386 current_cut_upper_bounds(i) >
5387 current_global_left_closest_points(ii)))) {
5388 current_cut_upper_bounds(i) =
5389 current_global_left_closest_points(ii);
5390 current_cut_upper_weights(i) = p_weight;
5393 mj_scalar_t new_cut_position = 0;
5394 algMJ_t::mj_calculate_new_cut_position(
5395 current_cut_upper_bounds(i),
5396 current_cut_lower_bounds(i),
5397 current_cut_upper_weights(i),
5398 current_cut_lower_bound_weights(i),
5399 expected_weight_in_part,
5404 if(std::abs(current_cut_coordinates(i) -
5405 new_cut_position) < local_sEpsilon) {
5406 current_cut_line_determined(i) =
true;
5407 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5409 new_current_cut_coordinates(i) =
5410 current_cut_coordinates(i);
5412 new_current_cut_coordinates(i) =
5419 team_member.team_barrier();
5423 mj_part_t rectilinear_cut_count;
5424 Kokkos::parallel_reduce(
"Read bDoingWork",
5425 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5426 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
5427 set_single = view_rectilinear_cut_count(0);
5428 }, rectilinear_cut_count);
5430 if(rectilinear_cut_count > 0) {
5431 auto host_local_process_rectilinear_cut_weight =
5432 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5433 local_process_rectilinear_cut_weight);
5434 auto host_local_global_rectilinear_cut_weight =
5435 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5436 local_global_rectilinear_cut_weight);
5437 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5438 local_process_rectilinear_cut_weight);
5439 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5440 local_global_rectilinear_cut_weight);
5441 Teuchos::scan<int,mj_scalar_t>(
5442 *comm, Teuchos::REDUCE_SUM,
5444 host_local_process_rectilinear_cut_weight.data(),
5445 host_local_global_rectilinear_cut_weight.data());
5446 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5447 host_local_process_rectilinear_cut_weight);
5448 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5449 host_local_global_rectilinear_cut_weight);
5451 Kokkos::parallel_for(
"finish up mj_get_new_cut_coordinates",
5452 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5453 KOKKOS_LAMBDA(
int dummy) {
5454 for(mj_part_t i = 0; i <
num_cuts; ++i) {
5456 if(local_global_rectilinear_cut_weight(i) > 0) {
5458 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5460 mj_scalar_t necessary_weight_on_line_for_left =
5461 expected_part_weight - current_global_part_weights(i * 2);
5464 mj_scalar_t my_weight_on_line =
5465 local_process_rectilinear_cut_weight(i);
5469 mj_scalar_t weight_on_line_upto_process_inclusive =
5470 local_global_rectilinear_cut_weight(i);
5474 mj_scalar_t space_to_put_left =
5475 necessary_weight_on_line_for_left -
5476 weight_on_line_upto_process_inclusive;
5479 mj_scalar_t space_left_to_me =
5480 space_to_put_left + my_weight_on_line;
5493 if(space_left_to_me < 0) {
5496 current_part_cut_line_weight_to_put_left(i) = 0;
5498 else if(space_left_to_me >= my_weight_on_line) {
5502 current_part_cut_line_weight_to_put_left(i) =
5509 current_part_cut_line_weight_to_put_left(i) =
5516 view_rectilinear_cut_count(0) = 0;
5520 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5532 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5533 typename mj_part_t,
typename mj_node_t>
5534 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5535 get_processor_num_points_in_parts(
5536 mj_part_t num_procs,
5537 mj_part_t num_parts,
5538 mj_gno_t *&num_points_in_all_processor_parts)
5541 size_t allocation_size = num_parts * (num_procs + 1);
5548 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5549 new mj_gno_t[allocation_size];
5553 mj_gno_t *my_local_points_to_reduce_sum =
5554 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5558 mj_gno_t *my_local_point_counts_in_each_part =
5559 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5562 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5563 sizeof(mj_gno_t)*allocation_size);
5565 auto local_new_part_xadj = this->new_part_xadj;
5566 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5567 Kokkos::ViewAllocateWithoutInitializing(
"points per part"), num_parts);
5568 Kokkos::parallel_for(
"get vals on device",
5569 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5570 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5571 points_per_part(i) =
5572 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5574 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5575 Kokkos::deep_copy(host_points_per_part, points_per_part);
5576 for(
int i = 0; i < num_parts; ++i) {
5577 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5582 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5583 sizeof(mj_gno_t) * (num_parts) );
5590 reduceAll<int, mj_gno_t>(
5592 Teuchos::REDUCE_SUM,
5594 num_local_points_in_each_part_to_reduce_sum,
5595 num_points_in_all_processor_parts);
5599 delete [] num_local_points_in_each_part_to_reduce_sum;
5617 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5618 typename mj_part_t,
typename mj_node_t>
5619 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5620 mj_check_to_migrate(
5621 size_t migration_reduce_all_population,
5622 mj_lno_t num_coords_for_last_dim_part,
5623 mj_part_t num_procs,
5624 mj_part_t num_parts,
5625 mj_gno_t *num_points_in_all_processor_parts)
5628 if(migration_reduce_all_population > future_reduceall_cutoff) {
5633 if(num_coords_for_last_dim_part < min_work_last_dim) {
5638 if(this->check_migrate_avoid_migration_option == 0) {
5639 double global_imbalance = 0;
5641 size_t global_shift = num_procs * num_parts;
5643 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5644 for(mj_part_t i = 0; i < num_parts; ++i) {
5645 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5646 / double(num_procs);
5648 global_imbalance += std::abs(ideal_num -
5649 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5652 global_imbalance /= num_parts;
5653 global_imbalance /= num_procs;
5655 if(global_imbalance <= this->minimum_migration_imbalance) {
5681 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5682 typename mj_part_t,
typename mj_node_t>
5683 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5684 assign_send_destinations(
5685 mj_part_t num_parts,
5686 mj_part_t *part_assignment_proc_begin_indices,
5687 mj_part_t *processor_chains_in_parts,
5688 mj_lno_t *send_count_to_each_proc,
5689 int *coordinate_destinations) {
5691 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5692 deep_copy(host_new_part_xadj, this->new_part_xadj);
5694 auto host_new_coordinate_permutations =
5695 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5696 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5698 for(mj_part_t p = 0; p < num_parts; ++p) {
5699 mj_lno_t part_begin = 0;
5700 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5701 mj_lno_t part_end = host_new_part_xadj(p);
5703 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5705 mj_lno_t num_total_send = 0;
5706 for(mj_lno_t j=part_begin; j < part_end; j++) {
5707 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5708 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5712 part_assignment_proc_begin_indices[p] =
5713 processor_chains_in_parts[proc_to_sent];
5715 processor_chains_in_parts[proc_to_sent] = -1;
5717 proc_to_sent = part_assignment_proc_begin_indices[p];
5720 coordinate_destinations[local_ind] = proc_to_sent;
5746 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5747 typename mj_part_t,
typename mj_node_t>
5748 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5749 mj_assign_proc_to_parts(
5750 mj_gno_t * num_points_in_all_processor_parts,
5751 mj_part_t num_parts,
5752 mj_part_t num_procs,
5753 mj_lno_t *send_count_to_each_proc,
5754 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5755 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5756 mj_part_t &out_part_index,
5757 mj_part_t &output_part_numbering_begin_index,
5758 int * coordinate_destinations) {
5759 mj_gno_t *global_num_points_in_parts =
5760 num_points_in_all_processor_parts + num_procs * num_parts;
5761 mj_part_t *num_procs_assigned_to_each_part =
new mj_part_t[num_parts];
5764 bool did_i_find_my_group =
false;
5766 mj_part_t num_free_procs = num_procs;
5767 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5769 double max_imbalance_difference = 0;
5770 mj_part_t max_differing_part = 0;
5773 for(mj_part_t i = 0; i < num_parts; i++) {
5776 double scalar_required_proc = num_procs *
5777 (double (global_num_points_in_parts[i]) /
5778 double (this->num_global_coords));
5781 mj_part_t required_proc =
5782 static_cast<mj_part_t
> (0.5 + scalar_required_proc);
5783 if(required_proc == 0) required_proc = 1;
5789 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5790 required_proc = num_free_procs -
5791 (minimum_num_procs_required_for_rest_of_parts);
5795 num_free_procs -= required_proc;
5799 --minimum_num_procs_required_for_rest_of_parts;
5802 num_procs_assigned_to_each_part[i] = required_proc;
5807 double imbalance_wrt_ideal =
5808 (scalar_required_proc - required_proc) / required_proc;
5809 if(imbalance_wrt_ideal > max_imbalance_difference) {
5810 max_imbalance_difference = imbalance_wrt_ideal;
5811 max_differing_part = i;
5817 if(num_free_procs > 0) {
5818 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5825 mj_part_t *part_assignment_proc_begin_indices =
new mj_part_t[num_parts];
5829 mj_part_t *processor_chains_in_parts =
new mj_part_t [num_procs];
5830 mj_part_t *processor_part_assignments =
new mj_part_t[num_procs];
5839 for(
int i = 0; i < num_procs; ++i ) {
5840 processor_part_assignments[i] = -1;
5841 processor_chains_in_parts[i] = -1;
5843 for(
int i = 0; i < num_parts; ++i ) {
5844 part_assignment_proc_begin_indices[i] = -1;
5850 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5851 sort_item_num_part_points_in_procs =
5852 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5854 for(mj_part_t i = 0; i < num_parts; ++i) {
5859 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5860 sort_item_num_part_points_in_procs[ii].id = ii;
5863 if(processor_part_assignments[ii] == -1) {
5864 sort_item_num_part_points_in_procs[ii].val =
5865 num_points_in_all_processor_parts[ii * num_parts + i];
5867 sort_item_num_part_points_in_procs[ii].signbit = 1;
5880 sort_item_num_part_points_in_procs[ii].val =
5881 num_points_in_all_processor_parts[ii * num_parts + i];
5882 sort_item_num_part_points_in_procs[ii].signbit = 0;
5887 uqSignsort<mj_part_t, mj_gno_t,char>
5888 (num_procs, sort_item_num_part_points_in_procs);
5900 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5901 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5902 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5903 ceil(total_num_points_in_part /
double (required_proc_count)));
5906 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5907 mj_part_t next_proc_to_send_id =
5908 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5909 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5910 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5914 for(mj_part_t ii = num_procs - 1;
5915 ii >= num_procs - required_proc_count; --ii) {
5916 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5918 processor_part_assignments[proc_id] = i;
5921 bool did_change_sign =
false;
5923 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5926 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5927 did_change_sign =
true;
5928 sort_item_num_part_points_in_procs[ii].signbit = 1;
5935 if(did_change_sign) {
5938 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5939 sort_item_num_part_points_in_procs);
5954 if(!did_i_find_my_group) {
5955 for(mj_part_t ii = num_procs - 1; ii >=
5956 num_procs - required_proc_count; --ii) {
5958 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5961 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5963 if(proc_id_to_assign == this->myRank) {
5965 did_i_find_my_group =
true;
5968 part_assignment_proc_begin_indices[i] = this->myRank;
5969 processor_chains_in_parts[this->myRank] = -1;
5973 send_count_to_each_proc[this->myRank] =
5974 sort_item_num_part_points_in_procs[ii].val;
5978 for(mj_part_t in = 0; in < i; ++in) {
5979 output_part_numbering_begin_index +=
5980 (*next_future_num_parts_in_parts)[in];
5988 if(!did_i_find_my_group) {
5989 processor_ranks_for_subcomm.clear();
5997 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5998 mj_part_t nonassigned_proc_id =
5999 sort_item_num_part_points_in_procs[ii].id;
6000 mj_lno_t num_points_to_sent =
6001 sort_item_num_part_points_in_procs[ii].val;
6007 if(num_points_to_sent < 0) {
6008 cout <<
"Migration - processor assignments - for part:" << i
6009 <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:"
6010 << num_points_to_sent << std::endl;
6015 switch (migration_type) {
6019 while (num_points_to_sent > 0) {
6021 if(num_points_to_sent <= space_left_in_sent_proc) {
6023 space_left_in_sent_proc -= num_points_to_sent;
6025 if(this->myRank == nonassigned_proc_id) {
6027 send_count_to_each_proc[next_proc_to_send_id] =
6032 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6033 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6034 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6036 num_points_to_sent = 0;
6040 if(space_left_in_sent_proc > 0) {
6041 num_points_to_sent -= space_left_in_sent_proc;
6044 if(this->myRank == nonassigned_proc_id) {
6046 send_count_to_each_proc[next_proc_to_send_id] =
6047 space_left_in_sent_proc;
6048 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6049 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6050 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6054 ++next_proc_to_send_index;
6057 if(next_part_to_send_index < nprocs - required_proc_count ) {
6058 cout <<
"Migration - processor assignments - for part:"
6060 <<
" next_part_to_send :" << next_part_to_send_index
6061 <<
" nprocs:" << nprocs
6062 <<
" required_proc_count:" << required_proc_count
6063 <<
" Error: next_part_to_send_index <" <<
6064 <<
" nprocs - required_proc_count" << std::endl;
6069 next_proc_to_send_id =
6070 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6072 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6073 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6084 if(this->myRank == nonassigned_proc_id) {
6086 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6090 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6091 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6092 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6094 num_points_to_sent = 0;
6095 ++next_proc_to_send_index;
6099 if(next_proc_to_send_index == num_procs) {
6100 next_proc_to_send_index = num_procs - required_proc_count;
6103 next_proc_to_send_id =
6104 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6106 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6107 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6120 this->assign_send_destinations(
6122 part_assignment_proc_begin_indices,
6123 processor_chains_in_parts,
6124 send_count_to_each_proc,
6125 coordinate_destinations);
6126 delete [] part_assignment_proc_begin_indices;
6127 delete [] processor_chains_in_parts;
6128 delete [] processor_part_assignments;
6129 delete [] sort_item_num_part_points_in_procs;
6130 delete [] num_procs_assigned_to_each_part;
6148 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6149 typename mj_part_t,
typename mj_node_t>
6150 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6151 assign_send_destinations2(
6152 mj_part_t num_parts,
6153 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6154 int *coordinate_destinations,
6155 mj_part_t &output_part_numbering_begin_index,
6156 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6158 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6159 mj_part_t previous_processor = -1;
6161 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6162 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6164 auto local_new_coordinate_permutations =
6165 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6166 Kokkos::deep_copy(local_new_coordinate_permutations,
6167 this->new_coordinate_permutations);
6169 for(mj_part_t i = 0; i < num_parts; ++i) {
6170 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6173 mj_lno_t part_begin_index = 0;
6176 part_begin_index = local_new_part_xadj(p - 1);
6179 mj_lno_t part_end_index = local_new_part_xadj(p);
6181 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6182 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6183 output_part_numbering_begin_index = part_shift_amount;
6185 previous_processor = assigned_proc;
6186 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6188 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6189 mj_lno_t localInd = local_new_coordinate_permutations(j);
6190 coordinate_destinations[localInd] = assigned_proc;
6216 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6217 typename mj_part_t,
typename mj_node_t>
6218 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6219 mj_assign_parts_to_procs(
6220 mj_gno_t * num_points_in_all_processor_parts,
6221 mj_part_t num_parts,
6222 mj_part_t num_procs,
6223 mj_lno_t *send_count_to_each_proc,
6224 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6225 mj_part_t &out_num_part,
6226 std::vector<mj_part_t> &out_part_indices,
6227 mj_part_t &output_part_numbering_begin_index,
6228 int *coordinate_destinations) {
6231 mj_gno_t *global_num_points_in_parts =
6232 num_points_in_all_processor_parts + num_procs * num_parts;
6233 out_part_indices.clear();
6237 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6238 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6239 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6240 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6244 mj_lno_t work_each =
6245 mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
6249 mj_lno_t *space_in_each_processor =
new mj_lno_t[num_procs];
6252 for(mj_part_t i = 0; i < num_procs; ++i) {
6253 space_in_each_processor[i] = work_each;
6260 mj_part_t *num_parts_proc_assigned =
new mj_part_t[num_procs];
6261 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
6262 int empty_proc_count = num_procs;
6266 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6267 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6272 for(mj_part_t i = 0; i < num_parts; ++i) {
6273 sort_item_point_counts_in_parts[i].id = i;
6274 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6278 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6283 for(mj_part_t j = 0; j < num_parts; ++j) {
6285 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6288 mj_gno_t load = global_num_points_in_parts[i];
6291 mj_part_t assigned_proc = -1;
6294 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6295 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6300 if(empty_proc_count < num_parts - j ||
6301 num_parts_proc_assigned[ii] == 0) {
6303 sort_item_num_points_of_proc_in_part_i[ii].val =
6304 num_points_in_all_processor_parts[ii * num_parts + i];
6307 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6311 uqsort<mj_part_t, mj_gno_t>(num_procs,
6312 sort_item_num_points_of_proc_in_part_i);
6315 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6316 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6317 if(assigned_proc == -1 ||
6318 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6321 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6322 if(ii < assigned_proc) {
6338 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6342 space_in_each_processor[assigned_proc] -= load;
6344 sort_item_part_to_proc_assignment[j].id = i;
6347 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6350 if(assigned_proc == this->myRank) {
6352 out_part_indices.push_back(i);
6358 send_count_to_each_proc[assigned_proc] +=
6359 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6362 delete [] num_parts_proc_assigned;
6363 delete [] sort_item_num_points_of_proc_in_part_i;
6364 delete [] sort_item_point_counts_in_parts;
6365 delete [] space_in_each_processor;
6368 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6371 this->assign_send_destinations2(
6373 sort_item_part_to_proc_assignment,
6374 coordinate_destinations,
6375 output_part_numbering_begin_index,
6376 next_future_num_parts_in_parts);
6378 delete [] sort_item_part_to_proc_assignment;
6405 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6406 typename mj_part_t,
typename mj_node_t>
6407 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6408 mj_migration_part_proc_assignment(
6409 mj_gno_t * num_points_in_all_processor_parts,
6410 mj_part_t num_parts,
6411 mj_part_t num_procs,
6412 mj_lno_t *send_count_to_each_proc,
6413 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6414 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6415 mj_part_t &out_num_part,
6416 std::vector<mj_part_t> &out_part_indices,
6417 mj_part_t &output_part_numbering_begin_index,
6418 int *coordinate_destinations)
6420 processor_ranks_for_subcomm.clear();
6422 if(num_procs > num_parts) {
6427 mj_part_t out_part_index = 0;
6429 this->mj_assign_proc_to_parts(
6430 num_points_in_all_processor_parts,
6433 send_count_to_each_proc,
6434 processor_ranks_for_subcomm,
6435 next_future_num_parts_in_parts,
6437 output_part_numbering_begin_index,
6438 coordinate_destinations
6442 out_part_indices.clear();
6443 out_part_indices.push_back(out_part_index);
6450 processor_ranks_for_subcomm.push_back(this->myRank);
6455 this->mj_assign_parts_to_procs(
6456 num_points_in_all_processor_parts,
6459 send_count_to_each_proc,
6460 next_future_num_parts_in_parts,
6463 output_part_numbering_begin_index,
6464 coordinate_destinations);
6481 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6482 typename mj_part_t,
typename mj_node_t>
6483 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6485 mj_part_t num_procs,
6486 mj_lno_t &num_new_local_points,
6487 std::string iteration,
6488 int *coordinate_destinations,
6489 mj_part_t num_parts)
6492 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6493 if(
sizeof(mj_lno_t) <=
sizeof(
int)) {
6497 ZOLTAN_COMM_OBJ *plan = NULL;
6498 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6499 int num_incoming_gnos = 0;
6500 int message_tag = 7859;
6503 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6504 int ierr = Zoltan_Comm_Create(
6506 int(this->num_local_coords),
6507 coordinate_destinations,
6510 &num_incoming_gnos);
6514 mj_timer_base_string +
"Migration Z1PlanCreating-" + iteration);
6517 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6523 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6524 Kokkos::HostSpace(), this->current_mj_gnos);
6525 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6526 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6527 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), num_incoming_gnos);
6528 auto host_dst_gnos = Kokkos::create_mirror_view(
6529 Kokkos::HostSpace(), dst_gnos);
6531 ierr = Zoltan_Comm_Do(
6534 (
char *) host_current_mj_gnos.data(),
6536 (
char *) host_dst_gnos.data());
6538 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6539 this->current_mj_gnos = dst_gnos;
6545 auto host_src_coordinates = Kokkos::create_mirror_view(
6546 Kokkos::HostSpace(), this->mj_coordinates);
6547 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6548 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6549 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
6550 num_incoming_gnos, this->coord_dim);
6551 auto host_dst_coordinates = Kokkos::create_mirror_view(
6552 Kokkos::HostSpace(), dst_coordinates);
6553 for(
int i = 0; i < this->coord_dim; ++i) {
6554 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6555 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6556 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6557 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6560 ierr = Zoltan_Comm_Do(
6563 (
char *) sub_host_src_coordinates.data(),
6564 sizeof(mj_scalar_t),
6565 (
char *) sub_host_dst_coordinates.data());
6568 deep_copy(dst_coordinates, host_dst_coordinates);
6569 this->mj_coordinates = dst_coordinates;
6574 auto host_src_weights = Kokkos::create_mirror_view(
6575 Kokkos::HostSpace(), this->mj_weights);
6576 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6577 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6578 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
6579 num_incoming_gnos, this->num_weights_per_coord);
6580 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6581 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6582 auto sub_host_src_weights
6583 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6584 auto sub_host_dst_weights
6585 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6586 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6588 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6589 sent_weight[n] = sub_host_src_weights(n);
6591 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6593 ierr = Zoltan_Comm_Do(
6596 (
char *) sent_weight.getRawPtr(),
6597 sizeof(mj_scalar_t),
6598 (
char *) received_weight.getRawPtr());
6601 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6602 sub_host_dst_weights(n) = received_weight[n];
6605 deep_copy(dst_weights, host_dst_weights);
6606 this->mj_weights = dst_weights;
6612 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6613 Kokkos::ViewAllocateWithoutInitializing(
"owner_of_coordinate"),
6616 ierr = Zoltan_Comm_Do(
6619 (
char *) owner_of_coordinate.data(),
6621 (
char *) dst_owners_of_coordinate.data());
6623 this->owner_of_coordinate = dst_owners_of_coordinate;
6630 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6631 Kokkos::HostSpace(), this->assigned_part_ids);
6632 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6633 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6634 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
6636 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6637 Kokkos::HostSpace(), dst_assigned_part_ids);
6638 mj_part_t *new_parts =
new mj_part_t[num_incoming_gnos];
6639 if(num_procs < num_parts) {
6641 ierr = Zoltan_Comm_Do(
6644 (
char *) host_src_assigned_part_ids.data(),
6646 (
char *) host_dst_assigned_part_ids.data());
6648 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6652 this->assigned_part_ids = dst_assigned_part_ids;
6655 ierr = Zoltan_Comm_Destroy(&plan);
6657 num_new_local_points = num_incoming_gnos;
6659 mj_timer_base_string +
"Migration Z1Migration-" + iteration);
6664 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6665 "Migration DistributorPlanCreating-" + iteration);
6667 Tpetra::Distributor distributor(this->comm);
6668 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6669 this->num_local_coords);
6670 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6671 this->mj_env->timerStop(
MACRO_TIMERS, mj_timer_base_string +
6672 "Migration DistributorPlanCreating-" + iteration);
6673 this->mj_env->timerStart(
MACRO_TIMERS, mj_timer_base_string +
6674 "Migration DistributorMigration-" + iteration);
6681 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
6682 auto src_host_current_mj_gnos =
6683 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
6684 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
6685 ArrayView<mj_gno_t> sent_gnos(
6686 src_host_current_mj_gnos.data(), this->num_local_coords);
6687 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
6688 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6689 Kokkos::ViewAllocateWithoutInitializing(
"gids"), num_incoming_gnos);
6690 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6691 this->current_mj_gnos);
6692 memcpy(host_current_mj_gnos.data(),
6693 received_gnos.getRawPtr(), num_incoming_gnos *
sizeof(mj_gno_t));
6694 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
6699 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6700 dst_coordinates(
"mj_coordinates", num_incoming_gnos, this->coord_dim);
6701 auto host_dst_coordinates = Kokkos::create_mirror_view(dst_coordinates);
6702 auto host_src_coordinates = Kokkos::create_mirror_view(
6703 Kokkos::HostSpace(), this->mj_coordinates);
6704 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6705 for(
int i = 0; i < this->coord_dim; ++i) {
6706 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6707 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6708 auto sub_host_dst_coordinates
6709 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6722 ArrayRCP<mj_scalar_t> sent_coord(this->num_local_coords);
6723 for(
int n = 0; n < this->num_local_coords; ++n) {
6724 sent_coord[n] = sub_host_src_coordinates[n];
6727 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
6728 distributor.doPostsAndWaits<mj_scalar_t>(
6729 sent_coord(), 1, received_coord());
6730 memcpy(sub_host_dst_coordinates.data(),
6731 received_coord.getRawPtr(), num_incoming_gnos *
sizeof(mj_scalar_t));
6733 deep_copy(dst_coordinates, host_dst_coordinates);
6734 this->mj_coordinates = dst_coordinates;
6737 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6738 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6739 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6740 auto host_src_weights = Kokkos::create_mirror_view(
6741 Kokkos::HostSpace(), this->mj_weights);
6742 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6743 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
6744 auto sub_host_src_weights
6745 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6746 auto sub_host_dst_weights
6747 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6748 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6754 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6755 sent_weight[n] = sub_host_src_weights(n);
6757 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6758 distributor.doPostsAndWaits<mj_scalar_t>(
6759 sent_weight(), 1, received_weight());
6762 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6763 sub_host_dst_weights(n) = received_weight[n];
6766 Kokkos::deep_copy(dst_weights, host_dst_weights);
6767 this->mj_weights = dst_weights;
6772 ArrayView<int> sent_owners(
6773 owner_of_coordinate.data(), this->num_local_coords);
6774 ArrayRCP<int> received_owners(num_incoming_gnos);
6775 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
6776 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>
6777 (
"owner_of_coordinate", num_incoming_gnos);
6778 memcpy(this->owner_of_coordinate.data(),
6779 received_owners.getRawPtr(), num_incoming_gnos *
sizeof(
int));
6785 if(num_procs < num_parts) {
6786 auto src_host_assigned_part_ids =
6787 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
6788 Kokkos::deep_copy(src_host_assigned_part_ids, assigned_part_ids);
6789 ArrayView<mj_part_t> sent_partids(
6790 src_host_assigned_part_ids.data(), this->num_local_coords);
6791 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
6792 distributor.doPostsAndWaits<mj_part_t>(
6793 sent_partids, 1, received_partids());
6794 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6795 (
"assigned_part_ids", num_incoming_gnos);
6796 auto host_assigned_part_ids = Kokkos::create_mirror_view(
6797 this->assigned_part_ids);
6799 host_assigned_part_ids.data(),
6800 received_partids.getRawPtr(),
6801 num_incoming_gnos *
sizeof(mj_part_t));
6802 Kokkos::deep_copy(this->assigned_part_ids, host_assigned_part_ids);
6805 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6806 (
"assigned_part_ids", num_incoming_gnos);
6808 this->mj_env->timerStop(
MACRO_TIMERS,
"" + mj_timer_base_string +
6809 "Migration DistributorMigration-" + iteration);
6811 num_new_local_points = num_incoming_gnos;
6820 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6821 typename mj_part_t,
typename mj_node_t>
6822 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6823 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6825 mj_part_t group_size = processor_ranks_for_subcomm.size();
6826 mj_part_t *ids =
new mj_part_t[group_size];
6827 for(mj_part_t i = 0; i < group_size; ++i) {
6828 ids[i] = processor_ranks_for_subcomm[i];
6830 ArrayView<const mj_part_t> idView(ids, group_size);
6831 this->comm = this->comm->createSubcommunicator(idView);
6840 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6841 typename mj_part_t,
typename mj_node_t>
6842 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6843 fill_permutation_array(
6844 mj_part_t output_num_parts,
6845 mj_part_t num_parts)
6848 if(output_num_parts == 1) {
6849 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6850 Kokkos::parallel_for(
6851 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6852 (0, this->num_local_coords),
6853 KOKKOS_LAMBDA(mj_lno_t i) {
6854 local_new_coordinate_permutations(i) = i;
6856 auto local_new_part_xadj = this->new_part_xadj;
6857 auto local_num_local_coords = this->num_local_coords;
6858 Kokkos::parallel_for(
6859 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6860 KOKKOS_LAMBDA(
int dummy) {
6861 local_new_part_xadj(0) = local_num_local_coords;
6865 auto local_num_local_coords = this->num_local_coords;
6866 auto local_assigned_part_ids = this->assigned_part_ids;
6867 auto local_new_part_xadj = this->new_part_xadj;
6868 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6871 Kokkos::View<mj_part_t*, device_t> part_shifts(
"part_shifts", num_parts);
6876 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6877 "num_points_in_parts", num_parts);
6879 Kokkos::parallel_for(
6880 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6881 KOKKOS_LAMBDA(
int dummy) {
6883 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6884 mj_part_t ii = local_assigned_part_ids(i);
6885 ++num_points_in_parts(ii);
6890 mj_lno_t prev_index = 0;
6891 for(mj_part_t i = 0; i < num_parts; ++i) {
6892 if(num_points_in_parts(i) > 0) {
6893 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6894 prev_index += num_points_in_parts(i);
6895 part_shifts(i) = p++;
6900 mj_part_t assigned_num_parts = p - 1;
6901 for(;p < num_parts; ++p) {
6902 local_new_part_xadj(p) =
6903 local_new_part_xadj(assigned_num_parts);
6905 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6906 num_points_in_parts(i) = local_new_part_xadj(i);
6912 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6914 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6915 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6945 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
6946 typename mj_part_t,
typename mj_node_t>
6947 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6948 mj_perform_migration(
6949 mj_part_t input_num_parts,
6950 mj_part_t &output_num_parts,
6951 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6952 mj_part_t &output_part_begin_index,
6953 size_t migration_reduce_all_population,
6954 mj_lno_t num_coords_for_last_dim_part,
6955 std::string iteration,
6956 RCP<mj_partBoxVector_t> &input_part_boxes,
6957 RCP<mj_partBoxVector_t> &output_part_boxes)
6959 mj_part_t num_procs = this->comm->getSize();
6960 this->myRank = this->comm->getRank();
6965 mj_gno_t *num_points_in_all_processor_parts =
6966 new mj_gno_t[input_num_parts * (num_procs + 1)];
6969 this->get_processor_num_points_in_parts(
6972 num_points_in_all_processor_parts);
6975 if(!this->mj_check_to_migrate(
6976 migration_reduce_all_population,
6977 num_coords_for_last_dim_part,
6980 num_points_in_all_processor_parts)) {
6981 delete [] num_points_in_all_processor_parts;
6985 mj_lno_t *send_count_to_each_proc = NULL;
6986 int *coordinate_destinations =
new int[this->num_local_coords];
6987 send_count_to_each_proc =
new mj_lno_t[num_procs];
6989 for(
int i = 0; i < num_procs; ++i) {
6990 send_count_to_each_proc[i] = 0;
6993 std::vector<mj_part_t> processor_ranks_for_subcomm;
6994 std::vector<mj_part_t> out_part_indices;
6997 this->mj_migration_part_proc_assignment(
6998 num_points_in_all_processor_parts,
7001 send_count_to_each_proc,
7002 processor_ranks_for_subcomm,
7003 next_future_num_parts_in_parts,
7006 output_part_begin_index,
7007 coordinate_destinations);
7009 delete [] send_count_to_each_proc;
7010 std::vector <mj_part_t> tmpv;
7012 std::sort (out_part_indices.begin(), out_part_indices.end());
7013 mj_part_t outP = out_part_indices.size();
7014 mj_gno_t new_global_num_points = 0;
7015 mj_gno_t *global_num_points_in_parts =
7016 num_points_in_all_processor_parts + num_procs * input_num_parts;
7018 if(this->mj_keep_part_boxes) {
7019 input_part_boxes->clear();
7024 for(mj_part_t i = 0; i < outP; ++i) {
7025 mj_part_t ind = out_part_indices[i];
7026 new_global_num_points += global_num_points_in_parts[ind];
7027 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7028 if(this->mj_keep_part_boxes) {
7029 input_part_boxes->push_back((*output_part_boxes)[ind]);
7034 if(this->mj_keep_part_boxes) {
7035 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7036 input_part_boxes = output_part_boxes;
7037 output_part_boxes = tmpPartBoxes;
7039 next_future_num_parts_in_parts->clear();
7040 for(mj_part_t i = 0; i < outP; ++i) {
7041 mj_part_t p = tmpv[i];
7042 next_future_num_parts_in_parts->push_back(p);
7045 delete [] num_points_in_all_processor_parts;
7047 mj_lno_t num_new_local_points = 0;
7049 this->mj_migrate_coords(
7051 num_new_local_points,
7053 coordinate_destinations,
7056 delete [] coordinate_destinations;
7057 if(this->num_local_coords != num_new_local_points) {
7058 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7059 (Kokkos::ViewAllocateWithoutInitializing(
"new_coordinate_permutations"),
7060 num_new_local_points);
7061 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7062 (Kokkos::ViewAllocateWithoutInitializing(
"coordinate_permutations"),
7063 num_new_local_points);
7065 this->num_local_coords = num_new_local_points;
7066 this->num_global_coords = new_global_num_points;
7069 this->create_sub_communicator(processor_ranks_for_subcomm);
7071 processor_ranks_for_subcomm.clear();
7074 this->fill_permutation_array(output_num_parts, input_num_parts);
7097 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7098 typename mj_part_t,
typename mj_node_t>
7099 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7100 create_consistent_chunks(
7101 mj_part_t num_parts,
7102 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7103 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7104 mj_lno_t coordinate_begin,
7105 mj_lno_t coordinate_end,
7106 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7107 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7109 bool longest_dim_part,
7110 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7120 mj_part_t no_cuts = num_parts - 1;
7124 if(this->distribute_points_on_cut_lines) {
7125 auto local_thread_cut_line_weight_to_put_left =
7126 this->thread_cut_line_weight_to_put_left;
7127 auto local_thread_part_weight_work =
7128 this->thread_part_weight_work;
7129 auto local_sEpsilon = this->
sEpsilon;
7131 Kokkos::parallel_for(
7132 Kokkos::RangePolicy<
typename mj_node_t::execution_space,
7133 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7135 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7136 if(left_weight > local_sEpsilon) {
7138 mj_scalar_t thread_ii_weight_on_cut =
7139 local_thread_part_weight_work(i * 2 + 1) -
7140 local_thread_part_weight_work(i * 2);
7141 if(thread_ii_weight_on_cut < left_weight) {
7142 local_thread_cut_line_weight_to_put_left(i) =
7143 thread_ii_weight_on_cut;
7146 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7150 local_thread_cut_line_weight_to_put_left(i) = 0;
7155 auto local_least_signifiance = least_signifiance;
7156 auto local_significance_mul = significance_mul;
7157 Kokkos::parallel_for(
7158 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7159 (0, 1), KOKKOS_LAMBDA (
int dummy) {
7163 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7164 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7165 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7166 mj_scalar_t delta = cut2 - cut1;
7167 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7168 if(abs_delta < local_sEpsilon) {
7169 local_thread_cut_line_weight_to_put_left(i) -=
7170 local_thread_cut_line_weight_to_put_left(i - 1);
7172 local_thread_cut_line_weight_to_put_left(i) =
7173 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7174 local_least_signifiance) * local_significance_mul) /
7175 static_cast<mj_scalar_t
>(local_significance_mul);
7181 auto local_thread_point_counts = this->thread_point_counts;
7182 Kokkos::parallel_for(
7183 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7184 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7185 local_thread_point_counts(i) = 0;
7196 mj_part_t *cut_map =
new mj_part_t[no_cuts];
7198 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7199 typedef std::vector< multiSItem > multiSVector;
7200 typedef std::vector<multiSVector> multiS2Vector;
7203 std::vector<mj_scalar_t *>allocated_memory;
7206 multiS2Vector sort_vector_points_on_cut;
7209 mj_part_t different_cut_count = 1;
7215 multiSVector tmpMultiSVector;
7216 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7218 auto local_current_concurrent_cut_coordinate =
7219 current_concurrent_cut_coordinate;
7220 auto host_current_concurrent_cut_coordinate =
7221 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7222 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7223 local_current_concurrent_cut_coordinate);
7225 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7228 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7229 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7230 cut_map[i] = cut_map[i-1];
7233 cut_map[i] = different_cut_count++;
7234 multiSVector tmp2MultiSVector;
7235 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7238 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7239 host_current_concurrent_cut_coordinate);
7242 auto host_coordinate_permutations =
7243 Kokkos::create_mirror_view(coordinate_permutations);
7244 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7246 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7247 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7249 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7250 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7252 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7253 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7255 auto local_coord_dim = this->coord_dim;
7257 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7258 mj_lno_t i = host_coordinate_permutations(ii);
7259 mj_part_t pp = host_assigned_part_ids(i);
7260 mj_part_t p = pp / 2;
7263 mj_scalar_t *vals =
new mj_scalar_t[local_coord_dim -1];
7264 allocated_memory.push_back(vals);
7269 if(longest_dim_part) {
7271 for(
int dim = local_coord_dim - 2; dim >= 0; --dim) {
7274 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7279 host_mj_coordinates(i,next_largest_coord_dim);
7283 for(
int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7284 vals[val_ind++] = host_mj_coordinates(i,dim);
7286 for(
int dim = 0; dim < coordInd; ++dim) {
7287 vals[val_ind++] = host_mj_coordinates(i,dim);
7291 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7293 mj_part_t cmap = cut_map[p];
7294 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7298 ++host_thread_point_counts(p);
7299 host_assigned_part_ids(i) = p;
7304 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7305 std::sort (sort_vector_points_on_cut[i].begin(),
7306 sort_vector_points_on_cut[i].end());
7309 mj_part_t previous_cut_map = cut_map[0];
7311 auto host_thread_cut_line_weight_to_put_left =
7312 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7313 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7314 thread_cut_line_weight_to_put_left);
7316 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7317 Kokkos::deep_copy(host_mj_weights, mj_weights);
7327 mj_scalar_t weight_stolen_from_previous_part = 0;
7328 for(mj_part_t p = 0; p < no_cuts; ++p) {
7329 mj_part_t mapped_cut = cut_map[p];
7333 if(previous_cut_map != mapped_cut) {
7334 mj_lno_t sort_vector_end = (mj_lno_t)
7335 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7336 for(; sort_vector_end >= 0; --sort_vector_end) {
7338 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7339 mj_lno_t i = t.index;
7340 ++host_thread_point_counts(p);
7341 host_assigned_part_ids(i) = p;
7343 sort_vector_points_on_cut[previous_cut_map].clear();
7347 mj_lno_t sort_vector_end = (mj_lno_t)
7348 sort_vector_points_on_cut[mapped_cut].size() - 1;
7354 for(; sort_vector_end >= 0; --sort_vector_end) {
7357 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7359 mj_lno_t i = t.index;
7360 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7361 this->mj_weights(i,0);
7363 if(host_thread_cut_line_weight_to_put_left(p) +
7364 weight_stolen_from_previous_part> this->sEpsilon &&
7365 host_thread_cut_line_weight_to_put_left(p) +
7366 weight_stolen_from_previous_part -
7367 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7368 weight_stolen_from_previous_part - w)> this->sEpsilon)
7370 host_thread_cut_line_weight_to_put_left(p) -= w;
7372 sort_vector_points_on_cut[mapped_cut].pop_back();
7374 ++host_thread_point_counts(p);
7375 host_assigned_part_ids(i) = p;
7379 if(p < no_cuts - 1 &&
7380 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7381 if(mapped_cut == cut_map[p + 1] ) {
7384 if(previous_cut_map != mapped_cut) {
7385 weight_stolen_from_previous_part =
7386 host_thread_cut_line_weight_to_put_left(p);
7391 weight_stolen_from_previous_part +=
7392 host_thread_cut_line_weight_to_put_left(p);
7396 weight_stolen_from_previous_part =
7397 -host_thread_cut_line_weight_to_put_left(p);
7406 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7407 if(previous_cut_map != mapped_cut) {
7408 weight_stolen_from_previous_part =
7409 host_thread_cut_line_weight_to_put_left(p);
7412 weight_stolen_from_previous_part +=
7413 host_thread_cut_line_weight_to_put_left(p);
7417 weight_stolen_from_previous_part =
7418 -host_thread_cut_line_weight_to_put_left(p);
7424 previous_cut_map = mapped_cut;
7429 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7430 previous_cut_map].size() - 1;
7436 for(; sort_vector_end >= 0; --sort_vector_end) {
7438 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7441 mj_lno_t i = t.index;
7442 ++host_thread_point_counts(no_cuts);
7443 host_assigned_part_ids(i) = no_cuts;
7446 sort_vector_points_on_cut[previous_cut_map].clear();
7450 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7451 for(mj_lno_t i = 0; i < vSize; ++i) {
7452 delete [] allocated_memory[i];
7455 auto local_out_part_xadj = out_part_xadj;
7456 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7457 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7460 for(mj_part_t j = 0; j < num_parts; ++j) {
7461 host_out_part_xadj(j) = host_thread_point_counts(j);
7462 host_thread_point_counts(j) = 0;
7466 for(mj_part_t j = 1; j < num_parts; ++j) {
7467 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7472 for(mj_part_t j = 1; j < num_parts; ++j) {
7473 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7476 auto host_new_coordinate_permutations =
7477 Kokkos::create_mirror_view(new_coordinate_permutations);
7478 Kokkos::deep_copy(host_new_coordinate_permutations,
7479 new_coordinate_permutations);
7483 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7484 mj_lno_t i = host_coordinate_permutations(ii);
7485 mj_part_t p = host_assigned_part_ids(i);
7486 host_new_coordinate_permutations(coordinate_begin +
7487 host_thread_point_counts(p)++) = i;
7490 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7491 Kokkos::deep_copy(new_coordinate_permutations,
7492 host_new_coordinate_permutations);
7493 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7505 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7506 typename mj_part_t,
typename mj_node_t>
7507 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7509 mj_part_t current_num_parts,
7510 mj_part_t output_part_begin_index,
7511 RCP<mj_partBoxVector_t> &output_part_boxes,
7512 bool is_data_ever_migrated)
7515 mj_timer_base_string +
"Part_Assignment");
7518 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7519 auto local_coordinate_permutations = coordinate_permutations;
7520 auto local_assigned_part_ids = assigned_part_ids;
7522 if(local_mj_keep_part_boxes) {
7523 for(
int i = 0; i < current_num_parts; ++i) {
7524 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7528 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7529 current_num_parts, Kokkos::AUTO());
7530 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7532 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(
member_type team_member) {
7533 int i = team_member.league_rank();
7534 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7535 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7537 mj_lno_t k = local_coordinate_permutations(ii);
7538 local_assigned_part_ids(k) = i + output_part_begin_index;
7542 if(is_data_ever_migrated) {
7543 #ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7544 if(
sizeof(mj_lno_t) <=
sizeof(int)) {
7551 ZOLTAN_COMM_OBJ *plan = NULL;
7552 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7555 int message_tag = 7856;
7558 mj_timer_base_string +
"Final Z1PlanCreating");
7561 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
7562 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7566 mj_timer_base_string +
"Final Z1PlanCreating" );
7569 mj_timer_base_string +
"Final Z1PlanComm");
7574 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7575 Kokkos::HostSpace(), this->current_mj_gnos);
7576 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7577 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7578 Kokkos::ViewAllocateWithoutInitializing(
"dst_gnos"), incoming);
7579 auto host_dst_gnos = Kokkos::create_mirror_view(
7580 Kokkos::HostSpace(), dst_gnos);
7582 ierr = Zoltan_Comm_Do( plan, message_tag,
7583 (
char *) host_current_mj_gnos.data(),
7584 sizeof(mj_gno_t), (
char *) host_dst_gnos.data());
7586 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7587 this->current_mj_gnos = dst_gnos;
7590 auto host_src_part_ids = Kokkos::create_mirror_view(
7591 Kokkos::HostSpace(), this->assigned_part_ids);
7592 deep_copy(host_src_part_ids, this->assigned_part_ids);
7593 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7594 Kokkos::ViewAllocateWithoutInitializing(
"dst_part_ids"), incoming);
7595 auto host_dst_part_ids = Kokkos::create_mirror_view(
7596 Kokkos::HostSpace(), dst_part_ids);
7598 ierr = Zoltan_Comm_Do( plan, message_tag,
7599 (
char *) host_src_part_ids.data(),
7600 sizeof(mj_part_t), (
char *) host_dst_part_ids.data());
7602 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7603 this->assigned_part_ids = dst_part_ids;
7605 ierr = Zoltan_Comm_Destroy(&plan);
7608 this->num_local_coords = incoming;
7611 mj_timer_base_string +
"Final Z1PlanComm");
7618 mj_timer_base_string +
"Final DistributorPlanCreating");
7619 Tpetra::Distributor distributor(this->mj_problemComm);
7620 ArrayView<const mj_part_t> owners_of_coords(
7621 this->owner_of_coordinate.data(), this->num_local_coords);
7622 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7624 mj_timer_base_string +
"Final DistributorPlanCreating" );
7627 mj_timer_base_string +
"Final DistributorPlanComm");
7632 auto src_host_current_mj_gnos =
7633 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->current_mj_gnos);
7634 Kokkos::deep_copy(src_host_current_mj_gnos, this->current_mj_gnos);
7635 ArrayRCP<mj_gno_t> received_gnos(incoming);
7636 ArrayView<mj_gno_t> sent_gnos(src_host_current_mj_gnos.data(),
7637 this->num_local_coords);
7638 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
7639 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7640 Kokkos::ViewAllocateWithoutInitializing(
"current_mj_gnos"), incoming);
7641 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7642 this->current_mj_gnos);
7643 memcpy(host_current_mj_gnos.data(),
7644 received_gnos.getRawPtr(), incoming *
sizeof(mj_gno_t));
7645 Kokkos::deep_copy(this->current_mj_gnos, host_current_mj_gnos);
7648 auto src_host_assigned_part_ids =
7649 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->assigned_part_ids);
7650 Kokkos::deep_copy(src_host_assigned_part_ids, this->assigned_part_ids);
7651 ArrayView<mj_part_t> sent_partids(src_host_assigned_part_ids.data(),
7652 this->num_local_coords);
7653 ArrayRCP<mj_part_t> received_partids(incoming);
7654 distributor.doPostsAndWaits<mj_part_t>(
7655 sent_partids, 1, received_partids());
7656 this->assigned_part_ids =
7657 Kokkos::View<mj_part_t*, device_t>(
7658 Kokkos::ViewAllocateWithoutInitializing(
"assigned_part_ids"),
7660 auto host_assigned_part_ids = Kokkos::create_mirror_view(
7661 this->assigned_part_ids);
7662 memcpy( host_assigned_part_ids.data(),
7663 received_partids.getRawPtr(), incoming *
sizeof(mj_part_t));
7664 deep_copy(this->assigned_part_ids, host_assigned_part_ids);
7665 this->num_local_coords = incoming;
7668 mj_timer_base_string +
"Final DistributorPlanComm");
7673 mj_timer_base_string +
"Part_Assignment");
7676 mj_timer_base_string +
"Solution_Part_Assignment");
7681 if(this->mj_keep_part_boxes) {
7682 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7686 mj_timer_base_string +
"Solution_Part_Assignment");
7701 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7702 typename mj_part_t,
typename mj_node_t>
7705 bool distribute_points_on_cut_lines_,
7706 int max_concurrent_part_calculation_,
7707 int check_migrate_avoid_migration_option_,
7708 double minimum_migration_imbalance_,
7709 int migration_type_)
7711 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7712 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7713 this->check_migrate_avoid_migration_option =
7714 check_migrate_avoid_migration_option_;
7715 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7716 this->migration_type = migration_type_;
7746 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7747 typename mj_part_t,
typename mj_node_t>
7750 const RCP<const Environment> &env,
7751 RCP<
const Comm<int> > &problemComm,
7752 double imbalance_tolerance_,
7754 size_t num_global_parts_,
7755 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7756 int recursion_depth_,
7758 mj_lno_t num_local_coords_,
7759 mj_gno_t num_global_coords_,
7760 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7762 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7763 int num_weights_per_coord_,
7764 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7765 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7766 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7767 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7768 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7773 this->mj_timer_base_string =
"MJ(" + std::to_string(execute_counter) +
") - ";
7776 this->mj_problemComm = problemComm;
7777 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7779 mj_timer_base_string +
"Total");
7780 this->mj_env->debug(3,
"In MultiJagged Jagged");
7781 this->imbalance_tolerance = imbalance_tolerance_;
7782 this->mj_num_teams = num_teams_;
7783 this->num_global_parts = num_global_parts_;
7784 this->part_no_array = part_no_array_;
7785 this->recursion_depth = recursion_depth_;
7786 this->coord_dim = coord_dim_;
7787 this->num_local_coords = num_local_coords_;
7788 this->num_global_coords = num_global_coords_;
7789 this->mj_coordinates = mj_coordinates_;
7790 this->initial_mj_gnos = initial_mj_gnos_;
7791 this->num_weights_per_coord = num_weights_per_coord_;
7792 this->mj_uniform_weights = mj_uniform_weights_;
7793 this->mj_weights = mj_weights_;
7794 this->mj_uniform_parts = mj_uniform_parts_;
7798 this->set_part_specifications();
7801 mj_timer_base_string +
"Allocate Views");
7802 this->allocate_set_work_memory();
7804 mj_timer_base_string +
"Allocate Views");
7808 this->comm = this->mj_problemComm->duplicate();
7811 if(comm->getRank() == 0) {
7812 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
7813 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
7814 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
7819 mj_part_t current_num_parts = 1;
7820 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7821 this->all_cut_coordinates;
7823 mj_timer_base_string +
"Problem_Partitioning");
7824 mj_part_t output_part_begin_index = 0;
7825 mj_part_t future_num_parts = this->total_num_part;
7826 bool is_data_ever_migrated =
false;
7828 std::vector<mj_part_t> *future_num_part_in_parts =
7829 new std::vector<mj_part_t> ();
7830 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7831 new std::vector<mj_part_t> ();
7833 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7835 RCP<mj_partBoxVector_t> input_part_boxes;
7836 RCP<mj_partBoxVector_t> output_part_boxes;
7838 if(this->mj_keep_part_boxes) {
7839 input_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7840 output_part_boxes = RCP<mj_partBoxVector_t>(
new mj_partBoxVector_t(),
true);
7841 compute_global_box();
7842 this->init_part_boxes(output_part_boxes);
7849 Kokkos::View<mj_part_t*, device_t>
7850 view_rectilinear_cut_count(
"view_rectilinear_cut_count", 1);
7851 Kokkos::View<size_t*, device_t>
7852 view_total_reduction_size(
"view_total_reduction_size", 1);
7854 for(
int i = 0; i < this->recursion_depth; ++i) {
7857 std::string istring = std::to_string(i);
7863 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7864 future_num_part_in_parts = next_future_num_parts_in_parts;
7865 next_future_num_parts_in_parts = tmpPartVect;
7869 next_future_num_parts_in_parts->clear();
7870 if(this->mj_keep_part_boxes) {
7871 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7872 input_part_boxes = output_part_boxes;
7873 output_part_boxes = tmpPartBoxes;
7874 output_part_boxes->clear();
7878 mj_part_t output_part_count_in_dimension =
7879 this->update_part_num_arrays(
7880 future_num_part_in_parts,
7881 next_future_num_parts_in_parts,
7886 output_part_boxes, 1);
7891 if(output_part_count_in_dimension == current_num_parts) {
7893 tmpPartVect= future_num_part_in_parts;
7894 future_num_part_in_parts = next_future_num_parts_in_parts;
7895 next_future_num_parts_in_parts = tmpPartVect;
7897 if(this->mj_keep_part_boxes) {
7898 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7899 input_part_boxes = output_part_boxes;
7900 output_part_boxes = tmpPartBoxes;
7906 int coordInd = i % this->coord_dim;
7908 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7909 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7912 mj_timer_base_string +
"Problem_Partitioning_" + istring);
7916 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7917 "new part xadj", output_part_count_in_dimension);
7920 mj_part_t output_part_index = 0;
7924 mj_part_t output_coordinate_end_index = 0;
7929 this->max_concurrent_part_calculation);
7931 mj_part_t obtained_part_index = 0;
7933 auto host_process_local_min_max_coord_total_weight =
7934 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7935 auto host_global_min_max_coord_total_weight =
7936 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7944 this->max_concurrent_part_calculation);
7947 auto local_device_num_partitioning_in_current_dim =
7948 device_num_partitioning_in_current_dim;
7949 Kokkos::parallel_reduce(
"Read bDoingWork",
7950 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7951 KOKKOS_LAMBDA(
int dummy,
int & set_single) {
7954 if(local_device_num_partitioning_in_current_dim(
7961 bool bDoingWork = (bDoingWork_int != 0) ?
true :
false;
7963 this->mj_get_local_min_max_coord_totW(
7966 mj_current_dim_coords);
7971 this->mj_get_global_min_max_coord_totW(
7973 this->process_local_min_max_coord_total_weight,
7974 this->global_min_max_coord_total_weight);
7978 mj_part_t total_incomplete_cut_count = 0;
7983 mj_part_t concurrent_part_cut_shift = 0;
7984 mj_part_t concurrent_part_part_shift = 0;
7988 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7989 global_min_max_coord_total_weight);
7991 mj_scalar_t min_coordinate =
7992 host_global_min_max_coord_total_weight(kk);
7993 mj_scalar_t max_coordinate =
7994 host_global_min_max_coord_total_weight(
7997 mj_scalar_t global_total_weight =
7998 host_global_min_max_coord_total_weight(
8003 mj_part_t partition_count = host_num_partitioning_in_current_dim(
8004 concurrent_current_part_index);
8006 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
8007 Kokkos::subview(current_cut_coordinates,
8008 std::pair<mj_lno_t, mj_lno_t>(
8009 concurrent_part_cut_shift, current_cut_coordinates.size()));
8010 Kokkos::View<mj_scalar_t *, device_t>
8011 current_target_part_weights =
8012 Kokkos::subview(target_part_weights,
8013 std::pair<mj_lno_t, mj_lno_t>(
8014 concurrent_part_part_shift, target_part_weights.size()));
8017 concurrent_part_cut_shift += partition_count - 1;
8019 concurrent_part_part_shift += partition_count;
8023 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8027 total_incomplete_cut_count += partition_count - 1;
8029 this->incomplete_cut_count(kk) = partition_count - 1;
8032 this->mj_get_initial_cut_coords_target_weights(
8035 partition_count - 1,
8036 global_total_weight,
8038 current_target_part_weights,
8039 future_num_part_in_parts,
8040 next_future_num_parts_in_parts,
8041 concurrent_current_part_index,
8042 obtained_part_index);
8044 mj_lno_t coordinate_end_index =
8045 host_part_xadj(concurrent_current_part_index);
8046 mj_lno_t coordinate_begin_index =
8047 concurrent_current_part_index==0 ? 0 :
8048 host_part_xadj(concurrent_current_part_index - 1);
8050 this->set_initial_coordinate_parts(
8053 coordinate_begin_index, coordinate_end_index,
8054 this->coordinate_permutations,
8055 mj_current_dim_coords,
8056 this->assigned_part_ids,
8062 this->incomplete_cut_count(kk) = 0;
8065 obtained_part_index += partition_count;
8070 double used_imbalance = 0;
8073 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8076 mj_current_dim_coords,
8080 current_cut_coordinates,
8081 total_incomplete_cut_count,
8082 view_rectilinear_cut_count,
8083 view_total_reduction_size);
8086 mj_timer_base_string +
"Problem_Partitioning Get Part Weights");
8091 mj_part_t output_array_shift = 0;
8092 mj_part_t cut_shift = 0;
8093 size_t tlr_shift = 0;
8094 size_t partweight_array_shift = 0;
8099 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8100 current_concurrent_work_part);
8103 int coordinateA_bigger_than_coordinateB =
8104 host_global_min_max_coord_total_weight(kk) >
8105 host_global_min_max_coord_total_weight(
8108 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8111 auto local_new_part_xadj = this->new_part_xadj;
8112 Kokkos::parallel_for(
8113 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8114 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8115 local_new_part_xadj(
8116 output_part_index + output_array_shift + jj) = 0;
8119 cut_shift += num_parts - 1;
8120 tlr_shift += (4 *(num_parts - 1) + 1);
8121 output_array_shift += num_parts;
8122 partweight_array_shift += (2 * (num_parts - 1) + 1);
8126 Kokkos::View<mj_scalar_t *, device_t>
8127 current_concurrent_cut_coordinate =
8128 Kokkos::subview(current_cut_coordinates,
8129 std::pair<mj_lno_t, mj_lno_t>(
8131 current_cut_coordinates.size()));
8132 Kokkos::View<mj_scalar_t *, device_t>
8133 used_local_cut_line_weight_to_left =
8134 Kokkos::subview(process_cut_line_weight_to_put_left,
8135 std::pair<mj_lno_t, mj_lno_t>(
8137 process_cut_line_weight_to_put_left.size()));
8139 this->thread_part_weight_work =
8141 this->thread_part_weights,
8142 std::pair<mj_lno_t, mj_lno_t>(
8143 partweight_array_shift,
8144 this->thread_part_weights.extent(0)));
8147 if(this->mj_keep_part_boxes) {
8149 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8150 mj_scalar_t temp_get_val;
8151 Kokkos::parallel_reduce(
"Read single",
8152 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8153 KOKKOS_LAMBDA(
int dummy, mj_scalar_t & set_single) {
8154 set_single = current_concurrent_cut_coordinate(j);
8156 (*output_part_boxes)
8157 [output_array_shift + output_part_index + j].
8158 updateMinMax(temp_get_val, 1 , coordInd);
8159 (*output_part_boxes)
8160 [output_array_shift + output_part_index + j + 1].
8161 updateMinMax(temp_get_val, 0 , coordInd);
8166 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8167 Kokkos::subview(this->new_part_xadj,
8168 std::pair<mj_lno_t, mj_lno_t>(
8169 output_part_index + output_array_shift,
8170 this->new_part_xadj.size()));
8172 this->mj_create_new_partitions(
8174 current_concurrent_work_part,
8175 mj_current_dim_coords,
8176 current_concurrent_cut_coordinate,
8177 used_local_cut_line_weight_to_left,
8182 mj_lno_t coordinate_end = host_part_xadj(
8183 current_concurrent_work_part);
8184 mj_lno_t coordinate_begin =
8185 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8186 current_concurrent_work_part - 1);
8190 mj_lno_t part_size = coordinate_end - coordinate_begin;
8194 auto local_new_part_xadj = this->new_part_xadj;
8195 Kokkos::parallel_for(
8196 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8197 (0, 1), KOKKOS_LAMBDA (
int dummy) {
8198 local_new_part_xadj(
8199 output_part_index + output_array_shift) = part_size;
8202 auto subview_new_coordinate_permutations =
8203 Kokkos::subview(this->new_coordinate_permutations,
8204 std::pair<mj_lno_t, mj_lno_t>(
8206 coordinate_begin + part_size));
8207 auto subview_coordinate_permutations =
8208 Kokkos::subview(this->coordinate_permutations,
8209 std::pair<mj_lno_t, mj_lno_t>(
8211 coordinate_begin + part_size));
8212 Kokkos::deep_copy(subview_new_coordinate_permutations,
8213 subview_coordinate_permutations);
8215 cut_shift += num_parts - 1;
8216 output_array_shift += num_parts;
8217 partweight_array_shift += (2 * (num_parts - 1) + 1);
8227 mj_part_t num_parts =
8232 auto local_new_part_xadj = this->new_part_xadj;
8233 Kokkos::parallel_for(
8234 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8235 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8236 local_new_part_xadj(output_part_index+ii) +=
8237 output_coordinate_end_index;
8242 Kokkos::parallel_reduce(
"Read single",
8243 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8244 KOKKOS_LAMBDA(
int dummy, mj_part_t & set_single) {
8246 local_new_part_xadj(output_part_index + num_parts - 1);
8248 output_coordinate_end_index = temp_get;
8250 output_part_index += num_parts;
8256 int current_world_size = this->comm->getSize();
8257 long migration_reduce_all_population =
8258 this->total_dim_num_reduce_all * current_world_size;
8259 bool is_migrated_in_current_dimension =
false;
8264 if(future_num_parts > 1 &&
8265 this->check_migrate_avoid_migration_option >= 0 &&
8266 current_world_size > 1) {
8268 mj_timer_base_string +
"Problem_Migration-" + istring);
8269 mj_part_t num_parts = output_part_count_in_dimension;
8271 if(this->mj_perform_migration(
8274 next_future_num_parts_in_parts,
8275 output_part_begin_index,
8276 migration_reduce_all_population,
8277 this->num_global_coords / (future_num_parts * current_num_parts),
8279 input_part_boxes, output_part_boxes) )
8281 is_migrated_in_current_dimension =
true;
8282 is_data_ever_migrated =
true;
8284 mj_timer_base_string +
"Problem_Migration-" + istring);
8287 this->total_dim_num_reduce_all /= num_parts;
8290 is_migrated_in_current_dimension =
false;
8292 mj_timer_base_string +
"Problem_Migration-" + istring);
8297 Kokkos::View<mj_lno_t*, device_t> tmp =
8298 this->coordinate_permutations;
8299 this->coordinate_permutations =
8300 this->new_coordinate_permutations;
8302 this->new_coordinate_permutations = tmp;
8303 if(!is_migrated_in_current_dimension) {
8304 this->total_dim_num_reduce_all -= current_num_parts;
8305 current_num_parts = output_part_count_in_dimension;
8309 this->part_xadj = this->new_part_xadj;
8310 local_part_xadj = this->new_part_xadj;
8311 this->host_part_xadj = Kokkos::create_mirror_view(
part_xadj);
8312 Kokkos::deep_copy(host_part_xadj,
part_xadj);
8314 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
"empty", 0);
8316 mj_timer_base_string +
"Problem_Partitioning_" + istring);
8321 delete future_num_part_in_parts;
8322 delete next_future_num_parts_in_parts;
8324 mj_timer_base_string +
"Problem_Partitioning");
8330 this->set_final_parts(
8332 output_part_begin_index,
8334 is_data_ever_migrated);
8336 result_assigned_part_ids_ = this->assigned_part_ids;
8337 result_mj_gnos_ = this->current_mj_gnos;
8339 mj_timer_base_string +
"Total");
8340 this->mj_env->debug(3,
"Out of MultiJagged");
8343 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8344 typename mj_part_t,
typename mj_node_t>
8345 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8350 if(this->mj_keep_part_boxes) {
8351 return this->kept_boxes;
8354 throw std::logic_error(
"Error: part boxes are not stored.");
8358 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
8359 typename mj_part_t,
typename mj_node_t>
8360 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8366 mj_part_t ntasks = this->num_global_parts;
8367 int dim = (*localPartBoxes)[0].getDim();
8368 coord_t *localPartBoundaries =
new coord_t[ntasks * 2 *dim];
8370 memset(localPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8372 coord_t *globalPartBoundaries =
new coord_t[ntasks * 2 *dim];
8373 memset(globalPartBoundaries, 0,
sizeof(coord_t) * ntasks * 2 *dim);
8375 coord_t *localPartMins = localPartBoundaries;
8376 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8378 coord_t *globalPartMins = globalPartBoundaries;
8379 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8381 mj_part_t boxCount = localPartBoxes->size();
8382 for(mj_part_t i = 0; i < boxCount; ++i) {
8383 mj_part_t pId = (*localPartBoxes)[i].getpId();
8387 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8388 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8390 for(
int j = 0; j < dim; ++j) {
8391 localPartMins[dim * pId + j] = lmins[j];
8392 localPartMaxs[dim * pId + j] = lmaxs[j];
8405 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8406 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8408 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
8409 for(mj_part_t i = 0; i < ntasks; ++i) {
8411 globalPartMins + dim * i,
8412 globalPartMaxs + dim * i);
8425 delete []localPartBoundaries;
8426 delete []globalPartBoundaries;
8433 template <
typename Adapter>
8439 #ifndef DOXYGEN_SHOULD_SKIP_THIS
8445 typedef typename Adapter::scalar_t adapter_scalar_t;
8448 typedef float default_mj_scalar_t;
8454 (std::is_same<adapter_scalar_t, float>::value ||
8455 std::is_same<adapter_scalar_t, double>::value),
8456 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8461 typedef typename Adapter::node_t mj_node_t;
8463 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8464 typedef typename mj_node_t::device_type device_t;
8469 RCP<const Environment> mj_env;
8470 RCP<const Comm<int> > mj_problemComm;
8471 RCP<const coordinateModel_t> mj_coords;
8474 double imbalance_tolerance;
8478 size_t num_global_parts;
8481 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8484 int recursion_depth;
8487 mj_lno_t num_local_coords;
8488 mj_gno_t num_global_coords;
8491 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8495 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8498 int num_weights_per_coord;
8501 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8504 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8507 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8517 mj_part_t num_first_level_parts;
8521 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8525 bool distribute_points_on_cut_lines;
8528 mj_part_t max_concurrent_part_calculation;
8531 int check_migrate_avoid_migration_option;
8539 double minimum_migration_imbalance;
8540 bool mj_keep_part_boxes;
8544 int mj_premigration_option;
8545 int min_coord_per_rank_for_premigration;
8548 ArrayRCP<mj_part_t> comXAdj_;
8551 ArrayRCP<mj_part_t> comAdj_;
8556 void set_input_parameters(
const Teuchos::ParameterList &p);
8558 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
8560 bool mj_premigrate_to_subset(
8562 int migration_selection_option,
8563 RCP<const Environment> mj_env_,
8564 RCP<
const Comm<int> > mj_problemComm_,
8566 mj_lno_t num_local_coords_,
8567 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8568 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8570 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8572 int num_weights_per_coord_,
8573 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8575 RCP<
const Comm<int> > &result_problemComm_,
8576 mj_lno_t & result_num_local_coords_,
8577 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8579 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8580 result_mj_coordinates_,
8581 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8582 int * &result_actual_owner_rank_);
8587 RCP<
const Comm<int> > &problemComm,
8588 const RCP<const coordinateModel_t> &coords) :
8591 mj_problemComm(problemComm),
8593 imbalance_tolerance(0),
8595 num_global_parts(1),
8598 num_local_coords(0),
8599 num_global_coords(0),
8600 num_weights_per_coord(0),
8601 num_first_level_parts(1),
8602 distribute_points_on_cut_lines(true),
8603 max_concurrent_part_calculation(1),
8604 check_migrate_avoid_migration_option(0),
8606 minimum_migration_imbalance(0.30),
8607 mj_keep_part_boxes(false),
8608 mj_run_as_rcb(false),
8609 mj_premigration_option(0),
8610 min_coord_per_rank_for_premigration(32000),
8624 const bool bUnsorted =
true;
8625 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8627 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning "
8628 "algorithm. As many as the dimension count.", mj_parts_Validator);
8630 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut "
8631 "coordinates will be calculated concurently.",
8634 pl.set(
"mj_minimum_migration_imbalance", 1.1,
8635 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8636 "processors to avoid migration",
8639 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8640 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8641 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision "
8642 "depending on the imbalance, 1 for forcing migration, 2 for "
8643 "avoiding migration", mj_migration_option_validator);
8645 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8646 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8647 pl.set(
"mj_migration_type", 0,
8648 "Migration type, 0 for migration to minimize the imbalance "
8649 "1 for migration to minimize messages exchanged the migration.",
8650 mj_migration_option_validator);
8653 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the "
8657 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
8660 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be "
8663 RCP<Teuchos::EnhancedNumberValidator<int>>
8664 mj_num_teams_validator =
8665 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(
8666 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8667 pl.set(
"mj_num_teams", 0,
8668 "How many teams for the main kernel loop"
8669 , mj_num_teams_validator);
8671 RCP<Teuchos::EnhancedNumberValidator<int>>
8672 mj_premigration_option_validator =
8673 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8675 pl.set(
"mj_premigration_option", 0,
8676 "Whether to do premigration or not. 0 for no migration "
8677 "x > 0 for migration to consecutive processors, "
8678 "the subset will be 0,x,2x,3x,...subset ranks."
8679 , mj_premigration_option_validator);
8681 pl.set(
"mj_premigration_coordinate_count", 32000,
"How many coordinate to "
8682 "assign each rank in multijagged after premigration"
8695 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8699 mj_part_t pointAssign(
int dim, adapter_scalar_t *point)
const;
8701 void boxAssign(
int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8702 size_t &nPartsFound, mj_part_t **partsFound)
const;
8706 void getCommunicationGraph(
8708 ArrayRCP<mj_part_t> &comXAdj,
8709 ArrayRCP<mj_part_t> &comAdj);
8711 void set_up_partitioning_data(
8715 std::string timer_base_string;
8723 template<
class dst_t,
class src_t>
8724 typename std::enable_if<std::is_same<
typename dst_t::value_type,
8725 typename src_t::value_type>::value>::type
8726 assign_if_same(dst_t & dst,
const src_t & src) {
8729 template<
class dst_t,
class src_t>
8730 typename std::enable_if<!std::is_same<
typename dst_t::value_type,
8731 typename src_t::value_type>::value>::type
8732 assign_if_same(dst_t & dst,
const src_t & src) {
8737 template <
typename Adapter>
8738 bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8740 int migration_selection_option,
8741 RCP<const Environment> mj_env_,
8742 RCP<
const Comm<int> > mj_problemComm_,
8744 mj_lno_t num_local_coords_,
8745 mj_gno_t num_global_coords_,
size_t num_global_parts_,
8746 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8748 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8749 int num_weights_per_coord_,
8750 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8752 RCP<
const Comm<int> > & result_problemComm_,
8753 mj_lno_t &result_num_local_coords_,
8754 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8756 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8757 result_mj_coordinates_,
8758 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8759 int * &result_actual_owner_rank_)
8762 timer_base_string +
"PreMigration DistributorPlanCreating");
8764 int myRank = mj_problemComm_->getRank();
8765 int worldSize = mj_problemComm_->getSize();
8767 mj_part_t groupsize = worldSize / used_num_ranks;
8769 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8771 mj_part_t i_am_sending_to = 0;
8772 bool am_i_a_receiver =
false;
8774 for(
int i = 0; i < used_num_ranks; ++i) {
8775 group_begins[i+ 1] = group_begins[i] + groupsize;
8776 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8777 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8778 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8779 i_am_sending_to = group_begins[i];
8781 if(myRank == group_begins[i]) {
8782 am_i_a_receiver =
true;
8786 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8787 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8789 Tpetra::Distributor distributor(mj_problemComm_);
8791 std::vector<mj_part_t>
8792 coordinate_destinations(num_local_coords_, i_am_sending_to);
8794 ArrayView<const mj_part_t>
8795 destinations(&(coordinate_destinations[0]), num_local_coords_);
8796 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8797 result_num_local_coords_ = num_incoming_gnos;
8799 timer_base_string +
"PreMigration DistributorPlanCreating");
8802 timer_base_string +
"PreMigration DistributorMigration");
8808 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
8809 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_initial_mj_gnos(
8810 Kokkos::ViewAllocateWithoutInitializing(
"host_initial_mj_gnos"),
8811 initial_mj_gnos_.size());
8812 Kokkos::deep_copy(host_initial_mj_gnos, initial_mj_gnos_);
8813 ArrayView<const mj_gno_t> sent_gnos(host_initial_mj_gnos.data(),
8815 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
8816 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8817 Kokkos::ViewAllocateWithoutInitializing(
"result_initial_mj_gnos_"),
8819 auto host_result_initial_mj_gnos_ = Kokkos::create_mirror_view(
8820 result_initial_mj_gnos_);
8821 memcpy(host_result_initial_mj_gnos_.data(),
8822 received_gnos.getRawPtr(), num_incoming_gnos *
sizeof(mj_gno_t));
8823 Kokkos::deep_copy(result_initial_mj_gnos_, host_result_initial_mj_gnos_);
8828 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8829 Kokkos::ViewAllocateWithoutInitializing(
"mj_coordinates"),
8830 num_incoming_gnos, this->coord_dim);
8831 auto host_dst_coordinates = Kokkos::create_mirror_view(
8833 auto host_src_coordinates =
8834 Kokkos::create_mirror_view(Kokkos::HostSpace(), this->mj_coordinates);
8835 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8836 for(
int i = 0; i < this->coord_dim; ++i) {
8837 auto sub_host_src_coordinates
8838 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8839 auto sub_host_dst_coordinates
8840 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
8842 ArrayView<mj_scalar_t> sent_coord(
8843 sub_host_src_coordinates.data(), this->num_local_coords);
8844 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
8845 distributor.doPostsAndWaits<mj_scalar_t>(
8846 sent_coord, 1, received_coord());
8847 memcpy(sub_host_dst_coordinates.data(),
8848 received_coord.getRawPtr(), num_incoming_gnos *
sizeof(mj_scalar_t));
8850 deep_copy(dst_coordinates, host_dst_coordinates);
8851 result_mj_coordinates_ = dst_coordinates;
8854 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8855 Kokkos::ViewAllocateWithoutInitializing(
"mj_weights"),
8856 num_incoming_gnos, this->num_weights_per_coord);
8857 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8858 auto host_src_weights = Kokkos::create_mirror_view(this->mj_weights);
8859 Kokkos::deep_copy(host_src_weights, this->mj_weights);
8860 for(
int i = 0; i < this->num_weights_per_coord; ++i) {
8861 auto sub_host_src_weights
8862 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8863 auto sub_host_dst_weights
8864 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8865 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
8873 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8874 sent_weight[n] = sub_host_src_weights(n);
8876 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
8877 distributor.doPostsAndWaits<mj_scalar_t>(
8878 sent_weight(), 1, received_weight());
8881 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8882 sub_host_dst_weights(n) = received_weight[n];
8885 Kokkos::deep_copy(dst_weights, host_dst_weights);
8886 result_mj_weights_ = dst_weights;
8890 std::vector<int> owner_of_coordinate(num_local_coords_, myRank);
8891 ArrayView<int> sent_owners(&(owner_of_coordinate[0]), num_local_coords_);
8892 ArrayRCP<int> received_owners(num_incoming_gnos);
8893 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
8894 result_actual_owner_rank_ =
new int[num_incoming_gnos];
8896 result_actual_owner_rank_,
8897 received_owners.getRawPtr(),
8898 num_incoming_gnos *
sizeof(
int));
8902 timer_base_string +
"PreMigration DistributorMigration");
8903 return am_i_a_receiver;
8913 template <
typename Adapter>
8922 int execute_counter =
8924 timer_base_string =
"partition(" + std::to_string(execute_counter) +
") - ";
8926 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"all");
8928 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"setup");
8930 this->set_up_partitioning_data(solution);
8932 this->set_input_parameters(this->mj_env->getParameters());
8933 if(this->mj_keep_part_boxes) {
8934 this->mj_partitioner.set_to_keep_part_boxes();
8937 this->mj_partitioner.set_partitioning_parameters(
8938 this->distribute_points_on_cut_lines,
8939 this->max_concurrent_part_calculation,
8940 this->check_migrate_avoid_migration_option,
8941 this->minimum_migration_imbalance, this->migration_type);
8943 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8944 mj_lno_t result_num_local_coords = this->num_local_coords;
8945 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8947 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8948 result_mj_coordinates = this->mj_coordinates;
8949 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8951 int *result_actual_owner_rank = NULL;
8953 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8954 this->initial_mj_gnos;
8972 int current_world_size = this->mj_problemComm->getSize();
8973 mj_lno_t threshold_num_local_coords =
8974 this->min_coord_per_rank_for_premigration;
8975 bool is_pre_migrated =
false;
8976 bool am_i_in_subset =
true;
8982 if(mj_premigration_option > 0 &&
8983 size_t (current_world_size) > this->num_global_parts &&
8984 this->num_global_coords < mj_gno_t (
8985 current_world_size * threshold_num_local_coords))
8987 if(this->mj_keep_part_boxes) {
8988 throw std::logic_error(
"Multijagged: mj_keep_part_boxes and "
8989 "mj_premigration_option are not supported together yet.");
8992 is_pre_migrated =
true;
8993 int migration_selection_option = mj_premigration_option;
8994 if(migration_selection_option * this->num_global_parts >
8995 (
size_t) (current_world_size)) {
8996 migration_selection_option =
8997 current_world_size / this->num_global_parts;
9000 int used_num_ranks = int (this->num_global_coords /
9001 float (threshold_num_local_coords) + 0.5);
9003 if(used_num_ranks == 0) {
9007 am_i_in_subset = this->mj_premigrate_to_subset(
9009 migration_selection_option,
9011 this->mj_problemComm,
9013 this->num_local_coords,
9014 this->num_global_coords,
9015 this->num_global_parts,
9016 this->initial_mj_gnos,
9017 this->mj_coordinates,
9018 this->num_weights_per_coord,
9022 result_num_local_coords,
9023 result_initial_mj_gnos,
9024 result_mj_coordinates,
9026 result_actual_owner_rank);
9028 result_initial_mj_gnos_ = result_initial_mj_gnos;
9031 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9032 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9034 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"setup");
9036 if(am_i_in_subset) {
9037 this->mj_partitioner.multi_jagged_part(
9040 this->imbalance_tolerance,
9042 this->num_global_parts,
9043 this->part_no_array,
9044 this->recursion_depth,
9046 result_num_local_coords,
9047 this->num_global_coords,
9048 result_initial_mj_gnos_,
9049 result_mj_coordinates,
9050 this->num_weights_per_coord,
9051 this->mj_uniform_weights,
9053 this->mj_uniform_parts,
9054 result_assigned_part_ids,
9059 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
"cleanup");
9062 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9063 localGidToLid.reserve(result_num_local_coords);
9064 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9065 Kokkos::ViewAllocateWithoutInitializing(
"host_result_initial_mj_gnos"),
9066 result_initial_mj_gnos_.size());
9067 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9068 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9069 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9072 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[result_num_local_coords],
9073 0, result_num_local_coords,
true);
9074 auto host_result_assigned_part_ids =
9075 Kokkos::create_mirror_view(result_assigned_part_ids);
9076 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9077 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9078 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9079 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9080 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9081 partId[origLID] = host_result_assigned_part_ids(i);
9086 if(is_pre_migrated) {
9087 this->mj_env->timerStart(
MACRO_TIMERS, timer_base_string +
9088 "PostMigration DistributorPlanCreating");
9089 Tpetra::Distributor distributor(this->mj_problemComm);
9090 ArrayView<const mj_part_t> actual_owner_destinations(
9091 result_actual_owner_rank , result_num_local_coords);
9092 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9093 actual_owner_destinations);
9094 if(num_incoming_gnos != this->num_local_coords) {
9095 throw std::logic_error(
"Zoltan2 - Multijagged Post Migration - "
9096 "num incoming is not equal to num local coords");
9100 "PostMigration DistributorPlanCreating");
9102 "PostMigration DistributorMigration");
9103 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
9104 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
9106 ArrayView<const mj_gno_t> sent_gnos(host_result_initial_mj_gnos.data(),
9107 result_num_local_coords);
9108 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
9112 ArrayView<mj_part_t> sent_partnos(partId());
9113 distributor.doPostsAndWaits<mj_part_t>(sent_partnos, 1,
9114 received_partids());
9117 partId = arcp(
new mj_part_t[this->num_local_coords],
9118 0, this->num_local_coords,
true);
9121 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9122 localGidToLid2.reserve(this->num_local_coords);
9123 auto host_initial_mj_gnos =
9124 Kokkos::create_mirror_view(this->initial_mj_gnos);
9125 Kokkos::deep_copy(host_initial_mj_gnos,
9126 this->initial_mj_gnos);
9127 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9128 localGidToLid2[host_initial_mj_gnos(i)] = i;
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9133 partId[origLID] = received_partids[i];
9138 delete [] result_actual_owner_rank;
9141 timer_base_string +
"PostMigration DistributorMigration");
9143 solution->setParts(partId);
9144 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"cleanup");
9147 this->mj_env->timerStop(
MACRO_TIMERS, timer_base_string +
"all");
9152 template <
typename Adapter>
9157 this->coord_dim = this->mj_coords->getCoordinateDim();
9158 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
9159 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
9160 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
9161 int criteria_dim = (this->num_weights_per_coord ?
9162 this->num_weights_per_coord : 1);
9166 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9169 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9170 "uniform parts", criteria_dim);
9171 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9172 "uniform weights", criteria_dim);
9174 Kokkos::View<const mj_gno_t *, device_t> gnos;
9175 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9177 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9178 this->mj_coords->getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9180 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9181 Kokkos::View<mj_scalar_t **, device_t> wgts;
9185 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9188 assign_if_same(xyz, xyz_adapter);
9189 assign_if_same(wgts, wgts_adapter);
9194 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9195 (Kokkos::ViewAllocateWithoutInitializing(
9196 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9197 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9198 Kokkos::ViewAllocateWithoutInitializing(
"wgts"),
9199 wgts_adapter.extent(0), wgts_adapter.extent(1));
9201 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9202 Kokkos::parallel_for(
9203 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9204 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (
int i) {
9205 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9206 xyz(i, n) =
static_cast<mj_scalar_t
>(xyz_adapter(i, n));
9209 Kokkos::parallel_for(
9210 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9211 (0, wgts.extent(0)), KOKKOS_LAMBDA (
int i) {
9212 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9213 wgts(i, n) =
static_cast<mj_scalar_t
>(wgts_adapter(i, n));
9219 this->initial_mj_gnos = gnos;
9221 this->mj_coordinates = xyz;
9224 if(this->num_weights_per_coord == 0) {
9225 this->mj_uniform_weights(0) =
true;
9226 Kokkos::resize(this->mj_weights, 0, 0);
9229 this->mj_weights = wgts;
9230 for(
int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9231 this->mj_uniform_weights(wdim) =
false;
9235 for(
int wdim = 0; wdim < criteria_dim; ++wdim) {
9236 if(solution->criteriaHasUniformPartSizes(wdim)) {
9237 this->mj_uniform_parts(wdim) =
true;
9240 printf(
"Error: MJ does not support non uniform target part weights\n");
9249 template <
typename Adapter>
9251 const Teuchos::ParameterList &pl)
9253 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
9256 tol = pe->getValue(&tol);
9257 this->imbalance_tolerance = tol - 1.0;
9261 if(this->imbalance_tolerance <= 0) {
9262 this->imbalance_tolerance= 10e-4;
9266 Kokkos::resize(this->part_no_array, 0);
9269 this->recursion_depth = 0;
9271 if(pl.getPtr<
int>(
"mj_num_teams")) {
9272 this->num_teams = pl.get<
int>(
"mj_num_teams");
9275 if(pl.getPtr<Array <mj_part_t> >(
"mj_parts")) {
9276 auto mj_parts = pl.get<Array <mj_part_t> >(
"mj_parts");
9277 int mj_parts_size =
static_cast<int>(mj_parts.size());
9280 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9281 "part_no_array", mj_parts_size);
9282 for(
int i = 0; i < mj_parts_size; ++i) {
9283 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9286 this->recursion_depth = mj_parts_size - 1;
9287 this->mj_env->debug(2,
"mj_parts provided by user");
9291 this->distribute_points_on_cut_lines =
true;
9292 this->max_concurrent_part_calculation = 1;
9294 this->mj_run_as_rcb =
false;
9295 this->mj_premigration_option = 0;
9296 this->min_coord_per_rank_for_premigration = 32000;
9298 int mj_user_recursion_depth = -1;
9299 this->mj_keep_part_boxes =
false;
9300 this->check_migrate_avoid_migration_option = 0;
9301 this->migration_type = 0;
9302 this->minimum_migration_imbalance = 0.35;
9304 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
9307 imb = pe->getValue(&imb);
9308 this->minimum_migration_imbalance = imb - 1.0;
9311 pe = pl.getEntryPtr(
"mj_migration_option");
9313 this->check_migrate_avoid_migration_option =
9314 pe->getValue(&this->check_migrate_avoid_migration_option);
9316 this->check_migrate_avoid_migration_option = 0;
9318 if(this->check_migrate_avoid_migration_option > 1) {
9319 this->check_migrate_avoid_migration_option = -1;
9323 pe = pl.getEntryPtr(
"mj_migration_type");
9325 this->migration_type = pe->getValue(&this->migration_type);
9327 this->migration_type = 0;
9333 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
9335 this->max_concurrent_part_calculation =
9336 pe->getValue(&this->max_concurrent_part_calculation);
9338 this->max_concurrent_part_calculation = 1;
9341 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
9343 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9345 this->mj_keep_part_boxes =
false;
9356 if(this->mj_keep_part_boxes ==
false) {
9357 pe = pl.getEntryPtr(
"mapping_type");
9359 int mapping_type = -1;
9360 mapping_type = pe->getValue(&mapping_type);
9361 if(mapping_type == 0) {
9362 mj_keep_part_boxes =
true;
9368 pe = pl.getEntryPtr(
"mj_enable_rcb");
9370 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9372 this->mj_run_as_rcb =
false;
9375 pe = pl.getEntryPtr(
"mj_premigration_option");
9377 mj_premigration_option = pe->getValue(&mj_premigration_option);
9379 mj_premigration_option = 0;
9382 pe = pl.getEntryPtr(
"mj_premigration_coordinate_count");
9384 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9386 min_coord_per_rank_for_premigration = 32000;
9389 pe = pl.getEntryPtr(
"mj_recursion_depth");
9391 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9393 mj_user_recursion_depth = -1;
9397 pe = pl.getEntryPtr(
"rectilinear");
9399 val = pe->getValue(&val);
9402 this->distribute_points_on_cut_lines =
false;
9404 this->distribute_points_on_cut_lines =
true;
9407 if(this->mj_run_as_rcb) {
9408 mj_user_recursion_depth =
9409 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9411 if(this->recursion_depth < 1) {
9412 if(mj_user_recursion_depth > 0) {
9413 this->recursion_depth = mj_user_recursion_depth;
9416 this->recursion_depth = this->coord_dim;
9422 template <
typename Adapter>
9425 adapter_scalar_t *lower,
9426 adapter_scalar_t *upper,
9427 size_t &nPartsFound,
9437 if(this->mj_keep_part_boxes) {
9440 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9442 size_t nBoxes = (*partBoxes).size();
9444 throw std::logic_error(
"no part boxes exist");
9448 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9450 if(globalBox->boxesOverlap(dim, lower, upper)) {
9452 std::vector<typename Adapter::part_t> partlist;
9455 for(
size_t i = 0; i < nBoxes; i++) {
9457 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9459 partlist.push_back((*partBoxes)[i].getpId());
9481 *partsFound =
new mj_part_t[nPartsFound];
9482 for(
size_t i = 0; i < nPartsFound; i++)
9483 (*partsFound)[i] = partlist[i];
9495 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
9500 template <
typename Adapter>
9503 adapter_scalar_t *point)
const
9509 if(this->mj_keep_part_boxes) {
9513 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9515 size_t nBoxes = (*partBoxes).size();
9517 throw std::logic_error(
"no part boxes exist");
9521 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9523 if(globalBox->pointInBox(dim, point)) {
9527 for(i = 0; i < nBoxes; i++) {
9529 if((*partBoxes)[i].pointInBox(dim, point)) {
9530 foundPart = (*partBoxes)[i].getpId();
9544 std::ostringstream oss;
9546 for(
int j = 0; j < dim; j++) oss << point[j] <<
" ";
9547 oss <<
") not found in domain";
9548 throw std::logic_error(oss.str());
9558 size_t closestBox = 0;
9559 coord_t minDistance = std::numeric_limits<coord_t>::max();
9560 coord_t *centroid =
new coord_t[dim];
9561 for(
size_t i = 0; i < nBoxes; i++) {
9562 (*partBoxes)[i].computeCentroid(centroid);
9565 for(
int j = 0; j < dim; j++) {
9566 diff = centroid[j] - point[j];
9569 if(sum < minDistance) {
9574 foundPart = (*partBoxes)[closestBox].getpId();
9581 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
9585 template <
typename Adapter>
9591 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9592 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9593 mj_part_t ntasks = (*pBoxes).size();
9594 int dim = (*pBoxes)[0].getDim();
9595 GridHash grid(pBoxes, ntasks, dim);
9602 template <
typename Adapter>
9603 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9606 return this->mj_partitioner.get_kept_boxes();
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
void set_up_partitioning_data(const RCP< PartitioningSolution< Adapter > > &solution)
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
mj_part_t pointAssign(int dim, adapter_scalar_t *point) const
void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
map_t::local_ordinal_type lno_t
map_t::global_ordinal_type gno_t
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
int value_count_rightleft
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
ArrayCombinationReducer reducer
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
part_t concurrent_current_part
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, const volatile value_type src) const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
part_t concurrent_current_part
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
part_t current_concurrent_num_parts
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
policy_t::member_type member_type
int value_count_rightleft
static int get_counter_Zoltan2_AlgMJ()
static int get_counter_AlgMJ()
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.