49 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 50 #define _ZOLTAN2_ALGMultiJagged_HPP_ 57 #include <Teuchos_StandardParameterEntryValidators.hpp> 59 #include <Tpetra_Distributor.hpp> 60 #include <Teuchos_ParameterList.hpp> 67 #if defined(__cplusplus) && __cplusplus >= 201103L 68 #include <unordered_map> 70 #include <Teuchos_Hashtable.hpp> 71 #endif // C++11 is enabled 73 #ifdef ZOLTAN2_USEZOLTANCOMM 74 #ifdef HAVE_ZOLTAN2_MPI 75 #define ENABLE_ZOLTAN_MIGRATION 76 #include "zoltan_comm_cpp.h" 77 #include "zoltan_types.h" 81 #ifdef HAVE_ZOLTAN2_OMP 85 #define LEAST_SIGNIFICANCE 0.0001 86 #define SIGNIFICANCE_MUL 1000 91 #define FUTURE_REDUCEALL_CUTOFF 1500000 94 #define MIN_WORK_LAST_DIM 1000 99 #define ZOLTAN2_ABS(x) ((x) >= 0 ? (x) : -(x)) 101 #define imbalanceOf(Wachieved, totalW, expectedRatio) \ 102 (Wachieved) / ((totalW) * (expectedRatio)) - 1 103 #define imbalanceOf2(Wachieved, wExpected) \ 104 (Wachieved) / (wExpected) - 1 107 #define ZOLTAN2_ALGMULTIJAGGED_SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp; 116 template <
typename Ordinal,
typename T>
135 size(s_), _EPSILON (
std::numeric_limits<T>::
epsilon()){}
139 void reduce(
const Ordinal count,
const T inBuffer[], T inoutBuffer[])
const 141 for (Ordinal i=0; i < count; i++){
142 if (
Z2_ABS(inBuffer[i]) > _EPSILON){
143 inoutBuffer[i] = inBuffer[i];
155 template <
typename T>
160 throw "cannot allocate memory";
172 template <
typename T>
188 template <
typename IT,
typename CT,
typename WT>
209 this->index = index_;
210 this->count = count_;
216 this->index = other.
index;
217 this->count = other.
count;
218 this->val = other.
val;
226 void set(IT index_ ,CT count_, WT *vals_){
227 this->index = index_;
228 this->count = count_;
234 this->index = other.
index;
235 this->count = other.
count;
236 this->val = other.
val;
240 bool operator<(const uMultiSortItem<IT,CT,WT>& other)
const{
241 assert (this->count == other.count);
242 for(CT i = 0; i < this->count; ++i){
244 if (
ZOLTAN2_ABS(this->val[i] - other.val[i]) < this->_EPSILON){
248 if(this->val[i] < other.val[i]){
257 return this->index < other.index;
260 assert (this->count == other.
count);
261 for(CT i = 0; i < this->count; ++i){
267 if(this->val[i] > other.
val[i]){
277 return this->index > other.
index;
284 template <
class IT,
class WT>
295 template <
class IT,
class WT>
301 IT i, ir=n, j, k, l=1;
302 IT jstack=0, istack[50];
311 for (j=l+1;j<=ir;j++)
317 if (arr[i].val <= aval)
333 if (arr[l+1].val > arr[ir].val)
337 if (arr[l].val > arr[ir].val)
341 if (arr[l+1].val > arr[l].val)
351 do i++;
while (arr[i].val < aval);
352 do j--;
while (arr[j].val > aval);
359 if (jstack > NSTACK){
360 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
379 template <
class IT,
class WT,
class SIGN>
386 bool operator<(const uSignedSortItem<IT, WT, SIGN>& rhs)
const {
388 if (this->signbit < rhs.signbit){
392 else if (this->signbit == rhs.signbit){
394 if (this->val < rhs.val){
395 return this->signbit;
398 else if (this->val > rhs.val){
399 return !this->signbit;
414 if (this->signbit > rhs.
signbit){
418 else if (this->signbit == rhs.
signbit){
420 if (this->val < rhs.
val){
421 return !this->signbit;
424 else if (this->val > rhs.
val){
425 return this->signbit;
437 bool operator<=(const uSignedSortItem<IT, WT, SIGN>& rhs){
438 return !(*
this > rhs);}
440 return !(*
this < rhs);}
446 template <
class IT,
class WT,
class SIGN>
451 IT i, ir=n, j, k, l=1;
452 IT jstack=0, istack[50];
460 for (j=l+1;j<=ir;j++)
482 if (arr[l+1] > arr[ir])
486 if (arr[l] > arr[ir])
490 if (arr[l+1] > arr[l])
499 do i++;
while (arr[i] < a);
500 do j--;
while (arr[j] > a);
507 if (jstack > NSTACK){
508 std::cout <<
"uqsort: NSTACK too small in sort." << std::endl;
530 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
536 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
538 RCP<const Environment> mj_env;
539 RCP<const Comm<int> > mj_problemComm;
541 double imbalance_tolerance;
542 mj_part_t *part_no_array;
544 int coord_dim, num_weights_per_coord;
546 size_t initial_num_loc_coords;
549 mj_lno_t num_local_coords;
550 mj_gno_t num_global_coords;
552 mj_scalar_t **mj_coordinates;
553 mj_scalar_t **mj_weights;
554 bool *mj_uniform_parts;
555 mj_scalar_t **mj_part_sizes;
556 bool *mj_uniform_weights;
558 ArrayView<const mj_gno_t> mj_gnos;
559 size_t num_global_parts;
561 mj_gno_t *initial_mj_gnos;
562 mj_gno_t *current_mj_gnos;
563 int *owner_of_coordinate;
565 mj_lno_t *coordinate_permutations;
566 mj_lno_t *new_coordinate_permutations;
567 mj_part_t *assigned_part_ids;
570 mj_lno_t *new_part_xadj;
573 bool distribute_points_on_cut_lines;
574 mj_part_t max_concurrent_part_calculation;
577 int mj_user_recursion_depth;
578 bool mj_keep_part_boxes;
580 int check_migrate_avoid_migration_option;
583 mj_scalar_t minimum_migration_imbalance;
586 mj_part_t total_num_cut ;
587 mj_part_t total_num_part;
589 mj_part_t max_num_part_along_dim ;
590 mj_part_t max_num_cut_along_dim;
591 size_t max_num_total_part_along_dim;
593 mj_part_t total_dim_num_reduce_all;
594 mj_part_t last_dim_num_part;
598 RCP<Comm<int> > comm;
600 mj_scalar_t sEpsilon;
602 mj_scalar_t maxScalar_t;
603 mj_scalar_t minScalar_t;
605 mj_scalar_t *all_cut_coordinates;
606 mj_scalar_t *max_min_coords;
607 mj_scalar_t *process_cut_line_weight_to_put_left;
608 mj_scalar_t **thread_cut_line_weight_to_put_left;
614 mj_scalar_t *cut_coordinates_work_array;
617 mj_scalar_t *target_part_weights;
619 mj_scalar_t *cut_upper_bound_coordinates ;
620 mj_scalar_t *cut_lower_bound_coordinates ;
621 mj_scalar_t *cut_lower_bound_weights ;
622 mj_scalar_t *cut_upper_bound_weights ;
624 mj_scalar_t *process_local_min_max_coord_total_weight ;
625 mj_scalar_t *global_min_max_coord_total_weight ;
629 bool *is_cut_line_determined;
632 mj_part_t *my_incomplete_cut_count;
634 double **thread_part_weights;
636 double **thread_part_weight_work;
639 mj_scalar_t **thread_cut_left_closest_point;
641 mj_scalar_t **thread_cut_right_closest_point;
644 mj_lno_t **thread_point_counts;
646 mj_scalar_t *process_rectilinear_cut_weight;
647 mj_scalar_t *global_rectilinear_cut_weight;
653 mj_scalar_t *total_part_weight_left_right_closests ;
654 mj_scalar_t *global_total_part_weight_left_right_closests;
656 RCP<mj_partBoxVector_t> kept_boxes;
659 RCP<mj_partBox_t> global_box;
660 int myRank, myActualRank;
662 bool divide_to_prime_first;
671 void set_part_specifications();
678 inline mj_part_t get_part_count(
679 mj_part_t num_total_future,
685 void allocate_set_work_memory();
692 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
695 void compute_global_box();
711 mj_part_t update_part_num_arrays(
712 std::vector<mj_part_t> &num_partitioning_in_current_dim,
713 std::vector<mj_part_t> *future_num_part_in_parts,
714 std::vector<mj_part_t> *next_future_num_parts_in_parts,
715 mj_part_t &future_num_parts,
716 mj_part_t current_num_parts,
717 int current_iteration,
718 RCP<mj_partBoxVector_t> input_part_boxes,
719 RCP<mj_partBoxVector_t> output_part_boxes,
720 mj_part_t atomic_part_count);
733 void mj_get_local_min_max_coord_totW(
734 mj_lno_t coordinate_begin_index,
735 mj_lno_t coordinate_end_index,
736 mj_lno_t *mj_current_coordinate_permutations,
737 mj_scalar_t *mj_current_dim_coords,
738 mj_scalar_t &min_coordinate,
739 mj_scalar_t &max_coordinate,
740 mj_scalar_t &total_weight);
749 void mj_get_global_min_max_coord_totW(
750 mj_part_t current_concurrent_num_parts,
751 mj_scalar_t *local_min_max_total,
752 mj_scalar_t *global_min_max_total);
772 void mj_get_initial_cut_coords_target_weights(
773 mj_scalar_t min_coord,
774 mj_scalar_t max_coord,
776 mj_scalar_t global_weight,
777 mj_scalar_t *initial_cut_coords ,
778 mj_scalar_t *target_part_weights ,
780 std::vector <mj_part_t> *future_num_part_in_parts,
781 std::vector <mj_part_t> *next_future_num_parts_in_parts,
782 mj_part_t concurrent_current_part,
783 mj_part_t obtained_part_index);
797 void set_initial_coordinate_parts(
798 mj_scalar_t &max_coordinate,
799 mj_scalar_t &min_coordinate,
800 mj_part_t &concurrent_current_part_index,
801 mj_lno_t coordinate_begin_index,
802 mj_lno_t coordinate_end_index,
803 mj_lno_t *mj_current_coordinate_permutations,
804 mj_scalar_t *mj_current_dim_coords,
805 mj_part_t *mj_part_ids,
806 mj_part_t &partition_count);
819 mj_scalar_t *mj_current_dim_coords,
820 mj_scalar_t imbalanceTolerance,
821 mj_part_t current_work_part,
822 mj_part_t current_concurrent_num_parts,
823 mj_scalar_t *current_cut_coordinates,
824 mj_part_t total_incomplete_cut_count,
825 std::vector <mj_part_t> &num_partitioning_in_current_dim);
846 void mj_1D_part_get_thread_part_weights(
847 size_t total_part_count,
849 mj_scalar_t max_coord,
850 mj_scalar_t min_coord,
851 mj_lno_t coordinate_begin_index,
852 mj_lno_t coordinate_end_index,
853 mj_scalar_t *mj_current_dim_coords,
854 mj_scalar_t *temp_current_cut_coords,
855 bool *current_cut_status,
856 double *my_current_part_weights,
857 mj_scalar_t *my_current_left_closest,
858 mj_scalar_t *my_current_right_closest);
867 void mj_accumulate_thread_results(
868 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
869 mj_part_t current_work_part,
870 mj_part_t current_concurrent_num_parts);
902 void mj_get_new_cut_coordinates(
903 const size_t &num_total_part,
904 const mj_part_t &num_cuts,
905 const mj_scalar_t &max_coordinate,
906 const mj_scalar_t &min_coordinate,
907 const mj_scalar_t &global_total_weight,
908 const mj_scalar_t &used_imbalance_tolerance,
909 mj_scalar_t * current_global_part_weights,
910 const mj_scalar_t * current_local_part_weights,
911 const mj_scalar_t *current_part_target_weights,
912 bool *current_cut_line_determined,
913 mj_scalar_t *current_cut_coordinates,
914 mj_scalar_t *current_cut_upper_bounds,
915 mj_scalar_t *current_cut_lower_bounds,
916 mj_scalar_t *current_global_left_closest_points,
917 mj_scalar_t *current_global_right_closest_points,
918 mj_scalar_t * current_cut_lower_bound_weights,
919 mj_scalar_t * current_cut_upper_weights,
920 mj_scalar_t *new_current_cut_coordinates,
921 mj_scalar_t *current_part_cut_line_weight_to_put_left,
922 mj_part_t *rectilinear_cut_count,
923 mj_part_t &my_num_incomplete_cut);
934 void mj_calculate_new_cut_position (
935 mj_scalar_t cut_upper_bound,
936 mj_scalar_t cut_lower_bound,
937 mj_scalar_t cut_upper_weight,
938 mj_scalar_t cut_lower_weight,
939 mj_scalar_t expected_weight,
940 mj_scalar_t &new_cut_position);
952 void mj_create_new_partitions(
954 mj_scalar_t *mj_current_dim_coords,
955 mj_scalar_t *current_concurrent_cut_coordinate,
956 mj_lno_t coordinate_begin,
957 mj_lno_t coordinate_end,
958 mj_scalar_t *used_local_cut_line_weight_to_left,
959 double **used_thread_part_weight_work,
960 mj_lno_t *out_part_xadj);
984 bool mj_perform_migration(
985 mj_part_t in_num_parts,
986 mj_part_t &out_num_parts,
987 std::vector<mj_part_t> *next_future_num_parts_in_parts,
988 mj_part_t &output_part_begin_index,
989 size_t migration_reduce_all_population,
990 mj_lno_t num_coords_for_last_dim_part,
991 std::string iteration,
992 RCP<mj_partBoxVector_t> &input_part_boxes,
993 RCP<mj_partBoxVector_t> &output_part_boxes);
1004 void get_processor_num_points_in_parts(
1005 mj_part_t num_procs,
1006 mj_part_t num_parts,
1007 mj_gno_t *&num_points_in_all_processor_parts);
1021 bool mj_check_to_migrate(
1022 size_t migration_reduce_all_population,
1023 mj_lno_t num_coords_for_last_dim_part,
1024 mj_part_t num_procs,
1025 mj_part_t num_parts,
1026 mj_gno_t *num_points_in_all_processor_parts);
1046 void mj_migration_part_proc_assignment(
1047 mj_gno_t * num_points_in_all_processor_parts,
1048 mj_part_t num_parts,
1049 mj_part_t num_procs,
1050 mj_lno_t *send_count_to_each_proc,
1051 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1052 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1053 mj_part_t &out_num_part,
1054 std::vector<mj_part_t> &out_part_indices,
1055 mj_part_t &output_part_numbering_begin_index,
1056 int *coordinate_destinations);
1074 void mj_assign_proc_to_parts(
1075 mj_gno_t * num_points_in_all_processor_parts,
1076 mj_part_t num_parts,
1077 mj_part_t num_procs,
1078 mj_lno_t *send_count_to_each_proc,
1079 std::vector<mj_part_t> &processor_ranks_for_subcomm,
1080 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1081 mj_part_t &out_part_index,
1082 mj_part_t &output_part_numbering_begin_index,
1083 int *coordinate_destinations);
1095 void assign_send_destinations(
1096 mj_part_t num_parts,
1097 mj_part_t *part_assignment_proc_begin_indices,
1098 mj_part_t *processor_chains_in_parts,
1099 mj_lno_t *send_count_to_each_proc,
1100 int *coordinate_destinations);
1114 void assign_send_destinations2(
1115 mj_part_t num_parts,
1117 int *coordinate_destinations,
1118 mj_part_t &output_part_numbering_begin_index,
1119 std::vector<mj_part_t> *next_future_num_parts_in_parts);
1137 void mj_assign_parts_to_procs(
1138 mj_gno_t * num_points_in_all_processor_parts,
1139 mj_part_t num_parts,
1140 mj_part_t num_procs,
1141 mj_lno_t *send_count_to_each_proc,
1142 std::vector<mj_part_t> *next_future_num_parts_in_parts,
1143 mj_part_t &out_num_part,
1144 std::vector<mj_part_t> &out_part_indices,
1145 mj_part_t &output_part_numbering_begin_index,
1146 int *coordinate_destinations);
1160 void mj_migrate_coords(
1161 mj_part_t num_procs,
1162 mj_lno_t &num_new_local_points,
1163 std::string iteration,
1164 int *coordinate_destinations,
1165 mj_part_t num_parts);
1173 void create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm);
1181 void fill_permutation_array(
1182 mj_part_t output_num_parts,
1183 mj_part_t num_parts);
1193 void set_final_parts(
1194 mj_part_t current_num_parts,
1195 mj_part_t output_part_begin_index,
1196 RCP<mj_partBoxVector_t> &output_part_boxes,
1197 bool is_data_ever_migrated);
1200 void free_work_memory();
1214 void create_consistent_chunks(
1215 mj_part_t num_parts,
1216 mj_scalar_t *mj_current_dim_coords,
1217 mj_scalar_t *current_concurrent_cut_coordinate,
1218 mj_lno_t coordinate_begin,
1219 mj_lno_t coordinate_end,
1220 mj_scalar_t *used_local_cut_line_weight_to_left,
1221 mj_lno_t *out_part_xadj,
1228 mj_part_t find_largest_prime_factor(mj_part_t num_parts){
1229 mj_part_t largest_factor = 1;
1230 mj_part_t n = num_parts;
1231 mj_part_t divisor = 2;
1233 while (n % divisor == 0){
1235 largest_factor = divisor;
1238 if (divisor * divisor > n){
1245 return largest_factor;
1278 void multi_jagged_part(
1279 const RCP<const Environment> &env,
1280 RCP<
const Comm<int> > &problemComm,
1282 double imbalance_tolerance,
1283 size_t num_global_parts,
1284 mj_part_t *part_no_array,
1285 int recursion_depth,
1288 mj_lno_t num_local_coords,
1289 mj_gno_t num_global_coords,
1290 const mj_gno_t *initial_mj_gnos,
1291 mj_scalar_t **mj_coordinates,
1293 int num_weights_per_coord,
1294 bool *mj_uniform_weights,
1295 mj_scalar_t **mj_weights,
1296 bool *mj_uniform_parts,
1297 mj_scalar_t **mj_part_sizes,
1299 mj_part_t *&result_assigned_part_ids,
1300 mj_gno_t *&result_mj_gnos
1311 void set_partitioning_parameters(
1312 bool distribute_points_on_cut_lines_,
1313 int max_concurrent_part_calculation_,
1314 int check_migrate_avoid_migration_option_,
1315 mj_scalar_t minimum_migration_imbalance_,
int migration_type_ = 0);
1319 void set_to_keep_part_boxes();
1323 RCP<mj_partBox_t> get_global_box()
const;
1325 RCP<mj_partBoxVector_t> get_kept_boxes()
const;
1327 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1328 RCP<mj_partBoxVector_t> &localPartBoxes)
const;
1354 void sequential_task_partitioning(
1355 const RCP<const Environment> &env,
1356 mj_lno_t num_total_coords,
1357 mj_lno_t num_selected_coords,
1358 size_t num_target_part,
1360 mj_scalar_t **mj_coordinates,
1361 mj_lno_t *initial_selected_coords_output_permutation,
1362 mj_lno_t *output_xadj,
1363 int recursion_depth,
1364 const mj_part_t *part_no_array,
1365 bool partition_along_longest_dim,
1366 int num_ranks_per_node,
1367 bool divide_to_prime_first_);
1395 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1398 const RCP<const Environment> &env,
1399 mj_lno_t num_total_coords,
1400 mj_lno_t num_selected_coords,
1401 size_t num_target_part,
1403 mj_scalar_t **mj_coordinates_,
1404 mj_lno_t *inital_adjList_output_adjlist,
1405 mj_lno_t *output_xadj,
1407 const mj_part_t *part_no_array_,
1408 bool partition_along_longest_dim,
1409 int num_ranks_per_node,
1410 bool divide_to_prime_first_
1415 const RCP<Comm<int> > commN;
1416 this->mj_problemComm =
1417 Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1419 Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1420 this->myActualRank = this->myRank = 1;
1422 #ifdef HAVE_ZOLTAN2_OMP 1427 this->divide_to_prime_first = divide_to_prime_first_;
1432 this->imbalance_tolerance = 0;
1433 this->num_global_parts = num_target_part;
1434 this->part_no_array = (mj_part_t *)part_no_array_;
1435 this->recursion_depth = rd;
1437 this->coord_dim = coord_dim_;
1438 this->num_local_coords = num_total_coords;
1439 this->num_global_coords = num_total_coords;
1440 this->mj_coordinates = mj_coordinates_;
1444 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
1446 this->num_weights_per_coord = 0;
1447 bool *tmp_mj_uniform_weights =
new bool[1];
1448 this->mj_uniform_weights = tmp_mj_uniform_weights ;
1449 this->mj_uniform_weights[0] =
true;
1451 mj_scalar_t **tmp_mj_weights =
new mj_scalar_t *[1];
1452 this->mj_weights = tmp_mj_weights;
1454 bool *tmp_mj_uniform_parts =
new bool[1];
1455 this->mj_uniform_parts = tmp_mj_uniform_parts;
1456 this->mj_uniform_parts[0] =
true;
1458 mj_scalar_t **tmp_mj_part_sizes =
new mj_scalar_t * [1];
1459 this->mj_part_sizes = tmp_mj_part_sizes;
1460 this->mj_part_sizes[0] = NULL;
1462 this->num_threads = 1;
1463 this->set_part_specifications();
1465 this->allocate_set_work_memory();
1467 this->part_xadj[0] =
static_cast<mj_lno_t
>(num_selected_coords);
1468 for(
size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
1469 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
1472 mj_part_t current_num_parts = 1;
1474 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
1476 mj_part_t future_num_parts = this->total_num_part;
1478 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
1479 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
1480 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1481 RCP<mj_partBoxVector_t> t1;
1482 RCP<mj_partBoxVector_t> t2;
1485 std::vector <uSignedSortItem<int, mj_scalar_t, char> > coord_dimension_range_sorted(this->coord_dim);
1487 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1488 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1490 for (
int i = 0; i < this->recursion_depth; ++i){
1494 std::vector <mj_part_t> num_partitioning_in_current_dim;
1508 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
1509 future_num_part_in_parts = next_future_num_parts_in_parts;
1510 next_future_num_parts_in_parts = tmpPartVect;
1515 next_future_num_parts_in_parts->clear();
1519 mj_part_t output_part_count_in_dimension =
1520 this->update_part_num_arrays(
1521 num_partitioning_in_current_dim,
1522 future_num_part_in_parts,
1523 next_future_num_parts_in_parts,
1528 t2, num_ranks_per_node);
1533 if(output_part_count_in_dimension == current_num_parts) {
1534 tmpPartVect= future_num_part_in_parts;
1535 future_num_part_in_parts = next_future_num_parts_in_parts;
1536 next_future_num_parts_in_parts = tmpPartVect;
1541 std::string istring = Teuchos::toString<int>(i);
1545 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
1548 mj_part_t output_part_index = 0;
1551 mj_part_t output_coordinate_end_index = 0;
1553 mj_part_t current_work_part = 0;
1554 mj_part_t current_concurrent_num_parts = 1;
1556 mj_part_t obtained_part_index = 0;
1559 int coordInd = i % this->coord_dim;
1560 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
1564 for (; current_work_part < current_num_parts;
1565 current_work_part += current_concurrent_num_parts){
1571 mj_part_t actual_work_part_count = 0;
1575 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1576 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
1580 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
1583 ++actual_work_part_count;
1584 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
1585 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
1595 if(partition_along_longest_dim){
1597 mj_scalar_t best_weight_coord = 0;
1598 for (
int coord_traverse_ind = 0; coord_traverse_ind < this->coord_dim; ++coord_traverse_ind){
1599 mj_scalar_t best_min_coord = 0;
1600 mj_scalar_t best_max_coord = 0;
1603 this->mj_get_local_min_max_coord_totW(
1604 coordinate_begin_index,
1605 coordinate_end_index,
1606 this->coordinate_permutations,
1607 this->mj_coordinates[coord_traverse_ind],
1613 coord_dim_mins[coord_traverse_ind] = best_min_coord;
1614 coord_dim_maxs[coord_traverse_ind] = best_max_coord;
1615 mj_scalar_t best_range = best_max_coord - best_min_coord;
1616 coord_dimension_range_sorted[coord_traverse_ind].id = coord_traverse_ind;
1617 coord_dimension_range_sorted[coord_traverse_ind].val = best_range;
1618 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1622 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1623 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].
id;
1634 mj_current_dim_coords = this->mj_coordinates[coordInd];
1636 this->process_local_min_max_coord_total_weight[kk] = coord_dim_mins[coordInd];
1637 this->process_local_min_max_coord_total_weight[kk+ current_concurrent_num_parts] = coord_dim_maxs[coordInd];
1638 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] = best_weight_coord;
1642 this->mj_get_local_min_max_coord_totW(
1643 coordinate_begin_index,
1644 coordinate_end_index,
1645 this->coordinate_permutations,
1646 mj_current_dim_coords,
1647 this->process_local_min_max_coord_total_weight[kk],
1648 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
1649 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]
1655 if (actual_work_part_count > 0){
1657 this->mj_get_global_min_max_coord_totW(
1658 current_concurrent_num_parts,
1659 this->process_local_min_max_coord_total_weight,
1660 this->global_min_max_coord_total_weight);
1664 mj_part_t total_incomplete_cut_count = 0;
1669 mj_part_t concurrent_part_cut_shift = 0;
1670 mj_part_t concurrent_part_part_shift = 0;
1673 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1674 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
1675 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
1676 current_concurrent_num_parts];
1677 mj_scalar_t global_total_weight =
1678 this->global_min_max_coord_total_weight[kk +
1679 2 * current_concurrent_num_parts];
1681 mj_part_t concurrent_current_part_index = current_work_part + kk;
1683 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
1685 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
1686 mj_scalar_t *current_target_part_weights = this->target_part_weights +
1687 concurrent_part_part_shift;
1689 concurrent_part_cut_shift += partition_count - 1;
1691 concurrent_part_part_shift += partition_count;
1695 if(partition_count > 1 && min_coordinate <= max_coordinate){
1699 total_incomplete_cut_count += partition_count - 1;
1702 this->my_incomplete_cut_count[kk] = partition_count - 1;
1705 this->mj_get_initial_cut_coords_target_weights(
1708 partition_count - 1,
1709 global_total_weight,
1711 current_target_part_weights,
1712 future_num_part_in_parts,
1713 next_future_num_parts_in_parts,
1714 concurrent_current_part_index,
1715 obtained_part_index);
1717 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
1718 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
1721 this->set_initial_coordinate_parts(
1724 concurrent_current_part_index,
1725 coordinate_begin_index, coordinate_end_index,
1726 this->coordinate_permutations,
1727 mj_current_dim_coords,
1728 this->assigned_part_ids,
1734 this->my_incomplete_cut_count[kk] = 0;
1736 obtained_part_index += partition_count;
1740 mj_scalar_t used_imbalance = 0;
1745 mj_current_dim_coords,
1748 current_concurrent_num_parts,
1749 current_cut_coordinates,
1750 total_incomplete_cut_count,
1751 num_partitioning_in_current_dim);
1754 obtained_part_index += current_concurrent_num_parts;
1760 mj_part_t output_array_shift = 0;
1761 mj_part_t cut_shift = 0;
1762 size_t tlr_shift = 0;
1763 size_t partweight_array_shift = 0;
1765 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
1766 mj_part_t current_concurrent_work_part = current_work_part + kk;
1767 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
1770 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] >
1771 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
1773 for(mj_part_t jj = 0; jj < num_parts; ++jj){
1774 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
1776 cut_shift += num_parts - 1;
1777 tlr_shift += (4 *(num_parts - 1) + 1);
1778 output_array_shift += num_parts;
1779 partweight_array_shift += (2 * (num_parts - 1) + 1);
1783 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
1784 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
1786 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
1787 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
1790 for(
int ii = 0; ii < this->num_threads; ++ii){
1791 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
1796 this->create_consistent_chunks(
1798 mj_current_dim_coords,
1799 current_concurrent_cut_coordinate,
1802 used_local_cut_line_weight_to_left,
1803 this->new_part_xadj + output_part_index + output_array_shift,
1805 partition_along_longest_dim,
1806 p_coord_dimension_range_sorted);
1811 mj_lno_t part_size = coordinate_end - coordinate_begin;
1812 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
1813 memcpy(this->new_coordinate_permutations + coordinate_begin,
1814 this->coordinate_permutations + coordinate_begin,
1815 part_size *
sizeof(mj_lno_t));
1820 cut_shift += num_parts - 1;
1821 tlr_shift += (4 *(num_parts - 1) + 1);
1822 output_array_shift += num_parts;
1823 partweight_array_shift += (2 * (num_parts - 1) + 1);
1832 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
1833 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
1834 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
1836 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
1838 mj_lno_t coordinate_end = this->new_part_xadj[output_part_index+ii];
1839 mj_lno_t coordinate_begin = this->new_part_xadj[output_part_index];
1841 for (mj_lno_t task_traverse = coordinate_begin; task_traverse < coordinate_end; ++task_traverse){
1842 mj_lno_t l = this->new_coordinate_permutations[task_traverse];
1844 mj_current_dim_coords[l] = -mj_current_dim_coords[l];
1849 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
1851 output_part_index += num_parts ;
1858 current_num_parts = output_part_count_in_dimension;
1861 mj_lno_t * tmp = this->coordinate_permutations;
1862 this->coordinate_permutations = this->new_coordinate_permutations;
1863 this->new_coordinate_permutations = tmp;
1865 freeArray<mj_lno_t>(this->part_xadj);
1866 this->part_xadj = this->new_part_xadj;
1867 this->new_part_xadj = NULL;
1870 for(mj_lno_t i = 0; i < num_total_coords; ++i){
1871 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
1876 for(
size_t i = 0; i < this->num_global_parts ; ++i){
1877 output_xadj[i+1] = this->part_xadj[i];
1880 delete future_num_part_in_parts;
1881 delete next_future_num_parts_in_parts;
1884 freeArray<mj_part_t>(this->assigned_part_ids);
1885 freeArray<mj_gno_t>(this->initial_mj_gnos);
1886 freeArray<mj_gno_t>(this->current_mj_gnos);
1887 freeArray<bool>(tmp_mj_uniform_weights);
1888 freeArray<bool>(tmp_mj_uniform_parts);
1889 freeArray<mj_scalar_t *>(tmp_mj_weights);
1890 freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
1892 this->free_work_memory();
1894 #ifdef HAVE_ZOLTAN2_OMP 1902 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1905 mj_env(), mj_problemComm(), imbalance_tolerance(0),
1906 part_no_array(NULL), recursion_depth(0), coord_dim(0),
1907 num_weights_per_coord(0), initial_num_loc_coords(0),
1908 initial_num_glob_coords(0),
1909 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
1910 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
1911 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
1912 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
1913 coordinate_permutations(NULL), new_coordinate_permutations(NULL),
1914 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
1915 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
1916 mj_run_as_rcb(false), mj_user_recursion_depth(0), mj_keep_part_boxes(false),
1917 check_migrate_avoid_migration_option(0), migration_type(0), minimum_migration_imbalance(0.30),
1918 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1919 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
1920 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
1921 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
1922 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
1923 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
1924 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
1925 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
1926 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
1927 thread_part_weights(NULL), thread_part_weight_work(NULL),
1928 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
1929 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
1930 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
1931 global_total_part_weight_left_right_closests(NULL),
1932 kept_boxes(),global_box(),
1933 myRank(0), myActualRank(0), divide_to_prime_first(false)
1938 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
1939 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
1947 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1949 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
1952 return this->global_box;
1958 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1961 this->mj_keep_part_boxes =
true;
1972 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
1976 this->total_num_cut = 0;
1977 this->total_num_part = 1;
1978 this->max_num_part_along_dim = 0;
1979 this->total_dim_num_reduce_all = 0;
1980 this->last_dim_num_part = 1;
1983 this->max_num_cut_along_dim = 0;
1984 this->max_num_total_part_along_dim = 0;
1986 if (this->part_no_array){
1988 for (
int i = 0; i < this->recursion_depth; ++i){
1989 this->total_dim_num_reduce_all += this->total_num_part;
1990 this->total_num_part *= this->part_no_array[i];
1991 if(this->part_no_array[i] > this->max_num_part_along_dim) {
1992 this->max_num_part_along_dim = this->part_no_array[i];
1995 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
1996 this->num_global_parts = this->total_num_part;
1998 mj_part_t future_num_parts = this->num_global_parts;
2001 for (
int i = 0; i < this->recursion_depth; ++i){
2003 mj_part_t maxNoPartAlongI = this->get_part_count(
2004 future_num_parts, 1.0f / (this->recursion_depth - i));
2006 if (maxNoPartAlongI > this->max_num_part_along_dim){
2007 this->max_num_part_along_dim = maxNoPartAlongI;
2010 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
2011 if (future_num_parts % maxNoPartAlongI){
2014 future_num_parts = nfutureNumParts;
2016 this->total_num_part = this->num_global_parts;
2018 if (this->divide_to_prime_first){
2019 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2020 this->last_dim_num_part = this->num_global_parts;
2029 for (
int i = 0; i < this->recursion_depth; ++i){
2030 this->total_dim_num_reduce_all += p;
2031 p *= this->max_num_part_along_dim;
2034 if (p / this->max_num_part_along_dim > this->num_global_parts){
2035 this->last_dim_num_part = this->num_global_parts;
2038 this->last_dim_num_part = p / this->max_num_part_along_dim;
2044 this->total_num_cut = this->total_num_part - 1;
2045 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2046 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
2050 if(this->max_concurrent_part_calculation > this->last_dim_num_part){
2051 if(this->mj_problemComm->getRank() == 0){
2052 std::cerr <<
"Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
2053 ") has been set bigger than maximum amount that can be used." <<
2054 " Setting to:" << this->last_dim_num_part <<
"." << std::endl;
2056 this->max_concurrent_part_calculation = this->last_dim_num_part;
2065 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2068 mj_part_t num_total_future,
2071 double fp = pow(num_total_future, root);
2072 mj_part_t ip = mj_part_t (fp);
2073 if (fp - ip < this->fEpsilon * 100){
2095 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2098 std::vector <mj_part_t> &num_partitioning_in_current_dim,
2099 std::vector<mj_part_t> *future_num_part_in_parts,
2100 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2101 mj_part_t &future_num_parts,
2102 mj_part_t current_num_parts,
2103 int current_iteration,
2104 RCP<mj_partBoxVector_t> input_part_boxes,
2105 RCP<mj_partBoxVector_t> output_part_boxes,
2106 mj_part_t atomic_part_count
2109 mj_part_t output_num_parts = 0;
2110 if(this->part_no_array){
2115 mj_part_t p = this->part_no_array[current_iteration];
2117 std::cout <<
"i:" << current_iteration <<
" p is given as:" << p << std::endl;
2121 return current_num_parts;
2124 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2125 num_partitioning_in_current_dim.push_back(p);
2138 future_num_parts /= num_partitioning_in_current_dim[0];
2139 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
2141 if (this->mj_keep_part_boxes){
2142 for (mj_part_t k = 0; k < current_num_parts; ++k){
2144 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
2145 output_part_boxes->push_back((*input_part_boxes)[k]);
2153 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
2154 next_future_num_parts_in_parts->push_back(future_num_parts);
2164 future_num_parts = 1;
2168 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
2170 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2174 mj_part_t num_partitions_in_current_dim =
2175 this->get_part_count(
2176 future_num_parts_of_part_ii,
2177 1.0 / (this->recursion_depth - current_iteration)
2180 if (num_partitions_in_current_dim > this->max_num_part_along_dim){
2181 std::cerr <<
"ERROR: maxPartNo calculation is wrong. num_partitions_in_current_dim: " 2182 << num_partitions_in_current_dim <<
"this->max_num_part_along_dim:" 2183 << this->max_num_part_along_dim <<
2184 " this->recursion_depth:" << this->recursion_depth <<
2185 " current_iteration:" << current_iteration <<
2186 " future_num_parts_of_part_ii:" << future_num_parts_of_part_ii <<
2187 " might need to fix max part no calculation for largest_prime_first partitioning" <<
2192 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2194 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2195 if (this->divide_to_prime_first){
2198 output_num_parts += num_partitions_in_current_dim;
2199 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2200 atomic_part_count = 1;
2203 largest_prime_factor = this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2208 if (largest_prime_factor < num_partitions_in_current_dim){
2209 largest_prime_factor = num_partitions_in_current_dim;
2213 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2215 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2218 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2220 mj_part_t my_ideal_primescale = ideal_prime_scale;
2222 if (iii < (largest_prime_factor) % num_partitions_in_current_dim){
2223 ++my_ideal_primescale;
2226 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part * my_ideal_primescale;
2229 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor){
2231 ++num_future_parts_for_part_iii;
2234 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2237 if (this->mj_keep_part_boxes){
2238 output_part_boxes->push_back((*input_part_boxes)[ii]);
2242 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2251 output_num_parts += num_partitions_in_current_dim;
2255 if (future_num_parts_of_part_ii == atomic_part_count || future_num_parts_of_part_ii % atomic_part_count != 0){
2256 atomic_part_count = 1;
2259 mj_part_t ideal_num_future_parts_in_part = (future_num_parts_of_part_ii / atomic_part_count) / num_partitions_in_current_dim;
2260 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
2261 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
2264 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % num_partitions_in_current_dim){
2266 ++num_future_parts_for_part_iii;
2269 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2272 if (this->mj_keep_part_boxes){
2273 output_part_boxes->push_back((*input_part_boxes)[ii]);
2277 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
2282 return output_num_parts;
2289 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2294 this->owner_of_coordinate = NULL;
2299 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2301 #ifdef HAVE_ZOLTAN2_OMP 2302 #pragma omp parallel for 2304 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
2305 this->coordinate_permutations[i] = i;
2309 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
2311 this->assigned_part_ids = NULL;
2312 if(this->num_local_coords > 0){
2313 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
2319 this->part_xadj = allocMemory<mj_lno_t>(1);
2320 this->part_xadj[0] =
static_cast<mj_lno_t
>(this->num_local_coords);
2322 this->new_part_xadj = NULL;
2328 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2330 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2);
2332 this->process_cut_line_weight_to_put_left = NULL;
2333 this->thread_cut_line_weight_to_put_left = NULL;
2335 if(this->distribute_points_on_cut_lines){
2336 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2337 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
2338 for(
int i = 0; i < this->num_threads; ++i){
2339 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2341 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2342 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
2350 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
2351 this->max_concurrent_part_calculation);
2355 this->target_part_weights = allocMemory<mj_scalar_t>(
2356 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2359 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2360 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2361 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2362 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2364 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2365 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);
2369 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2372 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
2374 this->thread_part_weights = allocMemory<double *>(this->num_threads);
2376 this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
2379 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2381 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
2384 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
2386 for(
int i = 0; i < this->num_threads; ++i){
2388 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2389 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2390 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2391 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim);
2397 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2398 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2401 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
2402 for (
int i=0; i < this->coord_dim; i++){
2403 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2404 #ifdef HAVE_ZOLTAN2_OMP 2405 #pragma omp parallel for 2407 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2408 coord[i][j] = this->mj_coordinates[i][j];
2410 this->mj_coordinates = coord;
2413 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
2414 mj_scalar_t **
weights = allocMemory<mj_scalar_t *>(criteria_dim);
2416 for (
int i=0; i < criteria_dim; i++){
2419 for (
int i=0; i < this->num_weights_per_coord; i++){
2420 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
2421 #ifdef HAVE_ZOLTAN2_OMP 2422 #pragma omp parallel for 2424 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2425 weights[i][j] = this->mj_weights[i][j];
2429 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
2430 #ifdef HAVE_ZOLTAN2_OMP 2431 #pragma omp parallel for 2433 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2434 this->current_mj_gnos[j] = this->initial_mj_gnos[j];
2436 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
2438 #ifdef HAVE_ZOLTAN2_OMP 2439 #pragma omp parallel for 2441 for (mj_lno_t j=0; j < this->num_local_coords; j++)
2442 this->owner_of_coordinate[j] = this->myActualRank;
2447 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2452 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
2454 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
2456 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
2458 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
2460 for (
int i = 0; i < this->coord_dim; ++i){
2461 mj_scalar_t localMin = std::numeric_limits<mj_scalar_t>::max();
2462 mj_scalar_t localMax = -localMin;
2463 if (localMax > 0) localMax = 0;
2466 for (mj_lno_t j = 0; j < this->num_local_coords; ++j){
2467 if (this->mj_coordinates[i][j] < localMin){
2468 localMin = this->mj_coordinates[i][j];
2470 if (this->mj_coordinates[i][j] > localMax){
2471 localMax = this->mj_coordinates[i][j];
2480 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2481 this->coord_dim, mins, gmins
2485 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2486 this->coord_dim, maxs, gmaxs
2492 global_box = rcp(
new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2494 freeArray<mj_scalar_t>(mins);
2495 freeArray<mj_scalar_t>(gmins);
2496 freeArray<mj_scalar_t>(maxs);
2497 freeArray<mj_scalar_t>(gmaxs);
2505 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2508 RCP<mj_partBoxVector_t> & initial_partitioning_boxes
2512 initial_partitioning_boxes->push_back(tmp_box);
2525 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2528 mj_lno_t coordinate_begin_index,
2529 mj_lno_t coordinate_end_index,
2530 mj_lno_t *mj_current_coordinate_permutations,
2531 mj_scalar_t *mj_current_dim_coords,
2532 mj_scalar_t &min_coordinate,
2533 mj_scalar_t &max_coordinate,
2534 mj_scalar_t &total_weight){
2538 if(coordinate_begin_index >= coordinate_end_index)
2540 min_coordinate = this->maxScalar_t;
2541 max_coordinate = this->minScalar_t;
2545 mj_scalar_t my_total_weight = 0;
2546 #ifdef HAVE_ZOLTAN2_OMP 2547 #pragma omp parallel num_threads(this->num_threads) 2551 if (this->mj_uniform_weights[0]) {
2552 #ifdef HAVE_ZOLTAN2_OMP 2556 my_total_weight = coordinate_end_index - coordinate_begin_index;
2562 #ifdef HAVE_ZOLTAN2_OMP 2563 #pragma omp for reduction(+:my_total_weight) 2565 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2566 int i = mj_current_coordinate_permutations[ii];
2567 my_total_weight += this->mj_weights[0][i];
2571 int my_thread_id = 0;
2572 #ifdef HAVE_ZOLTAN2_OMP 2573 my_thread_id = omp_get_thread_num();
2575 mj_scalar_t my_thread_min_coord, my_thread_max_coord;
2576 my_thread_min_coord=my_thread_max_coord
2577 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
2580 #ifdef HAVE_ZOLTAN2_OMP 2583 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
2584 int i = mj_current_coordinate_permutations[j];
2585 if(mj_current_dim_coords[i] > my_thread_max_coord)
2586 my_thread_max_coord = mj_current_dim_coords[i];
2587 if(mj_current_dim_coords[i] < my_thread_min_coord)
2588 my_thread_min_coord = mj_current_dim_coords[i];
2590 this->max_min_coords[my_thread_id] = my_thread_min_coord;
2591 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
2593 #ifdef HAVE_ZOLTAN2_OMP 2596 #pragma omp single nowait 2599 min_coordinate = this->max_min_coords[0];
2600 for(
int i = 1; i < this->num_threads; ++i){
2601 if(this->max_min_coords[i] < min_coordinate)
2602 min_coordinate = this->max_min_coords[i];
2606 #ifdef HAVE_ZOLTAN2_OMP 2607 #pragma omp single nowait 2610 max_coordinate = this->max_min_coords[this->num_threads];
2611 for(
int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
2612 if(this->max_min_coords[i] > max_coordinate)
2613 max_coordinate = this->max_min_coords[i];
2617 total_weight = my_total_weight;
2629 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2632 mj_part_t current_concurrent_num_parts,
2633 mj_scalar_t *local_min_max_total,
2634 mj_scalar_t *global_min_max_total){
2639 if(this->comm->getSize() > 1){
2642 current_concurrent_num_parts,
2643 current_concurrent_num_parts,
2644 current_concurrent_num_parts);
2646 reduceAll<int, mj_scalar_t>(
2649 3 * current_concurrent_num_parts,
2650 local_min_max_total,
2651 global_min_max_total);
2656 mj_part_t s = 3 * current_concurrent_num_parts;
2657 for (mj_part_t i = 0; i < s; ++i){
2658 global_min_max_total[i] = local_min_max_total[i];
2683 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2686 mj_scalar_t min_coord,
2687 mj_scalar_t max_coord,
2688 mj_part_t num_cuts ,
2689 mj_scalar_t global_weight,
2690 mj_scalar_t *initial_cut_coords ,
2691 mj_scalar_t *current_target_part_weights ,
2693 std::vector <mj_part_t> *future_num_part_in_parts,
2694 std::vector <mj_part_t> *next_future_num_parts_in_parts,
2695 mj_part_t concurrent_current_part,
2696 mj_part_t obtained_part_index
2699 mj_scalar_t coord_range = max_coord - min_coord;
2700 if(this->mj_uniform_parts[0]){
2702 mj_part_t cumulative = 0;
2704 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
2708 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
2714 for(mj_part_t i = 0; i < num_cuts; ++i){
2715 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
2723 current_target_part_weights[i] = cumulative * unit_part_weight;
2727 initial_cut_coords[i] = min_coord + (coord_range * cumulative) / total_future_part_count_in_part;
2729 current_target_part_weights[num_cuts] = 1;
2733 if (this->mj_uniform_weights[0]){
2734 for(mj_part_t i = 0; i < num_cuts + 1; ++i){
2736 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
2741 std::cerr <<
"MJ does not support non uniform part weights" << std::endl;
2759 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2762 mj_scalar_t &max_coordinate,
2763 mj_scalar_t &min_coordinate,
2764 mj_part_t &concurrent_current_part_index,
2765 mj_lno_t coordinate_begin_index,
2766 mj_lno_t coordinate_end_index,
2767 mj_lno_t *mj_current_coordinate_permutations,
2768 mj_scalar_t *mj_current_dim_coords,
2769 mj_part_t *mj_part_ids,
2770 mj_part_t &partition_count
2772 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
2776 if(
ZOLTAN2_ABS(coordinate_range) < this->sEpsilon ){
2777 #ifdef HAVE_ZOLTAN2_OMP 2778 #pragma omp parallel for 2780 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2781 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
2788 mj_scalar_t slice = coordinate_range / partition_count;
2790 #ifdef HAVE_ZOLTAN2_OMP 2791 #pragma omp parallel for 2793 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
2795 mj_lno_t iii = mj_current_coordinate_permutations[ii];
2796 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
2797 mj_part_ids[iii] = 2 * pp;
2813 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
2816 mj_scalar_t *mj_current_dim_coords,
2817 mj_scalar_t used_imbalance_tolerance,
2818 mj_part_t current_work_part,
2819 mj_part_t current_concurrent_num_parts,
2820 mj_scalar_t *current_cut_coordinates,
2821 mj_part_t total_incomplete_cut_count,
2822 std::vector <mj_part_t> &num_partitioning_in_current_dim
2826 mj_part_t rectilinear_cut_count = 0;
2827 mj_scalar_t *temp_cut_coords = current_cut_coordinates;
2830 *reductionOp = NULL;
2832 <mj_part_t, mj_scalar_t>(
2833 &num_partitioning_in_current_dim ,
2835 current_concurrent_num_parts);
2837 size_t total_reduction_size = 0;
2838 #ifdef HAVE_ZOLTAN2_OMP 2839 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) num_threads(this->num_threads) 2843 #ifdef HAVE_ZOLTAN2_OMP 2844 me = omp_get_thread_num();
2846 double *my_thread_part_weights = this->thread_part_weights[me];
2847 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
2848 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
2850 #ifdef HAVE_ZOLTAN2_OMP 2856 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
2858 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i];
2859 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
2860 total_reduction_size += (4 * num_cut_in_dim + 1);
2862 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
2863 this->is_cut_line_determined[next] =
false;
2864 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i];
2865 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts];
2867 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts];
2868 this->cut_lower_bound_weights[next] = 0;
2870 if(this->distribute_points_on_cut_lines){
2871 this->process_cut_line_weight_to_put_left[next] = 0;
2882 while (total_incomplete_cut_count != 0){
2884 mj_part_t concurrent_cut_shifts = 0;
2885 size_t total_part_shift = 0;
2887 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2888 mj_part_t num_parts = -1;
2889 num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2891 mj_part_t num_cuts = num_parts - 1;
2892 size_t total_part_count = num_parts + size_t (num_cuts) ;
2893 if (this->my_incomplete_cut_count[kk] > 0){
2896 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
2897 double *my_current_part_weights = my_thread_part_weights + total_part_shift;
2898 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
2899 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
2901 mj_part_t conccurent_current_part = current_work_part + kk;
2902 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
2903 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
2904 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
2906 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
2907 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2910 this->mj_1D_part_get_thread_part_weights(
2915 coordinate_begin_index,
2916 coordinate_end_index,
2917 mj_current_dim_coords,
2918 temp_current_cut_coords,
2920 my_current_part_weights,
2921 my_current_left_closest,
2922 my_current_right_closest);
2926 concurrent_cut_shifts += num_cuts;
2927 total_part_shift += total_part_count;
2931 this->mj_accumulate_thread_results(
2932 num_partitioning_in_current_dim,
2934 current_concurrent_num_parts);
2937 #ifdef HAVE_ZOLTAN2_OMP 2941 if(this->comm->getSize() > 1){
2942 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
2943 total_reduction_size,
2944 this->total_part_weight_left_right_closests,
2945 this->global_total_part_weight_left_right_closests);
2950 this->global_total_part_weight_left_right_closests,
2951 this->total_part_weight_left_right_closests,
2952 total_reduction_size *
sizeof(mj_scalar_t));
2957 mj_part_t cut_shift = 0;
2960 size_t tlr_shift = 0;
2961 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
2962 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk];
2963 mj_part_t num_cuts = num_parts - 1;
2964 size_t num_total_part = num_parts + size_t (num_cuts) ;
2969 if (this->my_incomplete_cut_count[kk] == 0) {
2970 cut_shift += num_cuts;
2971 tlr_shift += (num_total_part + 2 * num_cuts);
2975 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ;
2976 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
2977 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part;
2978 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts;
2979 mj_scalar_t *current_global_part_weights = current_global_tlr;
2980 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
2982 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
2983 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
2985 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
2986 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
2987 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
2988 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
2989 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
2990 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
2991 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
2993 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
2996 this->mj_get_new_cut_coordinates(
3001 global_total_weight,
3002 used_imbalance_tolerance,
3003 current_global_part_weights,
3004 current_local_part_weights,
3005 current_part_target_weights,
3006 current_cut_line_determined,
3007 temp_cut_coords + cut_shift,
3008 current_cut_upper_bounds,
3009 current_cut_lower_bounds,
3010 current_global_left_closest_points,
3011 current_global_right_closest_points,
3012 current_cut_lower_bound_weights,
3013 current_cut_upper_weights,
3014 this->cut_coordinates_work_array +cut_shift,
3015 current_part_cut_line_weight_to_put_left,
3016 &rectilinear_cut_count,
3017 this->my_incomplete_cut_count[kk]);
3019 cut_shift += num_cuts;
3020 tlr_shift += (num_total_part + 2 * num_cuts);
3021 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
3022 #ifdef HAVE_ZOLTAN2_OMP 3026 total_incomplete_cut_count -= iteration_complete_cut_count;
3031 #ifdef HAVE_ZOLTAN2_OMP 3037 mj_scalar_t *t = temp_cut_coords;
3038 temp_cut_coords = this->cut_coordinates_work_array;
3039 this->cut_coordinates_work_array = t;
3048 if (current_cut_coordinates != temp_cut_coords){
3049 #ifdef HAVE_ZOLTAN2_OMP 3054 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3055 mj_part_t num_parts = -1;
3056 num_parts = num_partitioning_in_current_dim[current_work_part + i];
3057 mj_part_t num_cuts = num_parts - 1;
3059 for(mj_part_t ii = 0; ii < num_cuts; ++ii){
3060 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
3066 #ifdef HAVE_ZOLTAN2_OMP 3070 this->cut_coordinates_work_array = temp_cut_coords;
3097 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3100 size_t total_part_count,
3102 mj_scalar_t max_coord,
3103 mj_scalar_t min_coord,
3104 mj_lno_t coordinate_begin_index,
3105 mj_lno_t coordinate_end_index,
3106 mj_scalar_t *mj_current_dim_coords,
3107 mj_scalar_t *temp_current_cut_coords,
3108 bool *current_cut_status,
3109 double *my_current_part_weights,
3110 mj_scalar_t *my_current_left_closest,
3111 mj_scalar_t *my_current_right_closest){
3114 for (
size_t i = 0; i < total_part_count; ++i){
3115 my_current_part_weights[i] = 0;
3120 for(mj_part_t i = 0; i < num_cuts; ++i){
3121 my_current_left_closest[i] = min_coord - 1;
3122 my_current_right_closest[i] = max_coord + 1;
3125 mj_scalar_t minus_EPSILON = -this->sEpsilon;
3126 #ifdef HAVE_ZOLTAN2_OMP 3132 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
3133 int i = this->coordinate_permutations[ii];
3137 mj_part_t j = this->assigned_part_ids[i] / 2;
3143 mj_part_t lower_cut_index = 0;
3144 mj_part_t upper_cut_index = num_cuts - 1;
3146 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
3147 bool is_inserted =
false;
3148 bool is_on_left_of_cut =
false;
3149 bool is_on_right_of_cut =
false;
3150 mj_part_t last_compared_part = -1;
3152 mj_scalar_t coord = mj_current_dim_coords[i];
3154 while(upper_cut_index >= lower_cut_index)
3157 last_compared_part = -1;
3158 is_on_left_of_cut =
false;
3159 is_on_right_of_cut =
false;
3160 mj_scalar_t cut = temp_current_cut_coords[j];
3161 mj_scalar_t distance_to_cut = coord - cut;
3162 mj_scalar_t abs_distance_to_cut =
ZOLTAN2_ABS(distance_to_cut);
3165 if(abs_distance_to_cut < this->sEpsilon){
3167 my_current_part_weights[j * 2 + 1] += w;
3168 this->assigned_part_ids[i] = j * 2 + 1;
3171 my_current_left_closest[j] = coord;
3172 my_current_right_closest[j] = coord;
3175 mj_part_t kk = j + 1;
3176 while(kk < num_cuts){
3178 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3179 if(distance_to_cut < this->sEpsilon){
3180 my_current_part_weights[2 * kk + 1] += w;
3181 my_current_left_closest[kk] = coord;
3182 my_current_right_closest[kk] = coord;
3188 if(coord - my_current_left_closest[kk] > this->sEpsilon){
3189 my_current_left_closest[kk] = coord;
3199 distance_to_cut =
ZOLTAN2_ABS(temp_current_cut_coords[kk] - cut);
3200 if(distance_to_cut < this->sEpsilon){
3201 my_current_part_weights[2 * kk + 1] += w;
3203 this->assigned_part_ids[i] = kk * 2 + 1;
3204 my_current_left_closest[kk] = coord;
3205 my_current_right_closest[kk] = coord;
3211 if(my_current_right_closest[kk] - coord > this->sEpsilon){
3212 my_current_right_closest[kk] = coord;
3223 if (distance_to_cut < 0) {
3224 bool _break =
false;
3228 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
3229 if(distance_to_next_cut > this->sEpsilon){
3235 upper_cut_index = j - 1;
3237 is_on_left_of_cut =
true;
3238 last_compared_part = j;
3243 bool _break =
false;
3244 if(j < num_cuts - 1){
3247 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
3248 if(distance_to_next_cut < minus_EPSILON){
3255 lower_cut_index = j + 1;
3257 is_on_right_of_cut =
true;
3258 last_compared_part = j;
3263 j = (upper_cut_index + lower_cut_index) / 2;
3266 if(is_on_right_of_cut){
3269 my_current_part_weights[2 * last_compared_part + 2] += w;
3270 this->assigned_part_ids[i] = 2 * last_compared_part + 2;
3273 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
3274 my_current_right_closest[last_compared_part] = coord;
3277 if(last_compared_part+1 < num_cuts){
3279 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
3280 my_current_left_closest[last_compared_part + 1] = coord;
3285 else if(is_on_left_of_cut){
3288 my_current_part_weights[2 * last_compared_part] += w;
3289 this->assigned_part_ids[i] = 2 * last_compared_part;
3293 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
3294 my_current_left_closest[last_compared_part] = coord;
3298 if(last_compared_part-1 >= 0){
3299 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
3300 my_current_right_closest[last_compared_part -1] = coord;
3309 for (
size_t i = 1; i < total_part_count; ++i){
3313 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
3314 ZOLTAN2_ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
3319 my_current_part_weights[i] = my_current_part_weights[i-2];
3323 my_current_part_weights[i] += my_current_part_weights[i-1];
3335 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3338 const std::vector <mj_part_t> &num_partitioning_in_current_dim,
3339 mj_part_t current_work_part,
3340 mj_part_t current_concurrent_num_parts){
3342 #ifdef HAVE_ZOLTAN2_OMP 3349 size_t tlr_array_shift = 0;
3350 mj_part_t cut_shift = 0;
3353 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3355 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3356 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3357 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3360 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
3361 mj_part_t next = tlr_array_shift + ii;
3362 mj_part_t cut_index = cut_shift + ii;
3363 if(this->is_cut_line_determined[cut_index])
continue;
3364 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
3365 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
3368 for (
int j = 1; j < this->num_threads; ++j){
3369 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
3370 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
3372 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
3373 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
3377 this->total_part_weight_left_right_closests[num_total_part_in_part +
3378 next] = left_closest_in_process;
3379 this->total_part_weight_left_right_closests[num_total_part_in_part +
3380 num_cuts_in_part + next] = right_closest_in_process;
3383 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
3384 cut_shift += num_cuts_in_part;
3387 tlr_array_shift = 0;
3389 size_t total_part_array_shift = 0;
3392 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
3394 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i];
3395 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
3396 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
3398 for(
size_t j = 0; j < num_total_part_in_part; ++j){
3400 mj_part_t cut_ind = j / 2 + cut_shift;
3405 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind])
continue;
3407 for (
int k = 0; k < this->num_threads; ++k){
3408 pwj += this->thread_part_weights[k][total_part_array_shift + j];
3411 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
3413 cut_shift += num_cuts_in_part;
3414 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
3415 total_part_array_shift += num_total_part_in_part;
3433 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3436 mj_scalar_t cut_upper_bound,
3437 mj_scalar_t cut_lower_bound,
3438 mj_scalar_t cut_upper_weight,
3439 mj_scalar_t cut_lower_weight,
3440 mj_scalar_t expected_weight,
3441 mj_scalar_t &new_cut_position){
3443 if(
ZOLTAN2_ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
3444 new_cut_position = cut_upper_bound;
3448 if(
ZOLTAN2_ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
3449 new_cut_position = cut_lower_bound;
3452 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
3453 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
3454 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
3456 mj_scalar_t required_shift = (my_weight_diff / weight_range);
3457 int scale_constant = 20;
3458 int shiftint= int (required_shift * scale_constant);
3459 if (shiftint == 0) shiftint = 1;
3460 required_shift = mj_scalar_t (shiftint) / scale_constant;
3461 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
3475 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3478 mj_part_t num_parts,
3479 mj_scalar_t *mj_current_dim_coords,
3480 mj_scalar_t *current_concurrent_cut_coordinate,
3481 mj_lno_t coordinate_begin,
3482 mj_lno_t coordinate_end,
3483 mj_scalar_t *used_local_cut_line_weight_to_left,
3484 double **used_thread_part_weight_work,
3485 mj_lno_t *out_part_xadj){
3487 mj_part_t num_cuts = num_parts - 1;
3489 #ifdef HAVE_ZOLTAN2_OMP 3490 #pragma omp parallel 3494 #ifdef HAVE_ZOLTAN2_OMP 3495 me = omp_get_thread_num();
3498 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
3499 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
3503 if (this->distribute_points_on_cut_lines){
3504 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
3506 #ifdef HAVE_ZOLTAN2_OMP 3509 for (mj_part_t i = 0; i < num_cuts; ++i){
3511 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
3512 for(
int ii = 0; ii < this->num_threads; ++ii){
3513 if(left_weight > this->sEpsilon){
3515 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
3516 if(thread_ii_weight_on_cut < left_weight){
3518 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
3522 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
3524 left_weight -= thread_ii_weight_on_cut;
3527 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
3535 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
3536 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
3537 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
3540 / mj_scalar_t(SIGNIFICANCE_MUL);
3545 for(mj_part_t ii = 0; ii < num_parts; ++ii){
3546 thread_num_points_in_parts[ii] = 0;
3550 #ifdef HAVE_ZOLTAN2_OMP 3554 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3556 mj_lno_t coordinate_index = this->coordinate_permutations[ii];
3557 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
3558 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
3559 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
3560 if(coordinate_assigned_place % 2 == 1){
3562 if(this->distribute_points_on_cut_lines
3563 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
3567 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3572 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
3573 && coordinate_assigned_part < num_cuts - 1
3574 &&
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3575 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3576 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3578 ++thread_num_points_in_parts[coordinate_assigned_part];
3579 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3583 ++coordinate_assigned_part;
3585 while(this->distribute_points_on_cut_lines &&
3586 coordinate_assigned_part < num_cuts){
3588 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
3589 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
3592 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
3594 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
3595 ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
3596 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
3599 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
3600 coordinate_assigned_part < num_cuts - 1 &&
3601 ZOLTAN2_ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
3602 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
3603 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
3611 ++coordinate_assigned_part;
3613 ++thread_num_points_in_parts[coordinate_assigned_part];
3614 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3619 ++thread_num_points_in_parts[coordinate_assigned_part];
3620 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
3629 #ifdef HAVE_ZOLTAN2_OMP 3632 for(mj_part_t j = 0; j < num_parts; ++j){
3633 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
3634 for (
int i = 0; i < this->num_threads; ++i){
3635 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
3637 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
3638 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
3641 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
3645 #ifdef HAVE_ZOLTAN2_OMP 3650 for(mj_part_t j = 1; j < num_parts; ++j){
3651 out_part_xadj[j] += out_part_xadj[j - 1];
3657 for(mj_part_t j = 1; j < num_parts; ++j){
3658 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
3664 #ifdef HAVE_ZOLTAN2_OMP 3667 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
3668 mj_lno_t i = this->coordinate_permutations[ii];
3669 mj_part_t p = this->assigned_part_ids[i];
3670 this->new_coordinate_permutations[coordinate_begin +
3671 thread_num_points_in_parts[p]++] = i;
3706 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
3709 const size_t &num_total_part,
3710 const mj_part_t &num_cuts,
3711 const mj_scalar_t &max_coordinate,
3712 const mj_scalar_t &min_coordinate,
3713 const mj_scalar_t &global_total_weight,
3714 const mj_scalar_t &used_imbalance_tolerance,
3715 mj_scalar_t * current_global_part_weights,
3716 const mj_scalar_t * current_local_part_weights,
3717 const mj_scalar_t *current_part_target_weights,
3718 bool *current_cut_line_determined,
3719 mj_scalar_t *current_cut_coordinates,
3720 mj_scalar_t *current_cut_upper_bounds,
3721 mj_scalar_t *current_cut_lower_bounds,
3722 mj_scalar_t *current_global_left_closest_points,
3723 mj_scalar_t *current_global_right_closest_points,
3724 mj_scalar_t * current_cut_lower_bound_weights,
3725 mj_scalar_t * current_cut_upper_weights,
3726 mj_scalar_t *new_current_cut_coordinates,
3727 mj_scalar_t *current_part_cut_line_weight_to_put_left,
3728 mj_part_t *rectilinear_cut_count,
3729 mj_part_t &my_num_incomplete_cut){
3732 mj_scalar_t seen_weight_in_part = 0;
3734 mj_scalar_t expected_weight_in_part = 0;
3736 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
3739 #ifdef HAVE_ZOLTAN2_OMP 3742 for (mj_part_t i = 0; i < num_cuts; i++){
3745 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
3746 current_global_left_closest_points[i] = current_cut_coordinates[i];
3747 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
3748 current_global_right_closest_points[i] = current_cut_coordinates[i];
3751 #ifdef HAVE_ZOLTAN2_OMP 3754 for (mj_part_t i = 0; i < num_cuts; i++){
3756 if(this->distribute_points_on_cut_lines){
3758 this->global_rectilinear_cut_weight[i] = 0;
3759 this->process_rectilinear_cut_weight[i] = 0;
3763 if(current_cut_line_determined[i]) {
3764 new_current_cut_coordinates[i] = current_cut_coordinates[i];
3769 seen_weight_in_part = current_global_part_weights[i * 2];
3778 expected_weight_in_part = current_part_target_weights[i];
3780 imbalance_on_left =
imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
3782 imbalance_on_right =
imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
3784 bool is_left_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
3785 bool is_right_imbalance_valid =
ZOLTAN2_ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
3788 if(is_left_imbalance_valid && is_right_imbalance_valid){
3789 current_cut_line_determined[i] =
true;
3790 #ifdef HAVE_ZOLTAN2_OMP 3793 my_num_incomplete_cut -= 1;
3794 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3797 else if(imbalance_on_left < 0){
3800 if(this->distribute_points_on_cut_lines){
3805 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
3807 current_cut_line_determined[i] =
true;
3808 #ifdef HAVE_ZOLTAN2_OMP 3811 my_num_incomplete_cut -= 1;
3814 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3818 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
3821 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
3825 current_cut_line_determined[i] =
true;
3826 #ifdef HAVE_ZOLTAN2_OMP 3829 *rectilinear_cut_count += 1;
3832 #ifdef HAVE_ZOLTAN2_OMP 3835 my_num_incomplete_cut -= 1;
3836 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3837 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
3838 current_local_part_weights[i * 2];
3843 current_cut_lower_bounds[i] = current_global_right_closest_points[i];
3845 current_cut_lower_bound_weights[i] = seen_weight_in_part;
3849 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
3850 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3851 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3853 if(p_weight >= expected_weight_in_part){
3858 if(p_weight == expected_weight_in_part){
3859 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3860 current_cut_upper_weights[i] = p_weight;
3861 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3862 current_cut_lower_bound_weights[i] = p_weight;
3863 }
else if (p_weight < current_cut_upper_weights[i]){
3866 current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
3867 current_cut_upper_weights[i] = p_weight;
3873 if(line_weight >= expected_weight_in_part){
3876 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3877 current_cut_upper_weights[i] = line_weight;
3878 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3879 current_cut_lower_bound_weights[i] = p_weight;
3884 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
3885 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
3886 current_cut_lower_bound_weights[i] = p_weight;
3891 mj_scalar_t new_cut_position = 0;
3892 this->mj_calculate_new_cut_position(
3893 current_cut_upper_bounds[i],
3894 current_cut_lower_bounds[i],
3895 current_cut_upper_weights[i],
3896 current_cut_lower_bound_weights[i],
3897 expected_weight_in_part, new_cut_position);
3901 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3904 current_cut_line_determined[i] =
true;
3905 #ifdef HAVE_ZOLTAN2_OMP 3908 my_num_incomplete_cut -= 1;
3911 new_current_cut_coordinates [i] = current_cut_coordinates[i];
3913 new_current_cut_coordinates [i] = new_cut_position;
3919 current_cut_upper_bounds[i] = current_global_left_closest_points[i];
3920 current_cut_upper_weights[i] = seen_weight_in_part;
3923 for (
int ii = i - 1; ii >= 0; --ii){
3924 mj_scalar_t p_weight = current_global_part_weights[ii * 2];
3925 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
3926 if(p_weight <= expected_weight_in_part){
3927 if(p_weight == expected_weight_in_part){
3930 current_cut_upper_bounds[i] = current_cut_coordinates[ii];
3931 current_cut_upper_weights[i] = p_weight;
3932 current_cut_lower_bounds[i] = current_cut_coordinates[ii];
3933 current_cut_lower_bound_weights[i] = p_weight;
3935 else if (p_weight > current_cut_lower_bound_weights[i]){
3938 current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
3939 current_cut_lower_bound_weights[i] = p_weight;
3945 if(line_weight > expected_weight_in_part){
3946 current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
3947 current_cut_upper_weights[i] = line_weight;
3956 if (p_weight >= expected_weight_in_part &&
3957 (p_weight < current_cut_upper_weights[i] ||
3958 (p_weight == current_cut_upper_weights[i] &&
3959 current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
3963 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
3964 current_cut_upper_weights[i] = p_weight;
3967 mj_scalar_t new_cut_position = 0;
3968 this->mj_calculate_new_cut_position(
3969 current_cut_upper_bounds[i],
3970 current_cut_lower_bounds[i],
3971 current_cut_upper_weights[i],
3972 current_cut_lower_bound_weights[i],
3973 expected_weight_in_part,
3977 if (
ZOLTAN2_ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
3979 current_cut_line_determined[i] =
true;
3980 #ifdef HAVE_ZOLTAN2_OMP 3983 my_num_incomplete_cut -= 1;
3985 new_current_cut_coordinates [ i] = current_cut_coordinates[i];
3987 new_current_cut_coordinates [ i] = new_cut_position;
3996 #ifdef HAVE_ZOLTAN2_OMP 4001 if(*rectilinear_cut_count > 0){
4004 Teuchos::scan<int,mj_scalar_t>(
4005 *comm, Teuchos::REDUCE_SUM,
4007 this->process_rectilinear_cut_weight,
4008 this->global_rectilinear_cut_weight
4013 for (mj_part_t i = 0; i < num_cuts; ++i){
4015 if(this->global_rectilinear_cut_weight[i] > 0) {
4017 mj_scalar_t expected_part_weight = current_part_target_weights[i];
4019 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
4021 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
4023 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
4026 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
4028 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
4038 if(space_left_to_me < 0){
4040 current_part_cut_line_weight_to_put_left[i] = 0;
4042 else if(space_left_to_me >= my_weight_on_line){
4045 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
4050 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
4057 *rectilinear_cut_count = 0;
4072 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4075 mj_part_t num_procs,
4076 mj_part_t num_parts,
4077 mj_gno_t *&num_points_in_all_processor_parts){
4080 size_t allocation_size = num_parts * (num_procs + 1);
4087 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
4092 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
4095 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
4098 memset(num_local_points_in_each_part_to_reduce_sum, 0,
sizeof(mj_gno_t)*allocation_size);
4101 for (mj_part_t i = 0; i < num_parts; ++i){
4102 mj_lno_t part_begin_index = 0;
4104 part_begin_index = this->new_part_xadj[i - 1];
4106 mj_lno_t part_end_index = this->new_part_xadj[i];
4107 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
4112 memcpy (my_local_point_counts_in_each_art,
4113 my_local_points_to_reduce_sum,
4114 sizeof(mj_gno_t) * (num_parts) );
4122 reduceAll<int, mj_gno_t>(
4124 Teuchos::REDUCE_SUM,
4126 num_local_points_in_each_part_to_reduce_sum,
4127 num_points_in_all_processor_parts);
4130 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
4147 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4150 size_t migration_reduce_all_population,
4151 mj_lno_t num_coords_for_last_dim_part,
4152 mj_part_t num_procs,
4153 mj_part_t num_parts,
4154 mj_gno_t *num_points_in_all_processor_parts){
4162 if (this->check_migrate_avoid_migration_option == 0){
4163 double global_imbalance = 0;
4165 size_t global_shift = num_procs * num_parts;
4167 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4168 for (mj_part_t i = 0; i < num_parts; ++i){
4169 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
4170 / double(num_procs);
4173 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
4176 global_imbalance /= num_parts;
4177 global_imbalance /= num_procs;
4185 if(global_imbalance <= this->minimum_migration_imbalance){
4208 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4211 mj_part_t num_parts,
4212 mj_part_t *part_assignment_proc_begin_indices,
4213 mj_part_t *processor_chains_in_parts,
4214 mj_lno_t *send_count_to_each_proc,
4215 int *coordinate_destinations){
4217 for (mj_part_t p = 0; p < num_parts; ++p){
4218 mj_lno_t part_begin = 0;
4219 if (p > 0) part_begin = this->new_part_xadj[p - 1];
4220 mj_lno_t part_end = this->new_part_xadj[p];
4223 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
4225 mj_lno_t num_total_send = 0;
4226 for (mj_lno_t j=part_begin; j < part_end; j++){
4227 mj_lno_t local_ind = this->new_coordinate_permutations[j];
4228 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
4232 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
4234 processor_chains_in_parts[proc_to_sent] = -1;
4236 proc_to_sent = part_assignment_proc_begin_indices[p];
4239 coordinate_destinations[local_ind] = proc_to_sent;
4259 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4262 mj_gno_t * num_points_in_all_processor_parts,
4263 mj_part_t num_parts,
4264 mj_part_t num_procs,
4265 mj_lno_t *send_count_to_each_proc,
4266 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4267 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4268 mj_part_t &out_part_index,
4269 mj_part_t &output_part_numbering_begin_index,
4270 int *coordinate_destinations){
4273 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4274 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
4277 bool did_i_find_my_group =
false;
4279 mj_part_t num_free_procs = num_procs;
4280 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
4282 double max_imbalance_difference = 0;
4283 mj_part_t max_differing_part = 0;
4286 for (mj_part_t i=0; i < num_parts; i++){
4289 double scalar_required_proc = num_procs *
4290 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
4293 mj_part_t required_proc =
static_cast<mj_part_t
> (0.5 + scalar_required_proc);
4297 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
4298 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
4302 num_free_procs -= required_proc;
4304 --minimum_num_procs_required_for_rest_of_parts;
4307 num_procs_assigned_to_each_part[i] = required_proc;
4312 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc;
4313 if (imbalance_wrt_ideal > max_imbalance_difference){
4314 max_imbalance_difference = imbalance_wrt_ideal;
4315 max_differing_part = i;
4320 if (num_free_procs > 0){
4321 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
4328 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
4330 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
4331 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
4340 for (
int i = 0; i < num_procs; ++i ){
4341 processor_part_assignments[i] = -1;
4342 processor_chains_in_parts[i] = -1;
4344 for (
int i = 0; i < num_parts; ++i ){
4345 part_assignment_proc_begin_indices[i] = -1;
4352 for(mj_part_t i = 0; i < num_parts; ++i){
4356 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4357 sort_item_num_part_points_in_procs[ii].
id = ii;
4360 if (processor_part_assignments[ii] == -1){
4361 sort_item_num_part_points_in_procs[ii].
val = num_points_in_all_processor_parts[ii * num_parts + i];
4362 sort_item_num_part_points_in_procs[ii].
signbit = 1;
4372 sort_item_num_part_points_in_procs[ii].
val = num_points_in_all_processor_parts[ii * num_parts + i];
4373 sort_item_num_part_points_in_procs[ii].
signbit = 0;
4377 uqSignsort<mj_part_t, mj_gno_t,char>(num_procs, sort_item_num_part_points_in_procs);
4387 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
4388 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
4389 mj_gno_t ideal_num_points_in_a_proc =
4390 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part /
double (required_proc_count)));
4393 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
4394 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4395 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4399 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4400 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].
id;
4402 processor_part_assignments[proc_id] = i;
4405 bool did_change_sign =
false;
4407 for(mj_part_t ii = 0; ii < num_procs; ++ii){
4410 if (sort_item_num_part_points_in_procs[ii].signbit == 0){
4411 did_change_sign =
true;
4412 sort_item_num_part_points_in_procs[ii].
signbit = 1;
4418 if(did_change_sign){
4420 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
4432 if (!did_i_find_my_group){
4433 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
4435 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].
id;
4437 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
4439 if(proc_id_to_assign == this->myRank){
4441 did_i_find_my_group =
true;
4443 part_assignment_proc_begin_indices[i] = this->myRank;
4444 processor_chains_in_parts[this->myRank] = -1;
4447 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].
val;
4450 for (mj_part_t in = 0; in < i; ++in){
4451 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
4458 if (!did_i_find_my_group){
4459 processor_ranks_for_subcomm.clear();
4467 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
4468 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].
id;
4469 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].
val;
4474 if (num_points_to_sent < 0) {
4475 cout <<
"Migration - processor assignments - for part:" << i <<
"from proc:" << nonassigned_proc_id <<
" num_points_to_sent:" << num_points_to_sent << std::endl;
4480 switch (migration_type){
4484 while (num_points_to_sent > 0){
4486 if (num_points_to_sent <= space_left_in_sent_proc){
4488 space_left_in_sent_proc -= num_points_to_sent;
4490 if (this->myRank == nonassigned_proc_id){
4492 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4495 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4496 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4497 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4499 num_points_to_sent = 0;
4503 if(space_left_in_sent_proc > 0){
4504 num_points_to_sent -= space_left_in_sent_proc;
4507 if (this->myRank == nonassigned_proc_id){
4509 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
4510 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4511 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4512 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4517 ++next_proc_to_send_index;
4520 if(next_part_to_send_index < nprocs - required_proc_count ){
4521 cout <<
"Migration - processor assignments - for part:" 4523 <<
" next_part_to_send :" << next_part_to_send_index
4524 <<
" nprocs:" << nprocs
4525 <<
" required_proc_count:" << required_proc_count
4526 <<
" Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl;
4532 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4534 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4543 if (this->myRank == nonassigned_proc_id){
4545 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
4548 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
4549 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
4550 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
4552 num_points_to_sent = 0;
4553 ++next_proc_to_send_index;
4556 if (next_proc_to_send_index == num_procs){
4557 next_proc_to_send_index = num_procs - required_proc_count;
4560 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].
id;
4562 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].
val;
4575 this->assign_send_destinations(
4577 part_assignment_proc_begin_indices,
4578 processor_chains_in_parts,
4579 send_count_to_each_proc,
4580 coordinate_destinations);
4582 freeArray<mj_part_t>(part_assignment_proc_begin_indices);
4583 freeArray<mj_part_t>(processor_chains_in_parts);
4584 freeArray<mj_part_t>(processor_part_assignments);
4585 freeArray<uSignedSortItem<mj_part_t, mj_gno_t, char> > (sort_item_num_part_points_in_procs);
4586 freeArray<mj_part_t > (num_procs_assigned_to_each_part);
4603 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4606 mj_part_t num_parts,
4608 int *coordinate_destinations,
4609 mj_part_t &output_part_numbering_begin_index,
4610 std::vector<mj_part_t> *next_future_num_parts_in_parts){
4612 mj_part_t part_shift_amount = output_part_numbering_begin_index;
4613 mj_part_t previous_processor = -1;
4614 for(mj_part_t i = 0; i < num_parts; ++i){
4615 mj_part_t p = sort_item_part_to_proc_assignment[i].
id;
4617 mj_lno_t part_begin_index = 0;
4618 if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
4619 mj_lno_t part_end_index = this->new_part_xadj[p];
4621 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].
val;
4622 if (this->myRank == assigned_proc && previous_processor != assigned_proc){
4623 output_part_numbering_begin_index = part_shift_amount;
4625 previous_processor = assigned_proc;
4626 part_shift_amount += (*next_future_num_parts_in_parts)[p];
4628 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
4629 mj_lno_t localInd = this->new_coordinate_permutations[j];
4630 coordinate_destinations[localInd] = assigned_proc;
4652 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4655 mj_gno_t * num_points_in_all_processor_parts,
4656 mj_part_t num_parts,
4657 mj_part_t num_procs,
4658 mj_lno_t *send_count_to_each_proc,
4659 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4660 mj_part_t &out_num_part,
4661 std::vector<mj_part_t> &out_part_indices,
4662 mj_part_t &output_part_numbering_begin_index,
4663 int *coordinate_destinations){
4666 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
4667 out_part_indices.clear();
4676 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (
double (num_procs)) + 0.5f);
4678 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
4680 for (mj_part_t i = 0; i < num_procs; ++i){
4681 space_in_each_processor[i] = work_each;
4688 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
4689 memset(num_parts_proc_assigned, 0,
sizeof(mj_part_t) * num_procs);
4690 int empty_proc_count = num_procs;
4698 for (mj_part_t i = 0; i < num_parts; ++i){
4699 sort_item_point_counts_in_parts[i].
id = i;
4700 sort_item_point_counts_in_parts[i].
val = global_num_points_in_parts[i];
4703 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
4709 for (mj_part_t j = 0; j < num_parts; ++j){
4711 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].
id;
4713 mj_gno_t load = global_num_points_in_parts[i];
4716 mj_part_t assigned_proc = -1;
4718 mj_part_t best_proc_to_assign = 0;
4722 for (mj_part_t ii = 0; ii < num_procs; ++ii){
4723 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
4728 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
4730 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i];
4733 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
4736 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
4739 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
4740 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
4741 mj_lno_t left_space = space_in_each_processor[ii] - load;
4743 if(left_space >= 0 ){
4748 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
4749 best_proc_to_assign = ii;
4754 if (assigned_proc == -1){
4755 assigned_proc = best_proc_to_assign;
4758 if (num_parts_proc_assigned[assigned_proc]++ == 0){
4761 space_in_each_processor[assigned_proc] -= load;
4763 sort_item_part_to_proc_assignment[j].
id = i;
4764 sort_item_part_to_proc_assignment[j].
val = assigned_proc;
4768 if (assigned_proc == this->myRank){
4770 out_part_indices.push_back(i);
4774 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
4776 freeArray<mj_part_t>(num_parts_proc_assigned);
4777 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
4778 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
4779 freeArray<mj_lno_t >(space_in_each_processor);
4783 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
4787 this->assign_send_destinations2(
4789 sort_item_part_to_proc_assignment,
4790 coordinate_destinations,
4791 output_part_numbering_begin_index,
4792 next_future_num_parts_in_parts);
4794 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
4815 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4818 mj_gno_t * num_points_in_all_processor_parts,
4819 mj_part_t num_parts,
4820 mj_part_t num_procs,
4821 mj_lno_t *send_count_to_each_proc,
4822 std::vector<mj_part_t> &processor_ranks_for_subcomm,
4823 std::vector<mj_part_t> *next_future_num_parts_in_parts,
4824 mj_part_t &out_num_part,
4825 std::vector<mj_part_t> &out_part_indices,
4826 mj_part_t &output_part_numbering_begin_index,
4827 int *coordinate_destinations){
4831 processor_ranks_for_subcomm.clear();
4833 if (num_procs > num_parts){
4838 mj_part_t out_part_index = 0;
4839 this->mj_assign_proc_to_parts(
4840 num_points_in_all_processor_parts,
4843 send_count_to_each_proc,
4844 processor_ranks_for_subcomm,
4845 next_future_num_parts_in_parts,
4847 output_part_numbering_begin_index,
4848 coordinate_destinations
4852 out_part_indices.clear();
4853 out_part_indices.push_back(out_part_index);
4860 processor_ranks_for_subcomm.push_back(this->myRank);
4864 this->mj_assign_parts_to_procs(
4865 num_points_in_all_processor_parts,
4868 send_count_to_each_proc,
4869 next_future_num_parts_in_parts,
4872 output_part_numbering_begin_index,
4873 coordinate_destinations);
4889 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
4892 mj_part_t num_procs,
4893 mj_lno_t &num_new_local_points,
4894 std::string iteration,
4895 int *coordinate_destinations,
4896 mj_part_t num_parts)
4898 #ifdef ENABLE_ZOLTAN_MIGRATION 4899 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
4905 ZOLTAN_COMM_OBJ *plan = NULL;
4906 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
4907 int num_incoming_gnos = 0;
4908 int message_tag = 7859;
4910 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4911 int ierr = Zoltan_Comm_Create(
4913 int(this->num_local_coords),
4914 coordinate_destinations,
4917 &num_incoming_gnos);
4919 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1PlanCreating-" + iteration);
4921 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
4922 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
4926 ierr = Zoltan_Comm_Do(
4929 (
char *) this->current_mj_gnos,
4931 (
char *) incoming_gnos);
4934 freeArray<mj_gno_t>(this->current_mj_gnos);
4935 this->current_mj_gnos = incoming_gnos;
4939 for (
int i = 0; i < this->coord_dim; ++i){
4941 mj_scalar_t *coord = this->mj_coordinates[i];
4943 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4944 ierr = Zoltan_Comm_Do(
4948 sizeof(mj_scalar_t),
4949 (
char *) this->mj_coordinates[i]);
4951 freeArray<mj_scalar_t>(coord);
4955 for (
int i = 0; i < this->num_weights_per_coord; ++i){
4957 mj_scalar_t *weight = this->mj_weights[i];
4959 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
4960 ierr = Zoltan_Comm_Do(
4964 sizeof(mj_scalar_t),
4965 (
char *) this->mj_weights[i]);
4967 freeArray<mj_scalar_t>(weight);
4972 int *coord_own = allocMemory<int>(num_incoming_gnos);
4974 ierr = Zoltan_Comm_Do(
4977 (
char *) this->owner_of_coordinate,
4978 sizeof(
int), (
char *) coord_own);
4980 freeArray<int>(this->owner_of_coordinate);
4981 this->owner_of_coordinate = coord_own;
4987 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
4988 if(num_procs < num_parts){
4990 ierr = Zoltan_Comm_Do(
4993 (
char *) this->assigned_part_ids,
4995 (
char *) new_parts);
4998 freeArray<mj_part_t>(this->assigned_part_ids);
4999 this->assigned_part_ids = new_parts;
5001 ierr = Zoltan_Comm_Destroy(&plan);
5003 num_new_local_points = num_incoming_gnos;
5004 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration Z1Migration-" + iteration);
5009 #endif // ENABLE_ZOLTAN_MIGRATION 5011 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5012 Tpetra::Distributor distributor(this->comm);
5013 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
5014 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
5015 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorPlanCreating-" + iteration);
5017 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5020 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
5021 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5022 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5023 freeArray<mj_gno_t>(this->current_mj_gnos);
5024 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
5026 this->current_mj_gnos,
5027 received_gnos.getRawPtr(),
5028 num_incoming_gnos *
sizeof(mj_gno_t));
5031 for (
int i = 0; i < this->coord_dim; ++i){
5033 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
5034 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
5035 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
5036 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5037 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5039 this->mj_coordinates[i],
5040 received_coord.getRawPtr(),
5041 num_incoming_gnos *
sizeof(mj_scalar_t));
5045 for (
int i = 0; i < this->num_weights_per_coord; ++i){
5047 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
5048 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
5049 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
5050 freeArray<mj_scalar_t>(this->mj_weights[i]);
5051 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
5053 this->mj_weights[i],
5054 received_weight.getRawPtr(),
5055 num_incoming_gnos *
sizeof(mj_scalar_t));
5060 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
5061 ArrayRCP<int> received_owners(num_incoming_gnos);
5062 distributor.doPostsAndWaits<
int>(sent_owners, 1, received_owners());
5063 freeArray<int>(this->owner_of_coordinate);
5064 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
5066 this->owner_of_coordinate,
5067 received_owners.getRawPtr(),
5068 num_incoming_gnos *
sizeof(int));
5074 if(num_procs < num_parts){
5075 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5076 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
5077 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5078 freeArray<mj_part_t>(this->assigned_part_ids);
5079 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
5081 this->assigned_part_ids,
5082 received_partids.getRawPtr(),
5083 num_incoming_gnos *
sizeof(mj_part_t));
5086 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
5087 freeArray<mj_part_t>(this->assigned_part_ids);
5088 this->assigned_part_ids = new_parts;
5090 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Migration DistributorMigration-" + iteration);
5091 num_new_local_points = num_incoming_gnos;
5102 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5105 mj_part_t group_size = processor_ranks_for_subcomm.size();
5106 mj_part_t *ids = allocMemory<mj_part_t>(group_size);
5107 for(mj_part_t i = 0; i < group_size; ++i) {
5108 ids[i] = processor_ranks_for_subcomm[i];
5110 ArrayView<const mj_part_t> idView(ids, group_size);
5111 this->comm = this->comm->createSubcommunicator(idView);
5112 freeArray<mj_part_t>(ids);
5121 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5124 mj_part_t output_num_parts,
5125 mj_part_t num_parts){
5127 if (output_num_parts == 1){
5128 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5129 this->new_coordinate_permutations[i] = i;
5131 this->new_part_xadj[0] = this->num_local_coords;
5138 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
5140 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
5142 memset(num_points_in_parts, 0,
sizeof(mj_lno_t) * num_parts);
5144 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
5145 mj_part_t ii = this->assigned_part_ids[i];
5146 ++num_points_in_parts[ii];
5151 mj_lno_t prev_index = 0;
5152 for(mj_part_t i = 0; i < num_parts; ++i){
5153 if(num_points_in_parts[i] > 0) {
5154 this->new_part_xadj[p] = prev_index + num_points_in_parts[i];
5155 prev_index += num_points_in_parts[i];
5156 part_shifts[i] = p++;
5161 mj_part_t assigned_num_parts = p - 1;
5162 for (;p < num_parts; ++p){
5163 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts];
5165 for(mj_part_t i = 0; i < output_num_parts; ++i){
5166 num_points_in_parts[i] = this->new_part_xadj[i];
5172 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
5173 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
5174 this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
5177 freeArray<mj_lno_t>(num_points_in_parts);
5178 freeArray<mj_part_t>(part_shifts);
5205 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5208 mj_part_t input_num_parts,
5209 mj_part_t &output_num_parts,
5210 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5211 mj_part_t &output_part_begin_index,
5212 size_t migration_reduce_all_population,
5213 mj_lno_t num_coords_for_last_dim_part,
5214 std::string iteration,
5215 RCP<mj_partBoxVector_t> &input_part_boxes,
5216 RCP<mj_partBoxVector_t> &output_part_boxes
5219 mj_part_t num_procs = this->comm->getSize();
5220 this->myRank = this->comm->getRank();
5226 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
5229 this->get_processor_num_points_in_parts(
5232 num_points_in_all_processor_parts);
5236 if (!this->mj_check_to_migrate(
5237 migration_reduce_all_population,
5238 num_coords_for_last_dim_part,
5241 num_points_in_all_processor_parts)){
5242 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5247 mj_lno_t *send_count_to_each_proc = NULL;
5248 int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
5249 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
5250 for (
int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
5252 std::vector<mj_part_t> processor_ranks_for_subcomm;
5253 std::vector<mj_part_t> out_part_indices;
5256 this->mj_migration_part_proc_assignment(
5257 num_points_in_all_processor_parts,
5260 send_count_to_each_proc,
5261 processor_ranks_for_subcomm,
5262 next_future_num_parts_in_parts,
5265 output_part_begin_index,
5266 coordinate_destinations);
5271 freeArray<mj_lno_t>(send_count_to_each_proc);
5272 std::vector <mj_part_t> tmpv;
5274 std::sort (out_part_indices.begin(), out_part_indices.end());
5275 mj_part_t outP = out_part_indices.size();
5277 mj_gno_t new_global_num_points = 0;
5278 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
5280 if (this->mj_keep_part_boxes){
5281 input_part_boxes->clear();
5286 for (mj_part_t i = 0; i < outP; ++i){
5287 mj_part_t ind = out_part_indices[i];
5288 new_global_num_points += global_num_points_in_parts[ind];
5289 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
5290 if (this->mj_keep_part_boxes){
5291 input_part_boxes->push_back((*output_part_boxes)[ind]);
5295 if (this->mj_keep_part_boxes){
5296 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
5297 input_part_boxes = output_part_boxes;
5298 output_part_boxes = tmpPartBoxes;
5300 next_future_num_parts_in_parts->clear();
5301 for (mj_part_t i = 0; i < outP; ++i){
5302 mj_part_t p = tmpv[i];
5303 next_future_num_parts_in_parts->push_back(p);
5306 freeArray<mj_gno_t>(num_points_in_all_processor_parts);
5308 mj_lno_t num_new_local_points = 0;
5312 this->mj_migrate_coords(
5314 num_new_local_points,
5316 coordinate_destinations,
5320 freeArray<int>(coordinate_destinations);
5322 if(this->num_local_coords != num_new_local_points){
5323 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5324 freeArray<mj_lno_t>(this->coordinate_permutations);
5326 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5327 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
5329 this->num_local_coords = num_new_local_points;
5330 this->num_global_coords = new_global_num_points;
5335 this->create_sub_communicator(processor_ranks_for_subcomm);
5336 processor_ranks_for_subcomm.clear();
5339 this->fill_permutation_array(
5359 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5362 mj_part_t num_parts,
5363 mj_scalar_t *mj_current_dim_coords,
5364 mj_scalar_t *current_concurrent_cut_coordinate,
5365 mj_lno_t coordinate_begin,
5366 mj_lno_t coordinate_end,
5367 mj_scalar_t *used_local_cut_line_weight_to_left,
5368 mj_lno_t *out_part_xadj,
5372 mj_part_t no_cuts = num_parts - 1;
5377 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
5378 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
5383 if (this->distribute_points_on_cut_lines){
5385 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
5386 for (mj_part_t i = 0; i < no_cuts; ++i){
5388 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
5390 for(
int ii = 0; ii < this->num_threads; ++ii){
5391 if(left_weight > this->sEpsilon){
5393 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
5394 if(thread_ii_weight_on_cut < left_weight){
5395 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
5398 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
5400 left_weight -= thread_ii_weight_on_cut;
5403 this->thread_cut_line_weight_to_put_left[ii][i] = 0;
5411 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
5412 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5413 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
5416 / mj_scalar_t(SIGNIFICANCE_MUL);
5421 for(mj_part_t ii = 0; ii < num_parts; ++ii){
5422 thread_num_points_in_parts[ii] = 0;
5434 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
5438 typedef std::vector< multiSItem > multiSVector;
5439 typedef std::vector<multiSVector> multiS2Vector;
5442 std::vector<mj_scalar_t *>allocated_memory;
5445 multiS2Vector sort_vector_points_on_cut;
5448 mj_part_t different_cut_count = 1;
5453 multiSVector tmpMultiSVector;
5454 sort_vector_points_on_cut.push_back(tmpMultiSVector);
5456 for (mj_part_t i = 1; i < no_cuts ; ++i){
5459 if(
ZOLTAN2_ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
5460 cut_map[i] = cut_map[i-1];
5463 cut_map[i] = different_cut_count++;
5464 multiSVector tmp2MultiSVector;
5465 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
5471 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5473 mj_lno_t i = this->coordinate_permutations[ii];
5475 mj_part_t pp = this->assigned_part_ids[i];
5476 mj_part_t p = pp / 2;
5479 mj_scalar_t *
vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
5480 allocated_memory.push_back(vals);
5485 if (longest_dim_part){
5487 for(
int dim = this->coord_dim - 2; dim >= 0; --dim){
5489 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].
id;
5491 vals[val_ind++] = this->mj_coordinates[next_largest_coord_dim][i];
5495 for(
int dim = coordInd + 1; dim < this->coord_dim; ++dim){
5496 vals[val_ind++] = this->mj_coordinates[dim][i];
5498 for(
int dim = 0; dim < coordInd; ++dim){
5499 vals[val_ind++] = this->mj_coordinates[dim][i];
5502 multiSItem tempSortItem(i, this->coord_dim -1, vals);
5504 mj_part_t cmap = cut_map[p];
5505 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
5509 ++thread_num_points_in_parts[p];
5510 this->assigned_part_ids[i] = p;
5515 for (mj_part_t i = 0; i < different_cut_count; ++i){
5516 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
5520 mj_part_t previous_cut_map = cut_map[0];
5529 mj_scalar_t weight_stolen_from_previous_part = 0;
5530 for (mj_part_t p = 0; p < no_cuts; ++p){
5532 mj_part_t mapped_cut = cut_map[p];
5536 if (previous_cut_map != mapped_cut){
5537 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5538 for (; sort_vector_end >= 0; --sort_vector_end){
5539 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5540 mj_lno_t i = t.index;
5541 ++thread_num_points_in_parts[p];
5542 this->assigned_part_ids[i] = p;
5544 sort_vector_points_on_cut[previous_cut_map].clear();
5548 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
5553 for (; sort_vector_end >= 0; --sort_vector_end){
5556 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
5558 mj_lno_t i = t.index;
5559 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
5563 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
5564 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part -
ZOLTAN2_ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
5567 my_local_thread_cut_weights_to_put_left[p] -= w;
5568 sort_vector_points_on_cut[mapped_cut].pop_back();
5569 ++thread_num_points_in_parts[p];
5570 this->assigned_part_ids[i] = p;
5573 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
5574 if(mapped_cut == cut_map[p + 1] ){
5577 if (previous_cut_map != mapped_cut){
5578 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5583 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5587 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5596 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
5597 if (previous_cut_map != mapped_cut){
5598 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
5601 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
5605 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
5611 previous_cut_map = mapped_cut;
5616 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
5621 for (; sort_vector_end >= 0; --sort_vector_end){
5624 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
5626 mj_lno_t i = t.index;
5627 ++thread_num_points_in_parts[no_cuts];
5628 this->assigned_part_ids[i] = no_cuts;
5630 sort_vector_points_on_cut[previous_cut_map].clear();
5631 freeArray<mj_part_t> (cut_map);
5634 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
5635 for(mj_lno_t i = 0; i < vSize; ++i){
5636 freeArray<mj_scalar_t> (allocated_memory[i]);
5640 for(mj_part_t j = 0; j < num_parts; ++j){
5641 mj_lno_t num_points_in_part_j_upto_thread_i = 0;
5642 for (
int i = 0; i < this->num_threads; ++i){
5643 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
5644 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
5645 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
5648 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;
5652 for(mj_part_t j = 1; j < num_parts; ++j){
5653 out_part_xadj[j] += out_part_xadj[j - 1];
5659 for(mj_part_t j = 1; j < num_parts; ++j){
5660 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
5665 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
5666 mj_lno_t i = this->coordinate_permutations[ii];
5667 mj_part_t p = this->assigned_part_ids[i];
5668 this->new_coordinate_permutations[coordinate_begin +
5669 thread_num_points_in_parts[p]++] = i;
5684 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5687 mj_part_t current_num_parts,
5688 mj_part_t output_part_begin_index,
5689 RCP<mj_partBoxVector_t> &output_part_boxes,
5690 bool is_data_ever_migrated)
5692 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5694 #ifdef HAVE_ZOLTAN2_OMP 5695 #pragma omp parallel for 5697 for(mj_part_t i = 0; i < current_num_parts;++i){
5700 mj_lno_t end = this->part_xadj[i];
5702 if(i > 0) begin = this->part_xadj[i -1];
5703 mj_part_t part_to_set_index = i + output_part_begin_index;
5704 if (this->mj_keep_part_boxes){
5705 (*output_part_boxes)[i].setpId(part_to_set_index);
5707 for (mj_lno_t ii = begin; ii < end; ++ii){
5708 mj_lno_t k = this->coordinate_permutations[ii];
5709 this->assigned_part_ids[k] = part_to_set_index;
5714 if(!is_data_ever_migrated){
5721 #ifdef ENABLE_ZOLTAN_MIGRATION 5722 if (
sizeof(mj_lno_t) <=
sizeof(
int)) {
5729 ZOLTAN_COMM_OBJ *plan = NULL;
5730 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
5733 int message_tag = 7856;
5735 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating");
5736 int ierr = Zoltan_Comm_Create( &plan,
int(this->num_local_coords),
5737 this->owner_of_coordinate, mpi_comm, message_tag,
5740 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanCreating" );
5742 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
5745 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5746 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->current_mj_gnos,
5747 sizeof(mj_gno_t), (
char *) incoming_gnos);
5750 freeArray<mj_gno_t>(this->current_mj_gnos);
5751 this->current_mj_gnos = incoming_gnos;
5753 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
5756 ierr = Zoltan_Comm_Do( plan, message_tag, (
char *) this->assigned_part_ids,
5757 sizeof(mj_part_t), (
char *) incoming_partIds);
5759 freeArray<mj_part_t>(this->assigned_part_ids);
5760 this->assigned_part_ids = incoming_partIds;
5762 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final Z1PlanComm");
5763 ierr = Zoltan_Comm_Destroy(&plan);
5766 this->num_local_coords = incoming;
5771 #endif // !ENABLE_ZOLTAN_MIGRATION 5774 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating");
5775 Tpetra::Distributor distributor(this->mj_problemComm);
5776 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
5777 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
5778 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanCreating" );
5780 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5782 ArrayRCP<mj_gno_t> received_gnos(incoming);
5783 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
5784 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
5785 freeArray<mj_gno_t>(this->current_mj_gnos);
5786 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
5787 memcpy( this->current_mj_gnos,
5788 received_gnos.getRawPtr(),
5789 incoming *
sizeof(mj_gno_t));
5792 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
5793 ArrayRCP<mj_part_t> received_partids(incoming);
5794 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
5795 freeArray<mj_part_t>(this->assigned_part_ids);
5796 this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
5797 memcpy( this->assigned_part_ids,
5798 received_partids.getRawPtr(),
5799 incoming *
sizeof(mj_part_t));
5801 this->num_local_coords = incoming;
5802 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Final DistributorPlanComm");
5807 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Part_Assignment");
5809 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5814 if (this->mj_keep_part_boxes){
5819 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Solution_Part_Assignment");
5824 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5827 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5829 for (
int i=0; i < this->coord_dim; i++){
5830 freeArray<mj_scalar_t>(this->mj_coordinates[i]);
5832 freeArray<mj_scalar_t *>(this->mj_coordinates);
5834 for (
int i=0; i < this->num_weights_per_coord; i++){
5835 freeArray<mj_scalar_t>(this->mj_weights[i]);
5837 freeArray<mj_scalar_t *>(this->mj_weights);
5839 freeArray<int>(this->owner_of_coordinate);
5841 for(
int i = 0; i < this->num_threads; ++i){
5842 freeArray<mj_lno_t>(this->thread_point_counts[i]);
5845 freeArray<mj_lno_t *>(this->thread_point_counts);
5846 freeArray<double *> (this->thread_part_weight_work);
5848 if(this->distribute_points_on_cut_lines){
5849 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
5850 for(
int i = 0; i < this->num_threads; ++i){
5851 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
5853 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
5854 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
5855 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
5858 freeArray<mj_part_t>(this->my_incomplete_cut_count);
5860 freeArray<mj_scalar_t>(this->max_min_coords);
5862 freeArray<mj_lno_t>(this->part_xadj);
5864 freeArray<mj_lno_t>(this->coordinate_permutations);
5866 freeArray<mj_lno_t>(this->new_coordinate_permutations);
5868 freeArray<mj_scalar_t>(this->all_cut_coordinates);
5870 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
5872 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
5874 freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
5876 freeArray<mj_scalar_t>(this->target_part_weights);
5878 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
5880 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
5882 freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
5883 freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
5884 freeArray<bool>(this->is_cut_line_determined);
5885 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
5886 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
5888 for(
int i = 0; i < this->num_threads; ++i){
5889 freeArray<double>(this->thread_part_weights[i]);
5890 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
5891 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
5894 freeArray<double *>(this->thread_part_weights);
5895 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
5896 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
5898 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Free");
5909 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5912 bool distribute_points_on_cut_lines_,
5913 int max_concurrent_part_calculation_,
5914 int check_migrate_avoid_migration_option_,
5915 mj_scalar_t minimum_migration_imbalance_,
5916 int migration_type_ ){
5917 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
5918 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
5919 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
5920 this->minimum_migration_imbalance = minimum_migration_imbalance_;
5921 this->migration_type = migration_type_;
5954 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
5958 const RCP<const Environment> &env,
5959 RCP<
const Comm<int> > &problemComm,
5961 double imbalance_tolerance_,
5962 size_t num_global_parts_,
5963 mj_part_t *part_no_array_,
5964 int recursion_depth_,
5967 mj_lno_t num_local_coords_,
5968 mj_gno_t num_global_coords_,
5969 const mj_gno_t *initial_mj_gnos_,
5970 mj_scalar_t **mj_coordinates_,
5972 int num_weights_per_coord_,
5973 bool *mj_uniform_weights_,
5974 mj_scalar_t **mj_weights_,
5975 bool *mj_uniform_parts_,
5976 mj_scalar_t **mj_part_sizes_,
5978 mj_part_t *&result_assigned_part_ids_,
5979 mj_gno_t *&result_mj_gnos_
5984 if(comm->getRank() == 0){
5985 std::cout <<
"size of gno:" <<
sizeof(mj_gno_t) << std::endl;
5986 std::cout <<
"size of lno:" <<
sizeof(mj_lno_t) << std::endl;
5987 std::cout <<
"size of mj_scalar_t:" <<
sizeof(mj_scalar_t) << std::endl;
5992 this->mj_problemComm = problemComm;
5993 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
5995 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Total");
5996 this->mj_env->debug(3,
"In MultiJagged Jagged");
5999 this->imbalance_tolerance = imbalance_tolerance_;
6000 this->num_global_parts = num_global_parts_;
6001 this->part_no_array = part_no_array_;
6002 this->recursion_depth = recursion_depth_;
6004 this->coord_dim = coord_dim_;
6005 this->num_local_coords = num_local_coords_;
6006 this->num_global_coords = num_global_coords_;
6007 this->mj_coordinates = mj_coordinates_;
6008 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_;
6010 this->num_weights_per_coord = num_weights_per_coord_;
6011 this->mj_uniform_weights = mj_uniform_weights_;
6012 this->mj_weights = mj_weights_;
6013 this->mj_uniform_parts = mj_uniform_parts_;
6014 this->mj_part_sizes = mj_part_sizes_;
6016 this->num_threads = 1;
6017 #ifdef HAVE_ZOLTAN2_OMP 6018 #pragma omp parallel 6021 this->num_threads = omp_get_num_threads();
6026 this->set_part_specifications();
6028 this->allocate_set_work_memory();
6032 this->comm = this->mj_problemComm->duplicate();
6035 mj_part_t current_num_parts = 1;
6036 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates;
6038 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6040 mj_part_t output_part_begin_index = 0;
6041 mj_part_t future_num_parts = this->total_num_part;
6042 bool is_data_ever_migrated =
false;
6044 std::vector<mj_part_t> *future_num_part_in_parts =
new std::vector<mj_part_t> ();
6045 std::vector<mj_part_t> *next_future_num_parts_in_parts =
new std::vector<mj_part_t> ();
6046 next_future_num_parts_in_parts->push_back(this->num_global_parts);
6048 RCP<mj_partBoxVector_t> input_part_boxes(
new mj_partBoxVector_t(),
true) ;
6049 RCP<mj_partBoxVector_t> output_part_boxes(
new mj_partBoxVector_t(),
true);
6051 compute_global_box();
6052 if(this->mj_keep_part_boxes){
6053 this->init_part_boxes(output_part_boxes);
6056 for (
int i = 0; i < this->recursion_depth; ++i){
6059 std::vector <mj_part_t> num_partitioning_in_current_dim;
6073 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
6074 future_num_part_in_parts = next_future_num_parts_in_parts;
6075 next_future_num_parts_in_parts = tmpPartVect;
6080 next_future_num_parts_in_parts->clear();
6082 if(this->mj_keep_part_boxes){
6083 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6084 input_part_boxes = output_part_boxes;
6085 output_part_boxes = tmpPartBoxes;
6086 output_part_boxes->clear();
6090 mj_part_t output_part_count_in_dimension =
6091 this->update_part_num_arrays(
6092 num_partitioning_in_current_dim,
6093 future_num_part_in_parts,
6094 next_future_num_parts_in_parts,
6099 output_part_boxes, 1);
6104 if(output_part_count_in_dimension == current_num_parts) {
6106 tmpPartVect= future_num_part_in_parts;
6107 future_num_part_in_parts = next_future_num_parts_in_parts;
6108 next_future_num_parts_in_parts = tmpPartVect;
6110 if(this->mj_keep_part_boxes){
6111 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
6112 input_part_boxes = output_part_boxes;
6113 output_part_boxes = tmpPartBoxes;
6120 int coordInd = i % this->coord_dim;
6121 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
6124 std::string istring = Teuchos::toString<int>(i);
6125 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6129 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
6132 mj_part_t output_part_index = 0;
6135 mj_part_t output_coordinate_end_index = 0;
6137 mj_part_t current_work_part = 0;
6138 mj_part_t current_concurrent_num_parts =
6139 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
6141 mj_part_t obtained_part_index = 0;
6144 for (; current_work_part < current_num_parts;
6145 current_work_part += current_concurrent_num_parts){
6147 current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
6148 this->max_concurrent_part_calculation);
6150 mj_part_t actual_work_part_count = 0;
6154 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6155 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
6159 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
6162 ++actual_work_part_count;
6163 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
6164 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
6172 this->mj_get_local_min_max_coord_totW(
6173 coordinate_begin_index,
6174 coordinate_end_index,
6175 this->coordinate_permutations,
6176 mj_current_dim_coords,
6177 this->process_local_min_max_coord_total_weight[kk],
6178 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts],
6179 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]);
6184 if (actual_work_part_count > 0){
6186 this->mj_get_global_min_max_coord_totW(
6187 current_concurrent_num_parts,
6188 this->process_local_min_max_coord_total_weight,
6189 this->global_min_max_coord_total_weight);
6193 mj_part_t total_incomplete_cut_count = 0;
6198 mj_part_t concurrent_part_cut_shift = 0;
6199 mj_part_t concurrent_part_part_shift = 0;
6200 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6201 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
6202 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
6203 current_concurrent_num_parts];
6205 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
6206 2 * current_concurrent_num_parts];
6208 mj_part_t concurrent_current_part_index = current_work_part + kk;
6210 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
6212 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
6213 mj_scalar_t *current_target_part_weights = this->target_part_weights +
6214 concurrent_part_part_shift;
6216 concurrent_part_cut_shift += partition_count - 1;
6218 concurrent_part_part_shift += partition_count;
6223 if(partition_count > 1 && min_coordinate <= max_coordinate){
6227 total_incomplete_cut_count += partition_count - 1;
6230 this->my_incomplete_cut_count[kk] = partition_count - 1;
6233 this->mj_get_initial_cut_coords_target_weights(
6236 partition_count - 1,
6237 global_total_weight,
6239 current_target_part_weights,
6240 future_num_part_in_parts,
6241 next_future_num_parts_in_parts,
6242 concurrent_current_part_index,
6243 obtained_part_index);
6245 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
6246 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
6250 this->set_initial_coordinate_parts(
6253 concurrent_current_part_index,
6254 coordinate_begin_index, coordinate_end_index,
6255 this->coordinate_permutations,
6256 mj_current_dim_coords,
6257 this->assigned_part_ids,
6262 this->my_incomplete_cut_count[kk] = 0;
6264 obtained_part_index += partition_count;
6271 mj_scalar_t used_imbalance = 0;
6276 mj_current_dim_coords,
6279 current_concurrent_num_parts,
6280 current_cut_coordinates,
6281 total_incomplete_cut_count,
6282 num_partitioning_in_current_dim);
6287 mj_part_t output_array_shift = 0;
6288 mj_part_t cut_shift = 0;
6289 size_t tlr_shift = 0;
6290 size_t partweight_array_shift = 0;
6292 for(
int kk = 0; kk < current_concurrent_num_parts; ++kk){
6293 mj_part_t current_concurrent_work_part = current_work_part + kk;
6294 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
6297 if((num_parts != 1 )
6299 this->global_min_max_coord_total_weight[kk] >
6300 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
6304 for(mj_part_t jj = 0; jj < num_parts; ++jj){
6305 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
6307 cut_shift += num_parts - 1;
6308 tlr_shift += (4 *(num_parts - 1) + 1);
6309 output_array_shift += num_parts;
6310 partweight_array_shift += (2 * (num_parts - 1) + 1);
6314 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
6315 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
6316 current_concurrent_work_part -1];
6317 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
6318 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
6323 for(
int ii = 0; ii < this->num_threads; ++ii){
6324 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift;
6328 if(this->mj_keep_part_boxes){
6330 for (mj_part_t j = 0; j < num_parts - 1; ++j){
6331 (*output_part_boxes)[output_array_shift + output_part_index +
6332 j].updateMinMax(current_concurrent_cut_coordinate[j], 1
6335 (*output_part_boxes)[output_array_shift + output_part_index + j +
6336 1].updateMinMax(current_concurrent_cut_coordinate[j], 0
6342 this->mj_create_new_partitions(
6344 mj_current_dim_coords,
6345 current_concurrent_cut_coordinate,
6348 used_local_cut_line_weight_to_left,
6349 this->thread_part_weight_work,
6350 this->new_part_xadj + output_part_index + output_array_shift
6357 mj_lno_t part_size = coordinate_end - coordinate_begin;
6358 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
6360 this->new_coordinate_permutations + coordinate_begin,
6361 this->coordinate_permutations + coordinate_begin,
6362 part_size *
sizeof(mj_lno_t));
6364 cut_shift += num_parts - 1;
6365 tlr_shift += (4 *(num_parts - 1) + 1);
6366 output_array_shift += num_parts;
6367 partweight_array_shift += (2 * (num_parts - 1) + 1);
6377 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
6378 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
6379 for (mj_part_t ii = 0;ii < num_parts ; ++ii){
6381 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
6384 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
6386 output_part_index += num_parts ;
6393 int current_world_size = this->comm->getSize();
6394 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
6397 bool is_migrated_in_current_dimension =
false;
6402 if (future_num_parts > 1 &&
6403 this->check_migrate_avoid_migration_option >= 0 &&
6404 current_world_size > 1){
6406 this->mj_env->timerStart(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6407 mj_part_t num_parts = output_part_count_in_dimension;
6408 if ( this->mj_perform_migration(
6411 next_future_num_parts_in_parts,
6412 output_part_begin_index,
6413 migration_reduce_all_population,
6414 this->num_local_coords / (future_num_parts * current_num_parts),
6416 input_part_boxes, output_part_boxes) ) {
6417 is_migrated_in_current_dimension =
true;
6418 is_data_ever_migrated =
true;
6419 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" +
6422 this->total_dim_num_reduce_all /= num_parts;
6425 is_migrated_in_current_dimension =
false;
6426 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Migration-" + istring);
6431 mj_lno_t * tmp = this->coordinate_permutations;
6432 this->coordinate_permutations = this->new_coordinate_permutations;
6433 this->new_coordinate_permutations = tmp;
6435 if(!is_migrated_in_current_dimension){
6436 this->total_dim_num_reduce_all -= current_num_parts;
6437 current_num_parts = output_part_count_in_dimension;
6439 freeArray<mj_lno_t>(this->part_xadj);
6440 this->part_xadj = this->new_part_xadj;
6441 this->new_part_xadj = NULL;
6442 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning_" + istring);
6446 delete future_num_part_in_parts;
6447 delete next_future_num_parts_in_parts;
6449 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Problem_Partitioning");
6456 this->set_final_parts(
6458 output_part_begin_index,
6460 is_data_ever_migrated);
6462 result_assigned_part_ids_ = this->assigned_part_ids;
6463 result_mj_gnos_ = this->current_mj_gnos;
6465 this->free_work_memory();
6466 this->mj_env->timerStop(
MACRO_TIMERS,
"MultiJagged - Total");
6467 this->mj_env->debug(3,
"Out of MultiJagged");
6475 template <
typename Adapter>
6480 #ifndef DOXYGEN_SHOULD_SKIP_THIS 6483 typedef typename Adapter::scalar_t mj_scalar_t;
6484 typedef typename Adapter::gno_t mj_gno_t;
6485 typedef typename Adapter::lno_t mj_lno_t;
6486 typedef typename Adapter::node_t mj_node_t;
6489 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
6493 RCP<const Environment> mj_env;
6494 RCP<const Comm<int> > mj_problemComm;
6495 RCP<const coordinateModel_t> mj_coords;
6498 double imbalance_tolerance;
6499 size_t num_global_parts;
6500 mj_part_t *part_no_array;
6501 int recursion_depth;
6504 mj_lno_t num_local_coords;
6505 mj_gno_t num_global_coords;
6506 const mj_gno_t *initial_mj_gnos;
6507 mj_scalar_t **mj_coordinates;
6509 int num_weights_per_coord;
6510 bool *mj_uniform_weights;
6511 mj_scalar_t **mj_weights;
6512 bool *mj_uniform_parts;
6513 mj_scalar_t **mj_part_sizes;
6515 bool distribute_points_on_cut_lines;
6516 mj_part_t max_concurrent_part_calculation;
6517 int check_migrate_avoid_migration_option;
6520 mj_scalar_t minimum_migration_imbalance;
6521 bool mj_keep_part_boxes;
6527 ArrayRCP<mj_part_t> comXAdj_;
6528 ArrayRCP<mj_part_t> comAdj_;
6534 ArrayRCP<const mj_scalar_t> * coordinate_ArrayRCP_holder;
6536 void set_up_partitioning_data(
6539 void set_input_parameters(
const Teuchos::ParameterList &p);
6541 void free_work_memory();
6543 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries()
const;
6548 RCP<
const Comm<int> > &problemComm,
6549 const RCP<const coordinateModel_t> &coords) :
6550 mj_partitioner(), mj_env(env),
6551 mj_problemComm(problemComm),
6553 imbalance_tolerance(0),
6554 num_global_parts(1), part_no_array(NULL),
6556 coord_dim(0),num_local_coords(0), num_global_coords(0),
6557 initial_mj_gnos(NULL), mj_coordinates(NULL),
6558 num_weights_per_coord(0),
6559 mj_uniform_weights(NULL), mj_weights(NULL),
6560 mj_uniform_parts(NULL),
6561 mj_part_sizes(NULL),
6562 distribute_points_on_cut_lines(true),
6563 max_concurrent_part_calculation(1),
6564 check_migrate_avoid_migration_option(0), migration_type(0),
6565 minimum_migration_imbalance(0.30),
6566 mj_keep_part_boxes(false), num_threads(1), mj_run_as_rcb(false),
6567 comXAdj_(), comAdj_(), coordinate_ArrayRCP_holder (NULL)
6570 if (coordinate_ArrayRCP_holder != NULL){
6571 delete [] this->coordinate_ArrayRCP_holder;
6572 this->coordinate_ArrayRCP_holder = NULL;
6580 const bool bUnsorted =
true;
6581 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
6583 pl.set(
"mj_parts",
"0",
"list of parts for multiJagged partitioning " 6584 "algorithm. As many as the dimension count.", mj_parts_Validator);
6586 pl.set(
"mj_concurrent_part_count", 1,
"The number of parts whose cut " 6589 pl.set(
"mj_minimum_migration_imbalance", 1.1,
6590 "mj_minimum_migration_imbalance, the minimum imbalance of the " 6591 "processors to avoid migration",
6594 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
6595 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 2) );
6596 pl.set(
"mj_migration_option", 1,
"Migration option, 0 for decision " 6597 "depending on the imbalance, 1 for forcing migration, 2 for " 6598 "avoiding migration", mj_migration_option_validator);
6601 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
6602 Teuchos::rcp(
new Teuchos::EnhancedNumberValidator<int>(0, 1) );
6603 pl.set(
"mj_migration_type", 0,
"Migration type, 0 for migration to minimize the imbalance " 6604 "1 for migration to minimize messages exchanged the migration." ,
6605 mj_migration_option_validator);
6608 pl.set(
"mj_keep_part_boxes",
false,
"Keep the part boundaries of the " 6612 pl.set(
"mj_enable_rcb",
false,
"Use MJ as RCB.",
6615 pl.set(
"mj_recursion_depth", -1,
"Recursion depth for MJ: Must be " 6631 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
6635 mj_part_t pointAssign(
int dim, mj_scalar_t *point)
const;
6637 void boxAssign(
int dim, mj_scalar_t *lower, mj_scalar_t *upper,
6638 size_t &nPartsFound, mj_part_t **partsFound)
const;
6643 void getCommunicationGraph(
6645 ArrayRCP<mj_part_t> &comXAdj,
6646 ArrayRCP<mj_part_t> &comAdj);
6659 template <
typename Adapter>
6664 this->set_up_partitioning_data(solution);
6665 this->set_input_parameters(this->mj_env->getParameters());
6666 if (this->mj_keep_part_boxes){
6667 this->mj_partitioner.set_to_keep_part_boxes();
6669 this->mj_partitioner.set_partitioning_parameters(
6670 this->distribute_points_on_cut_lines,
6671 this->max_concurrent_part_calculation,
6672 this->check_migrate_avoid_migration_option,
6673 this->minimum_migration_imbalance, this->migration_type);
6675 mj_part_t *result_assigned_part_ids = NULL;
6676 mj_gno_t *result_mj_gnos = NULL;
6677 this->mj_partitioner.multi_jagged_part(
6679 this->mj_problemComm,
6681 this->imbalance_tolerance,
6682 this->num_global_parts,
6683 this->part_no_array,
6684 this->recursion_depth,
6687 this->num_local_coords,
6688 this->num_global_coords,
6689 this->initial_mj_gnos,
6690 this->mj_coordinates,
6692 this->num_weights_per_coord,
6693 this->mj_uniform_weights,
6695 this->mj_uniform_parts,
6696 this->mj_part_sizes,
6698 result_assigned_part_ids,
6703 #if defined(__cplusplus) && __cplusplus >= 201103L 6704 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
6705 localGidToLid.reserve(this->num_local_coords);
6706 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6707 localGidToLid[this->initial_mj_gnos[i]] = i;
6709 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6710 0, this->num_local_coords,
true);
6712 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6713 mj_lno_t origLID = localGidToLid[result_mj_gnos[i]];
6714 partId[origLID] = result_assigned_part_ids[i];
6718 Teuchos::Hashtable<mj_gno_t, mj_lno_t>
6719 localGidToLid(this->num_local_coords);
6720 for (mj_lno_t i = 0; i < this->num_local_coords; i++)
6721 localGidToLid.put(this->initial_mj_gnos[i], i);
6723 ArrayRCP<mj_part_t> partId = arcp(
new mj_part_t[this->num_local_coords],
6724 0, this->num_local_coords,
true);
6726 for (mj_lno_t i = 0; i < this->num_local_coords; i++) {
6727 mj_lno_t origLID = localGidToLid.get(result_mj_gnos[i]);
6728 partId[origLID] = result_assigned_part_ids[i];
6731 #endif // C++11 is enabled 6733 delete [] result_mj_gnos;
6734 delete [] result_assigned_part_ids;
6736 solution->setParts(partId);
6737 this->free_work_memory();
6742 template <
typename Adapter>
6744 freeArray<mj_scalar_t *>(this->mj_coordinates);
6745 freeArray<mj_scalar_t *>(this->mj_weights);
6746 freeArray<bool>(this->mj_uniform_parts);
6747 freeArray<mj_scalar_t *>(this->mj_part_sizes);
6748 freeArray<bool>(this->mj_uniform_weights);
6754 template <
typename Adapter>
6759 this->coord_dim = this->mj_coords->getCoordinateDim();
6760 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
6761 this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
6762 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
6763 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
6768 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
6771 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
6772 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
6775 this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
6778 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim);
6780 this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
6783 ArrayView<const mj_gno_t> gnos;
6784 ArrayView<input_t> xyz;
6785 ArrayView<input_t> wgts;
6788 this->coordinate_ArrayRCP_holder =
new ArrayRCP<const mj_scalar_t> [this->coord_dim + this->num_weights_per_coord];
6790 this->mj_coords->getCoordinates(gnos, xyz, wgts);
6792 ArrayView<const mj_gno_t> mj_gnos = gnos;
6793 this->initial_mj_gnos = mj_gnos.getRawPtr();
6796 for (
int dim=0; dim < this->coord_dim; dim++){
6797 ArrayRCP<const mj_scalar_t> ar;
6799 this->coordinate_ArrayRCP_holder[dim] = ar;
6802 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr();
6806 if (this->num_weights_per_coord == 0){
6807 this->mj_uniform_weights[0] =
true;
6808 this->mj_weights[0] = NULL;
6812 for (
int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
6813 ArrayRCP<const mj_scalar_t> ar;
6814 wgts[wdim].getInputArray(ar);
6815 this->coordinate_ArrayRCP_holder[this->coord_dim + wdim] = ar;
6816 this->mj_uniform_weights[wdim] =
false;
6817 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
6821 for (
int wdim = 0; wdim < criteria_dim; wdim++){
6822 if (solution->criteriaHasUniformPartSizes(wdim)){
6823 this->mj_uniform_parts[wdim] =
true;
6824 this->mj_part_sizes[wdim] = NULL;
6827 std::cerr <<
"MJ does not support non uniform target part weights" << std::endl;
6836 template <
typename Adapter>
6839 const Teuchos::ParameterEntry *pe = pl.getEntryPtr(
"imbalance_tolerance");
6842 tol = pe->getValue(&tol);
6843 this->imbalance_tolerance = tol - 1.0;
6847 if (this->imbalance_tolerance <= 0)
6848 this->imbalance_tolerance= 10e-4;
6851 this->part_no_array = NULL;
6853 this->recursion_depth = 0;
6855 if (pl.getPtr<Array <mj_part_t> >(
"mj_parts")){
6856 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >(
"mj_parts")->getRawPtr();
6857 this->recursion_depth = pl.getPtr<Array <mj_part_t> >(
"mj_parts")->size() - 1;
6858 this->mj_env->debug(2,
"mj_parts provided by user");
6862 this->distribute_points_on_cut_lines =
true;
6863 this->max_concurrent_part_calculation = 1;
6865 this->mj_run_as_rcb =
false;
6866 int mj_user_recursion_depth = -1;
6867 this->mj_keep_part_boxes =
false;
6868 this->check_migrate_avoid_migration_option = 0;
6869 this->migration_type = 0;
6870 this->minimum_migration_imbalance = 0.35;
6872 pe = pl.getEntryPtr(
"mj_minimum_migration_imbalance");
6875 imb = pe->getValue(&imb);
6876 this->minimum_migration_imbalance = imb - 1.0;
6879 pe = pl.getEntryPtr(
"mj_migration_option");
6881 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
6883 this->check_migrate_avoid_migration_option = 0;
6885 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
6888 pe = pl.getEntryPtr(
"mj_migration_type");
6890 this->migration_type = pe->getValue(&this->migration_type);
6892 this->migration_type = 0;
6897 pe = pl.getEntryPtr(
"mj_concurrent_part_count");
6899 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
6901 this->max_concurrent_part_calculation = 1;
6904 pe = pl.getEntryPtr(
"mj_keep_part_boxes");
6906 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
6908 this->mj_keep_part_boxes =
false;
6920 if (this->mj_keep_part_boxes ==
false){
6921 pe = pl.getEntryPtr(
"mapping_type");
6923 int mapping_type = -1;
6924 mapping_type = pe->getValue(&mapping_type);
6925 if (mapping_type == 0){
6926 mj_keep_part_boxes =
true;
6932 pe = pl.getEntryPtr(
"mj_enable_rcb");
6934 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
6936 this->mj_run_as_rcb =
false;
6939 pe = pl.getEntryPtr(
"mj_recursion_depth");
6941 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
6943 mj_user_recursion_depth = -1;
6947 pe = pl.getEntryPtr(
"rectilinear");
6948 if (pe) val = pe->getValue(&val);
6950 this->distribute_points_on_cut_lines =
false;
6952 this->distribute_points_on_cut_lines =
true;
6955 if (this->mj_run_as_rcb){
6956 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
6958 if (this->recursion_depth < 1){
6959 if (mj_user_recursion_depth > 0){
6960 this->recursion_depth = mj_user_recursion_depth;
6963 this->recursion_depth = this->coord_dim;
6967 this->num_threads = 1;
6968 #ifdef HAVE_ZOLTAN2_OMP 6969 #pragma omp parallel 6971 this->num_threads = omp_get_num_threads();
6978 template <
typename Adapter>
6981 typename Adapter::scalar_t *lower,
6982 typename Adapter::scalar_t *upper,
6983 size_t &nPartsFound,
6993 if (this->mj_keep_part_boxes) {
6996 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
6998 size_t nBoxes = (*partBoxes).size();
7000 throw std::logic_error(
"no part boxes exist");
7004 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7006 if (globalBox->boxesOverlap(dim, lower, upper)) {
7008 std::vector<typename Adapter::part_t> partlist;
7011 for (
size_t i = 0; i < nBoxes; i++) {
7013 if ((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
7015 partlist.push_back((*partBoxes)[i].getpId());
7036 *partsFound =
new mj_part_t[nPartsFound];
7037 for (
size_t i = 0; i < nPartsFound; i++)
7038 (*partsFound)[i] = partlist[i];
7051 throw std::logic_error(
"need to use keep_cuts parameter for boxAssign");
7056 template <
typename Adapter>
7059 typename Adapter::scalar_t *point)
const 7066 if (this->mj_keep_part_boxes) {
7070 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
7072 size_t nBoxes = (*partBoxes).size();
7074 throw std::logic_error(
"no part boxes exist");
7078 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
7080 if (globalBox->pointInBox(dim, point)) {
7084 for (i = 0; i < nBoxes; i++) {
7086 if ((*partBoxes)[i].pointInBox(dim, point)) {
7087 foundPart = (*partBoxes)[i].getpId();
7101 std::ostringstream oss;
7103 for (
int j = 0; j < dim; j++) oss << point[j] <<
" ";
7104 oss <<
") not found in domain";
7105 throw std::logic_error(oss.str());
7114 size_t closestBox = 0;
7115 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
7116 mj_scalar_t *centroid =
new mj_scalar_t[dim];
7117 for (
size_t i = 0; i < nBoxes; i++) {
7118 (*partBoxes)[i].computeCentroid(centroid);
7119 mj_scalar_t sum = 0.;
7121 for (
int j = 0; j < dim; j++) {
7122 diff = centroid[j] - point[j];
7125 if (sum < minDistance) {
7130 foundPart = (*partBoxes)[closestBox].getpId();
7137 throw std::logic_error(
"need to use keep_cuts parameter for pointAssign");
7141 template <
typename Adapter>
7147 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL){
7148 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
7149 mj_part_t ntasks = (*pBoxes).size();
7150 int dim = (*pBoxes)[0].getDim();
7159 template <
typename Adapter>
7160 RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
7163 return this->mj_partitioner.get_kept_boxes();
7167 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7169 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7172 if (this->mj_keep_part_boxes)
7173 return this->kept_boxes;
7175 throw std::logic_error(
"Error: part boxes are not stored.");
7178 template <
typename mj_scalar_t,
typename mj_lno_t,
typename mj_gno_t,
7180 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
7182 RCP<mj_partBoxVector_t> &localPartBoxes
7185 mj_part_t ntasks = this->num_global_parts;
7186 int dim = (*localPartBoxes)[0].getDim();
7187 mj_scalar_t *localPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
7189 memset(localPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
7191 mj_scalar_t *globalPartBoundaries =
new mj_scalar_t[ntasks * 2 *dim];
7192 memset(globalPartBoundaries, 0,
sizeof(mj_scalar_t) * ntasks * 2 *dim);
7194 mj_scalar_t *localPartMins = localPartBoundaries;
7195 mj_scalar_t *localPartMaxs = localPartBoundaries + ntasks * dim;
7197 mj_scalar_t *globalPartMins = globalPartBoundaries;
7198 mj_scalar_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
7200 mj_part_t boxCount = localPartBoxes->size();
7201 for (mj_part_t i = 0; i < boxCount; ++i){
7202 mj_part_t pId = (*localPartBoxes)[i].getpId();
7205 mj_scalar_t *lmins = (*localPartBoxes)[i].getlmins();
7206 mj_scalar_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
7208 for (
int j = 0; j < dim; ++j){
7209 localPartMins[dim * pId + j] = lmins[j];
7210 localPartMaxs[dim * pId + j] = lmaxs[j];
7222 reduceAll<int, mj_scalar_t>(*mj_problemComm, reductionOp,
7223 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
7224 RCP<mj_partBoxVector_t> pB(
new mj_partBoxVector_t(),
true);
7225 for (mj_part_t i = 0; i < ntasks; ++i){
7227 globalPartMins + dim * i,
7228 globalPartMaxs + dim * i);
7240 delete []localPartBoundaries;
7241 delete []globalPartBoundaries;
#define MIN_WORK_LAST_DIM
GridHash Class, Hashing Class for part boxes.
Time an algorithm (or other entity) as a whole.
bool operator>(const uMultiSortItem< IT, CT, WT > &other) const
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
Defines Parameter related enumerators, declares functions.
static RCP< Teuchos::BoolParameterEntryValidator > getBoolValidator()
Exists to make setting up validators less cluttered.
bool operator>(const uSignedSortItem< IT, WT, SIGN > &rhs) const
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Sort items for quick sort function.
mj_part_t pointAssign(int dim, mj_scalar_t *point) const
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals...
#define imbalanceOf2(Wachieved, wExpected)
void freeArray(T *&array)
Frees the given array.
Class for sorting items with multiple values. First sorting with respect to val[0], then val[1] then ... val[count-1]. The last tie breaking is done with index values. Used for task mapping partitioning where the points on a cut line needs to be distributed consistently.
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, size_t num_global_parts, mj_part_t *part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, const mj_gno_t *initial_mj_gnos, mj_scalar_t **mj_coordinates, int num_weights_per_coord, bool *mj_uniform_weights, mj_scalar_t **mj_weights, bool *mj_uniform_parts, mj_scalar_t **mj_part_sizes, mj_part_t *&result_assigned_part_ids, mj_gno_t *&result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyDoubleValidator()
Exists to make setting up validators less cluttered.
void partition(const RCP< PartitioningSolution< Adapter > > &solution)
Multi Jagged coordinate partitioning algorithm.
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
A ParameterList validator for integer range lists.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
void getCommunicationGraph(const PartitioningSolution< Adapter > *solution, ArrayRCP< mj_part_t > &comXAdj, ArrayRCP< mj_part_t > &comAdj)
returns communication graph resulting from MJ partitioning.
bool operator>=(const uSignedSortItem< IT, WT, SIGN > &rhs)
SparseMatrixAdapter_t::part_t part_t
#define FUTURE_REDUCEALL_CUTOFF
Multi Jagged coordinate partitioning algorithm.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
T * allocMemory(size_t size)
Allocates memory for the given size.
uMultiSortItem< IT, CT, WT > operator=(const uMultiSortItem< IT, CT, WT > &other)
RCP< mj_partBox_t > get_global_box() const
Return the global bounding box: min/max coords of global domain.
A PartitioningSolution is a solution to a partitioning problem.
Zoltan2_BoxBoundaries()
Default Constructor.
uMultiSortItem(const uMultiSortItem< IT, CT, WT > &other)
#define LEAST_SIGNIFICANCE
RCP< mj_partBoxVector_t > get_kept_boxes() const
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
The StridedData class manages lists of weights or coordinates.
Algorithm defines the base class for all algorithms.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
static RCP< Teuchos::AnyNumberParameterEntryValidator > getAnyIntValidator()
Exists to make setting up validators less cluttered.
uMultiSortItem(IT index_, CT count_, WT *vals_)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Define IntegerRangeList validator.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, mj_scalar_t minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
Defines the CoordinateModel classes.
Tpetra::global_size_t global_size_t
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
A gathering of useful namespace methods.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const coordinateModel_t > &coords)
Contains Teuchos redcution operators for the Multi-jagged algorthm.
void boxAssign(int dim, mj_scalar_t *lower, mj_scalar_t *upper, size_t &nPartsFound, mj_part_t **partsFound) const
void getInputArray(ArrayRCP< const T > &array) const
Create a contiguous array of the required type, perhaps for a TPL.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
Multi Jagged coordinate partitioning algorithm.
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.