Zoltan2
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
2 #define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3 
4 #include <fstream>
5 #include <ctime>
6 #include <vector>
7 #include <set>
8 #include <tuple>
9 
11 #include "Teuchos_ArrayViewDecl.hpp"
14 #include "Teuchos_ReductionOp.hpp"
15 
17 
18 #include "Zoltan2_GraphModel.hpp"
19 #include <zoltan_dd.h>
20 #include <Zoltan2_TPLTraits.hpp>
21 
22 #include "Teuchos_Comm.hpp"
23 #ifdef HAVE_ZOLTAN2_MPI
24 #include "Teuchos_DefaultMpiComm.hpp"
25 #endif // HAVE_ZOLTAN2_MPI
26 #include <Teuchos_DefaultSerialComm.hpp>
27 
28 //#define gnuPlot
30 
31 namespace Teuchos{
32 
35 template <typename Ordinal, typename T>
36 class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
37 {
38 private:
39  T _EPSILON;
40 
41 public:
44  Zoltan2_ReduceBestMapping ():_EPSILON (std::numeric_limits<T>::epsilon()){}
45 
48  void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
49  {
50 
51  for (Ordinal i=0; i < count; i++){
52  if (inBuffer[0] - inoutBuffer[0] < -_EPSILON){
53  inoutBuffer[0] = inBuffer[0];
54  inoutBuffer[1] = inBuffer[1];
55  } else if(inBuffer[0] - inoutBuffer[0] < _EPSILON &&
56  inBuffer[1] - inoutBuffer[1] < _EPSILON){
57  inoutBuffer[0] = inBuffer[0];
58  inoutBuffer[1] = inBuffer[1];
59  }
60  }
61  }
62 };
63 } // namespace Teuchos
64 
65 
66 namespace Zoltan2{
67 
68 template <typename it>
69 inline it z2Fact(it x) {
70  return (x == 1 ? x : x * z2Fact<it>(x - 1));
71 }
72 
73 template <typename gno_t, typename part_t>
75 public:
76  gno_t gno;
78 };
79 
80 //returns the ith permutation indices.
81 template <typename IT>
82 void ithPermutation(const IT n, IT i, IT *perm)
83 {
84  IT j, k = 0;
85  IT *fact = new IT[n];
86 
87 
88  // compute factorial numbers
89  fact[k] = 1;
90  while (++k < n)
91  fact[k] = fact[k - 1] * k;
92 
93  // compute factorial code
94  for (k = 0; k < n; ++k)
95  {
96  perm[k] = i / fact[n - 1 - k];
97  i = i % fact[n - 1 - k];
98  }
99 
100  // readjust values to obtain the permutation
101  // start from the end and check if preceding values are lower
102  for (k = n - 1; k > 0; --k)
103  for (j = k - 1; j >= 0; --j)
104  if (perm[j] <= perm[k])
105  perm[k]++;
106 
107  delete [] fact;
108 }
109 
110 template <typename part_t>
111 void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector <int> grid_dims){
112  int dim = grid_dims.size();
113  int neighborCount = 2 * dim;
114  task_comm_xadj = allocMemory<part_t>(taskCount+1);
115  task_comm_adj = allocMemory<part_t>(taskCount * neighborCount);
116 
117  part_t neighBorIndex = 0;
118  task_comm_xadj[0] = 0;
119  for (part_t i = 0; i < taskCount; ++i){
120  part_t prevDimMul = 1;
121  for (int j = 0; j < dim; ++j){
122  part_t lNeighbor = i - prevDimMul;
123  part_t rNeighbor = i + prevDimMul;
124  prevDimMul *= grid_dims[j];
125  if (lNeighbor >= 0 && lNeighbor/ prevDimMul == i / prevDimMul && lNeighbor < taskCount){
126  task_comm_adj[neighBorIndex++] = lNeighbor;
127  }
128  if (rNeighbor >= 0 && rNeighbor/ prevDimMul == i / prevDimMul && rNeighbor < taskCount){
129  task_comm_adj[neighBorIndex++] = rNeighbor;
130  }
131  }
132  task_comm_xadj[i+1] = neighBorIndex;
133  }
134 
135 }
136 //returns the center of the parts.
137 template <typename Adapter, typename scalar_t, typename part_t>
139  const Environment *envConst,
140  const Teuchos::Comm<int> *comm,
142  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
143  const part_t *parts,
144  int coordDim,
145  part_t ntasks,
146  scalar_t **partCenters){
147 
148  typedef typename Adapter::lno_t lno_t;
149  typedef typename Adapter::gno_t gno_t;
150 
151  typedef StridedData<lno_t, scalar_t> input_t;
152  ArrayView<const gno_t> gnos;
153  ArrayView<input_t> xyz;
154  ArrayView<input_t> wgts;
155  coords->getCoordinates(gnos, xyz, wgts);
156 
157  //local and global num coordinates.
158  lno_t numLocalCoords = coords->getLocalNumCoordinates();
159  //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
160 
161 
162 
163  //local number of points in each part.
164  gno_t *point_counts = allocMemory<gno_t>(ntasks);
165  memset(point_counts, 0, sizeof(gno_t) * ntasks);
166 
167  //global number of points in each part.
168  gno_t *global_point_counts = allocMemory<gno_t>(ntasks);
169 
170 
171  scalar_t **multiJagged_coordinates = allocMemory<scalar_t *>(coordDim);
172 
173  for (int dim=0; dim < coordDim; dim++){
174  ArrayRCP<const scalar_t> ar;
175  xyz[dim].getInputArray(ar);
176  //multiJagged coordinate values assignment
177  multiJagged_coordinates[dim] = (scalar_t *)ar.getRawPtr();
178  memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
179  }
180 
181  //get parts with parallel gnos.
182  //const part_t *parts = soln_->getPartListView();
183  /*
184  for (lno_t i=0; i < numLocalCoords; i++){
185  cout << "me:" << comm->getRank() << " gno:" << soln_gnos[i] << " tmp.part :" << parts[i]<< endl;
186  }
187  */
188 
189 
190  envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
191 
192 
193  for (lno_t i=0; i < numLocalCoords; i++){
194  part_t p = parts[i];
195  //add up all coordinates in each part.
196  for(int j = 0; j < coordDim; ++j){
197  scalar_t c = multiJagged_coordinates[j][i];
198  partCenters[j][p] += c;
199  }
200  ++point_counts[p];
201  }
202 
203  //get global number of points in each part.
204  reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
205  ntasks, point_counts, global_point_counts
206  );
207 
208  for(int j = 0; j < coordDim; ++j){
209  for (part_t i=0; i < ntasks; ++i){
210  partCenters[j][i] /= global_point_counts[i];
211  }
212  }
213 
214  scalar_t *tmpCoords = allocMemory<scalar_t>(ntasks);
215  for(int j = 0; j < coordDim; ++j){
216  reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
217  ntasks, partCenters[j], tmpCoords
218  );
219 
220  scalar_t *tmp = partCenters[j];
221  partCenters[j] = tmpCoords;
222  tmpCoords = tmp;
223  }
224  envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
225 
226  freeArray<gno_t> (point_counts);
227  freeArray<gno_t> (global_point_counts);
228 
229  freeArray<scalar_t> (tmpCoords);
230  freeArray<scalar_t *>(multiJagged_coordinates);
231 }
232 
233 //returns the coarsend part graph.
234 template <typename Adapter, typename scalar_t, typename part_t>
236  const Environment *envConst,
237  const Teuchos::Comm<int> *comm,
239  //const Zoltan2::PartitioningSolution<Adapter> *soln_,
240  part_t np,
241  const part_t *parts,
242  ArrayRCP<part_t> &g_part_xadj,
243  ArrayRCP<part_t> &g_part_adj,
244  ArrayRCP<scalar_t> &g_part_ew
245  ){
246 
247  typedef typename Adapter::lno_t t_lno_t;
248  typedef typename Adapter::gno_t t_gno_t;
249  typedef typename Adapter::scalar_t t_scalar_t;
251 
252  //int numRanks = comm->getSize();
253  //int myRank = comm->getRank();
254 
255  //get parts with parallel gnos.
256  /*
257  const part_t *parts = soln_->getPartListView();
258 
259  part_t np = soln_->getActualGlobalNumberOfParts();
260  if (part_t (soln_->getTargetGlobalNumberOfParts()) > np){
261  np = soln_->getTargetGlobalNumberOfParts();
262  }
263  */
264 
265 
266  t_lno_t localNumVertices = graph->getLocalNumVertices();
267  t_lno_t localNumEdges = graph->getLocalNumEdges();
268 
269  //get the vertex global ids, and weights
270  ArrayView<const t_gno_t> Ids;
271  ArrayView<t_input_t> v_wghts;
272  graph->getVertexList(Ids, v_wghts);
273 
274  //get the edge ids, and weights
275  ArrayView<const t_gno_t> edgeIds;
276  ArrayView<const t_lno_t> offsets;
277  ArrayView<t_input_t> e_wgts;
278  graph->getEdgeList(edgeIds, offsets, e_wgts);
279 
280 
281  std::vector <t_scalar_t> edge_weights;
282  int numWeightPerEdge = graph->getNumWeightsPerEdge();
283 
284  if (numWeightPerEdge > 0){
285  edge_weights = std::vector <t_scalar_t> (localNumEdges);
286  for (t_lno_t i = 0; i < localNumEdges; ++i){
287  edge_weights[i] = e_wgts[0][i];
288  }
289  }
290 
291  //create a zoltan dictionary to get the parts of the vertices
292  //at the other end of edges
293  std::vector <part_t> e_parts (localNumEdges);
294 #ifdef HAVE_ZOLTAN2_MPI
295  if (comm->getSize() > 1)
296  {
297  Zoltan_DD_Struct *dd = NULL;
298 
299  MPI_Comm mpicomm = Teuchos::getRawMpiComm(*comm);
301 
302  int debug_level = 0;
303  Zoltan_DD_Create(&dd, mpicomm,
304  size_gnot, 0,
305  sizeof(part_t), localNumVertices, debug_level);
306 
307  ZOLTAN_ID_PTR ddnotneeded = NULL; // Local IDs not needed
308  Zoltan_DD_Update(
309  dd,
310  (ZOLTAN_ID_PTR) Ids.getRawPtr(),
311  ddnotneeded,
312  (char *) parts,
313  NULL,
314  int(localNumVertices));
315 
316  Zoltan_DD_Find(
317  dd,
318  (ZOLTAN_ID_PTR) edgeIds.getRawPtr(),
319  ddnotneeded,
320  (char *)&(e_parts[0]),
321  NULL,
322  localNumEdges,
323  NULL
324  );
325  Zoltan_DD_Destroy(&dd);
326  } else
327 #endif
328  {
329 
330  /*
331  std::cout << "localNumVertices:" << localNumVertices
332  << " np:" << np
333  << " globalNumVertices:" << graph->getGlobalNumVertices()
334  << " localNumEdges:" << localNumEdges << std::endl;
335  */
336 
337  for (t_lno_t i = 0; i < localNumEdges; ++i){
338  t_gno_t ei = edgeIds[i];
339  part_t p = parts[ei];
340  e_parts[i] = p;
341  }
342 
343  //get the vertices in each part in my part.
344  std::vector <t_lno_t> part_begins(np, -1);
345  std::vector <t_lno_t> part_nexts(localNumVertices, -1);
346 
347  //cluster vertices according to their parts.
348  //create local part graph.
349  for (t_lno_t i = 0; i < localNumVertices; ++i){
350  part_t ap = parts[i];
351  part_nexts[i] = part_begins[ap];
352  part_begins[ap] = i;
353  }
354 
355 
356  g_part_xadj = ArrayRCP<part_t> (np + 1);
357  g_part_adj = ArrayRCP<part_t> (localNumEdges);
358  g_part_ew = ArrayRCP<t_scalar_t> (localNumEdges);
359  part_t nindex = 0;
360  g_part_xadj[0] = 0;
361  std::vector <part_t> part_neighbors (np);
362  std::vector <t_scalar_t> part_neighbor_weights(np, 0);
363  std::vector <t_scalar_t> part_neighbor_weights_ordered(np);
364 
365  //coarsen for all vertices in my part in order with parts.
366  for (t_lno_t i = 0; i < np; ++i){
367  part_t num_neighbor_parts = 0;
368  t_lno_t v = part_begins[i];
369  //get part i, and first vertex in this part v.
370  while (v != -1){
371  //now get the neightbors of v.
372  for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){
373  //get the part of the second vertex.
374  part_t ep = e_parts[j];
375 
376  t_scalar_t ew = 1;
377  if (numWeightPerEdge > 0){
378  ew = edge_weights[j];
379  }
380  //std::cout << "part:" << i << " v:" << v << " part2:" << ep << " v2:" << edgeIds[j] << " w:" << ew << std::endl;
381  //add it to my local part neighbors for part i.
382  if (part_neighbor_weights[ep] < 0.00001){
383  part_neighbors[num_neighbor_parts++] = ep;
384  }
385  part_neighbor_weights[ep] += ew;
386  }
387  v = part_nexts[v];
388  }
389 
390 
391  //now get the part list.
392  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
393  part_t neighbor_part = part_neighbors[j];
394  g_part_adj[nindex] = neighbor_part;
395  g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
396  part_neighbor_weights[neighbor_part] = 0;
397  }
398  g_part_xadj[i + 1] = nindex;
399  }
400  return;
401  }
402 
403  RCP<const Teuchos::Comm<int> > tcomm = rcpFromRef(*comm);
404  typedef Tpetra::Map<>::node_type t_node_t;
405  typedef Tpetra::Map<part_t, part_t, t_node_t> t_map_t;
406  Teuchos::RCP<const t_map_t> map = Teuchos::rcp (new t_map_t (np, 0, tcomm));
407  typedef Tpetra::CrsMatrix<t_scalar_t, part_t, part_t, t_node_t> tcrsMatrix_t;
408  Teuchos::RCP<tcrsMatrix_t> tMatrix(new tcrsMatrix_t (map, 0));
409 
410 
411 
412  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
413  {
414  //get the vertices in each part in my part.
415  std::vector <t_lno_t> part_begins(np, -1);
416  std::vector <t_lno_t> part_nexts(localNumVertices, -1);
417 
418  //cluster vertices according to their parts.
419  //create local part graph.
420  for (t_lno_t i = 0; i < localNumVertices; ++i){
421  part_t ap = parts[i];
422  part_nexts[i] = part_begins[ap];
423  part_begins[ap] = i;
424  }
425 
426  std::vector <part_t> part_neighbors (np);
427  std::vector <t_scalar_t> part_neighbor_weights(np, 0);
428  std::vector <t_scalar_t> part_neighbor_weights_ordered(np);
429 
430  //coarsen for all vertices in my part in order with parts.
431  for (t_lno_t i = 0; i < np; ++i){
432  part_t num_neighbor_parts = 0;
433  t_lno_t v = part_begins[i];
434  //get part i, and first vertex in this part v.
435  while (v != -1){
436  //now get the neightbors of v.
437  for (t_lno_t j = offsets[v]; j < offsets[v+1]; ++j){
438  //get the part of the second vertex.
439  part_t ep = e_parts[j];
440 
441  t_scalar_t ew = 1;
442  if (numWeightPerEdge > 0){
443  ew = edge_weights[j];
444  }
445  //add it to my local part neighbors for part i.
446  if (part_neighbor_weights[ep] < 0.00001){
447  part_neighbors[num_neighbor_parts++] = ep;
448  }
449  part_neighbor_weights[ep] += ew;
450  }
451  v = part_nexts[v];
452  }
453 
454  //now get the part list.
455  for (t_lno_t j = 0; j < num_neighbor_parts; ++j){
456  part_t neighbor_part = part_neighbors[j];
457  part_neighbor_weights_ordered[j] = part_neighbor_weights[neighbor_part];
458  part_neighbor_weights[neighbor_part] = 0;
459  }
460 
461  //insert it to tpetra crsmatrix.
462  if (num_neighbor_parts > 0){
463  Teuchos::ArrayView<const part_t> destinations(
464  &(part_neighbors[0]), num_neighbor_parts);
465  Teuchos::ArrayView<const t_scalar_t>
466  vals(&(part_neighbor_weights_ordered[0]), num_neighbor_parts);
467  tMatrix->insertGlobalValues (i,destinations, vals);
468  }
469  }
470  }
471  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
472  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE fillComplete");
473  tMatrix->fillComplete ();
474  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE fillComplete");
475 
476 
477  std::vector <part_t> part_indices(np);
478 
479  for (part_t i = 0; i < np; ++i) part_indices[i] = i;
480 
481  Teuchos::ArrayView<const part_t>
482  global_ids( &(part_indices[0]), np);
483 
484  //create a map where all processors own all rows.
485  //so that we do a gatherAll for crsMatrix.
486  Teuchos::RCP<const t_map_t> gatherRowMap(new t_map_t (
487  Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid(), global_ids, 0, tcomm));
488 
489  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Import");
490  //create the importer for gatherAll
491  Teuchos::RCP<tcrsMatrix_t> A_gather =
492  Teuchos::rcp (new tcrsMatrix_t (gatherRowMap, 0));
493  typedef Tpetra::Import<typename t_map_t::local_ordinal_type,
494  typename t_map_t::global_ordinal_type,
495  typename t_map_t::node_type> import_type;
496  import_type import (map, gatherRowMap);
497  A_gather->doImport (*tMatrix, import, Tpetra::INSERT);
498  A_gather->fillComplete ();
499  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Import");
500 
501  //create the output part arrays.
502  //all processors owns whole copy.
503  g_part_xadj = ArrayRCP<part_t> (np + 1);
504  g_part_adj = ArrayRCP<part_t> (A_gather->getNodeNumEntries ());
505  g_part_ew = ArrayRCP<t_scalar_t> (A_gather->getNodeNumEntries ());
506 
507  part_t *taskidx = g_part_xadj.getRawPtr();
508  part_t *taskadj = g_part_adj.getRawPtr();
509  t_scalar_t *taskadjwgt = g_part_ew.getRawPtr();
510 
511  taskidx[0] = 0;
512 
513  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Import Copy");
514  for (part_t i = 0; i < np; i++) {
515  part_t length = A_gather->getNumEntriesInLocalRow(i); // Use Global to get same
516  size_t nentries;
517  taskidx[i+1] = taskidx[i] + length;
518  //get the indices
519  Teuchos::ArrayView<part_t> Indices(taskadj + taskidx[i], length);
520  Teuchos::ArrayView<t_scalar_t> Values(taskadjwgt + taskidx[i], length);
521  A_gather->getLocalRowCopy(i, Indices, Values, nentries);
522  }
523  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Import Copy");
524 }
525 
526 
529 template <class IT, class WT>
531  IT heapSize;
532  IT *indices;
533  WT *values;
534  WT _EPSILON;
535 
536 
537 public:
538  void setHeapsize(IT heapsize_){
539  this->heapSize = heapsize_;
540  this->indices = allocMemory<IT>(heapsize_ );
541  this->values = allocMemory<WT>(heapsize_ );
542  this->_EPSILON = std::numeric_limits<WT>::epsilon();
543  }
544 
546  freeArray<IT>(this->indices);
547  freeArray<WT>(this->values);
548  }
549 
550 
551  void addPoint(IT index, WT distance){
552  WT maxVal = this->values[0];
553  //add only the distance is smaller than the maximum distance.
554  //cout << "indeX:" << index << "distance:" <<distance << " maxVal:" << maxVal << endl;
555  if (distance >= maxVal) return;
556  else {
557  this->values[0] = distance;
558  this->indices[0] = index;
559  this->push_down(0);
560  }
561  }
562 
563  //heap push down operation
564  void push_down(IT index_on_heap){
565  IT child_index1 = 2 * index_on_heap + 1;
566  IT child_index2 = 2 * index_on_heap + 2;
567 
568  IT biggerIndex = -1;
569  if(child_index1 < this->heapSize && child_index2 < this->heapSize){
570 
571  if (this->values[child_index1] < this->values[child_index2]){
572  biggerIndex = child_index2;
573  }
574  else {
575  biggerIndex = child_index1;
576  }
577  }
578  else if(child_index1 < this->heapSize){
579  biggerIndex = child_index1;
580 
581  }
582  else if(child_index2 < this->heapSize){
583  biggerIndex = child_index2;
584  }
585  if (biggerIndex >= 0 && this->values[biggerIndex] > this->values[index_on_heap]){
586  WT tmpVal = this->values[biggerIndex];
587  this->values[biggerIndex] = this->values[index_on_heap];
588  this->values[index_on_heap] = tmpVal;
589 
590  IT tmpIndex = this->indices[biggerIndex];
591  this->indices[biggerIndex] = this->indices[index_on_heap];
592  this->indices[index_on_heap] = tmpIndex;
593  this->push_down(biggerIndex);
594  }
595  }
596 
597  void initValues(){
598  WT MAXVAL = std::numeric_limits<WT>::max();
599  for(IT i = 0; i < this->heapSize; ++i){
600  this->values[i] = MAXVAL;
601  this->indices[i] = -1;
602  }
603  }
604 
605  //returns the total distance to center in the cluster.
607 
608  WT nc = 0;
609  for(IT j = 0; j < this->heapSize; ++j){
610  nc += this->values[j];
611 
612  //cout << "index:" << this->indices[j] << " distance:" << this->values[j] << endl;
613  }
614  return nc;
615  }
616 
617  //returns the new center of the cluster.
618  bool getNewCenters(WT *center, WT **coords, int dimension){
619  bool moved = false;
620  for(int i = 0; i < dimension; ++i){
621  WT nc = 0;
622  for(IT j = 0; j < this->heapSize; ++j){
623  IT k = this->indices[j];
624  //cout << "i:" << i << " dim:" << dimension << " k:" << k << " heapSize:" << heapSize << endl;
625  nc += coords[i][k];
626  }
627  nc /= this->heapSize;
628  moved = (ZOLTAN2_ABS(center[i] - nc) > this->_EPSILON || moved );
629  center[i] = nc;
630 
631  }
632  return moved;
633  }
634 
635  void copyCoordinates(IT *permutation){
636  for(IT i = 0; i < this->heapSize; ++i){
637  permutation[i] = this->indices[i];
638  }
639  }
640 };
641 
644 template <class IT, class WT>
646 
647  int dimension;
648  KmeansHeap<IT,WT> closestPoints;
649 
650 public:
651  WT *center;
653  freeArray<WT>(center);
654  }
655 
656  void setParams(int dimension_, int heapsize){
657  this->dimension = dimension_;
658  this->center = allocMemory<WT>(dimension_);
659  this->closestPoints.setHeapsize(heapsize);
660  }
661 
662  void clearHeap(){
663  this->closestPoints.initValues();
664  }
665 
666  bool getNewCenters( WT **coords){
667  return this->closestPoints.getNewCenters(center, coords, dimension);
668  }
669 
670  //returns the distance of the coordinate to the center.
671  //also adds it to the heap.
672  WT getDistance(IT index, WT **elementCoords){
673  WT distance = 0;
674  for (int i = 0; i < this->dimension; ++i){
675  WT d = (center[i] - elementCoords[i][index]);
676  distance += d * d;
677  }
678  distance = pow(distance, WT(1.0 / this->dimension));
679  closestPoints.addPoint(index, distance);
680  return distance;
681  }
682 
684  return closestPoints.getTotalDistance();
685  }
686 
687  void copyCoordinates(IT *permutation){
688  closestPoints.copyCoordinates(permutation);
689  }
690 };
691 
695 template <class IT, class WT>
697 
698  int dim;
699  IT numElements;
700  WT **elementCoords;
701  IT numClusters;
702  IT required_elements;
703  KMeansCluster <IT,WT> *clusters;
704  WT *maxCoordinates;
705  WT *minCoordinates;
706 public:
708  freeArray<KMeansCluster <IT,WT> >(clusters);
709  freeArray<WT>(maxCoordinates);
710  freeArray<WT>(minCoordinates);
711  }
712 
716  int dim_ ,
717  IT numElements_,
718  WT **elementCoords_,
719  IT required_elements_):
720  dim(dim_),
721  numElements(numElements_),
722  elementCoords(elementCoords_),
723  numClusters ((1 << dim_) + 1),
724  required_elements(required_elements_)
725  {
726  this->clusters = allocMemory<KMeansCluster <IT,WT> >(this->numClusters);
727  //set dimension and the number of required elements for all clusters.
728  for (int i = 0; i < numClusters; ++i){
729  this->clusters[i].setParams(this->dim, this->required_elements);
730  }
731 
732  this->maxCoordinates = allocMemory <WT> (this->dim);
733  this->minCoordinates = allocMemory <WT> (this->dim);
734 
735  //obtain the min and max coordiantes for each dimension.
736  for (int j = 0; j < dim; ++j){
737  this->minCoordinates[j] = this->maxCoordinates[j] = this->elementCoords[j][0];
738  for(IT i = 1; i < numElements; ++i){
739  WT t = this->elementCoords[j][i];
740  if(t > this->maxCoordinates[j]){
741  this->maxCoordinates[j] = t;
742  }
743  if (t < minCoordinates[j]){
744  this->minCoordinates[j] = t;
745  }
746  }
747  }
748 
749 
750  //assign initial cluster centers.
751  for (int j = 0; j < dim; ++j){
752  int mod = (1 << (j+1));
753  for (int i = 0; i < numClusters - 1; ++i){
754  WT c = 0;
755  if ( (i % mod) < mod / 2){
756  c = this->maxCoordinates[j];
757  //cout << "i:" << i << " j:" << j << " setting max:" << c << endl;
758  }
759  else {
760  c = this->minCoordinates[j];
761  }
762  this->clusters[i].center[j] = c;
763  }
764  }
765 
766  //last cluster center is placed to middle.
767  for (int j = 0; j < dim; ++j){
768  this->clusters[numClusters - 1].center[j] = (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
769  }
770 
771 
772  /*
773  for (int i = 0; i < numClusters; ++i){
774  //cout << endl << "cluster:" << i << endl << "\t";
775  for (int j = 0; j < dim; ++j){
776  cout << this->clusters[i].center[j] << " ";
777  }
778  }
779  */
780  }
781 
782  //performs kmeans clustering of coordinates.
783  void kmeans(){
784  for(int it = 0; it < 10; ++it){
785  //cout << "it:" << it << endl;
786  for (IT j = 0; j < this->numClusters; ++j){
787  this->clusters[j].clearHeap();
788  }
789  for (IT i = 0; i < this->numElements; ++i){
790  //cout << "i:" << i << " numEl:" << this->numElements << endl;
791  for (IT j = 0; j < this->numClusters; ++j){
792  //cout << "j:" << j << " numClusters:" << this->numClusters << endl;
793  this->clusters[j].getDistance(i,this->elementCoords);
794  }
795  }
796  bool moved = false;
797  for (IT j = 0; j < this->numClusters; ++j){
798  moved =(this->clusters[j].getNewCenters(this->elementCoords) || moved );
799  }
800  if (!moved){
801  break;
802  }
803  }
804 
805 
806  }
807 
808  //finds the cluster in which the coordinates are the closest to each other.
809  void getMinDistanceCluster(IT *procPermutation){
810 
811  WT minDistance = this->clusters[0].getDistanceToCenter();
812  IT minCluster = 0;
813  //cout << "j:" << 0 << " minDistance:" << minDistance << " minTmpDistance:" << minDistance<< " minCluster:" << minCluster << endl;
814  for (IT j = 1; j < this->numClusters; ++j){
815  WT minTmpDistance = this->clusters[j].getDistanceToCenter();
816  //cout << "j:" << j << " minDistance:" << minDistance << " minTmpDistance:" << minTmpDistance<< " minCluster:" << minCluster << endl;
817  if(minTmpDistance < minDistance){
818  minDistance = minTmpDistance;
819  minCluster = j;
820  }
821  }
822 
823  //cout << "minCluster:" << minCluster << endl;
824  this->clusters[minCluster].copyCoordinates(procPermutation);
825  }
826 };
827 
828 
829 
830 #define MINOF(a,b) (((a)<(b))?(a):(b))
831 
838 template <typename T>
839 void fillContinousArray(T *arr, size_t arrSize, T *val){
840  if(val == NULL){
841 
842 #ifdef HAVE_ZOLTAN2_OMP
843 #pragma omp parallel for
844 #endif
845  for(size_t i = 0; i < arrSize; ++i){
846  arr[i] = i;
847  }
848 
849  }
850  else {
851  T v = *val;
852 #ifdef HAVE_ZOLTAN2_OMP
853 #pragma omp parallel for
854 #endif
855  for(size_t i = 0; i < arrSize; ++i){
856  //cout << "writing to i:" << i << " arr:" << arrSize << endl;
857  arr[i] = v;
858  }
859  }
860 }
861 
864 template <typename part_t, typename pcoord_t>
866 protected:
867  double commCost;
868 public:
869 
870  part_t no_procs; //the number of processors
871  part_t no_tasks; //the number of taks.
872  CommunicationModel(): commCost(),no_procs(0), no_tasks(0){}
873  CommunicationModel(part_t no_procs_, part_t no_tasks_):
874  commCost(),
875  no_procs(no_procs_),
876  no_tasks(no_tasks_){}
878  part_t getNProcs() const{
879  return this->no_procs;
880  }
882  return this->no_tasks;
883  }
884 
885 
887  part_t *task_to_proc,
888  part_t *task_communication_xadj,
889  part_t *task_communication_adj,
890  pcoord_t *task_communication_edge_weight){
891 
892  double totalCost = 0;
893 
894  part_t commCount = 0;
895  for (part_t task = 0; task < this->no_tasks; ++task){
896  int assigned_proc = task_to_proc[task];
897  //cout << "task:" << task << endl;
898  part_t task_adj_begin = task_communication_xadj[task];
899  part_t task_adj_end = task_communication_xadj[task+1];
900 
901  commCount += task_adj_end - task_adj_begin;
902  //cout << "task:" << task << " proc:" << assigned_proc << endl;
903  for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2){
904 
905  //cout << "task2:" << task2 << endl;
906  part_t neighborTask = task_communication_adj[task2];
907  //cout << "neighborTask :" << neighborTask << endl;
908  int neighborProc = task_to_proc[neighborTask];
909 
910  double distance = getProcDistance(assigned_proc, neighborProc);
911 
912  if (task_communication_edge_weight == NULL){
913  totalCost += distance ;
914  }
915  else {
916  totalCost += distance * task_communication_edge_weight[task2];
917 
918  /*
919  std::cout << "\ttask:" << task << " assigned_proc:" << assigned_proc <<
920  "task2:" << task << " neighborProc:" << neighborProc <<
921  " d:" << distance << " task_communication_edge_weight[task2]:" << task_communication_edge_weight[task2] <<
922  " wh:" << distance * task_communication_edge_weight[task2] <<
923  std::endl;
924  */
925  }
926  }
927  }
928 
929  this->commCost = totalCost;// commCount;
930  }
931 
933  return this->commCost;
934  }
935 
936  virtual double getProcDistance(int procId1, int procId2) const = 0;
937 
944  virtual void getMapping(
945  int myRank,
946  const RCP<const Environment> &env,
947  ArrayRCP <part_t> &proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
948  ArrayRCP <part_t> &proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
949  ArrayRCP <part_t> &task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
950  ,const Teuchos::RCP <const Teuchos::Comm<int> > comm_
951  ) const = 0;
952 };
956 template <typename pcoord_t, typename tcoord_t, typename part_t>
957 class CoordinateCommunicationModel:public CommunicationModel<part_t, pcoord_t> {
958 public:
959  //private:
960  int proc_coord_dim; //dimension of the processors
961  pcoord_t **proc_coords; //the processor coordinates. allocated outside of the class.
962  int task_coord_dim; //dimension of the tasks coordinates.
963  tcoord_t **task_coords; //the task coordinates allocated outside of the class.
966 
970 
973 
974  //public:
976  CommunicationModel<part_t, pcoord_t>(),
977  proc_coord_dim(0),
978  proc_coords(0),
979  task_coord_dim(0),
980  task_coords(0),
981  partArraySize(-1),
982  partNoArray(NULL),
983  machine_extent(NULL),
984  machine_extent_wrap_around(NULL),
985  machine(NULL),
986  num_ranks_per_node(1),
987  divide_to_prime_first(false){}
988 
990 
1000  int pcoord_dim_,
1001  pcoord_t **pcoords_,
1002  int tcoord_dim_,
1003  tcoord_t **tcoords_,
1004  part_t no_procs_,
1005  part_t no_tasks_,
1006  int *machine_extent_,
1007  bool *machine_extent_wrap_around_,
1008  const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1009  ):
1010  CommunicationModel<part_t, pcoord_t>(no_procs_, no_tasks_),
1011  proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1012  task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1013  partArraySize(-1),
1014  partNoArray(NULL),
1015  machine_extent(machine_extent_),
1016  machine_extent_wrap_around(machine_extent_wrap_around_),
1017  machine(machine_),
1018  num_ranks_per_node(1),
1019  divide_to_prime_first(false){
1020  }
1021 
1022 
1023  void setPartArraySize(int psize){
1024  this->partArraySize = psize;
1025  }
1026  void setPartArray(part_t *pNo){
1027  this->partNoArray = pNo;
1028  }
1029 
1036  void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1037  //currently returns a random subset.
1038 
1039  part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1041  minCoordDim, nprocs,
1042  this->proc_coords, ntasks);
1043 
1044  kma.kmeans();
1045  kma.getMinDistanceCluster(proc_permutation);
1046 
1047  for(int i = ntasks; i < nprocs; ++i){
1048  proc_permutation[i] = -1;
1049  }
1050  /*
1051  //fill array.
1052  fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1053  int _u_umpa_seed = 847449649;
1054  srand (time(NULL));
1055  int a = rand() % 1000 + 1;
1056  _u_umpa_seed -= a;
1057  //permute array randomly.
1058  update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1059  */
1060  }
1061 
1062  //temporary, necessary for random permutation.
1063  static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1064  {
1065  int a = 16807;
1066  int m = 2147483647;
1067  int q = 127773;
1068  int r = 2836;
1069  int lo, hi, test;
1070  double d;
1071 
1072  lo = _u_umpa_seed % q;
1073  hi = _u_umpa_seed / q;
1074  test = (a*lo)-(r*hi);
1075  if (test>0)
1076  _u_umpa_seed = test;
1077  else
1078  _u_umpa_seed = test + m;
1079  d = (double) ((double) _u_umpa_seed / (double) m);
1080  return (part_t) (d*(double)l);
1081  }
1082 
1083  virtual double getProcDistance(int procId1, int procId2) const{
1084  pcoord_t distance = 0;
1085  if (machine == NULL){
1086  for (int i = 0 ; i < this->proc_coord_dim; ++i){
1087  double d = ZOLTAN2_ABS(proc_coords[i][procId1] - proc_coords[i][procId2]);
1088  if (machine_extent_wrap_around && machine_extent_wrap_around[i]){
1089  if (machine_extent[i] - d < d){
1090  d = machine_extent[i] - d;
1091  }
1092  }
1093  distance += d;
1094  }
1095  }
1096  else {
1097  this->machine->getHopCount(procId1, procId2, distance);
1098  }
1099 
1100  return distance;
1101  }
1102 
1103 
1104  //temporary, does random permutation.
1105  void update_visit_order(part_t* visitOrder, part_t n, int &_u_umpa_seed, part_t rndm) {
1106  part_t *a = visitOrder;
1107 
1108 
1109  if (rndm){
1110  part_t i, u, v, tmp;
1111 
1112  if (n <= 4)
1113  return;
1114 
1115  //srand ( time(NULL) );
1116 
1117  //_u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1118  for (i=0; i<n; i+=16)
1119  {
1120  u = umpa_uRandom(n-4, _u_umpa_seed);
1121  v = umpa_uRandom(n-4, _u_umpa_seed);
1122 
1123  // FIXME (mfh 30 Sep 2015) This requires including Zoltan2_AlgMultiJagged.hpp.
1124 
1125  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1126  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+1], a[u+1], tmp);
1127  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+2], a[u+2], tmp);
1128  ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v+3], a[u+3], tmp);
1129 
1130  }
1131  }
1132  else {
1133  part_t i, end = n / 4;
1134 
1135  for (i=1; i<end; i++)
1136  {
1137  part_t j=umpa_uRandom(n-i, _u_umpa_seed);
1138  part_t t=a[j];
1139  a[j] = a[n-i];
1140  a[n-i] = t;
1141  }
1142  }
1143  //PermuteInPlace(visitOrder, n);
1144  }
1145 
1146 
1147 
1154  virtual void getMapping(
1155  int myRank,
1156  const RCP<const Environment> &env,
1157  ArrayRCP <part_t> &rcp_proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1158  ArrayRCP <part_t> &rcp_proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1159  ArrayRCP <part_t> &rcp_task_to_proc //allocMemory<part_t>(this->no_tasks); //holds the processors mapped to tasks.
1160  ,const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1161  ) const{
1162 
1163  rcp_proc_to_task_xadj = ArrayRCP <part_t> (this->no_procs+1);
1164  rcp_proc_to_task_adj = ArrayRCP <part_t> (this->no_tasks);
1165  rcp_task_to_proc = ArrayRCP <part_t> (this->no_tasks);
1166 
1167  part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr(); //holds the pointer to the task array
1168  part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr(); //holds the indices of tasks wrt to proc_to_task_xadj array.
1169  part_t *task_to_proc = rcp_task_to_proc.getRawPtr(); //holds the processors mapped to tasks.);
1170 
1171 
1172  part_t invalid = 0;
1173  fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs+1, &invalid);
1174 
1175  //obtain the number of parts that should be divided.
1176  part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1177  //obtain the min coordinate dim.
1178  //No more want to do min coord dim. If machine dimension > task_dim,
1179  //we end up with a long line.
1180  //part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1181 
1182  int recursion_depth = partArraySize;
1183  //if(partArraySize < minCoordDim) recursion_depth = minCoordDim;
1184  if (partArraySize == -1){
1185 
1186  if (divide_to_prime_first){
1187  //it is difficult to estimate the number of steps in this case as each branch will have different depth.
1188  //The worst case happens when all prime factors are 3s. P = 3^n, n recursion depth will divide parts to 2x and x
1189  //and n recursion depth with divide 2x into x and x.
1190  //set it to upperbound here.
1191  //we could calculate the exact value here as well, but the partitioning algorithm skips further ones anyways.
1192  recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1193  }
1194  else {
1195  recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1196  }
1197  }
1198 
1199 
1200  int taskPerm = z2Fact<int>(this->task_coord_dim); //get the number of different permutations for task dimension ordering
1201  int procPerm = z2Fact<int>(this->proc_coord_dim); //get the number of different permutations for proc dimension ordering
1202 
1203  int permutations = taskPerm * procPerm; //total number of permutations
1204  //now add the ones, where we divide the processors with longest dimension,
1205  //but task with order.
1206  permutations += taskPerm;
1207  //and divide tasks with longest dimension, and processors with order.
1208  permutations += procPerm; //total number of permutations
1209  //and both with longest dimension.
1210  permutations += 1;
1211  //add one also that partitions based the longest dimension.
1212 
1213  //holds the pointers to proc_adjList
1214  part_t *proc_xadj = allocMemory<part_t> (num_parts+1);
1215  //holds the processors in parts according to the result of partitioning algorithm.
1216  //the processors assigned to part x is at proc_adjList[ proc_xadj[x] : proc_xadj[x+1] ]
1217  part_t *proc_adjList = allocMemory<part_t>(this->no_procs);
1218 
1219 
1220  part_t used_num_procs = this->no_procs;
1221  if(this->no_procs > this->no_tasks){
1222  //obtain the subset of the processors that are closest to each other.
1223  this->getClosestSubset(proc_adjList, this->no_procs, this->no_tasks);
1224  used_num_procs = this->no_tasks;
1225  }
1226  else {
1227  fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1228  }
1229 
1230  int myPermutation = myRank % permutations; //the index of the permutation
1231  bool task_partition_along_longest_dim = false;
1232  bool proc_partition_along_longest_dim = false;
1233 
1234 
1235  int myProcPerm = 0;
1236  int myTaskPerm = 0;
1237 
1238  if (myPermutation == 0){
1239  task_partition_along_longest_dim = true;
1240  proc_partition_along_longest_dim = true;
1241  }
1242  else {
1243  --myPermutation;
1244  if (myPermutation < taskPerm){
1245  proc_partition_along_longest_dim = true;
1246  myTaskPerm = myPermutation; // the index of the task permutation
1247  }
1248  else{
1249  myPermutation -= taskPerm;
1250  if (myPermutation < procPerm){
1251  task_partition_along_longest_dim = true;
1252  myProcPerm = myPermutation; // the index of the task permutation
1253  }
1254  else {
1255  myPermutation -= procPerm;
1256  myProcPerm = myPermutation % procPerm; // the index of the proc permutation
1257  myTaskPerm = myPermutation / procPerm; // the index of the task permutation
1258  }
1259  }
1260  }
1261 
1262 
1263 
1264 
1265  /*
1266  if (task_partition_along_longest_dim && proc_partition_along_longest_dim){
1267  std::cout <<"me:" << myRank << " task:longest proc:longest" << " numPerms:" << permutations << std::endl;
1268  }
1269  else if (proc_partition_along_longest_dim){
1270  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:longest" << " numPerms:" << permutations << std::endl;
1271  }
1272  else if (task_partition_along_longest_dim){
1273  std::cout <<"me:" << myRank << " task: longest" << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1274  }
1275  else {
1276  std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:" << myProcPerm << " numPerms:" << permutations << std::endl;
1277  }
1278  */
1279 
1280 
1281 
1282  int *permutation = allocMemory<int> ((this->proc_coord_dim > this->task_coord_dim)
1283  ? this->proc_coord_dim : this->task_coord_dim);
1284 
1285  //get the permutation order from the proc permutation index.
1286  ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1287 
1288  /*
1289  //reorder the coordinate dimensions.
1290  pcoord_t **pcoords = allocMemory<pcoord_t *> (this->proc_coord_dim);
1291  for(int i = 0; i < this->proc_coord_dim; ++i){
1292  pcoords[i] = this->proc_coords[permutation[i]];
1293  //cout << permutation[i] << " ";
1294  }
1295  */
1296  int procdim = this->proc_coord_dim;
1297  pcoord_t **pcoords = this->proc_coords;
1298  /*
1299  int procdim = this->proc_coord_dim;
1300  procdim = 6;
1301  //reorder the coordinate dimensions.
1302  pcoord_t **pcoords = allocMemory<pcoord_t *> (procdim);
1303  for(int i = 0; i < procdim; ++i){
1304  pcoords[i] = new pcoord_t[used_num_procs] ;//this->proc_coords[permutation[i]];
1305  }
1306 
1307  for (int k = 0; k < used_num_procs ; k++){
1308  pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1309  pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1310 
1311  pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1312  pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1313  pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1314  pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1315 
1316  //if (this->proc_coords[0][k] == 40 && this->proc_coords[1][k] == 8 && this->proc_coords[2][k] == 48){
1317  if (this->proc_coords[0][k] == 5 && this->proc_coords[1][k] == 0 && this->proc_coords[2][k] == 10){
1318  std::cout << "pcoords[0][k]:" << pcoords[0][k] <<
1319  "pcoords[1][k]:" << pcoords[1][k] <<
1320  "pcoords[2][k]:" << pcoords[2][k] <<
1321  "pcoords[3][k]:" << pcoords[3][k] <<
1322  "pcoords[4][k]:" << pcoords[4][k] <<
1323  "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1324  }
1325  else if ( pcoords[0][k] == 64 && pcoords[1][k] == 0 && pcoords[2][k] == 160 &&
1326  pcoords[3][k]==16 && pcoords[4][k] == 0 && pcoords[5][k] == 10){
1327  std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k] <<
1328  "this->proc_coords[1][k]:" << this->proc_coords[1][k] <<
1329  "this->proc_coords[2][k]:" << this->proc_coords[2][k] << std::endl;
1330  }
1331 
1332  }
1333  */
1334 
1335 
1336  //if (partNoArray == NULL) std::cout << "partNoArray is null" << std::endl;
1337  //std::cout << "recursion_depth:" << recursion_depth << " partArraySize:" << partArraySize << std::endl;
1338  //do the partitioning and renumber the parts.
1339  env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1340 
1342  mj_partitioner.sequential_task_partitioning(
1343  env,
1344  this->no_procs,
1345  used_num_procs,
1346  num_parts,
1347  procdim,
1348  //minCoordDim,
1349  pcoords,//this->proc_coords,
1350  proc_adjList,
1351  proc_xadj,
1352  recursion_depth,
1353  partNoArray,
1354  proc_partition_along_longest_dim//, false
1355  ,num_ranks_per_node
1356  ,divide_to_prime_first
1357  );
1358  env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1359  //comm_->barrier();
1360  //std::cout << "mj_partitioner.for procs over" << std::endl;
1361 
1362 
1363  //freeArray<pcoord_t *> (pcoords);
1364 
1365 
1366  part_t *task_xadj = allocMemory<part_t> (num_parts+1);
1367  part_t *task_adjList = allocMemory<part_t>(this->no_tasks);
1368  //fill task_adjList st: task_adjList[i] <- i.
1369  fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1370 
1371  //get the permutation order from the task permutation index.
1372  ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1373 
1374  //reorder task coordinate dimensions.
1375  tcoord_t **tcoords = allocMemory<tcoord_t *> (this->task_coord_dim);
1376  for(int i = 0; i < this->task_coord_dim; ++i){
1377  tcoords[i] = this->task_coords[permutation[i]];
1378  }
1379 
1380 
1381  env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1382  //partitioning of tasks
1383  mj_partitioner.sequential_task_partitioning(
1384  env,
1385  this->no_tasks,
1386  this->no_tasks,
1387  num_parts,
1388  this->task_coord_dim,
1389  //minCoordDim,
1390  tcoords, //this->task_coords,
1391  task_adjList,
1392  task_xadj,
1393  recursion_depth,
1394  partNoArray,
1395  task_partition_along_longest_dim
1396  ,num_ranks_per_node
1397  ,divide_to_prime_first
1398  //,"task_partitioning"
1399  //, false//(myRank == 6)
1400  );
1401  env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1402 
1403  //std::cout << "myrank:" << myRank << std::endl;
1404  //comm_->barrier();
1405  //std::cout << "mj_partitioner.sequential_task_partitioning over" << std::endl;
1406 
1407  freeArray<pcoord_t *> (tcoords);
1408  freeArray<int> (permutation);
1409 
1410 
1411  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1412  for(part_t i = 0; i < num_parts; ++i){
1413 
1414  part_t proc_index_begin = proc_xadj[i];
1415  part_t task_begin_index = task_xadj[i];
1416  part_t proc_index_end = proc_xadj[i+1];
1417  part_t task_end_index = task_xadj[i+1];
1418 
1419 
1420  if(proc_index_end - proc_index_begin != 1){
1421  std::cerr << "Error at partitioning of processors" << std::endl;
1422  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1423  exit(1);
1424  }
1425  part_t assigned_proc = proc_adjList[proc_index_begin];
1426  proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1427  }
1428 
1429 
1430  //holds the pointer to the task array
1431  //convert proc_to_task_xadj to CSR index array
1432  part_t *proc_to_task_xadj_work = allocMemory<part_t> (this->no_procs);
1433  part_t sum = 0;
1434  for(part_t i = 0; i < this->no_procs; ++i){
1435  part_t tmp = proc_to_task_xadj[i];
1436  proc_to_task_xadj[i] = sum;
1437  sum += tmp;
1438  proc_to_task_xadj_work[i] = sum;
1439  }
1440  proc_to_task_xadj[this->no_procs] = sum;
1441 
1442  for(part_t i = 0; i < num_parts; ++i){
1443 
1444  part_t proc_index_begin = proc_xadj[i];
1445  part_t task_begin_index = task_xadj[i];
1446  part_t task_end_index = task_xadj[i+1];
1447 
1448  part_t assigned_proc = proc_adjList[proc_index_begin];
1449 
1450  for (part_t j = task_begin_index; j < task_end_index; ++j){
1451  part_t taskId = task_adjList[j];
1452 
1453  task_to_proc[taskId] = assigned_proc;
1454 
1455  proc_to_task_adj [ --proc_to_task_xadj_work[assigned_proc] ] = taskId;
1456  }
1457  }
1458 
1459  /*
1460  if (myPermutation == 0){
1461  std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1462 
1463 
1464  for(part_t i = 0; i < num_parts; ++i){
1465 
1466  part_t proc_index_begin = proc_xadj[i];
1467  part_t proc_index_end = proc_xadj[i+1];
1468 
1469 
1470  if(proc_index_end - proc_index_begin != 1){
1471  std::cerr << "Error at partitioning of processors" << std::endl;
1472  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1473  exit(1);
1474  }
1475 
1476  part_t assigned_proc = proc_adjList[proc_index_begin];
1477  gnuPlotCode << "Rank:" << i << " " <<
1478  this->proc_coords[0][assigned_proc] << " " << this->proc_coords[1][assigned_proc] << " " << this->proc_coords[2][assigned_proc] <<
1479  " " << pcoords[0][assigned_proc] << " " << pcoords[1][assigned_proc] <<
1480  " " << pcoords[2][assigned_proc] << " " << pcoords[3][assigned_proc] <<
1481  std::endl;
1482 
1483  }
1484 
1485 
1486  gnuPlotCode << "Machine Extent:" << std::endl;
1487  //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1488  for(part_t i = 0; i < num_parts; ++i){
1489 
1490  part_t proc_index_begin = proc_xadj[i];
1491  part_t proc_index_end = proc_xadj[i+1];
1492 
1493 
1494  if(proc_index_end - proc_index_begin != 1){
1495  std::cerr << "Error at partitioning of processors" << std::endl;
1496  std::cerr << "PART:" << i << " is assigned to " << proc_index_end - proc_index_begin << " processors." << std::endl;
1497  exit(1);
1498  }
1499 
1500  part_t assigned_proc = proc_adjList[proc_index_begin];
1501  gnuPlotCode << "Rank:" << i << " " << this->proc_coords[0][assigned_proc] << " " << this->proc_coords[1][assigned_proc] << " " << this->proc_coords[2][assigned_proc] << std::endl;
1502 
1503  }
1504  gnuPlotCode.close();
1505  }
1506  */
1507 
1508  freeArray<part_t>(proc_to_task_xadj_work);
1509  freeArray<part_t>(task_xadj);
1510  freeArray<part_t>(task_adjList);
1511  freeArray<part_t>(proc_xadj);
1512  freeArray<part_t>(proc_adjList);
1513  }
1514 
1515 };
1516 
1517 template <typename Adapter, typename part_t>
1519 protected:
1520 
1521 #ifndef DOXYGEN_SHOULD_SKIP_THIS
1522 
1523  typedef typename Adapter::scalar_t pcoord_t;
1524  typedef typename Adapter::scalar_t tcoord_t;
1525  typedef typename Adapter::scalar_t scalar_t;
1526  typedef typename Adapter::lno_t lno_t;
1527 
1528 #endif
1529 
1530  //RCP<const Environment> env;
1531  ArrayRCP<part_t> proc_to_task_xadj; // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1532  ArrayRCP<part_t> proc_to_task_adj; // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1533  ArrayRCP<part_t> task_to_proc; //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.
1534  ArrayRCP<part_t> local_task_to_rank; //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.
1535 
1540  ArrayRCP<part_t> task_communication_xadj;
1541  ArrayRCP<part_t> task_communication_adj;
1543 
1544 
1547  void doMapping(int myRank, const Teuchos::RCP <const Teuchos::Comm<int> > comm_){
1548 
1549  if(this->proc_task_comm){
1550  this->proc_task_comm->getMapping(
1551  myRank,
1552  this->env,
1553  this->proc_to_task_xadj, // = allocMemory<part_t> (this->no_procs+1); //holds the pointer to the task array
1554  this->proc_to_task_adj, // = allocMemory<part_t>(this->no_tasks); //holds the indices of tasks wrt to proc_to_task_xadj array.
1555  this->task_to_proc //allocMemory<part_t>(this->no_procs); //holds the processors mapped to tasks.);
1556  ,comm_
1557  );
1558  }
1559  else {
1560  std::cerr << "communicationModel is not specified in the Mapper" << std::endl;
1561  exit(1);
1562  }
1563  }
1564 
1565 
1568  RCP<Comm<int> > create_subCommunicator(){
1569  int procDim = this->proc_task_comm->proc_coord_dim;
1570  int taskDim = this->proc_task_comm->task_coord_dim;
1571 
1572  int taskPerm = z2Fact<int>(procDim); //get the number of different permutations for task dimension ordering
1573  int procPerm = z2Fact<int>(taskDim); //get the number of different permutations for proc dimension ordering
1574  int idealGroupSize = taskPerm * procPerm; //total number of permutations
1575 
1576  idealGroupSize += taskPerm + procPerm + 1; //for the one that does longest dimension partitioning.
1577 
1578  int myRank = this->comm->getRank();
1579  int commSize = this->comm->getSize();
1580 
1581  int myGroupIndex = myRank / idealGroupSize;
1582 
1583  int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1584  if (prevGroupBegin < 0) prevGroupBegin = 0;
1585  int myGroupBegin = myGroupIndex * idealGroupSize;
1586  int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1587  int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1588 
1589  if (myGroupEnd > commSize){
1590  myGroupBegin = prevGroupBegin;
1591  myGroupEnd = commSize;
1592  }
1593  if (nextGroupEnd > commSize){
1594  myGroupEnd = commSize;
1595  }
1596  int myGroupSize = myGroupEnd - myGroupBegin;
1597 
1598  part_t *myGroup = allocMemory<part_t>(myGroupSize);
1599  for (int i = 0; i < myGroupSize; ++i){
1600  myGroup[i] = myGroupBegin + i;
1601  }
1602  //cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin << " myGroupEnd:" << myGroupEnd << endl;
1603 
1604  ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1605 
1606  RCP<Comm<int> > subComm = this->comm->createSubcommunicator(myGroupView);
1607  freeArray<part_t>(myGroup);
1608  return subComm;
1609  }
1610 
1611 
1615  //create the sub group.
1616  RCP<Comm<int> > subComm = this->create_subCommunicator();
1617  //calculate cost.
1618  double myCost = this->proc_task_comm->getCommunicationCostMetric();
1619  //std::cout << "me:" << this->comm->getRank() << " myCost:" << myCost << std::endl;
1620  double localCost[2], globalCost[2];
1621 
1622  localCost[0] = myCost;
1623  localCost[1] = double(subComm->getRank());
1624 
1625  globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1627  reduceAll<int, double>(*subComm, reduceBest,
1628  2, localCost, globalCost);
1629 
1630  int sender = int(globalCost[1]);
1631 
1632  /*
1633  if ( this->comm->getRank() == 0){
1634  std::cout << "me:" << localCost[1] <<
1635  " localcost:" << localCost[0]<<
1636  " bestcost:" << globalCost[0] <<
1637  " Sender:" << sender <<
1638  " procDim" << proc_task_comm->proc_coord_dim <<
1639  " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1640  }
1641  */
1642  //cout << "me:" << localCost[1] << " localcost:" << localCost[0]<< " bestcost:" << globalCost[0] << endl;
1643  //cout << "me:" << localCost[1] << " proc:" << globalCost[1] << endl;
1644  broadcast (*subComm, sender, this->ntasks, this->task_to_proc.getRawPtr());
1645  broadcast (*subComm, sender, this->nprocs, this->proc_to_task_xadj.getRawPtr());
1646  broadcast (*subComm, sender, this->ntasks, this->proc_to_task_adj.getRawPtr());
1647  }
1648 
1649 
1650 
1651  //write mapping to gnuPlot code to visualize.
1653  std::ofstream gnuPlotCode ("gnuPlot.plot", std::ofstream::out);
1654 
1655  int mindim = MINOF(proc_task_comm->proc_coord_dim, proc_task_comm->task_coord_dim);
1656  std::string ss = "";
1657  for(part_t i = 0; i < this->nprocs; ++i){
1658 
1659  std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1660  if (i == 0){
1661  gnuPlotCode << "plot \"" << procFile << "\"\n";
1662  }
1663  else {
1664  gnuPlotCode << "replot \"" << procFile << "\"\n";
1665  }
1666 
1667  std::ofstream inpFile (procFile.c_str(), std::ofstream::out);
1668 
1669  std::string gnuPlotArrow = "set arrow from ";
1670  for(int j = 0; j < mindim; ++j){
1671  if (j == mindim - 1){
1672  inpFile << proc_task_comm->proc_coords[j][i];
1673  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1674 
1675  }
1676  else {
1677  inpFile << proc_task_comm->proc_coords[j][i] << " ";
1678  gnuPlotArrow += Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]) +",";
1679  }
1680  }
1681  gnuPlotArrow += " to ";
1682 
1683 
1684  inpFile << std::endl;
1685  ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1686  for(int k = 0; k < a.size(); ++k){
1687  int j = a[k];
1688  //cout << "i:" << i << " j:"
1689  std::string gnuPlotArrow2 = gnuPlotArrow;
1690  for(int z = 0; z < mindim; ++z){
1691  if(z == mindim - 1){
1692 
1693  //cout << "z:" << z << " j:" << j << " " << proc_task_comm->task_coords[z][j] << endl;
1694  inpFile << proc_task_comm->task_coords[z][j];
1695  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1696  }
1697  else{
1698  inpFile << proc_task_comm->task_coords[z][j] << " ";
1699  gnuPlotArrow2 += Teuchos::toString<float>(proc_task_comm->task_coords[z][j]) +",";
1700  }
1701  }
1702  ss += gnuPlotArrow2 + "\n";
1703  inpFile << std::endl;
1704  }
1705  inpFile.close();
1706  }
1707  gnuPlotCode << ss;
1708  gnuPlotCode << "\nreplot\n pause -1 \n";
1709  gnuPlotCode.close();
1710 
1711  }
1712 
1713 
1714  //write mapping to gnuPlot code to visualize.
1715  void writeMapping2(int myRank){
1716  std::string rankStr = Teuchos::toString<int>(myRank);
1717  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
1718  std::string outF = gnuPlots + rankStr+ extentionS;
1719  std::ofstream gnuPlotCode ( outF.c_str(), std::ofstream::out);
1720 
1722  static_cast <CoordinateCommunicationModel<pcoord_t, tcoord_t, part_t> * > (proc_task_comm);
1723  //int mindim = MINOF(tmpproc_task_comm->proc_coord_dim, tmpproc_task_comm->task_coord_dim);
1724  int mindim = tmpproc_task_comm->proc_coord_dim;
1725  if (mindim != 3) {
1726  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
1727  return;
1728  }
1729  std::string ss = "";
1730  std::string procs = "";
1731 
1732  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
1733  for(part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank){
1734  ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
1735  if (a.size() == 0){
1736  continue;
1737  }
1738 
1739  std::string gnuPlotArrow = "set arrow from ";
1740  for(int j = 0; j < mindim; ++j){
1741  if (j == mindim - 1){
1742  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1743  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]);
1744 
1745  }
1746  else {
1747  gnuPlotArrow += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank]) +",";
1748  procs += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][origin_rank])+ " ";
1749  }
1750  }
1751  procs += "\n";
1752 
1753  gnuPlotArrow += " to ";
1754 
1755 
1756  for(int k = 0; k < a.size(); ++k){
1757  int origin_task = a[k];
1758 
1759  for (int nind = task_communication_xadj[origin_task]; nind < task_communication_xadj[origin_task + 1]; ++nind){
1760  int neighbor_task = task_communication_adj[nind];
1761 
1762  bool differentnode = false;
1763 
1764  int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
1765 
1766  for(int j = 0; j < mindim; ++j){
1767  if (int (tmpproc_task_comm->proc_coords[j][origin_rank]) != int (tmpproc_task_comm->proc_coords[j][neighbor_rank])){
1768  differentnode = true; break;
1769  }
1770  }
1771  std::tuple<int,int,int, int, int, int> foo (
1772  int (tmpproc_task_comm->proc_coords[0][origin_rank]),
1773  int (tmpproc_task_comm->proc_coords[1][origin_rank]),
1774  int (tmpproc_task_comm->proc_coords[2][origin_rank]),
1775  int (tmpproc_task_comm->proc_coords[0][neighbor_rank]),
1776  int (tmpproc_task_comm->proc_coords[1][neighbor_rank]),
1777  int (tmpproc_task_comm->proc_coords[2][neighbor_rank]));
1778 
1779 
1780  if (differentnode && my_arrows.find(foo) == my_arrows.end()){
1781  my_arrows.insert(foo);
1782 
1783  std::string gnuPlotArrow2 = "";
1784  for(int j = 0; j < mindim; ++j){
1785  if(j == mindim - 1){
1786  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]);
1787  }
1788  else{
1789  gnuPlotArrow2 += Teuchos::toString<float>(tmpproc_task_comm->proc_coords[j][neighbor_rank]) +",";
1790  }
1791  }
1792  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
1793  }
1794  }
1795  }
1796 
1797  }
1798 
1799 
1800  std::ofstream procFile ("procPlot.plot", std::ofstream::out);
1801  procFile << procs << "\n";
1802  procFile.close();
1803 
1804  //gnuPlotCode << ss;
1805  if(mindim == 2){
1806  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
1807  } else {
1808  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
1809  }
1810 
1811  gnuPlotCode << ss << "\nreplot\n pause -1 \n";
1812  gnuPlotCode.close();
1813  }
1814 
1815 
1816 // KDD Need to provide access to algorithm for getPartBoxes
1817 #ifdef gnuPlot
1818  void writeGnuPlot(
1819  const Teuchos::Comm<int> *comm_,
1821  int coordDim,
1822  tcoord_t **partCenters
1823  ){
1824  std::string file = "gggnuPlot";
1825  std::string exten = ".plot";
1826  std::ofstream mm("2d.txt");
1827  file += Teuchos::toString<int>(comm_->getRank()) + exten;
1828  std::ofstream ff(file.c_str());
1829  //ff.seekg (0, ff.end);
1830  std::vector <Zoltan2::coordinateModelPartBox <tcoord_t, part_t> > outPartBoxes = ((Zoltan2::PartitioningSolution<Adapter> *)soln_)->getPartBoxesView();
1831 
1832  for (part_t i = 0; i < this->ntasks;++i){
1833  outPartBoxes[i].writeGnuPlot(ff, mm);
1834  }
1835  if (coordDim == 2){
1836  ff << "plot \"2d.txt\"" << std::endl;
1837  //ff << "\n pause -1" << endl;
1838  }
1839  else {
1840  ff << "splot \"2d.txt\"" << std::endl;
1841  //ff << "\n pause -1" << endl;
1842  }
1843  mm.close();
1844 
1845  ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1" << std::endl;
1846  for (part_t i = 0; i < this->ntasks;++i){
1847  part_t pb = task_communication_xadj[i];
1848  part_t pe = task_communication_xadj[i+1];
1849  for (part_t p = pb; p < pe; ++p){
1850  part_t n = task_communication_adj[p];
1851 
1852  //cout << "i:" << i << " n:" << n << endl;
1853  std::string arrowline = "set arrow from ";
1854  for (int j = 0; j < coordDim - 1; ++j){
1855  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
1856  }
1857  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][n]) + " to ";
1858 
1859 
1860  for (int j = 0; j < coordDim - 1; ++j){
1861  arrowline += Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
1862  }
1863  arrowline += Teuchos::toString<tcoord_t>(partCenters[coordDim -1][i]) + " as 5\n";
1864 
1865  //cout << "arrow:" << arrowline << endl;
1866  ff << arrowline;
1867  }
1868  }
1869 
1870  ff << "replot\n pause -1" << std::endl;
1871  ff.close();
1872  }
1873 #endif // gnuPlot
1874 
1875 public:
1876 
1877  void getProcTask(part_t* &proc_to_task_xadj_, part_t* &proc_to_task_adj_){
1878  proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
1879  proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
1880  }
1881 
1882  virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
1883 
1884  // Mapping was already computed in the constructor; we need to store it
1885  // in the solution.
1886  mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
1887 
1888  // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
1889  // KDDKDD is not needed once we use MappingSolution to answer queries
1890  // KDDKDD instead of this algorithm.
1891  // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
1892  // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
1893  }
1894 
1895 
1897  //freeArray<part_t> (proc_to_task_xadj);
1898  //freeArray<part_t> (proc_to_task_adj);
1899  //freeArray<part_t> (task_to_proc);
1900  if(this->isOwnerofModel){
1901  delete this->proc_task_comm;
1902  }
1903  }
1904 
1906  const lno_t num_local_coords,
1907  const part_t *local_coord_parts,
1908  const ArrayRCP<part_t> task_to_proc_){
1909  local_task_to_rank = ArrayRCP <part_t> (num_local_coords);
1910 
1911  for (lno_t i = 0; i < num_local_coords; ++i){
1912  part_t local_coord_part = local_coord_parts[i];
1913  part_t rank_index = task_to_proc_[local_coord_part];
1914  local_task_to_rank[i] = rank_index;
1915  }
1916  }
1917 
1918 
1919 
1931  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
1932  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
1933  const Teuchos::RCP <const Adapter> input_adapter_,
1934  const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> > soln_,
1935  const Teuchos::RCP <const Environment> envConst,
1936  bool is_input_adapter_distributed = true,
1937  int num_ranks_per_node = 1,
1938  bool divide_to_prime_first = false, bool reduce_best_mapping = true):
1939  PartitionMapping<Adapter> (comm_, machine_, input_adapter_, soln_, envConst),
1940  proc_to_task_xadj(0),
1941  proc_to_task_adj(0),
1942  task_to_proc(0),
1943  isOwnerofModel(true),
1944  proc_task_comm(0),
1945  task_communication_xadj(0),
1946  task_communication_adj(0),
1947  task_communication_edge_weight(0){
1948  using namespace Teuchos;
1949  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
1950 
1951  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
1952  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
1953 
1954  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
1955  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
1956  if (!is_input_adapter_distributed){
1957  ia_comm = Teuchos::createSerialComm<int>();
1958  }
1959 
1960  RCP<const Environment> envConst_ = envConst;
1961 
1962  RCP<const ctm_base_adapter_t> baseInputAdapter_ (
1963  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
1964 
1965  modelFlag_t coordFlags_, graphFlags_;
1966 
1967  //create coordinate model
1968  //since this is coordinate task mapper,
1969  //the adapter has to have the coordinates
1970  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
1971  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
1972 
1973  //if the adapter has also graph model, we will use graph model
1974  //to calculate the cost mapping.
1975  BaseAdapterType inputType_ = input_adapter_->adapterType();
1976  if (inputType_ == MatrixAdapterType ||
1977  inputType_ == GraphAdapterType ||
1978  inputType_ == MeshAdapterType)
1979  {
1980  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
1981  baseInputAdapter_, envConst_, ia_comm,
1982  graphFlags_));
1983  }
1984 
1985  if (!machine_->hasMachineCoordinates()) {
1986  throw std::runtime_error("Existing machine does not provide coordinates "
1987  "for coordinate task mapping");
1988  }
1989 
1990  //if mapping type is 0 then it is coordinate mapping
1991  int procDim = machine_->getMachineDim();
1992  this->nprocs = machine_->getNumRanks();
1993 
1994  //get processor coordinates.
1995  pcoord_t **procCoordinates = NULL;
1996  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
1997  throw std::runtime_error("Existing machine does not implement "
1998  "getAllMachineCoordinatesView");
1999  }
2000 
2001  //get the machine extent.
2002  //if we have machine extent,
2003  //if the machine has wrap-around links, we would like to shift the coordinates,
2004  //so that the largest hap would be the wrap-around.
2005  std::vector <int> machine_extent_vec (procDim);
2006  //std::vector <bool> machine_extent_wrap_around_vec(procDim, 0);
2007  int *machine_extent = &(machine_extent_vec[0]);
2008  bool *machine_extent_wrap_around = new bool[procDim];
2009  for (int i = 0; i < procDim; ++i)machine_extent_wrap_around[i] = false;
2010  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2011 
2012 
2013 
2014  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2015  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2016  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2017  // MD: Yes, I ADDED BELOW:
2018  if (machine_->getMachineExtent(machine_extent)) {
2019  procCoordinates =
2020  this->shiftMachineCoordinates (
2021  procDim,
2022  machine_extent,
2023  machine_extent_wrap_around,
2024  this->nprocs,
2025  procCoordinates);
2026  }
2027 
2028 
2029  //get the tasks information, such as coordinate dimension,
2030  //number of parts.
2031  int coordDim = coordinateModel_->getCoordinateDim();
2032 
2033 
2034  this->ntasks = soln_->getActualGlobalNumberOfParts();
2035  if (part_t (soln_->getTargetGlobalNumberOfParts()) > this->ntasks){
2036  this->ntasks = soln_->getTargetGlobalNumberOfParts();
2037  }
2038  this->solution_parts = soln_->getPartListView();
2039 
2040  //we need to calculate the center of parts.
2041  tcoord_t **partCenters = NULL;
2042  partCenters = allocMemory<tcoord_t *>(coordDim);
2043  for (int i = 0; i < coordDim; ++i){
2044  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2045  }
2046 
2047  typedef typename Adapter::scalar_t t_scalar_t;
2048 
2049 
2050  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2051 
2052  //get centers for the parts.
2053  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2054  envConst.getRawPtr(),
2055  ia_comm.getRawPtr(),
2056  coordinateModel_.getRawPtr(),
2057  this->solution_parts,
2058  //soln_->getPartListView();
2059  //this->soln.getRawPtr(),
2060  coordDim,
2061  ntasks,
2062  partCenters);
2063 
2064  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2065 
2066 
2067  //create the part graph
2068  if (graph_model_.getRawPtr() != NULL){
2069  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t> (
2070  envConst.getRawPtr(),
2071  ia_comm.getRawPtr(),
2072  graph_model_.getRawPtr(),
2073  this->ntasks,
2074  this->solution_parts,
2075  //soln_->getPartListView(),
2076  //this->soln.getRawPtr(),
2077  task_communication_xadj,
2078  task_communication_adj,
2079  task_communication_edge_weight
2080  );
2081  }
2082 
2083  //create coordinate communication model.
2084  this->proc_task_comm =
2086  procDim,
2087  procCoordinates,
2088  coordDim,
2089  partCenters,
2090  this->nprocs,
2091  this->ntasks,
2092  machine_extent,
2093  machine_extent_wrap_around,
2094  machine_.getRawPtr());
2095 
2096  int myRank = comm_->getRank();
2097  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2098  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2099 
2100 
2101  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2102  this->doMapping(myRank, comm_);
2103  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2104 
2105 
2106  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2107 
2108  /*soln_->getCommunicationGraph(task_communication_xadj,
2109  task_communication_adj);
2110  */
2111 
2112  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2113  #ifdef gnuPlot1
2114  if (comm_->getRank() == 0){
2115 
2116  part_t taskCommCount = task_communication_xadj.size();
2117  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
2118  part_t maxN = task_communication_xadj[0];
2119  for (part_t i = 1; i <= taskCommCount; ++i){
2120  part_t nc = task_communication_xadj[i] - task_communication_xadj[i-1];
2121  if (maxN < nc) maxN = nc;
2122  }
2123  std::cout << " maxNeighbor:" << maxN << std::endl;
2124  }
2125 
2126  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2127  #endif
2128 
2129  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2130 
2131  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2132  this->proc_task_comm->calculateCommunicationCost(
2133  task_to_proc.getRawPtr(),
2134  task_communication_xadj.getRawPtr(),
2135  task_communication_adj.getRawPtr(),
2136  task_communication_edge_weight.getRawPtr()
2137  );
2138  }
2139 
2140  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2141 
2142  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2143 
2144  //processors are divided into groups of size procDim! * coordDim!
2145  //each processor in the group obtains a mapping with a different rotation
2146  //and best one is broadcasted all processors.
2147  this->getBestMapping();
2148  this->create_local_task_to_rank(
2149  coordinateModel_->getLocalNumCoordinates(),
2150  this->solution_parts,
2151  this->task_to_proc);
2152  /*
2153  {
2154  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
2155  this->proc_task_comm->calculateCommunicationCost(
2156  task_to_proc.getRawPtr(),
2157  task_communication_xadj.getRawPtr(),
2158  task_communication_adj.getRawPtr(),
2159  task_communication_edge_weight.getRawPtr()
2160  );
2161  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2162  }
2163  */
2164 
2165 
2166  #ifdef gnuPlot
2167  this->writeMapping2(comm_->getRank());
2168  #endif
2169 
2170  delete []machine_extent_wrap_around;
2171  if (machine_->getMachineExtent(machine_extent)){
2172  for (int i = 0; i < procDim; ++i){
2173  delete [] procCoordinates[i];
2174  }
2175  delete [] procCoordinates;
2176  }
2177 
2178  for (int i = 0; i < coordDim; ++i){
2179  freeArray<tcoord_t>(partCenters[i]);
2180  }
2181  freeArray<tcoord_t *>(partCenters);
2182 
2183  }
2184 
2185 
2197  const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2198  const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> > machine_,
2199  const Teuchos::RCP <const Adapter> input_adapter_,
2200  const part_t num_parts_,
2201  const part_t *result_parts,
2202  const Teuchos::RCP <const Environment> envConst,
2203  bool is_input_adapter_distributed = true,
2204  int num_ranks_per_node = 1,
2205  bool divide_to_prime_first = false, bool reduce_best_mapping = true):
2206  PartitionMapping<Adapter> (comm_, machine_, input_adapter_, num_parts_, result_parts, envConst),
2207  proc_to_task_xadj(0),
2208  proc_to_task_adj(0),
2209  task_to_proc(0),
2210  isOwnerofModel(true),
2211  proc_task_comm(0),
2212  task_communication_xadj(0),
2213  task_communication_adj(0),
2214  task_communication_edge_weight(0){
2215  using namespace Teuchos;
2216  typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2217 
2218  RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2219  RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2220 
2221  RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2222  RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2223  if (!is_input_adapter_distributed){
2224  ia_comm = Teuchos::createSerialComm<int>();
2225  }
2226  RCP<const Environment> envConst_ = envConst;
2227 
2228  RCP<const ctm_base_adapter_t> baseInputAdapter_ (
2229  rcp(dynamic_cast<const ctm_base_adapter_t *>(input_adapter_.getRawPtr()), false));
2230 
2231  modelFlag_t coordFlags_, graphFlags_;
2232 
2233  //create coordinate model
2234  //since this is coordinate task mapper,
2235  //the adapter has to have the coordinates
2236  coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2237  baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2238 
2239  //if the adapter has also graph model, we will use graph model
2240  //to calculate the cost mapping.
2241  BaseAdapterType inputType_ = input_adapter_->adapterType();
2242  if (inputType_ == MatrixAdapterType ||
2243  inputType_ == GraphAdapterType ||
2244  inputType_ == MeshAdapterType)
2245  {
2246  graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2247  baseInputAdapter_, envConst_, ia_comm,
2248  graphFlags_));
2249  }
2250 
2251  if (!machine_->hasMachineCoordinates()) {
2252  throw std::runtime_error("Existing machine does not provide coordinates "
2253  "for coordinate task mapping");
2254  }
2255 
2256  //if mapping type is 0 then it is coordinate mapping
2257  int procDim = machine_->getMachineDim();
2258  this->nprocs = machine_->getNumRanks();
2259 
2260  //get processor coordinates.
2261  pcoord_t **procCoordinates = NULL;
2262  if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2263  throw std::runtime_error("Existing machine does not implement "
2264  "getAllMachineCoordinatesView");
2265  }
2266 
2267  //get the machine extent.
2268  //if we have machine extent,
2269  //if the machine has wrap-around links, we would like to shift the coordinates,
2270  //so that the largest hap would be the wrap-around.
2271  std::vector <int> machine_extent_vec (procDim);
2272  //std::vector <bool> machine_extent_wrap_around_vec(procDim, 0);
2273  int *machine_extent = &(machine_extent_vec[0]);
2274  bool *machine_extent_wrap_around = new bool[procDim];
2275  machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2276 
2277 
2278 
2279  // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2280  // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2281  // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2282  // MD: Yes, I ADDED BELOW:
2283  if (machine_->getMachineExtent(machine_extent)) {
2284  procCoordinates =
2285  this->shiftMachineCoordinates (
2286  procDim,
2287  machine_extent,
2288  machine_extent_wrap_around,
2289  this->nprocs,
2290  procCoordinates);
2291  }
2292 
2293 
2294  //get the tasks information, such as coordinate dimension,
2295  //number of parts.
2296  int coordDim = coordinateModel_->getCoordinateDim();
2297 
2298 
2299  this->ntasks = num_parts_;
2300  this->solution_parts = result_parts;
2301 
2302  //we need to calculate the center of parts.
2303  tcoord_t **partCenters = NULL;
2304  partCenters = allocMemory<tcoord_t *>(coordDim);
2305  for (int i = 0; i < coordDim; ++i){
2306  partCenters[i] = allocMemory<tcoord_t>(this->ntasks);
2307  }
2308 
2309  typedef typename Adapter::scalar_t t_scalar_t;
2310 
2311 
2312  envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2313 
2314  //get centers for the parts.
2315  getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2316  envConst.getRawPtr(),
2317  ia_comm.getRawPtr(),
2318  coordinateModel_.getRawPtr(),
2319  this->solution_parts,
2320  //soln_->getPartListView();
2321  //this->soln.getRawPtr(),
2322  coordDim,
2323  ntasks,
2324  partCenters);
2325 
2326  envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2327 
2328  envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2329  //create the part graph
2330  if (graph_model_.getRawPtr() != NULL){
2331  getCoarsenedPartGraph<Adapter, t_scalar_t, part_t> (
2332  envConst.getRawPtr(),
2333  ia_comm.getRawPtr(),
2334  graph_model_.getRawPtr(),
2335  this->ntasks,
2336  this->solution_parts,
2337  //soln_->getPartListView(),
2338  //this->soln.getRawPtr(),
2339  task_communication_xadj,
2340  task_communication_adj,
2341  task_communication_edge_weight
2342  );
2343  }
2344  envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2345 
2346 
2347  envConst->timerStart(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2348  //create coordinate communication model.
2349  this->proc_task_comm =
2351  procDim,
2352  procCoordinates,
2353  coordDim,
2354  partCenters,
2355  this->nprocs,
2356  this->ntasks,
2357  machine_extent,
2358  machine_extent_wrap_around,
2359  machine_.getRawPtr());
2360 
2361  envConst->timerStop(MACRO_TIMERS, "CoordinateCommunicationModel Create");
2362 
2363 
2364  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2365  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2366 
2367  int myRank = comm_->getRank();
2368 
2369 
2370  envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2371  this->doMapping(myRank, comm_);
2372  envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2373 
2374 
2375  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2376 
2377  /*soln_->getCommunicationGraph(task_communication_xadj,
2378  task_communication_adj);
2379  */
2380 
2381  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2382  #ifdef gnuPlot1
2383  if (comm_->getRank() == 0){
2384 
2385  part_t taskCommCount = task_communication_xadj.size();
2386  std::cout << " TotalComm:" << task_communication_xadj[taskCommCount] << std::endl;
2387  part_t maxN = task_communication_xadj[0];
2388  for (part_t i = 1; i <= taskCommCount; ++i){
2389  part_t nc = task_communication_xadj[i] - task_communication_xadj[i-1];
2390  if (maxN < nc) maxN = nc;
2391  }
2392  std::cout << " maxNeighbor:" << maxN << std::endl;
2393  }
2394 
2395  this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2396  #endif
2397 
2398  envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2399 
2400  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2401  this->proc_task_comm->calculateCommunicationCost(
2402  task_to_proc.getRawPtr(),
2403  task_communication_xadj.getRawPtr(),
2404  task_communication_adj.getRawPtr(),
2405  task_communication_edge_weight.getRawPtr()
2406  );
2407  }
2408 
2409  //std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2410 
2411  envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2412 
2413  //processors are divided into groups of size procDim! * coordDim!
2414  //each processor in the group obtains a mapping with a different rotation
2415  //and best one is broadcasted all processors.
2416  this->getBestMapping();
2417 
2418  this->create_local_task_to_rank(
2419  coordinateModel_->getLocalNumCoordinates(),
2420  this->solution_parts,
2421  this->task_to_proc);
2422  /*
2423 
2424  {
2425  if (task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr())
2426  this->proc_task_comm->calculateCommunicationCost(
2427  task_to_proc.getRawPtr(),
2428  task_communication_xadj.getRawPtr(),
2429  task_communication_adj.getRawPtr(),
2430  task_communication_edge_weight.getRawPtr()
2431  );
2432  std::cout << "me: " << comm_->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2433  }
2434  */
2435 
2436 
2437 
2438  #ifdef gnuPlot
2439  this->writeMapping2(comm_->getRank());
2440  #endif
2441 
2442  delete []machine_extent_wrap_around;
2443  if (machine_->getMachineExtent(machine_extent)){
2444  for (int i = 0; i < procDim; ++i){
2445  delete [] procCoordinates[i];
2446  }
2447  delete [] procCoordinates;
2448  }
2449 
2450  for (int i = 0; i < coordDim; ++i){
2451  freeArray<tcoord_t>(partCenters[i]);
2452  }
2453  freeArray<tcoord_t *>(partCenters);
2454 
2455  }
2456 
2504  const Environment *env_const_,
2505  const Teuchos::Comm<int> *problemComm,
2506  int proc_dim,
2507  int num_processors,
2508  pcoord_t **machine_coords,
2509 
2510  int task_dim,
2511  part_t num_tasks,
2512  tcoord_t **task_coords,
2513  ArrayRCP<part_t>task_comm_xadj,
2514  ArrayRCP<part_t>task_comm_adj,
2515  pcoord_t *task_communication_edge_weight_,
2516  int recursion_depth,
2517  part_t *part_no_array,
2518  const part_t *machine_dimensions,
2519  int num_ranks_per_node = 1,
2520  bool divide_to_prime_first = false, bool reduce_best_mapping = true
2521  ): PartitionMapping<Adapter>(
2522  Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2523  Teuchos::rcpFromRef<const Environment> (*env_const_)),
2524  proc_to_task_xadj(0),
2525  proc_to_task_adj(0),
2526  task_to_proc(0),
2527  isOwnerofModel(true),
2528  proc_task_comm(0),
2529  task_communication_xadj(task_comm_xadj),
2530  task_communication_adj(task_comm_adj){
2531 
2532  //if mapping type is 0 then it is coordinate mapping
2533  pcoord_t ** virtual_machine_coordinates = machine_coords;
2534  bool *wrap_arounds = new bool [proc_dim];
2535  for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2536 
2537  if (machine_dimensions){
2538  virtual_machine_coordinates =
2539  this->shiftMachineCoordinates (
2540  proc_dim,
2541  machine_dimensions,
2542  wrap_arounds,
2543  num_processors,
2544  machine_coords);
2545  }
2546 
2547  this->nprocs = num_processors;
2548 
2549  int coordDim = task_dim;
2550  this->ntasks = num_tasks;
2551 
2552  //alloc memory for part centers.
2553  tcoord_t **partCenters = task_coords;
2554 
2555  //create coordinate communication model.
2556  this->proc_task_comm =
2558  proc_dim,
2559  virtual_machine_coordinates,
2560  coordDim,
2561  partCenters,
2562  this->nprocs,
2563  this->ntasks, NULL, NULL
2564  );
2565 
2566  this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2567  this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2568 
2569  this->proc_task_comm->setPartArraySize(recursion_depth);
2570  this->proc_task_comm->setPartArray(part_no_array);
2571 
2572  int myRank = problemComm->getRank();
2573 
2574  this->doMapping(myRank, this->comm);
2575 #ifdef gnuPlot
2576  this->writeMapping2(myRank);
2577 #endif
2578 
2579  if (reduce_best_mapping && task_communication_xadj.getRawPtr() && task_communication_adj.getRawPtr()){
2580  this->proc_task_comm->calculateCommunicationCost(
2581  task_to_proc.getRawPtr(),
2582  task_communication_xadj.getRawPtr(),
2583  task_communication_adj.getRawPtr(),
2584  task_communication_edge_weight_
2585  );
2586 
2587 
2588  this->getBestMapping();
2589 
2590 /*
2591  if (myRank == 0){
2592  this->proc_task_comm->calculateCommunicationCost(
2593  task_to_proc.getRawPtr(),
2594  task_communication_xadj.getRawPtr(),
2595  task_communication_adj.getRawPtr(),
2596  task_communication_edge_weight_
2597  );
2598  cout << "me: " << problemComm->getRank() << " cost:" << this->proc_task_comm->getCommunicationCostMetric() << endl;
2599  }
2600 */
2601 
2602  }
2603  delete [] wrap_arounds;
2604 
2605  if (machine_dimensions){
2606  for (int i = 0; i < proc_dim; ++i){
2607  delete [] virtual_machine_coordinates[i];
2608  }
2609  delete [] virtual_machine_coordinates;
2610  }
2611 #ifdef gnuPlot
2612  if(problemComm->getRank() == 0)
2613  this->writeMapping2(-1);
2614 #endif
2615  }
2616 
2617 
2618  /*
2619  double getCommunicationCostMetric(){
2620  return this->proc_task_comm->getCommCost();
2621  }
2622  */
2623 
2626  virtual size_t getLocalNumberOfParts() const{
2627  return 0;
2628  }
2629 
2639  int machine_dim,
2640  const part_t *machine_dimensions,
2641  bool *machine_extent_wrap_around,
2642  part_t numProcs,
2643  pcoord_t **mCoords){
2644  pcoord_t **result_machine_coords = NULL;
2645  result_machine_coords = new pcoord_t*[machine_dim];
2646  for (int i = 0; i < machine_dim; ++i){
2647  result_machine_coords[i] = new pcoord_t [numProcs];
2648  }
2649 
2650  for (int i = 0; i < machine_dim; ++i){
2651  part_t numMachinesAlongDim = machine_dimensions[i];
2652  part_t *machineCounts= new part_t[numMachinesAlongDim];
2653  memset(machineCounts, 0, sizeof(part_t) *numMachinesAlongDim);
2654 
2655  int *filledCoordinates= new int[numMachinesAlongDim];
2656 
2657  pcoord_t *coords = mCoords[i];
2658  for(part_t j = 0; j < numProcs; ++j){
2659  part_t mc = (part_t) coords[j];
2660  ++machineCounts[mc];
2661  }
2662 
2663  part_t filledCoordinateCount = 0;
2664  for(part_t j = 0; j < numMachinesAlongDim; ++j){
2665  if (machineCounts[j] > 0){
2666  filledCoordinates[filledCoordinateCount++] = j;
2667  }
2668  }
2669 
2670  part_t firstProcCoord = filledCoordinates[0];
2671  part_t firstProcCount = machineCounts[firstProcCoord];
2672 
2673  part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
2674  part_t lastProcCount = machineCounts[lastProcCoord];
2675 
2676  part_t firstLastGap = numMachinesAlongDim - lastProcCoord + firstProcCoord;
2677  part_t firstLastGapProc = lastProcCount + firstProcCount;
2678 
2679  part_t leftSideProcCoord = firstProcCoord;
2680  part_t leftSideProcCount = firstProcCount;
2681  part_t biggestGap = 0;
2682  part_t biggestGapProc = numProcs;
2683 
2684  part_t shiftBorderCoordinate = -1;
2685  for(part_t j = 1; j < filledCoordinateCount; ++j){
2686  part_t rightSideProcCoord= filledCoordinates[j];
2687  part_t rightSideProcCount = machineCounts[rightSideProcCoord];
2688 
2689  part_t gap = rightSideProcCoord - leftSideProcCoord;
2690  part_t gapProc = rightSideProcCount + leftSideProcCount;
2691 
2692  /* Pick the largest gap in this dimension. Use fewer process on either side
2693  of the largest gap to break the tie. An easy addition to this would
2694  be to weight the gap by the number of processes. */
2695  if (gap > biggestGap || (gap == biggestGap && biggestGapProc > gapProc)){
2696  shiftBorderCoordinate = rightSideProcCoord;
2697  biggestGapProc = gapProc;
2698  biggestGap = gap;
2699  }
2700  leftSideProcCoord = rightSideProcCoord;
2701  leftSideProcCount = rightSideProcCount;
2702  }
2703 
2704 
2705  if (!(biggestGap > firstLastGap || (biggestGap == firstLastGap && biggestGapProc < firstLastGapProc))){
2706  shiftBorderCoordinate = -1;
2707  }
2708 
2709  for(part_t j = 0; j < numProcs; ++j){
2710 
2711  if (machine_extent_wrap_around[i] && coords[j] < shiftBorderCoordinate){
2712  result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
2713 
2714  }
2715  else {
2716  result_machine_coords[i][j] = coords[j];
2717  }
2718  //cout << "I:" << i << "j:" << j << " coord:" << coords[j] << " now:" << result_machine_coords[i][j] << endl;
2719  }
2720  delete [] machineCounts;
2721  delete [] filledCoordinates;
2722  }
2723 
2724  return result_machine_coords;
2725 
2726  }
2727 
2734  virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const{
2735  numProcs = 1;
2736  procs = this->task_to_proc.getRawPtr() + taskId;
2737  }
2742  return this->task_to_proc[taskId];
2743  }
2750  virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const{
2751 
2752  part_t task_begin = this->proc_to_task_xadj[procId];
2753  part_t taskend = this->proc_to_task_xadj[procId+1];
2754  parts = this->proc_to_task_adj.getRawPtr() + task_begin;
2755  numParts = taskend - task_begin;
2756  }
2757 
2758  ArrayView<part_t> getAssignedTasksForProc(part_t procId){
2759  part_t task_begin = this->proc_to_task_xadj[procId];
2760  part_t taskend = this->proc_to_task_xadj[procId+1];
2761 
2762  /*
2763  cout << "part_t:" << procId << " taskCount:" << taskend - task_begin << endl;
2764  for(part_t i = task_begin; i < taskend; ++i){
2765  cout << "part_t:" << procId << " task:" << proc_to_task_adj[i] << endl;
2766  }
2767  */
2768  if (taskend - task_begin > 0){
2769  ArrayView <part_t> assignedParts(this->proc_to_task_adj.getRawPtr() + task_begin, taskend - task_begin);
2770  return assignedParts;
2771  }
2772  else {
2773  ArrayView <part_t> assignedParts;
2774  return assignedParts;
2775  }
2776  }
2777 
2778 };
2779 
2780 
2781 
2850 template <typename part_t, typename pcoord_t, typename tcoord_t>
2852  RCP<const Teuchos::Comm<int> > problemComm,
2853  int proc_dim,
2854  int num_processors,
2855  pcoord_t **machine_coords,
2856  int task_dim,
2857  part_t num_tasks,
2858  tcoord_t **task_coords,
2859  part_t *task_comm_xadj,
2860  part_t *task_comm_adj,
2861  pcoord_t *task_communication_edge_weight_, /*float-like, same size with task_communication_adj_ weight of the corresponding edge.*/
2862  part_t *proc_to_task_xadj, /*output*/
2863  part_t *proc_to_task_adj, /*output*/
2864  int recursion_depth,
2865  part_t *part_no_array,
2866  const part_t *machine_dimensions,
2867  int num_ranks_per_node = 1,
2868  bool divide_to_prime_first = false
2869 )
2870 {
2871 
2872  const Environment *envConst_ = new Environment(problemComm);
2873 
2874  // mfh 03 Mar 2015: It's OK to omit the Node template
2875  // parameter in Tpetra, if you're just going to use the
2876  // default Node.
2877  typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
2878 
2879  Teuchos::ArrayRCP<part_t> task_communication_xadj (task_comm_xadj, 0, num_tasks+1, false);
2880 
2881  Teuchos::ArrayRCP<part_t> task_communication_adj;
2882  if (task_comm_xadj){
2883  Teuchos::ArrayRCP<part_t> tmp_task_communication_adj (task_comm_adj, 0, task_comm_xadj[num_tasks], false);
2884  task_communication_adj = tmp_task_communication_adj;
2885  }
2886 
2887 
2890  envConst_,
2891  problemComm.getRawPtr(),
2892  proc_dim,
2893  num_processors,
2894  machine_coords,//machine_coords_,
2895 
2896  task_dim,
2897  num_tasks,
2898  task_coords,
2899 
2900  task_communication_xadj,
2901  task_communication_adj,
2902  task_communication_edge_weight_,
2903  recursion_depth,
2904  part_no_array,
2905  machine_dimensions,
2906  num_ranks_per_node,
2907  divide_to_prime_first
2908  );
2909 
2910 
2911  part_t* proc_to_task_xadj_;
2912  part_t* proc_to_task_adj_;
2913 
2914  ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
2915 
2916  for (part_t i = 0; i <= num_processors; ++i){
2917  proc_to_task_xadj[i] = proc_to_task_xadj_[i];
2918  }
2919  for (part_t i = 0; i < num_tasks; ++i){
2920  proc_to_task_adj[i] = proc_to_task_adj_[i];
2921  }
2922  delete ctm;
2923  delete envConst_;
2924 
2925 }
2926 
2927 template <typename proc_coord_t, typename v_lno_t>
2928 inline void visualize_mapping(int myRank,
2929  const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords,
2930  const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank){
2931 
2932  std::string rankStr = Teuchos::toString<int>(myRank);
2933  std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2934  std::string outF = gnuPlots + rankStr+ extentionS;
2935  std::ofstream gnuPlotCode ( outF.c_str(), std::ofstream::out);
2936 
2937  if (machine_coord_dim != 3) {
2938  std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2939  return;
2940  }
2941  std::string ss = "";
2942  std::string procs = "";
2943 
2944  std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2945  for(v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task){
2946  int origin_rank = task_to_rank[origin_task];
2947  std::string gnuPlotArrow = "set arrow from ";
2948 
2949  for(int j = 0; j < machine_coord_dim; ++j){
2950  if (j == machine_coord_dim - 1){
2951  gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2952  procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
2953 
2954  }
2955  else {
2956  gnuPlotArrow += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]) +",";
2957  procs += Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])+ " ";
2958  }
2959  }
2960  procs += "\n";
2961 
2962  gnuPlotArrow += " to ";
2963 
2964 
2965  for (int nind = task_communication_xadj[origin_task]; nind < task_communication_xadj[origin_task + 1]; ++nind){
2966  int neighbor_task = task_communication_adj[nind];
2967 
2968  bool differentnode = false;
2969  int neighbor_rank = task_to_rank[neighbor_task];
2970 
2971  for(int j = 0; j < machine_coord_dim; ++j){
2972  if (int (machine_coords[j][origin_rank]) != int (machine_coords[j][neighbor_rank])){
2973  differentnode = true; break;
2974  }
2975  }
2976  std::tuple<int,int,int, int, int, int> foo (
2977  (int) (machine_coords[0][origin_rank]),
2978  (int) (machine_coords[1][origin_rank]),
2979  (int) (machine_coords[2][origin_rank]),
2980  (int) (machine_coords[0][neighbor_rank]),
2981  (int) (machine_coords[1][neighbor_rank]),
2982  (int) (machine_coords[2][neighbor_rank]));
2983 
2984 
2985  if (differentnode && my_arrows.find(foo) == my_arrows.end()){
2986  my_arrows.insert(foo);
2987 
2988  std::string gnuPlotArrow2 = "";
2989  for(int j = 0; j < machine_coord_dim; ++j){
2990  if(j == machine_coord_dim - 1){
2991  gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
2992  }
2993  else{
2994  gnuPlotArrow2 += Teuchos::toString<float>(machine_coords[j][neighbor_rank]) +",";
2995  }
2996  }
2997  ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2998  }
2999  }
3000  }
3001 
3002  std::ofstream procFile ("procPlot.plot", std::ofstream::out);
3003  procFile << procs << "\n";
3004  procFile.close();
3005 
3006  //gnuPlotCode << ss;
3007  if(machine_coord_dim == 2){
3008  gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3009  } else {
3010  gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3011  }
3012 
3013  gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3014  gnuPlotCode.close();
3015 }
3016 
3017 }// namespace Zoltan2
3018 
3019 #endif
void setParams(int dimension_, int heapsize)
CommunicationModel Base Class that performs mapping between the coordinate partitioning result...
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
Tpetra::CrsMatrix< zscalar_t, zlno_t, zgno_t, znode_t > tcrsMatrix_t
Definition: GraphModel.cpp:84
void ithPermutation(const IT n, IT i, IT *perm)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts When this constructor is called...
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
Time an algorithm (or other entity) as a whole.
WT getDistance(IT index, WT **elementCoords)
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
PartitionMapping maps a solution or an input distribution to ranks.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
size_t getLocalNumVertices() const
Returns the number vertices on this process.
bool getNewCenters(WT **coords)
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges...
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, mj_scalar_t **mj_coordinates, mj_lno_t *initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth, const mj_part_t *part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_)
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
const MachineRepresentation< pcoord_t, part_t > * machine
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const lno_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; edge (neighbor) global Ids, including off-process edges.
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task ...
void getMinDistanceCluster(IT *procPermutation)
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
Defines the XpetraMultiVectorAdapter.
SparseMatrixAdapter_t::part_t part_t
Multi Jagged coordinate partitioning algorithm.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
Contains the Multi-jagged algorthm.
PartitionMapping maps a solution or an input distribution to ranks.
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
dictionary vals
Definition: xml2dox.py:186
A PartitioningSolution is a solution to a partitioning problem.
#define ZOLTAN2_ABS(x)
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
void copyCoordinates(IT *permutation)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id...
Adapter::part_t part_t
KMeansCluster Class.
BaseAdapterType
An enum to identify general types of adapters.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
The StridedData class manages lists of weights or coordinates.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation. The result mapping can be obtained by –getAssignedProcForTask function: which returns the assigned processor id for the given task –getPartsForProc: which returns the assigned tasks with the number of tasks.
The user parameters, debug, timing and memory profiling output objects, and error checking methods...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
void addPoint(IT index, WT distance)
bool getNewCenters(WT *center, WT **coords, int dimension)
MachineRepresentation Class Base class for representing machine coordinates, networks, etc.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process&#39; vertex Ids and their weights.
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric...
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS&#39;s idx_t...
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
Defines the MappingSolution class.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
GraphModel defines the interface required for graph models.
Tpetra::MultiVector< double, int, int > tMVector_t
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
Zoltan2_ReduceBestMapping()
Default Constructor.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
KmeansHeap Class, max heap, but holds the minimum values.
Defines the GraphModel interface.
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
#define ZOLTAN2_ALGMULTIJAGGED_SWAP(a, b, temp)
void push_down(IT index_on_heap)
ArrayRCP< scalar_t > task_communication_edge_weight
#define epsilon
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
virtual double getProcDistance(int procId1, int procId2) const
void copyCoordinates(IT *permutation)
#define MINOF(a, b)
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, part_t *part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
void setHeapsize(IT heapsize_)
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t > * proc_task_comm