Zoltan2
Zoltan2_PartitioningSolution.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // Zoltan2: A package of combinatorial algorithms for scientific computing
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Karen Devine (kddevin@sandia.gov)
39 // Erik Boman (egboman@sandia.gov)
40 // Siva Rajamanickam (srajama@sandia.gov)
41 //
42 // ***********************************************************************
43 //
44 // @HEADER
45 
50 #ifndef _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
51 #define _ZOLTAN2_PARTITIONINGSOLUTION_HPP_
52 
53 namespace Zoltan2 {
54 template <typename Adapter>
55 class PartitioningSolution;
56 }
57 
58 #include <Zoltan2_Environment.hpp>
59 #include <Zoltan2_Solution.hpp>
60 #include <Zoltan2_GreedyMWM.hpp>
61 #include <Zoltan2_Algorithm.hpp>
63 #include <cmath>
64 #include <algorithm>
65 #include <vector>
66 #include <limits>
67 
68 #ifdef _MSC_VER
69 #define NOMINMAX
70 #include <windows.h>
71 #endif
72 
73 
74 namespace Zoltan2 {
75 
90 template <typename Adapter>
92 {
93 public:
94 
95 #ifndef DOXYGEN_SHOULD_SKIP_THIS
96  typedef typename Adapter::gno_t gno_t;
97  typedef typename Adapter::scalar_t scalar_t;
98  typedef typename Adapter::lno_t lno_t;
99  typedef typename Adapter::part_t part_t;
100  typedef typename Adapter::user_t user_t;
101 #endif
102 
120  PartitioningSolution( const RCP<const Environment> &env,
121  const RCP<const Comm<int> > &comm,
122  int nUserWeights,
123  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
124 
154  PartitioningSolution(const RCP<const Environment> &env,
155  const RCP<const Comm<int> > &comm,
156  int nUserWeights, ArrayView<ArrayRCP<part_t> > reqPartIds,
157  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
158  const RCP<Algorithm<Adapter> > &algorithm = Teuchos::null);
159 
161  // Information that the algorithm may wish to query.
162 
165  size_t getTargetGlobalNumberOfParts() const { return nGlobalParts_; }
166 
169  size_t getActualGlobalNumberOfParts() const { return nGlobalPartsSolution_; }
170 
173  size_t getLocalNumberOfParts() const { return nLocalParts_; }
174 
182  scalar_t getLocalFractionOfPart() const { return localFraction_; }
183 
193  bool oneToOnePartDistribution() const { return onePartPerProc_; }
194 
214  const int *getPartDistribution() const {
215  if (partDist_.size() > 0) return &partDist_[0];
216  else return NULL;
217  }
218 
235  const part_t *getProcDistribution() const {
236  if (procDist_.size() > 0) return &procDist_[0];
237  else return NULL;
238  }
239 
243  int getNumberOfCriteria() const { return nWeightsPerObj_; }
244 
245 
252  bool criteriaHasUniformPartSizes(int idx) const { return pSizeUniform_[idx];}
253 
266  scalar_t getCriteriaPartSize(int idx, part_t part) const {
267  if (pSizeUniform_[idx])
268  return 1.0 / nGlobalParts_;
269  else if (pCompactIndex_[idx].size())
270  return pSize_[idx][pCompactIndex_[idx][part]];
271  else
272  return pSize_[idx][part];
273  }
274 
288  bool criteriaHaveSamePartSizes(int c1, int c2) const;
289 
291  // Method used by the algorithm to set results.
292 
319  void setParts(ArrayRCP<part_t> &partList);
320 
322 
334  void RemapParts();
335 
337  /* Return the weight of objects staying with a given remap.
338  * If remap is NULL, compute weight of objects staying with given partition
339  */
340  long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt,
341  part_t nrhs, part_t nlhs)
342  {
343  long staying = 0;
344  for (part_t i = 0; i < nrhs; i++) {
345  part_t k = (remap ? remap[i] : i);
346  for (part_t j = idx[k]; j < idx[k+1]; j++) {
347  if (i == (adj[j]-nlhs)) {
348  staying += wgt[j];
349  break;
350  }
351  }
352  }
353  return staying;
354  }
355 
357  // Results that may be queried by the user, by migration methods,
358  // or by metric calculation methods.
359  // We return raw pointers so users don't have to learn about our
360  // pointer wrappers.
361 
364  inline const RCP<const Comm<int> > &getCommunicator() const { return comm_;}
365 
368  inline const RCP<const Environment> &getEnvironment() const { return env_;}
369 
372  const part_t *getPartListView() const {
373  if (parts_.size() > 0) return parts_.getRawPtr();
374  else return NULL;
375  }
376 
381  const int *getProcListView() const {
382  if (procs_.size() > 0) return procs_.getRawPtr();
383  else return NULL;
384  }
385 
388  virtual bool isPartitioningTreeBinary() const
389  {
390  if (this->algorithm_ == Teuchos::null)
391  throw std::logic_error("no partitioning algorithm has been run yet");
392  return this->algorithm_->isPartitioningTreeBinary();
393  }
394 
397  void getPartitionTree(part_t & numTreeVerts,
398  std::vector<part_t> & permPartNums,
399  std::vector<part_t> & splitRangeBeg,
400  std::vector<part_t> & splitRangeEnd,
401  std::vector<part_t> & treeVertParents) const {
402 
403  part_t numParts = static_cast<part_t>(getTargetGlobalNumberOfParts());
404 
405  if (this->algorithm_ == Teuchos::null)
406  throw std::logic_error("no partitioning algorithm has been run yet");
407  this->algorithm_->getPartitionTree(
408  numParts, // may want to change how this is passed through
409  numTreeVerts,
410  permPartNums,
411  splitRangeBeg,
412  splitRangeEnd,
413  treeVertParents);
414  }
415 
418  std::vector<Zoltan2::coordinateModelPartBox> &
420  {
421  return this->algorithm_->getPartBoxesView();
422  }
423 
425  // when a point lies on a part boundary, the lowest part
426  // number on that boundary is returned.
427  // Note that not all partitioning algorithms will support
428  // this method.
429  //
430  // \param dim : the number of dimensions specified for the point in space
431  // \param point : the coordinates of the point in space; array of size dim
432  // \return the part number of a part overlapping the given point
433  part_t pointAssign(int dim, scalar_t *point) const
434  {
435  part_t p;
436  try {
437  if (this->algorithm_ == Teuchos::null)
438  throw std::logic_error("no partitioning algorithm has been run yet");
439 
440  p = this->algorithm_->pointAssign(dim, point);
441  }
443  return p;
444  }
445 
447  // This method allocates memory for the return argument, but does not
448  // control that memory. The user is responsible for freeing the
449  // memory.
450  //
451  // \param dim : (in) the number of dimensions specified for the box
452  // \param lower : (in) the coordinates of the lower corner of the box;
453  // array of size dim
454  // \param upper : (in) the coordinates of the upper corner of the box;
455  // array of size dim
456  // \param nPartsFound : (out) the number of parts overlapping the box
457  // \param partsFound : (out) array of parts overlapping the box
458  void boxAssign(int dim, scalar_t *lower, scalar_t *upper,
459  size_t &nPartsFound, part_t **partsFound) const
460  {
461  try {
462  if (this->algorithm_ == Teuchos::null)
463  throw std::logic_error("no partitioning algorithm has been run yet");
464 
465  this->algorithm_->boxAssign(dim, lower, upper, nPartsFound, partsFound);
466  }
468  }
469 
470 
473  void getCommunicationGraph(ArrayRCP <part_t> &comXAdj,
474  ArrayRCP <part_t> &comAdj) const
475  {
476  try {
477  if (this->algorithm_ == Teuchos::null)
478  throw std::logic_error("no partitioning algorithm has been run yet");
479 
480  this->algorithm_->getCommunicationGraph(this, comXAdj, comAdj);
481  }
483  }
484 
502  void getPartsForProc(int procId, double &numParts, part_t &partMin,
503  part_t &partMax) const
504  {
505  env_->localInputAssertion(__FILE__, __LINE__, "invalid process id",
506  procId >= 0 && procId < comm_->getSize(), BASIC_ASSERTION);
507 
508  procToPartsMap(procId, numParts, partMin, partMax);
509  }
510 
522  void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
523  {
524  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
525  partId >= 0 && partId < nGlobalParts_, BASIC_ASSERTION);
526 
527  partToProcsMap(partId, procMin, procMax);
528  }
529 
530 private:
531  void partToProc(bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
532  int numLocalParts, int numGlobalParts);
533 
534  void procToPartsMap(int procId, double &numParts, part_t &partMin,
535  part_t &partMax) const;
536 
537  void partToProcsMap(part_t partId, int &procMin, int &procMax) const;
538 
539  void setPartDistribution();
540 
541  void setPartSizes(ArrayView<ArrayRCP<part_t> > reqPartIds,
542  ArrayView<ArrayRCP<scalar_t> > reqPartSizes);
543 
544  void computePartSizes(int widx, ArrayView<part_t> ids,
545  ArrayView<scalar_t> sizes);
546 
547  void broadcastPartSizes(int widx);
548 
549 
550  RCP<const Environment> env_; // has application communicator
551  const RCP<const Comm<int> > comm_; // the problem communicator
552 
553  //part box boundaries as a result of geometric partitioning algorithm.
554  RCP<std::vector<Zoltan2::coordinateModelPartBox> > partBoxes;
555 
556  part_t nGlobalParts_;// target global number of parts
557  part_t nLocalParts_; // number of parts to be on this process
558 
559  scalar_t localFraction_; // approx fraction of a part on this process
560  int nWeightsPerObj_; // if user has no weights, this is 1 TODO: WHY???
561 
562  // If process p is to be assigned part p for all p, then onePartPerProc_
563  // is true. Otherwise it is false, and either procDist_ or partDist_
564  // describes the allocation of parts to processes.
565  //
566  // If parts are never split across processes, then procDist_ is defined
567  // as follows:
568  //
569  // partId = procDist_[procId]
570  // partIdNext = procDist_[procId+1]
571  // globalNumberOfParts = procDist_[numProcs]
572  //
573  // meaning that the parts assigned to process procId range from
574  // [partId, partIdNext). If partIdNext is the same as partId, then
575  // process procId has no parts.
576  //
577  // If the number parts is less than the number of processes, and the
578  // user did not specify "num_local_parts" for each of the processes, then
579  // parts are split across processes, and partDist_ is defined rather than
580  // procDist_.
581  //
582  // procId = partDist_[partId]
583  // procIdNext = partDist_[partId+1]
584  // globalNumberOfProcs = partDist_[numParts]
585  //
586  // which implies that the part partId is shared by processes in the
587  // the range [procId, procIdNext).
588  //
589  // We use std::vector so we can use upper_bound algorithm
590 
591  bool onePartPerProc_; // either this is true...
592  std::vector<int> partDist_; // or this is defined ...
593  std::vector<part_t> procDist_; // or this is defined.
594  bool procDistEquallySpread_; // if procDist_ is used and
595  // #parts > #procs and
596  // num_local_parts is not specified,
597  // parts are evenly distributed to procs
598 
599  // In order to minimize the storage required for part sizes, we
600  // have three different representations.
601  //
602  // If the part sizes for weight index w are all the same, then:
603  // pSizeUniform_[w] = true
604  // pCompactIndex_[w].size() = 0
605  // pSize_[w].size() = 0
606  //
607  // and the size for part p is 1.0 / nparts.
608  //
609  // If part sizes differ for each part in weight index w, but there
610  // are no more than 64 distinct sizes:
611  // pSizeUniform_[w] = false
612  // pCompactIndex_[w].size() = number of parts
613  // pSize_[w].size() = number of different sizes
614  //
615  // and the size for part p is pSize_[pCompactIndex_[p]].
616  //
617  // If part sizes differ for each part in weight index w, and there
618  // are more than 64 distinct sizes:
619  // pSizeUniform_[w] = false
620  // pCompactIndex_[w].size() = 0
621  // pSize_[w].size() = nparts
622  //
623  // and the size for part p is pSize_[p].
624  //
625  // NOTE: If we expect to have similar cases, i.e. a long list of scalars
626  // where it is highly possible that just a few unique values appear,
627  // then we may want to make this a class. The advantage is that we
628  // save a long list of 1-byte indices instead of a long list of scalars.
629 
630  ArrayRCP<bool> pSizeUniform_;
631  ArrayRCP<ArrayRCP<unsigned char> > pCompactIndex_;
632  ArrayRCP<ArrayRCP<scalar_t> > pSize_;
633 
635  // The algorithm sets these values upon completion.
636 
637  ArrayRCP<part_t> parts_; // part number assigned to localid[i]
638 
639  bool haveSolution_;
640 
641  part_t nGlobalPartsSolution_; // global number of parts in solution
642 
644  // The solution calculates this from the part assignments,
645  // unless onePartPerProc_.
646 
647  ArrayRCP<int> procs_; // process rank assigned to localid[i]
648 
650  // Algorithm used to compute the solution;
651  // needed for post-processing with pointAssign or getCommunicationGraph
652  const RCP<Algorithm<Adapter> > algorithm_; //
653 };
654 
656 // Definitions
658 
659 template <typename Adapter>
661  const RCP<const Environment> &env,
662  const RCP<const Comm<int> > &comm,
663  int nUserWeights,
664  const RCP<Algorithm<Adapter> > &algorithm)
665  : env_(env), comm_(comm),
666  partBoxes(),
667  nGlobalParts_(0), nLocalParts_(0),
668  localFraction_(0), nWeightsPerObj_(),
669  onePartPerProc_(false), partDist_(), procDist_(),
670  procDistEquallySpread_(false),
671  pSizeUniform_(), pCompactIndex_(), pSize_(),
672  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
673  procs_(), algorithm_(algorithm)
674 {
675  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
676 
677  setPartDistribution();
678 
679  // We must call setPartSizes() because part sizes may have
680  // been provided by the user on other processes.
681 
682  ArrayRCP<part_t> *noIds = new ArrayRCP<part_t> [nWeightsPerObj_];
683  ArrayRCP<scalar_t> *noSizes = new ArrayRCP<scalar_t> [nWeightsPerObj_];
684  ArrayRCP<ArrayRCP<part_t> > ids(noIds, 0, nWeightsPerObj_, true);
685  ArrayRCP<ArrayRCP<scalar_t> > sizes(noSizes, 0, nWeightsPerObj_, true);
686 
687  setPartSizes(ids.view(0, nWeightsPerObj_), sizes.view(0, nWeightsPerObj_));
688 
689  env_->memory("After construction of solution");
690 }
691 
692 template <typename Adapter>
694  const RCP<const Environment> &env,
695  const RCP<const Comm<int> > &comm,
696  int nUserWeights,
697  ArrayView<ArrayRCP<part_t> > reqPartIds,
698  ArrayView<ArrayRCP<scalar_t> > reqPartSizes,
699  const RCP<Algorithm<Adapter> > &algorithm)
700  : env_(env), comm_(comm),
701  partBoxes(),
702  nGlobalParts_(0), nLocalParts_(0),
703  localFraction_(0), nWeightsPerObj_(),
704  onePartPerProc_(false), partDist_(), procDist_(),
705  procDistEquallySpread_(false),
706  pSizeUniform_(), pCompactIndex_(), pSize_(),
707  parts_(), haveSolution_(false), nGlobalPartsSolution_(0),
708  procs_(), algorithm_(algorithm)
709 {
710  nWeightsPerObj_ = (nUserWeights ? nUserWeights : 1); // TODO: WHY?? WHY NOT ZERO?
711 
712  setPartDistribution();
713 
714  setPartSizes(reqPartIds, reqPartSizes);
715 
716  env_->memory("After construction of solution");
717 }
718 
719 template <typename Adapter>
721 {
722  // Did the caller define num_global_parts and/or num_local_parts?
723 
724  const ParameterList &pl = env_->getParameters();
725  size_t haveGlobalNumParts=0, haveLocalNumParts=0;
726  int numLocal=0, numGlobal=0;
727 
728  const Teuchos::ParameterEntry *pe = pl.getEntryPtr("num_global_parts");
729 
730  if (pe){
731  haveGlobalNumParts = 1;
732  nGlobalParts_ = part_t(pe->getValue(&nGlobalParts_));
733  numGlobal = nGlobalParts_;
734  }
735 
736  pe = pl.getEntryPtr("num_local_parts");
737 
738  if (pe){
739  haveLocalNumParts = 1;
740  nLocalParts_ = part_t(pe->getValue(&nLocalParts_));
741  numLocal = nLocalParts_;
742  }
743 
744  try{
745  // Sets onePartPerProc_, partDist_, and procDist_
746 
747  partToProc(true, haveLocalNumParts, haveGlobalNumParts,
748  numLocal, numGlobal);
749  }
751 
752  int nprocs = comm_->getSize();
753  int rank = comm_->getRank();
754 
755  if (onePartPerProc_){
756  nGlobalParts_ = nprocs;
757  nLocalParts_ = 1;
758  }
759  else if (partDist_.size() > 0){ // more procs than parts
760  nGlobalParts_ = partDist_.size() - 1;
761  int pstart = partDist_[0];
762  for (part_t i=1; i <= nGlobalParts_; i++){
763  int pend = partDist_[i];
764  if (rank >= pstart && rank < pend){
765  int numOwners = pend - pstart;
766  nLocalParts_ = 1;
767  localFraction_ = 1.0 / numOwners;
768  break;
769  }
770  pstart = pend;
771  }
772  }
773  else if (procDist_.size() > 0){ // more parts than procs
774  nGlobalParts_ = procDist_[nprocs];
775  nLocalParts_ = procDist_[rank+1] - procDist_[rank];
776  }
777  else {
778  throw std::logic_error("partToProc error");
779  }
780 
781 }
782 
783 template <typename Adapter>
784  void PartitioningSolution<Adapter>::setPartSizes(
785  ArrayView<ArrayRCP<part_t> > ids, ArrayView<ArrayRCP<scalar_t> > sizes)
786 {
787  int widx = nWeightsPerObj_;
788  bool fail=false;
789 
790  size_t *countBuf = new size_t [widx*2];
791  ArrayRCP<size_t> counts(countBuf, 0, widx*2, true);
792 
793  fail = ((ids.size() != widx) || (sizes.size() != widx));
794 
795  for (int w=0; !fail && w < widx; w++){
796  counts[w] = ids[w].size();
797  if (ids[w].size() != sizes[w].size()) fail=true;
798  }
799 
800  env_->globalBugAssertion(__FILE__, __LINE__, "bad argument arrays", fail==0,
801  COMPLEX_ASSERTION, comm_);
802 
803  // Are all part sizes the same? This is the common case.
804 
805  ArrayRCP<scalar_t> *emptySizes= new ArrayRCP<scalar_t> [widx];
806  pSize_ = arcp(emptySizes, 0, widx);
807 
808  ArrayRCP<unsigned char> *emptyIndices= new ArrayRCP<unsigned char> [widx];
809  pCompactIndex_ = arcp(emptyIndices, 0, widx);
810 
811  bool *info = new bool [widx];
812  pSizeUniform_ = arcp(info, 0, widx);
813  for (int w=0; w < widx; w++)
814  pSizeUniform_[w] = true;
815 
816  if (nGlobalParts_ == 1){
817  return; // there's only one part in the whole problem
818  }
819 
820  size_t *ptr1 = counts.getRawPtr();
821  size_t *ptr2 = counts.getRawPtr() + widx;
822 
823  try{
824  reduceAll<int, size_t>(*comm_, Teuchos::REDUCE_MAX, widx, ptr1, ptr2);
825  }
826  Z2_THROW_OUTSIDE_ERROR(*env_);
827 
828  bool zero = true;
829 
830  for (int w=0; w < widx; w++)
831  if (counts[widx+w] > 0){
832  zero = false;
833  pSizeUniform_[w] = false;
834  }
835 
836  if (zero) // Part sizes for all criteria are uniform.
837  return;
838 
839  // Compute the part sizes for criteria for which part sizes were
840  // supplied. Normalize for each criteria so part sizes sum to one.
841 
842  int nprocs = comm_->getSize();
843  int rank = comm_->getRank();
844 
845  for (int w=0; w < widx; w++){
846  if (pSizeUniform_[w]) continue;
847 
848  // Send all ids and sizes to one process.
849  // (There is no simple gather method in Teuchos.)
850 
851  part_t length = ids[w].size();
852  part_t *allLength = new part_t [nprocs];
853  Teuchos::gatherAll<int, part_t>(*comm_, 1, &length,
854  nprocs, allLength);
855 
856  if (rank == 0){
857  int total = 0;
858  for (int i=0; i < nprocs; i++)
859  total += allLength[i];
860 
861  part_t *partNums = new part_t [total];
862  scalar_t *partSizes = new scalar_t [total];
863 
864  ArrayView<part_t> idArray(partNums, total);
865  ArrayView<scalar_t> sizeArray(partSizes, total);
866 
867  if (length > 0){
868  for (int i=0; i < length; i++){
869  *partNums++ = ids[w][i];
870  *partSizes++ = sizes[w][i];
871  }
872  }
873 
874  for (int p=1; p < nprocs; p++){
875  if (allLength[p] > 0){
876  Teuchos::receive<int, part_t>(*comm_, p,
877  allLength[p], partNums);
878  Teuchos::receive<int, scalar_t>(*comm_, p,
879  allLength[p], partSizes);
880  partNums += allLength[p];
881  partSizes += allLength[p];
882  }
883  }
884 
885  delete [] allLength;
886 
887  try{
888  computePartSizes(w, idArray, sizeArray);
889  }
891 
892  delete [] idArray.getRawPtr();
893  delete [] sizeArray.getRawPtr();
894  }
895  else{
896  delete [] allLength;
897  if (length > 0){
898  Teuchos::send<int, part_t>(*comm_, length, ids[w].getRawPtr(), 0);
899  Teuchos::send<int, scalar_t>(*comm_, length, sizes[w].getRawPtr(), 0);
900  }
901  }
902 
903  broadcastPartSizes(w);
904  }
905 }
906 
907 template <typename Adapter>
908  void PartitioningSolution<Adapter>::broadcastPartSizes(int widx)
909 {
910  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
911  pSize_.size()>widx &&
912  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
914 
915  int rank = comm_->getRank();
916  int nprocs = comm_->getSize();
917  part_t nparts = nGlobalParts_;
918 
919  if (nprocs < 2)
920  return;
921 
922  char flag=0;
923 
924  if (rank == 0){
925  if (pSizeUniform_[widx] == true)
926  flag = 1;
927  else if (pCompactIndex_[widx].size() > 0)
928  flag = 2;
929  else
930  flag = 3;
931  }
932 
933  try{
934  Teuchos::broadcast<int, char>(*comm_, 0, 1, &flag);
935  }
936  Z2_THROW_OUTSIDE_ERROR(*env_);
937 
938  if (flag == 1){
939  if (rank > 0)
940  pSizeUniform_[widx] = true;
941 
942  return;
943  }
944 
945  if (flag == 2){
946 
947  // broadcast the indices into the size list
948 
949  unsigned char *idxbuf = NULL;
950 
951  if (rank > 0){
952  idxbuf = new unsigned char [nparts];
953  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, idxbuf);
954  }
955  else{
956  env_->localBugAssertion(__FILE__, __LINE__, "index list size",
957  pCompactIndex_[widx].size() == nparts, COMPLEX_ASSERTION);
958  idxbuf = pCompactIndex_[widx].getRawPtr();
959  }
960 
961  try{
962  // broadcast of unsigned char is not supported
963  Teuchos::broadcast<int, char>(*comm_, 0, nparts,
964  reinterpret_cast<char *>(idxbuf));
965  }
966  Z2_THROW_OUTSIDE_ERROR(*env_);
967 
968  if (rank > 0)
969  pCompactIndex_[widx] = arcp(idxbuf, 0, nparts, true);
970 
971  // broadcast the list of different part sizes
972 
973  unsigned char maxIdx=0;
974  for (part_t p=0; p < nparts; p++)
975  if (idxbuf[p] > maxIdx) maxIdx = idxbuf[p];
976 
977  int numSizes = maxIdx + 1;
978 
979  scalar_t *sizeList = NULL;
980 
981  if (rank > 0){
982  sizeList = new scalar_t [numSizes];
983  env_->localMemoryAssertion(__FILE__, __LINE__, numSizes, sizeList);
984  }
985  else{
986  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
987  numSizes == pSize_[widx].size(), COMPLEX_ASSERTION);
988 
989  sizeList = pSize_[widx].getRawPtr();
990  }
991 
992  try{
993  Teuchos::broadcast<int, scalar_t>(*comm_, 0, numSizes, sizeList);
994  }
995  Z2_THROW_OUTSIDE_ERROR(*env_);
996 
997  if (rank > 0)
998  pSize_[widx] = arcp(sizeList, 0, numSizes, true);
999 
1000  return;
1001  }
1002 
1003  if (flag == 3){
1004 
1005  // broadcast the size of each part
1006 
1007  scalar_t *sizeList = NULL;
1008 
1009  if (rank > 0){
1010  sizeList = new scalar_t [nparts];
1011  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, sizeList);
1012  }
1013  else{
1014  env_->localBugAssertion(__FILE__, __LINE__, "wrong number of sizes",
1015  nparts == pSize_[widx].size(), COMPLEX_ASSERTION);
1016 
1017  sizeList = pSize_[widx].getRawPtr();
1018  }
1019 
1020  try{
1021  Teuchos::broadcast<int, scalar_t >(*comm_, 0, nparts, sizeList);
1022  }
1023  Z2_THROW_OUTSIDE_ERROR(*env_);
1024 
1025  if (rank > 0)
1026  pSize_[widx] = arcp(sizeList, 0, nparts);
1027 
1028  return;
1029  }
1030 }
1031 
1032 template <typename Adapter>
1033  void PartitioningSolution<Adapter>::computePartSizes(int widx,
1034  ArrayView<part_t> ids, ArrayView<scalar_t> sizes)
1035 {
1036  int len = ids.size();
1037 
1038  if (len == 0){
1039  pSizeUniform_[widx] = true;
1040  return;
1041  }
1042 
1043  env_->localBugAssertion(__FILE__, __LINE__, "bad array sizes",
1044  len>0 && sizes.size()==len, COMPLEX_ASSERTION);
1045 
1046  env_->localBugAssertion(__FILE__, __LINE__, "bad index",
1047  widx>=0 && widx<nWeightsPerObj_, COMPLEX_ASSERTION);
1048 
1049  env_->localBugAssertion(__FILE__, __LINE__, "preallocations",
1050  pSize_.size()>widx &&
1051  pSizeUniform_.size()>widx && pCompactIndex_.size()>widx,
1053 
1054  // Check ids and sizes and find min, max and average sizes.
1055  // If sizes are very close to uniform, call them uniform parts.
1056 
1057  part_t nparts = nGlobalParts_;
1058  unsigned char *buf = new unsigned char [nparts];
1059  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, buf);
1060  memset(buf, 0, nparts);
1061  ArrayRCP<unsigned char> partIdx(buf, 0, nparts, true);
1062 
1063  scalar_t epsilon = 10e-5 / nparts;
1064  scalar_t min=sizes[0], max=sizes[0], sum=0;
1065 
1066  for (int i=0; i < len; i++){
1067  part_t id = ids[i];
1068  scalar_t size = sizes[i];
1069 
1070  env_->localInputAssertion(__FILE__, __LINE__, "invalid part id",
1071  id>=0 && id<nparts, BASIC_ASSERTION);
1072 
1073  env_->localInputAssertion(__FILE__, __LINE__, "invalid part size", size>=0,
1074  BASIC_ASSERTION);
1075 
1076  // TODO: we could allow users to specify multiple sizes for the same
1077  // part if we add a parameter that says what we are to do with them:
1078  // add them or take the max.
1079 
1080  env_->localInputAssertion(__FILE__, __LINE__,
1081  "multiple sizes provided for one part", partIdx[id]==0, BASIC_ASSERTION);
1082 
1083  partIdx[id] = 1; // mark that we have a size for this part
1084 
1085  if (size < min) min = size;
1086  if (size > max) max = size;
1087  sum += size;
1088  }
1089 
1090  if (sum == 0){
1091 
1092  // User has given us a list of parts of size 0, we'll set
1093  // the rest to them to equally sized parts.
1094 
1095  scalar_t *allSizes = new scalar_t [2];
1096  env_->localMemoryAssertion(__FILE__, __LINE__, 2, allSizes);
1097 
1098  ArrayRCP<scalar_t> sizeArray(allSizes, 0, 2, true);
1099 
1100  int numNonZero = nparts - len;
1101 
1102  allSizes[0] = 0.0;
1103  allSizes[1] = 1.0 / numNonZero;
1104 
1105  for (part_t p=0; p < nparts; p++)
1106  buf[p] = 1; // index to default part size
1107 
1108  for (int i=0; i < len; i++)
1109  buf[ids[i]] = 0; // index to part size zero
1110 
1111  pSize_[widx] = sizeArray;
1112  pCompactIndex_[widx] = partIdx;
1113 
1114  return;
1115  }
1116 
1117  if (max - min <= epsilon){
1118  pSizeUniform_[widx] = true;
1119  return;
1120  }
1121 
1122  // A size for parts that were not specified:
1123  scalar_t avg = sum / nparts;
1124 
1125  // We are going to merge part sizes that are very close. This takes
1126  // computation time now, but can save considerably in the storage of
1127  // all part sizes on each process. For example, a common case may
1128  // be some parts are size 1 and all the rest are size 2.
1129 
1130  scalar_t *tmp = new scalar_t [len];
1131  env_->localMemoryAssertion(__FILE__, __LINE__, len, tmp);
1132  memcpy(tmp, sizes.getRawPtr(), sizeof(scalar_t) * len);
1133  ArrayRCP<scalar_t> partSizes(tmp, 0, len, true);
1134 
1135  std::sort(partSizes.begin(), partSizes.end());
1136 
1137  // create a list of sizes that are unique within epsilon
1138 
1139  Array<scalar_t> nextUniqueSize;
1140  nextUniqueSize.push_back(partSizes[len-1]); // largest
1141  scalar_t curr = partSizes[len-1];
1142  int avgIndex = len;
1143  bool haveAvg = false;
1144  if (curr - avg <= epsilon)
1145  avgIndex = 0;
1146 
1147  for (int i=len-2; i >= 0; i--){
1148  scalar_t val = partSizes[i];
1149  if (curr - val > epsilon){
1150  nextUniqueSize.push_back(val); // the highest in the group
1151  curr = val;
1152  if (avgIndex==len && val > avg && val - avg <= epsilon){
1153  // the average would be in this group
1154  avgIndex = nextUniqueSize.size() - 1;
1155  haveAvg = true;
1156  }
1157  }
1158  }
1159 
1160  partSizes.clear();
1161 
1162  size_t numSizes = nextUniqueSize.size();
1163  int sizeArrayLen = numSizes;
1164 
1165  if (numSizes < 64){
1166  // We can store the size for every part in a compact way.
1167 
1168  // Create a list of all sizes in increasing order
1169 
1170  if (!haveAvg) sizeArrayLen++; // need to include average
1171 
1172  scalar_t *allSizes = new scalar_t [sizeArrayLen];
1173  env_->localMemoryAssertion(__FILE__, __LINE__, sizeArrayLen, allSizes);
1174  ArrayRCP<scalar_t> sizeArray(allSizes, 0, sizeArrayLen, true);
1175 
1176  int newAvgIndex = sizeArrayLen;
1177 
1178  for (int i=numSizes-1, idx=0; i >= 0; i--){
1179 
1180  if (newAvgIndex == sizeArrayLen){
1181 
1182  if (haveAvg && i==avgIndex)
1183  newAvgIndex = idx;
1184 
1185  else if (!haveAvg && avg < nextUniqueSize[i]){
1186  newAvgIndex = idx;
1187  allSizes[idx++] = avg;
1188  }
1189  }
1190 
1191  allSizes[idx++] = nextUniqueSize[i];
1192  }
1193 
1194  env_->localBugAssertion(__FILE__, __LINE__, "finding average in list",
1195  newAvgIndex < sizeArrayLen, COMPLEX_ASSERTION);
1196 
1197  for (int i=0; i < nparts; i++){
1198  buf[i] = newAvgIndex; // index to default part size
1199  }
1200 
1201  sum = (nparts - len) * allSizes[newAvgIndex];
1202 
1203  for (int i=0; i < len; i++){
1204  int id = ids[i];
1205  scalar_t size = sizes[i];
1206  int index;
1207 
1208  // Find the first size greater than or equal to this size.
1209 
1210  if (size < avg && avg - size <= epsilon)
1211  index = newAvgIndex;
1212  else{
1213  typename ArrayRCP<scalar_t>::iterator found =
1214  std::lower_bound(sizeArray.begin(), sizeArray.end(), size);
1215 
1216  env_->localBugAssertion(__FILE__, __LINE__, "size array",
1217  found != sizeArray.end(), COMPLEX_ASSERTION);
1218 
1219  index = found - sizeArray.begin();
1220  }
1221 
1222  buf[id] = index;
1223  sum += allSizes[index];
1224  }
1225 
1226  for (int i=0; i < sizeArrayLen; i++){
1227  sizeArray[i] /= sum;
1228  }
1229 
1230  pCompactIndex_[widx] = partIdx;
1231  pSize_[widx] = sizeArray;
1232  }
1233  else{
1234  // To have access to part sizes, we must store nparts scalar_ts on
1235  // every process. We expect this is a rare case.
1236 
1237  tmp = new scalar_t [nparts];
1238  env_->localMemoryAssertion(__FILE__, __LINE__, nparts, tmp);
1239 
1240  sum += ((nparts - len) * avg);
1241 
1242  for (int i=0; i < nparts; i++){
1243  tmp[i] = avg/sum;
1244  }
1245 
1246  for (int i=0; i < len; i++){
1247  tmp[ids[i]] = sizes[i]/sum;
1248  }
1249 
1250  pSize_[widx] = arcp(tmp, 0, nparts);
1251  }
1252 }
1253 
1254 template <typename Adapter>
1255  void PartitioningSolution<Adapter>::setParts(ArrayRCP<part_t> &partList)
1256 {
1257  env_->debug(DETAILED_STATUS, "Entering setParts");
1258 
1259  size_t len = partList.size();
1260 
1261  // Find the actual number of parts in the solution, which can
1262  // be more or less than the nGlobalParts_ target.
1263  // (We may want to compute the imbalance of a given solution with
1264  // respect to a desired solution. This solution may have more or
1265  // fewer parts that the desired solution.)
1266 
1267  part_t lMax = -1;
1268  part_t lMin = (len > 0 ? std::numeric_limits<part_t>::max() : 0);
1269  part_t gMax, gMin;
1270 
1271  for (size_t i = 0; i < len; i++) {
1272  if (partList[i] < lMin) lMin = partList[i];
1273  if (partList[i] > lMax) lMax = partList[i];
1274  }
1275  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MIN, 1, &lMin, &gMin);
1276  Teuchos::reduceAll<int, part_t>(*comm_, Teuchos::REDUCE_MAX, 1, &lMax, &gMax);
1277 
1278  nGlobalPartsSolution_ = gMax - gMin + 1;
1279  parts_ = partList;
1280 
1281  // Now determine which process gets each object, if not one-to-one.
1282 
1283  if (!onePartPerProc_){
1284 
1285  int *procs = new int [len];
1286  env_->localMemoryAssertion(__FILE__, __LINE__, len, procs);
1287  procs_ = arcp<int>(procs, 0, len);
1288 
1289  if (len > 0) {
1290  part_t *parts = partList.getRawPtr();
1291 
1292  if (procDist_.size() > 0){ // parts are not split across procs
1293 
1294  int procId;
1295  for (size_t i=0; i < len; i++){
1296  partToProcsMap(parts[i], procs[i], procId);
1297  }
1298  }
1299  else{ // harder - we need to split the parts across multiple procs
1300 
1301  lno_t *partCounter = new lno_t [nGlobalPartsSolution_];
1302  env_->localMemoryAssertion(__FILE__, __LINE__, nGlobalPartsSolution_,
1303  partCounter);
1304 
1305  int numProcs = comm_->getSize();
1306 
1307  //MD NOTE: there was no initialization for partCounter.
1308  //I added the line below, correct me if I am wrong.
1309  memset(partCounter, 0, sizeof(lno_t) * nGlobalPartsSolution_);
1310 
1311  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++)
1312  partCounter[parts[i]]++;
1313 
1314  lno_t *procCounter = new lno_t [numProcs];
1315  env_->localMemoryAssertion(__FILE__, __LINE__, numProcs, procCounter);
1316 
1317  int proc1;
1318  int proc2 = partDist_[0];
1319 
1320  for (part_t part=1; part < nGlobalParts_; part++){
1321  proc1 = proc2;
1322  proc2 = partDist_[part+1];
1323  int numprocs = proc2 - proc1;
1324 
1325  double dNum = partCounter[part];
1326  double dProcs = numprocs;
1327 
1328  //std::cout << "dNum:" << dNum << " dProcs:" << dProcs << std::endl;
1329  double each = floor(dNum/dProcs);
1330  double extra = fmod(dNum,dProcs);
1331 
1332  for (int proc=proc1, i=0; proc<proc2; proc++, i++){
1333  if (i < extra)
1334  procCounter[proc] = lno_t(each) + 1;
1335  else
1336  procCounter[proc] = lno_t(each);
1337  }
1338  }
1339 
1340  delete [] partCounter;
1341 
1342  for (typename ArrayRCP<part_t>::size_type i=0; i < partList.size(); i++){
1343  if (partList[i] >= nGlobalParts_){
1344  // Solution has more parts that targeted. These
1345  // objects just remain on this process.
1346  procs[i] = comm_->getRank();
1347  continue;
1348  }
1349  part_t partNum = parts[i];
1350  proc1 = partDist_[partNum];
1351  proc2 = partDist_[partNum + 1];
1352 
1353  int proc;
1354  for (proc=proc1; proc < proc2; proc++){
1355  if (procCounter[proc] > 0){
1356  procs[i] = proc;
1357  procCounter[proc]--;
1358  break;
1359  }
1360  }
1361  env_->localBugAssertion(__FILE__, __LINE__, "part to proc",
1362  proc < proc2, COMPLEX_ASSERTION);
1363  }
1364 
1365  delete [] procCounter;
1366  }
1367  }
1368  }
1369 
1370  // Now that parts_ info is back on home process, remap the parts.
1371  // TODO: The parts will be inconsistent with the proc assignments after
1372  // TODO: remapping. This problem will go away after we separate process
1373  // TODO: mapping from setParts. But for MueLu's use case, the part
1374  // TODO: remapping is all that matters; they do not use the process mapping.
1375  bool doRemap = false;
1376  const Teuchos::ParameterEntry *pe =
1377  env_->getParameters().getEntryPtr("remap_parts");
1378  if (pe) doRemap = pe->getValue(&doRemap);
1379  if (doRemap) RemapParts();
1380 
1381  haveSolution_ = true;
1382 
1383  env_->memory("After Solution has processed algorithm's answer");
1384  env_->debug(DETAILED_STATUS, "Exiting setParts");
1385 }
1386 
1387 
1388 template <typename Adapter>
1390  double &numParts, part_t &partMin, part_t &partMax) const
1391 {
1392  if (onePartPerProc_){
1393  numParts = 1.0;
1394  partMin = partMax = procId;
1395  }
1396  else if (procDist_.size() > 0){
1397  partMin = procDist_[procId];
1398  partMax = procDist_[procId+1] - 1;
1399  numParts = procDist_[procId+1] - partMin;
1400  }
1401  else{
1402  // find the first p such that partDist_[p] > procId
1403 
1404  std::vector<int>::const_iterator entry;
1405  entry = std::upper_bound(partDist_.begin(), partDist_.end(), procId);
1406 
1407  size_t partIdx = entry - partDist_.begin();
1408  int numProcs = partDist_[partIdx] - partDist_[partIdx-1];
1409  partMin = partMax = int(partIdx) - 1;
1410  numParts = 1.0 / numProcs;
1411  }
1412 }
1413 
1414 template <typename Adapter>
1415  void PartitioningSolution<Adapter>::partToProcsMap(part_t partId,
1416  int &procMin, int &procMax) const
1417 {
1418  if (partId >= nGlobalParts_){
1419  // setParts() may be given an initial solution which uses a
1420  // different number of parts than the desired solution. It is
1421  // still a solution. We keep it on this process.
1422  procMin = procMax = comm_->getRank();
1423  }
1424  else if (onePartPerProc_){
1425  procMin = procMax = int(partId);
1426  }
1427  else if (procDist_.size() > 0){
1428  if (procDistEquallySpread_) {
1429  // Avoid binary search.
1430  double fProcs = comm_->getSize();
1431  double fParts = nGlobalParts_;
1432  double each = fParts / fProcs;
1433  procMin = int(partId / each);
1434  while (procDist_[procMin] > partId) procMin--;
1435  while (procDist_[procMin+1] <= partId) procMin++;
1436  procMax = procMin;
1437  }
1438  else {
1439  // find the first p such that procDist_[p] > partId.
1440  // For now, do a binary search.
1441 
1442  typename std::vector<part_t>::const_iterator entry;
1443  entry = std::upper_bound(procDist_.begin(), procDist_.end(), partId);
1444 
1445  size_t procIdx = entry - procDist_.begin();
1446  procMin = procMax = int(procIdx) - 1;
1447  }
1448  }
1449  else{
1450  procMin = partDist_[partId];
1451  procMax = partDist_[partId+1] - 1;
1452  }
1453 }
1454 
1455 template <typename Adapter>
1457  int c1, int c2) const
1458 {
1459  if (c1 < 0 || c1 >= nWeightsPerObj_ || c2 < 0 || c2 >= nWeightsPerObj_ )
1460  throw std::logic_error("criteriaHaveSamePartSizes error");
1461 
1462  bool theSame = false;
1463 
1464  if (c1 == c2)
1465  theSame = true;
1466 
1467  else if (pSizeUniform_[c1] == true && pSizeUniform_[c2] == true)
1468  theSame = true;
1469 
1470  else if (pCompactIndex_[c1].size() == pCompactIndex_[c2].size()){
1471  theSame = true;
1472  bool useIndex = pCompactIndex_[c1].size() > 0;
1473  if (useIndex){
1474  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1475  if (pSize_[c1][pCompactIndex_[c1][p]] !=
1476  pSize_[c2][pCompactIndex_[c2][p]])
1477  theSame = false;
1478  }
1479  else{
1480  for (part_t p=0; theSame && p < nGlobalParts_; p++)
1481  if (pSize_[c1][p] != pSize_[c2][p])
1482  theSame = false;
1483  }
1484  }
1485 
1486  return theSame;
1487 }
1488 
1504 template <typename Adapter>
1506  bool doCheck, bool haveNumLocalParts, bool haveNumGlobalParts,
1507  int numLocalParts, int numGlobalParts)
1508 {
1509 #ifdef _MSC_VER
1510  typedef SSIZE_T ssize_t;
1511 #endif
1512  int nprocs = comm_->getSize();
1513  ssize_t reducevals[4];
1514  ssize_t sumHaveGlobal=0, sumHaveLocal=0;
1515  ssize_t sumGlobal=0, sumLocal=0;
1516  ssize_t maxGlobal=0, maxLocal=0;
1517  ssize_t vals[4] = {haveNumGlobalParts, haveNumLocalParts,
1518  numGlobalParts, numLocalParts};
1519 
1520  partDist_.clear();
1521  procDist_.clear();
1522 
1523  if (doCheck){
1524 
1525  try{
1526  reduceAll<int, ssize_t>(*comm_, Teuchos::REDUCE_SUM, 4, vals, reducevals);
1527  }
1528  Z2_THROW_OUTSIDE_ERROR(*env_);
1529 
1530  sumHaveGlobal = reducevals[0];
1531  sumHaveLocal = reducevals[1];
1532  sumGlobal = reducevals[2];
1533  sumLocal = reducevals[3];
1534 
1535  env_->localInputAssertion(__FILE__, __LINE__,
1536  "Either all procs specify num_global/local_parts or none do",
1537  (sumHaveGlobal == 0 || sumHaveGlobal == nprocs) &&
1538  (sumHaveLocal == 0 || sumHaveLocal == nprocs),
1539  BASIC_ASSERTION);
1540  }
1541  else{
1542  if (haveNumLocalParts)
1543  sumLocal = numLocalParts * nprocs;
1544  if (haveNumGlobalParts)
1545  sumGlobal = numGlobalParts * nprocs;
1546 
1547  sumHaveGlobal = haveNumGlobalParts ? nprocs : 0;
1548  sumHaveLocal = haveNumLocalParts ? nprocs : 0;
1549 
1550  maxLocal = numLocalParts;
1551  maxGlobal = numGlobalParts;
1552  }
1553 
1554  if (!haveNumLocalParts && !haveNumGlobalParts){
1555  onePartPerProc_ = true; // default if user did not specify
1556  return;
1557  }
1558 
1559  if (haveNumGlobalParts){
1560  if (doCheck){
1561  vals[0] = numGlobalParts;
1562  vals[1] = numLocalParts;
1563  try{
1564  reduceAll<int, ssize_t>(
1565  *comm_, Teuchos::REDUCE_MAX, 2, vals, reducevals);
1566  }
1567  Z2_THROW_OUTSIDE_ERROR(*env_);
1568 
1569  maxGlobal = reducevals[0];
1570  maxLocal = reducevals[1];
1571 
1572  env_->localInputAssertion(__FILE__, __LINE__,
1573  "Value for num_global_parts is different on different processes.",
1574  maxGlobal * nprocs == sumGlobal, BASIC_ASSERTION);
1575  }
1576 
1577  if (sumLocal){
1578  env_->localInputAssertion(__FILE__, __LINE__,
1579  "Sum of num_local_parts does not equal requested num_global_parts",
1580  sumLocal == numGlobalParts, BASIC_ASSERTION);
1581 
1582  if (sumLocal == nprocs && maxLocal == 1){
1583  onePartPerProc_ = true; // user specified one part per proc
1584  return;
1585  }
1586  }
1587  else{
1588  if (maxGlobal == nprocs){
1589  onePartPerProc_ = true; // user specified num parts is num procs
1590  return;
1591  }
1592  }
1593  }
1594 
1595  // If we are here, we do not have #parts == #procs.
1596 
1597  if (sumHaveLocal == nprocs){
1598  //
1599  // We will go by the number of local parts specified.
1600  //
1601 
1602  try{
1603  procDist_.resize(nprocs+1);
1604  }
1605  catch (std::exception &e){
1606  throw(std::bad_alloc());
1607  }
1608 
1609  part_t *procArray = &procDist_[0];
1610 
1611  try{
1612  part_t tmp = part_t(numLocalParts);
1613  gatherAll<int, part_t>(*comm_, 1, &tmp, nprocs, procArray + 1);
1614  }
1615  Z2_THROW_OUTSIDE_ERROR(*env_);
1616 
1617  procArray[0] = 0;
1618 
1619  for (int proc=0; proc < nprocs; proc++)
1620  procArray[proc+1] += procArray[proc];
1621  }
1622  else{
1623  //
1624  // We will allocate global number of parts to the processes.
1625  //
1626  double fParts = numGlobalParts;
1627  double fProcs = nprocs;
1628 
1629  if (fParts < fProcs){
1630 
1631  try{
1632  partDist_.resize(size_t(fParts+1));
1633  }
1634  catch (std::exception &e){
1635  throw(std::bad_alloc());
1636  }
1637 
1638  int *partArray = &partDist_[0];
1639 
1640  double each = floor(fProcs / fParts);
1641  double extra = fmod(fProcs, fParts);
1642  partDist_[0] = 0;
1643 
1644  for (part_t part=0; part < numGlobalParts; part++){
1645  int numOwners = int(each + ((part<extra) ? 1 : 0));
1646  partArray[part+1] = partArray[part] + numOwners;
1647  }
1648 
1649  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1650  partDist_[numGlobalParts] == nprocs, COMPLEX_ASSERTION, comm_);
1651  }
1652  else if (fParts > fProcs){
1653 
1654  // User did not specify local number of parts per proc;
1655  // Distribute the parts evenly among the procs.
1656 
1657  procDistEquallySpread_ = true;
1658 
1659  try{
1660  procDist_.resize(size_t(fProcs+1));
1661  }
1662  catch (std::exception &e){
1663  throw(std::bad_alloc());
1664  }
1665 
1666  part_t *procArray = &procDist_[0];
1667 
1668  double each = floor(fParts / fProcs);
1669  double extra = fmod(fParts, fProcs);
1670  procArray[0] = 0;
1671 
1672  for (int proc=0; proc < nprocs; proc++){
1673  part_t numParts = part_t(each + ((proc<extra) ? 1 : 0));
1674  procArray[proc+1] = procArray[proc] + numParts;
1675  }
1676 
1677  env_->globalBugAssertion(__FILE__, __LINE__, "#parts != #procs",
1678  procDist_[nprocs] == numGlobalParts, COMPLEX_ASSERTION, comm_);
1679  }
1680  else{
1681  env_->globalBugAssertion(__FILE__, __LINE__,
1682  "should never get here", 1, COMPLEX_ASSERTION, comm_);
1683  }
1684  }
1685 }
1686 
1688 // Remap a new part assignment vector for maximum overlap with an input
1689 // part assignment.
1690 //
1691 // Assumptions for this version:
1692 // input part assignment == processor rank for every local object.
1693 // assuming nGlobalParts_ <= num ranks
1694 // TODO: Write a version that takes the input part number as input;
1695 // this change requires input parts in adapters to be provided in
1696 // the Adapter.
1697 // TODO: For repartitioning, compare to old remapping results; see Zoltan1.
1698 
1699 template <typename Adapter>
1701 {
1702  size_t len = parts_.size();
1703 
1704  part_t me = comm_->getRank();
1705  int np = comm_->getSize();
1706 
1707  if (np < nGlobalParts_) {
1708  if (me == 0)
1709  std::cout << "Remapping not yet supported for "
1710  << "num_global_parts " << nGlobalParts_
1711  << " > num procs " << np << std::endl;
1712  return;
1713  }
1714  // Build edges of a bipartite graph with np + nGlobalParts_ vertices,
1715  // and edges between a process vtx and any parts to which that process'
1716  // objects are assigned.
1717  // Weight edge[parts_[i]] by the number of objects that are going from
1718  // this rank to parts_[i].
1719  // We use a std::map, assuming the number of unique parts in the parts_ array
1720  // is small to keep the binary search efficient.
1721  // TODO We use the count of objects to move; should change to SIZE of objects
1722  // to move; need SIZE function in Adapter.
1723 
1724  std::map<part_t, long> edges;
1725  long lstaying = 0; // Total num of local objects staying if we keep the
1726  // current mapping. TODO: change to SIZE of local objs
1727  long gstaying = 0; // Total num of objects staying in the current partition
1728 
1729  for (size_t i = 0; i < len; i++) {
1730  edges[parts_[i]]++; // TODO Use obj size instead of count
1731  if (parts_[i] == me) lstaying++; // TODO Use obj size instead of count
1732  }
1733 
1734  Teuchos::reduceAll<int, long>(*comm_, Teuchos::REDUCE_SUM, 1,
1735  &lstaying, &gstaying);
1736 //TODO if (gstaying == Adapter::getGlobalNumObjs()) return; // Nothing to do
1737 
1738  part_t *remap = NULL;
1739 
1740  int nedges = edges.size();
1741 
1742  // Gather the graph to rank 0.
1743  part_t tnVtx = np + nGlobalParts_; // total # vertices
1744  int *idx = NULL; // Pointer index into graph adjacencies
1745  int *sizes = NULL; // nedges per rank
1746  if (me == 0) {
1747  idx = new int[tnVtx+1];
1748  sizes = new int[np];
1749  sizes[0] = nedges;
1750  }
1751  if (np > 1)
1752  Teuchos::gather<int, int>(&nedges, 1, sizes, 1, 0, *comm_);
1753 
1754  // prefix sum to build the idx array
1755  if (me == 0) {
1756  idx[0] = 0;
1757  for (int i = 0; i < np; i++)
1758  idx[i+1] = idx[i] + sizes[i];
1759  }
1760 
1761  // prepare to send edges
1762  int cnt = 0;
1763  part_t *bufv = NULL;
1764  long *bufw = NULL;
1765  if (nedges) {
1766  bufv = new part_t[nedges];
1767  bufw = new long[nedges];
1768  // Create buffer with edges (me, part[i]) and weight edges[parts_[i]].
1769  for (typename std::map<part_t, long>::iterator it = edges.begin();
1770  it != edges.end(); it++) {
1771  bufv[cnt] = it->first; // target part
1772  bufw[cnt] = it->second; // weight
1773  cnt++;
1774  }
1775  }
1776 
1777  // Prepare to receive edges on rank 0
1778  part_t *adj = NULL;
1779  long *wgt = NULL;
1780  if (me == 0) {
1781 //SYM adj = new part_t[2*idx[np]]; // need 2x space to symmetrize later
1782 //SYM wgt = new long[2*idx[np]]; // need 2x space to symmetrize later
1783  adj = new part_t[idx[np]];
1784  wgt = new long[idx[np]];
1785  }
1786 
1787  Teuchos::gatherv<int, part_t>(bufv, cnt, adj, sizes, idx, 0, *comm_);
1788  Teuchos::gatherv<int, long>(bufw, cnt, wgt, sizes, idx, 0, *comm_);
1789  delete [] bufv;
1790  delete [] bufw;
1791 
1792  // Now have constructed graph on rank 0.
1793  // Call the matching algorithm
1794 
1795  int doRemap;
1796  if (me == 0) {
1797  // We have the "LHS" vertices of the bipartite graph; need to create
1798  // "RHS" vertices.
1799  for (int i = 0; i < idx[np]; i++) {
1800  adj[i] += np; // New RHS vertex number; offset by num LHS vertices
1801  }
1802 
1803  // Build idx for RHS vertices
1804  for (part_t i = np; i < tnVtx; i++) {
1805  idx[i+1] = idx[i]; // No edges for RHS vertices
1806  }
1807 
1808 #ifdef KDDKDD_DEBUG
1809  std::cout << "IDX ";
1810  for (part_t i = 0; i <= tnVtx; i++) std::cout << idx[i] << " ";
1811  std::cout << std::endl;
1812 
1813  std::cout << "ADJ ";
1814  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << adj[i] << " ";
1815  std::cout << std::endl;
1816 
1817  std::cout << "WGT ";
1818  for (part_t i = 0; i < idx[tnVtx]; i++) std::cout << wgt[i] << " ";
1819  std::cout << std::endl;
1820 #endif
1821 
1822  // Perform matching on the graph
1823  part_t *match = new part_t[tnVtx];
1824  for (part_t i = 0; i < tnVtx; i++) match[i] = i;
1825  part_t nmatches =
1826  Zoltan2::GreedyMWM<part_t, long>(idx, adj, wgt, tnVtx, match);
1827 
1828 #ifdef KDDKDD_DEBUG
1829  std::cout << "After matching: " << nmatches << " found" << std::endl;
1830  for (part_t i = 0; i < tnVtx; i++)
1831  std::cout << "match[" << i << "] = " << match[i]
1832  << ((match[i] != i &&
1833  (i < np && match[i] != i+np))
1834  ? " *" : " ")
1835  << std::endl;
1836 #endif
1837 
1838  // See whether there were nontrivial changes in the matching.
1839  bool nontrivial = false;
1840  if (nmatches) {
1841  for (part_t i = 0; i < np; i++) {
1842  if ((match[i] != i) && (match[i] != (i+np))) {
1843  nontrivial = true;
1844  break;
1845  }
1846  }
1847  }
1848 
1849  // Process the matches
1850  if (nontrivial) {
1851  remap = new part_t[nGlobalParts_];
1852  for (part_t i = 0; i < nGlobalParts_; i++) remap[i] = -1;
1853 
1854  bool *used = new bool[np];
1855  for (part_t i = 0; i < np; i++) used[i] = false;
1856 
1857  // First, process all matched parts
1858  for (part_t i = 0; i < nGlobalParts_; i++) {
1859  part_t tmp = i + np;
1860  if (match[tmp] != tmp) {
1861  remap[i] = match[tmp];
1862  used[match[tmp]] = true;
1863  }
1864  }
1865 
1866  // Second, process unmatched parts; keep same part number if possible
1867  for (part_t i = 0; i < nGlobalParts_; i++) {
1868  if (remap[i] > -1) continue;
1869  if (!used[i]) {
1870  remap[i] = i;
1871  used[i] = true;
1872  }
1873  }
1874 
1875  // Third, process unmatched parts; give them the next unused part
1876  for (part_t i = 0, uidx = 0; i < nGlobalParts_; i++) {
1877  if (remap[i] > -1) continue;
1878  while (used[uidx]) uidx++;
1879  remap[i] = uidx;
1880  used[uidx] = true;
1881  }
1882  delete [] used;
1883  }
1884  delete [] match;
1885 
1886 #ifdef KDDKDD_DEBUG
1887  std::cout << "Remap vector: ";
1888  for (part_t i = 0; i < nGlobalParts_; i++) std::cout << remap[i] << " ";
1889  std::cout << std::endl;
1890 #endif
1891 
1892  long newgstaying = measure_stays(remap, idx, adj, wgt,
1893  nGlobalParts_, np);
1894  doRemap = (newgstaying > gstaying);
1895  std::cout << "gstaying " << gstaying << " measure(input) "
1896  << measure_stays(NULL, idx, adj, wgt, nGlobalParts_, np)
1897  << " newgstaying " << newgstaying
1898  << " nontrivial " << nontrivial
1899  << " doRemap " << doRemap << std::endl;
1900  }
1901  delete [] idx;
1902  delete [] sizes;
1903  delete [] adj;
1904  delete [] wgt;
1905 
1906  Teuchos::broadcast<int, int>(*comm_, 0, 1, &doRemap);
1907 
1908  if (doRemap) {
1909  if (me != 0) remap = new part_t[nGlobalParts_];
1910  Teuchos::broadcast<int, part_t>(*comm_, 0, nGlobalParts_, remap);
1911  for (size_t i = 0; i < len; i++) {
1912  parts_[i] = remap[parts_[i]];
1913  }
1914  }
1915 
1916  delete [] remap; // TODO May want to keep for repartitioning as in Zoltan
1917 }
1918 
1919 
1920 } // namespace Zoltan2
1921 
1922 #endif
Zoltan2::BasicUserTypes< zscalar_t, zlno_t, zgno_t > user_t
Definition: Metric.cpp:74
Defines the Environment class.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Greedy Maximal Weight Matching.
Defines the Solution base class.
Algorithm defines the base class for all algorithms.
A PartitioningSolution is a solution to a partitioning problem.
virtual bool isPartitioningTreeBinary() const
calculate if partition tree is binary.
const RCP< const Comm< int > > & getCommunicator() const
Return the communicator associated with the solution.
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
int getNumberOfCriteria() const
Get the number of criteria (object weights)
scalar_t getLocalFractionOfPart() const
If parts are divided across processes, return the fraction of a part on this process.
void RemapParts()
Remap a new partition for maximum overlap with an input partition.
std::vector< Zoltan2::coordinateModelPartBox > & getPartBoxesView() const
returns the part box boundary list.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
PartitioningSolution(const RCP< const Environment > &env, const RCP< const Comm< int > > &comm, int nUserWeights, const RCP< Algorithm< Adapter > > &algorithm=Teuchos::null)
Constructor when part sizes are not supplied.
const int * getProcListView() const
Returns the process list corresponding to the global ID list.
void getPartitionTree(part_t &numTreeVerts, std::vector< part_t > &permPartNums, std::vector< part_t > &splitRangeBeg, std::vector< part_t > &splitRangeEnd, std::vector< part_t > &treeVertParents) const
get the partition tree - fill the relevant arrays
scalar_t getCriteriaPartSize(int idx, part_t part) const
Get the size for a given weight index and a given part.
void boxAssign(int dim, scalar_t *lower, scalar_t *upper, size_t &nPartsFound, part_t **partsFound) const
Return an array of all parts overlapping a given box in space.
void setParts(ArrayRCP< part_t > &partList)
The algorithm uses setParts to set the solution.
const int * getPartDistribution() const
Return a distribution by part.
long measure_stays(part_t *remap, int *idx, part_t *adj, long *wgt, part_t nrhs, part_t nlhs)
bool oneToOnePartDistribution() const
Is the part-to-process distribution is one-to-one.
bool criteriaHaveSamePartSizes(int c1, int c2) const
Return true if the two weight indices have the same part size information.
const part_t * getProcDistribution() const
Return a distribution by process.
part_t pointAssign(int dim, scalar_t *point) const
Return the part overlapping a given point in space;.
void getProcsForPart(part_t partId, part_t &procMin, part_t &procMax) const
Get the processes containing a part.
void getPartsForProc(int procId, double &numParts, part_t &partMin, part_t &partMax) const
Get the parts belonging to a process.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
void getCommunicationGraph(ArrayRCP< part_t > &comXAdj, ArrayRCP< part_t > &comAdj) const
returns communication graph resulting from geometric partitioning.
const RCP< const Environment > & getEnvironment() const
Return the environment associated with the solution.
bool criteriaHasUniformPartSizes(int idx) const
Determine if balancing criteria has uniform part sizes. (User can specify differing part sizes....
Just a placeholder for now.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Created by mbenlioglu on Aug 31, 2020.
static const std::string fail
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ BASIC_ASSERTION
fast typical checks for valid arguments
@ COMPLEX_ASSERTION
more involved, like validate a graph
#define epsilon
Definition: nd.cpp:82
SparseMatrixAdapter_t::part_t part_t