Zoltan2
Zoltan2_AlgHybridD1.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_ALGHYBRIDD1_HPP_
2 #define _ZOLTAN2_ALGHYBRIDD1_HPP_
3 
4 #include <vector>
5 #include <unordered_map>
6 #include <iostream>
7 #include <fstream>
8 #include <queue>
9 #ifdef _WIN32
10 #include <time.h>
11 #else
12 #include <sys/time.h>
13 #endif
14 #include <algorithm>
15 
16 #include "Zoltan2_Algorithm.hpp"
17 #include "Zoltan2_GraphModel.hpp"
19 #include "Zoltan2_Util.hpp"
20 #include "Zoltan2_TPLTraits.hpp"
21 #include "Zoltan2_AlltoAll.hpp"
22 
23 #include "Tpetra_Core.hpp"
24 #include "Teuchos_RCP.hpp"
25 #include "Tpetra_Import.hpp"
26 #include "Tpetra_FEMultiVector.hpp"
27 
28 #include "KokkosKernels_Handle.hpp"
29 #include "KokkosKernels_IOUtils.hpp"
30 #include "KokkosGraph_Distance1Color.hpp"
31 #include "KokkosGraph_Distance1ColorHandle.hpp"
32 #include <stdlib.h>
33 
37 
38 namespace Zoltan2 {
39 
40 template <typename Adapter>
41 class AlgDistance1 : public Algorithm<Adapter>
42 {
43  public:
44 
45  using lno_t = typename Adapter::lno_t;
46  using gno_t = typename Adapter::gno_t;
47  using offset_t = typename Adapter::offset_t;
48  using scalar_t = typename Adapter::scalar_t;
50  using map_t = Tpetra::Map<lno_t, gno_t>;
51  using femv_scalar_t = int;
52  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53  using device_type = Tpetra::Map<>::device_type;
54  using execution_space = Tpetra::Map<>::execution_space;
55  using memory_space = Tpetra::Map<>::memory_space;
56  using host_exec = typename Kokkos::View<device_type>::HostMirror::execution_space;
57  using host_mem = typename Kokkos::View<device_type>::HostMirror::memory_space;
58  double timer() {
59  struct timeval tp;
60  gettimeofday(&tp, NULL);
61  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
62  }
63 
64  private:
65 
66  void buildModel(modelFlag_t &flags);
67 
68  //function to invoke KokkosKernels distance-1 coloring
69  //
70  //OUTPUT ARG:
71  //
72  // femv: FEMultiVector containing dual-view of colors.
73  // After this call each vertex will have a locally-valid color.
74  //
75  //INPUT ARGS:
76  //
77  // nVtx: number of local owned vertices
78  //
79  // adjs_view: CSR adjacencies for the local graph
80  //
81  // offset_view: CSR offsets for indexing adjs_view
82  //
83  // vertex_list: list of local IDs of vertices that
84  // need to be recolored
85  //
86  // vertex_list_size: 0 means no vertex list given,
87  // otherwise it simply is the size
88  // of the vertex_list
89  //
90  // recolor: switches between VB_BIT and EB KokkosKernels
91  // algorithms
92  //
93  template <class ExecutionSpace, typename MemorySpace>
94  void colorInterior(const size_t nVtx,
95  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace,MemorySpace> > adjs_view,
96  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > offset_view,
97  Teuchos::RCP<femv_t> femv,
98  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace> > vertex_list,
99  size_t vertex_list_size = 0,
100  bool recolor=false){
101 
102  //set up types to be used by KokkosKernels
103  using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle
104  <offset_t, lno_t, lno_t, ExecutionSpace, MemorySpace,
105  MemorySpace>;
106  using lno_row_view_t = Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
107  using lno_nnz_view_t = Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>>;
108 
109  KernelHandle kh;
110 
111  //pick which KokkosKernels algorithm to use.
112  //VBBIT is more efficient for inputs with max degree < ~6000
113  //EB is more efficient for inputs with max degree > ~6000
114  if(recolor){
115  kh.create_graph_coloring_handle(KokkosGraph::COLORING_VBBIT);
116  } else {
117  kh.create_graph_coloring_handle(KokkosGraph::COLORING_EB);
118  }
119 
120  //vertex_list_size indicates whether we have provided a list of vertices to recolor.
121  //Only makes a difference if the algorithm to be used is VBBIT.
122  if(vertex_list_size != 0){
123  kh.get_graph_coloring_handle()->set_vertex_list(vertex_list,vertex_list_size);
124  }
125 
126  kh.set_verbose(verbose);
127 
128  //set the initial coloring of the kh.get_graph_coloring_handle() to be
129  //the data view from the femv.
130  auto femvColors = femv->template getLocalView<Kokkos::Device<ExecutionSpace,MemorySpace> >(Tpetra::Access::ReadWrite);
131  auto sv = subview(femvColors, Kokkos::ALL, 0);
132  kh.get_graph_coloring_handle()->set_vertex_colors(sv);
133  kh.get_graph_coloring_handle()->set_tictoc(verbose);
134  KokkosGraph::Experimental::graph_color_symbolic<KernelHandle, lno_row_view_t, lno_nnz_view_t>
135  (&kh, nVtx, nVtx, offset_view, adjs_view);
136  //numColors = kh.get_graph_coloring_handle()->get_num_colors();
137 
138  if(verbose){
139  std::cout<<"\nKokkosKernels Coloring: "<<kh.get_graph_coloring_handle()->get_overall_coloring_time()<<" iterations: "<<kh.get_graph_coloring_handle()->get_num_phases()<<"\n\n";
140  }
141  }
142 
143  public:
144  //contains all distance-1 conflict detection logic
145  //
146  //OUTPUT ARGS:
147  //
148  // femv_colors: This function uncolors vertices that have
149  // distance-1 conflicts, so these colors will
150  // change if there are any conflicts present
151  //
152  //INPUT ARGS:
153  //
154  // n_local: number of locally owned vertices
155  //
156  // n_ghosts: number of ghosts on this process
157  //
158  // dist_offsets: CSR offsets of the local graph
159  //
160  // dist_adjs: CSR adjacencies of the local graph
161  //
162  // verts_to_send_view: Used to construct a list of verts to send on device.
163  //
164  // verts_to_send_size_atomic: atomic version of the verts_to_send_size view
165  // Used to construct a list on device,
166  // the atomic is necessary for correctness.
167  //
168  // recoloringSize: holds the total amount of recoloring that will be done
169  // locally. Does not need to be atomic.
170  //
171  // rand: view that holds the tie-breaking random numbers indexed by LID.
172  //
173  // gid: view that holds GIDs, for tie breaking in the case that rand
174  // numbers are the same for two vertices.
175  //
176  // ghost_degrees: view that holds degrees only for ghost vertices.
177  // LID n_local corresponds to ghost_degrees(0).
178  //
179  // recolor_degrees: if true, we factor degrees into the conflict detection
180  // if false, we resolve using only consistent random numbers.
181  //
182  template <class ExecutionSpace, typename MemorySpace>
183  void detectConflicts(const size_t n_local, const size_t n_ghosts,
184  Kokkos::View<offset_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_offsets,
185  Kokkos::View<lno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> dist_adjs,
186  Kokkos::View<int*, Kokkos::Device<ExecutionSpace, MemorySpace>> femv_colors,
187  Kokkos::View<lno_t*,
188  Kokkos::Device<ExecutionSpace, MemorySpace>> verts_to_send_view,
189  Kokkos::View<size_t*,
190  Kokkos::Device<ExecutionSpace, MemorySpace>,
191  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
192  Kokkos::View<gno_t*, Kokkos::Device<ExecutionSpace, MemorySpace>> recoloringSize,
193  Kokkos::View<int*,
194  Kokkos::Device<ExecutionSpace, MemorySpace>> rand,
195  Kokkos::View<gno_t*,
196  Kokkos::Device<ExecutionSpace, MemorySpace>> gid,
197  Kokkos::View<gno_t*,
198  Kokkos::Device<ExecutionSpace, MemorySpace>> ghost_degrees,
199  bool recolor_degrees){
200  gno_t local_recoloring_size;
201  Kokkos::parallel_reduce("Conflict Detection",
202  Kokkos::RangePolicy<ExecutionSpace>(0,n_ghosts),
203  KOKKOS_LAMBDA(const int& i,
204  gno_t& recoloring_size){
205  lno_t lid = i+n_local;
206  int currColor = femv_colors(lid);
207  int currDegree = ghost_degrees(i);
208  for(offset_t j = dist_offsets(lid); j < dist_offsets(lid+1); j++){
209  int nborColor = femv_colors(dist_adjs(j));
210  int nborDegree = 0;
211  if((size_t)dist_adjs(j) < n_local) nborDegree = dist_offsets(dist_adjs(j)+1) - dist_offsets(dist_adjs(j));
212  else nborDegree = ghost_degrees(dist_adjs(j) - n_local);
213  if(currColor == nborColor ){
214  if(currDegree < nborDegree && recolor_degrees){
215  femv_colors(lid) = 0;
216  recoloring_size++;
217  }else if (nborDegree < currDegree && recolor_degrees){
218  femv_colors(dist_adjs(j)) = 0;
219  recoloring_size++;
220  }else if(rand(lid) > rand(dist_adjs(j))){
221  femv_colors(lid) = 0;
222  recoloring_size++;
223  break;
224  }else if (rand(dist_adjs(j)) > rand(lid)){
225  femv_colors(dist_adjs(j)) = 0;
226  recoloring_size++;
227  } else {
228  if (gid(lid) >= gid(dist_adjs(j))){
229  femv_colors(lid) = 0;
230  recoloring_size++;
231  break;
232  } else {
233  femv_colors(dist_adjs(j)) = 0;
234  recoloring_size++;
235  }
236  }
237  }
238  }
239  },local_recoloring_size);
240  Kokkos::deep_copy(recoloringSize, local_recoloring_size);
241  Kokkos::fence();
242  Kokkos::parallel_for("Rebuild verts_to_send_view",
243  Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
244  KOKKOS_LAMBDA(const int& i){
245  if(femv_colors(i) == 0){
246  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
247  }
248  });
249  Kokkos::fence();
250  }
251 
252  private:
253  //Communicates owned vertex colors to ghost copies.
254  //
255  //RETURN VALUE:
256  //
257  // returns the time it took for the communication call to complete
258  //
259  //OUTPUT ARGS:
260  //
261  // femv: FEMultivector that holds the dual-view of the colors
262  // After this call, ghost colors will be up-to-date.
263  //
264  // recv: returns the size of the recv buffer
265  //
266  // send: returns the size of the send buffer
267  //
268  //INPUT ARGS:
269  //
270  // mapOwnedPlusGhosts: a Tpetra map that translates between
271  // LID and GID for any vertex on this process.
272  //
273  // nVtx: the number of locally owned vertices.
274  //
275  // verts_to_send: hostmirror of verts to send. This function sends
276  // all vertices in this list to their ghost copies.
277  //
278  // verts_to_send_size: hostmirror of verts_to_send_size, holds the
279  // size of verts_to_send.
280  //
281  // procs_to_send: map that translates LID into a list of processes that
282  // have a ghost copy of the vertex.
283  //
284  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
285  size_t nVtx,
286  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send,
287  typename Kokkos::View<size_t*, device_type>::HostMirror& verts_to_send_size,
288  Teuchos::RCP<femv_t> femv,
289  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
290  gno_t& recv, gno_t& send){
291  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
292  auto colors = subview(femvColors, Kokkos::ALL, 0);
293  int nprocs = comm->getSize();
294 
295  std::vector<int> sendcnts(comm->getSize(), 0);
296  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
297  for(size_t i = 0; i < verts_to_send_size(0); i++){
298  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
299  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
300  }
301  }
302 
303  sdispls[0] = 0;
304  gno_t sendsize = 0;
305  std::vector<int> sentcount(nprocs, 0);
306 
307  for(int i = 1; i < comm->getSize()+1; i++){
308  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
309  sendsize += sendcnts[i-1];
310  }
311  send = sendsize;
312  std::vector<gno_t> sendbuf(sendsize,0);
313 
314  for(size_t i = 0; i < verts_to_send_size(0); i++){
315  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
316  for(size_t j = 0; j < procs.size(); j++){
317  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
318  sentcount[procs[j]] += 2;
319  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
320  sendbuf[idx] = colors(verts_to_send(i));
321  }
322  }
323 
324  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
325  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
326  Teuchos::ArrayRCP<gno_t> recvbuf;
327  std::vector<int> recvcnts(comm->getSize(), 0);
328  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
329 
330  //if we're reporting timings, remove the computation imbalance from the comm timer.
331  if(timing) comm->barrier();
332  double comm_total = 0.0;
333  double comm_temp = timer();
334 
335  AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
336  comm_total += timer() - comm_temp;
337 
338  gno_t recvsize = 0;
339 
340  for(int i = 0; i < recvcnts_view.size(); i++){
341  recvsize += recvcnts_view[i];
342  }
343  recv = recvsize;
344  for(int i = 0; i < recvsize; i+=2){
345  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
346  if(lid<nVtx && verbose) std::cout<<comm->getRank()<<": received a locally owned vertex, somehow\n";
347  colors(lid) = recvbuf[i+1];
348  }
349 
350  return comm_total;
351  }
352 
353  RCP<const base_adapter_t> adapter;
354  RCP<GraphModel<base_adapter_t> > model;
355  RCP<Teuchos::ParameterList> pl;
356  RCP<Environment> env;
357  RCP<const Teuchos::Comm<int> > comm;
358  bool verbose;
359  bool timing;
360  public:
361  //constructor for the hybrid distributed distance-1 algorithm
363  const RCP<const base_adapter_t> &adapter_,
364  const RCP<Teuchos::ParameterList> &pl_,
365  const RCP<Environment> &env_,
366  const RCP<const Teuchos::Comm<int> > &comm_)
367  : adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
368  verbose = pl->get<bool>("verbose",false);
369  timing = pl->get<bool>("timing", false);
370  if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
371  modelFlag_t flags;
372  flags.reset();
373  buildModel(flags);
374  if(verbose) std::cout<<comm->getRank()<<": done constructing coloring class\n";
375  }
376 
377 
378  //Main entry point for graph coloring
379  void color( const RCP<ColoringSolution<Adapter> > &solution ) {
380  if(verbose) std::cout<<comm->getRank()<<": inside coloring function\n";
381  //this will color the global graph in a manner similar to Zoltan
382 
383  //get vertex GIDs in a locally indexed array
384  if(verbose) std::cout<<comm->getRank()<<": getting owned vtxIDs\n";
385  ArrayView<const gno_t> vtxIDs;
386  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
387  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
388  //we do not use weights at this point
389  if(verbose) std::cout<<comm->getRank()<<": getting edge list\n";
390  //get edge information from the model
391  ArrayView<const gno_t> adjs;
392  ArrayView<const offset_t> offsets;
393  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
394  model->getEdgeList(adjs, offsets, ewgts);
395  //again, weights are not used
396 
397  RCP<const map_t> mapOwned;
398  RCP<const map_t> mapWithCopies;
399 
400  std::vector<gno_t> finalGIDs;
401  std::vector<offset_t> finalOffset_vec;
402  std::vector<lno_t> finalAdjs_vec;
403 
404  std::vector<lno_t> reorderToLocal;
405  for(size_t i = 0; i< nVtx; i++) reorderToLocal.push_back(i);
406  if(verbose) std::cout<<comm->getRank()<<": Setting up local datastructures\n";
407  //Set up a typical local mapping here.
408  std::unordered_map<gno_t,lno_t> globalToLocal;
409  std::vector<gno_t> ownedPlusGhosts;
410  for (gno_t i = 0; i < vtxIDs.size(); i++){
411  if(vtxIDs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative GID\n";
412  globalToLocal[vtxIDs[i]] = i;
413  ownedPlusGhosts.push_back(vtxIDs[i]);
414  }
415  gno_t nghosts = 0;
416  for (int i = 0; i < adjs.size(); i++){
417  if(globalToLocal.count(adjs[i]) == 0){
418  //new unique ghost found
419  if(adjs[i] < 0 && verbose) std::cout<<comm->getRank()<<": found a negative adjacency\n";
420  ownedPlusGhosts.push_back(adjs[i]);
421  globalToLocal[adjs[i]] = vtxIDs.size() + nghosts;
422  nghosts++;
423 
424  }
425  }
426  if(verbose) std::cout<<comm->getRank()<<": vec.max_size() = "<<finalAdjs_vec.max_size()<<", adjs.size() = "<<adjs.size()<<"\n";
427  finalAdjs_vec.resize(adjs.size());
428  for(size_t i = 0; i < finalAdjs_vec.size();i++){
429  finalAdjs_vec[i] = globalToLocal[adjs[i]];
430  }
431  for(int i = 0; i < offsets.size(); i++) finalOffset_vec.push_back(offsets[i]);
432  finalGIDs = ownedPlusGhosts;
433 
434 
435  if(verbose) std::cout<<comm->getRank()<<": creating Tpetra Maps\n";
436  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
437  <Tpetra::global_size_t>::invalid();
438  mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
439 
440  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
441  mapWithCopies = rcp(new map_t(dummy,
442  Teuchos::arrayViewFromVector(ownedPlusGhosts),
443  0, comm));
444 
445  //create the FEMultiVector for the distributed communication.
446  //We also use the views from this datastructure as arguments to
447  //KokkosKernels coloring functions.
448  if(verbose)std::cout<<comm->getRank()<<": creating FEMultiVector\n";
449  typedef Tpetra::Import<lno_t, gno_t> import_t;
450  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
451  mapWithCopies));
452  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
453  importer, 1, true));
454  //Get color array to fill
455  ArrayRCP<int> colors = solution->getColorsRCP();
456  for(size_t i=0; i<nVtx; i++){
457  colors[i] = 0;
458  }
459 
460  //Create random numbers seeded on global IDs so that we don't
461  //need to communicate for consistency. These numbers determine
462  //which vertex gets recolored in the event of a conflict.
463  //taken directly from the Zoltan coloring implementation
464  std::vector<int> rand(finalGIDs.size());
465  for(size_t i = 0; i < finalGIDs.size(); i++){
466  std::srand(finalGIDs[i]);
467  rand[i] = std::rand();
468  }
469 
470  //find out who owns the ghosts on this process
471  std::vector<int> ghostOwners(finalGIDs.size() - nVtx);
472  std::vector<gno_t> ghosts(finalGIDs.size() - nVtx);
473  for(size_t i = nVtx; i < finalGIDs.size(); i++) ghosts[i-nVtx] = finalGIDs[i];
474  ArrayView<int> owners = Teuchos::arrayViewFromVector(ghostOwners);
475  ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts);
476  mapOwned->getRemoteIndexList(ghostGIDs, owners);
477 
478  //create const views of the CSR representation of the local graph
479  ArrayView<const offset_t> finalOffsets = Teuchos::arrayViewFromVector(finalOffset_vec);
480  ArrayView<const lno_t> finalAdjs = Teuchos::arrayViewFromVector(finalAdjs_vec);
481  ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
482  ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(finalGIDs);
483 
484  //find out which remote processes have a ghost copy of a local owned vertex.
485  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
486  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
487 
488  //create a quick lookup from LID -> remote processes with a ghost copy
489  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
490  for(int i = 0; i < exportLIDs.size(); i++){
491  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
492  }
493 
494  // call coloring function
495  hybridGMB(nVtx, finalAdjs, finalOffsets,femv,gid_view,rand_view,owners,mapWithCopies, procs_to_send);
496 
497  //copy colors to the output array.
498  auto femvdata = femv->getData(0);
499  for(int i = 0; i < colors.size(); i++){
500  colors[reorderToLocal[i]] = femvdata[i];
501  }
502 
503  }
504 
505  //This function contains all coloring logic.
506  //
507  //OUTPUT ARGS:
508  //
509  // femv: FEMultiVector with a dual-view of the vertex colors.
510  // The colors will be a valid distance-1 coloring after this
511  // function returns.
512  //
513  //INPUT ARGS:
514  //
515  // nVtx: number of locally owned vertices
516  //
517  // adjs: CSR adjacencies for the local graph
518  //
519  // offsets: CSR offsets into adjs for the local graph
520  //
521  // gids: global IDs for every vertex on this process.
522  // indexed by local ID
523  //
524  // rand: random number generated for every vertex on
525  // this process. Indexed by local ID
526  //
527  // owners: owners of each ghost vertex.
528  // owners[i] = the owning process for vertex with GID gids[i+nVtx]
529  //
530  // mapOwnedPlusGhosts: Tpetra map that translates between LIDs and GIDs
531  //
532  // procs_to_send: maps from LIDs to Process IDs.
533  // procs_to_send[LID] gives a list of processes that have
534  // a ghost copy of LID.
535  //
536  void hybridGMB(const size_t nVtx,
537  const Teuchos::ArrayView<const lno_t>& adjs,
538  const Teuchos::ArrayView<const offset_t>& offsets,
539  const Teuchos::RCP<femv_t>& femv,
540  const Teuchos::ArrayView<const gno_t>& gids,
541  const Teuchos::ArrayView<const int>& rand,
542  const Teuchos::ArrayView<const int>& owners,
543  RCP<const map_t> mapOwnedPlusGhosts,
544  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
545  if(verbose) std::cout<<comm->getRank()<<": inside coloring algorithm\n";
546 
547  //initialize timers
548  double total_time = 0.0;
549  double interior_time = 0.0;
550  double comm_time = 0.0;
551  double comp_time = 0.0;
552  double recoloring_time = 0.0;
553  double conflict_detection = 0.0;
554 
555  const int numStatisticRecordingRounds = 100;
556 
557  //Put together local and remote degrees
558  //1. Communicate remote GIDs to owning processes
559  std::vector<int> deg_send_cnts(comm->getSize(),0);
560  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
561  for(int i = 0; i < owners.size(); i++){
562  deg_send_cnts[owners[i]]++;
563  }
564  deg_sdispls[0] = 0;
565  gno_t deg_sendsize = 0;
566  std::vector<int> deg_sentcount(comm->getSize(),0);
567  for(int i = 1; i < comm->getSize()+1; i++){
568  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
569  deg_sendsize += deg_send_cnts[i-1];
570  }
571  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
572  for(int i = 0; i < owners.size(); i++){
573  size_t idx = deg_sdispls[owners[i]] + deg_sentcount[owners[i]];
574  deg_sentcount[owners[i]]++;
575  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+nVtx);
576  }
577  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
578  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
579  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
580  std::vector<int> deg_recvcnts(comm->getSize(),0);
581  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
582  AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
583 
584  //2. replace GID with local degree
585  for(int i = 0; i < deg_recvbuf.size(); i++){
586  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
587  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
588  }
589  //3. send modified buffer back
590  ArrayRCP<gno_t> ghost_degrees;
591  AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
592  //determine max degree locally and globally
593  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
594  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
595  for(int i = 0; i < ghost_degrees.size(); i++){
596  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
597  ghost_degrees_host(lid-nVtx) = ghost_degrees[i];
598  }
599  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
600 
601  offset_t local_max_degree = 0;
602  offset_t global_max_degree = 0;
603  for(size_t i = 0; i < nVtx; i++){
604  offset_t curr_degree = offsets[i+1] - offsets[i];
605  if(curr_degree > local_max_degree){
606  local_max_degree = curr_degree;
607  }
608  }
609  Teuchos::reduceAll<int, offset_t>(*comm,Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
610  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
611  if(verbose)std::cout<<comm->getRank()<<": creating Kokkos Views\n";
612 
613  Kokkos::View<offset_t*, device_type> dist_degrees("Owned+Ghost degree view",rand.size());
614  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees);
615  //set degree counts for ghosts
616  for(int i = 0; i < adjs.size(); i++){
617  if((size_t)adjs[i] < nVtx) continue;
618  dist_degrees_host(adjs[i])++;
619  }
620  //set degree counts for owned verts
621  for(int i = 0; i < offsets.size()-1; i++){
622  dist_degrees_host(i) = offsets[i+1] - offsets[i];
623  }
624 
625  Kokkos::View<offset_t*, device_type> dist_offsets("Owned+Ghost Offset view", rand.size()+1);
626  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets);
627 
628  //set offsets and total # of adjacencies
629  dist_offsets_host(0) = 0;
630  uint64_t total_adjs = 0;
631  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
632  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
633  total_adjs+= dist_degrees_host(i-1);
634  }
635 
636  Kokkos::View<lno_t*, device_type> dist_adjs("Owned+Ghost adjacency view", total_adjs);
637  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs);
638  //now, use the degree view as a counter
639  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
640  dist_degrees_host(i) = 0;
641  }
642  for(int i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
643  if(comm->getSize() > 1){
644  for(size_t i = 0; i < nVtx; i++){
645  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
646  //if the adjacency is a ghost
647  if( (size_t)adjs[j] >= nVtx){
648  //add the symmetric edge to its adjacency list (already accounted for by offsets)
649  dist_adjs_host(dist_offsets_host(adjs[j]) + dist_degrees_host(adjs[j])) = i;
650  dist_degrees_host(adjs[j])++;
651  }
652  }
653  }
654  }
655 
656  if(verbose) std::cout<<comm->getRank()<<": copying host mirrors to device views\n";
657  //copy the host data to device memory
658  Kokkos::deep_copy(dist_degrees, dist_degrees_host); //may be unnecessary
659  Kokkos::deep_copy(dist_offsets, dist_offsets_host);
660  Kokkos::deep_copy(dist_adjs, dist_adjs_host);
661  if(verbose) std::cout<<comm->getRank()<<": done copying to device\n";
662 
663  //counter in UVM memory for how many vertices need recoloring.
664  Kokkos::View<gno_t*, device_type> recoloringSize("Recoloring Queue Size",1);
665  typename Kokkos::View<gno_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
666  recoloringSize_host(0) = 0;
667  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
668 
669  //device copy of the random tie-breakers
670  Kokkos::View<int*,device_type> rand_dev("randVec",rand.size());
671  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
672  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
673  rand_host(i) = rand[i];
674  }
675 
676  //device copy of global IDs
677  Kokkos::View<gno_t*, device_type> gid_dev("GIDs",gids.size());
678  typename Kokkos::View<gno_t*,device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
679  for(Teuchos_Ordinal i = 0; i < gids.size(); i++){
680  gid_host(i) = gids[i];
681  }
682 
683  //copy host views to device memory
684  Kokkos::deep_copy(rand_dev,rand_host);
685  Kokkos::deep_copy(gid_dev, gid_host);
686 
687  if(verbose)std::cout<<comm->getRank()<<": done creating recoloring datastructures\n";
688  //count boundary size to allocate list of vertices to recolor.
689  offset_t boundary_size = 0;
690  for(size_t i = 0; i < nVtx; i++){
691  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
692  if((size_t)adjs[j] >= nVtx) {
693  boundary_size++;
694  break;
695  }
696  }
697  }
698  if(verbose)std::cout<<comm->getRank()<<": creating send views\n";
699 
700  //list of vertices to send to remote processes
701  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send",boundary_size);
702  Kokkos::parallel_for(boundary_size, KOKKOS_LAMBDA(const int& i){
703  verts_to_send_view(i) = -1;
704  });
705 
706  //size information for the list of vertices to send. Also includes an atomic copy
707  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
708  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic> > verts_to_send_size_atomic = verts_to_send_size;
709  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
710  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
711  //initialize the device view with a value of zero
712  verts_to_send_size_host(0) = 0;
713  deep_copy(verts_to_send_size, verts_to_send_size_host);
714 
715  if(verbose)std::cout<<comm->getRank()<<": Done creating send views, initializing...\n";
716  if(verbose)std::cout<<comm->getRank()<<": boundary_size = "<<boundary_size<<" verts_to_send_size_atomic(0) = "<<verts_to_send_size_atomic(0)<<"\n";
717  //initially the verts to send include all boundary vertices.
718  Kokkos::parallel_for("Initialize verts_to_send",nVtx, KOKKOS_LAMBDA(const int&i){
719  for(offset_t j = dist_offsets(i); j < dist_offsets(i+1); j++){
720  if((size_t)dist_adjs(j) >= nVtx){
721  verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
722  break;
723  }
724  }
725  });
726  Kokkos::fence();
727 
728 
729  //used to replace the incorrectly recolored ghosts
730  Kokkos::View<int*, device_type> ghost_colors("ghost color backups", rand.size()-nVtx);
731  if(verbose)std::cout<<comm->getRank()<<": Done initializing\n";
732  gno_t sentPerRound[numStatisticRecordingRounds];
733  gno_t recvPerRound[numStatisticRecordingRounds];
734 
735  if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
736  //initialize interior and total timers, barrier to prevent any imbalance from setup.
737  //Only use a barrier if timing is happening.
738  if(timing) comm->barrier();
739  interior_time = timer();
740  total_time = timer();
741  //call the KokkosKernels coloring function with the Tpetra default spaces.
742  bool use_vbbit = (global_max_degree < 6000);
743  this->colorInterior<execution_space,memory_space>
744  (nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
745  if(timing){
746  interior_time = timer() - interior_time;
747  comp_time = interior_time;
748  }
749  if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
750  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);
751 
752  //if there is more than a single process, check distributed conflicts and recolor
753  if(comm->getSize() > 1){
754 
755  if(verbose)std::cout<<comm->getRank()<<": going to communicate\n";
756 
757  //communicate
758  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
759  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
760  gno_t recv, sent;
761  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
762  nVtx,
763  verts_to_send_host,
764  verts_to_send_size_host,
765  femv,
766  procs_to_send,
767  recv,
768  sent);
769  sentPerRound[0] = sent;
770  recvPerRound[0] = recv;
771  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
772  verts_to_send_size_host(0) = 0;
773  deep_copy(verts_to_send_size, verts_to_send_size_host);
774  //set the old ghost colors
775  //get the colors from the femv
776  Kokkos::View<int**, Kokkos::LayoutLeft, device_type> femvColors =
777  femv->template getLocalView<device_type>(Tpetra::Access::ReadWrite); // Partial write
778  Kokkos::View<int*, device_type> femv_colors = subview(femvColors, Kokkos::ALL, 0);
779  Kokkos::parallel_for(rand.size()-nVtx,KOKKOS_LAMBDA(const int& i){
780  ghost_colors(i) = femv_colors(i+nVtx);
781  });
782  Kokkos::fence();
783  //detect conflicts on the device, uncolor conflicts consistently.
784  double temp = timer();
785  detectConflicts<execution_space, memory_space>(nVtx,
786  rand.size()-nVtx,
787  dist_offsets,
788  dist_adjs,
789  femv_colors,
790  verts_to_send_view,
791  verts_to_send_size_atomic,
792  recoloringSize,
793  rand_dev,
794  gid_dev,
795  ghost_degrees_dev,
796  recolor_degrees);
797  deep_copy(recoloringSize_host, recoloringSize);
798  conflict_detection += timer() - temp;
799  comp_time += conflict_detection;
800  }
801  //end the initial recoloring
802  if(verbose)std::cout<<comm->getRank()<<": done initial recoloring, begin recoloring loop\n";
803  double totalPerRound[numStatisticRecordingRounds];
804  double commPerRound[numStatisticRecordingRounds];
805  double compPerRound[numStatisticRecordingRounds];
806  double recoloringPerRound[numStatisticRecordingRounds];
807  double conflictDetectionPerRound[numStatisticRecordingRounds];
808  double serialRecoloringPerRound[numStatisticRecordingRounds];
809  int vertsPerRound[numStatisticRecordingRounds];
810  bool done = false; //We're only done when all processors are done
811  if(comm->getSize() == 1) done = true;
812  totalPerRound[0] = interior_time + comm_time + conflict_detection;
813  recoloringPerRound[0] = 0;
814  commPerRound[0] = comm_time;
815  compPerRound[0] = interior_time + conflict_detection;
816  conflictDetectionPerRound[0] = conflict_detection;
817  recoloringPerRound[0] = 0;
818  vertsPerRound[0] = 0;
819  int distributedRounds = 1; //this is the same across all processors
820  int serial_threshold = this->pl->template get<int>("serial_threshold",0);
821 
822  Kokkos::View<lno_t*, device_type> verts_to_recolor("verts_to_recolor", boundary_size);
823  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
824  //while the queue is not empty
825  while(recoloringSize_host(0) > 0 || !done){
826  if(recoloringSize_host(0) < serial_threshold) break;
827  //get a subview of the colors:
828  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
829  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
830  //color everything in the recoloring queue, put everything on conflict queue
831  if(distributedRounds < numStatisticRecordingRounds) {
832  vertsPerRound[distributedRounds] = recoloringSize_host(0);
833  }
834 
835  //copying the send view to the recolor view is necessary because
836  //KokkosKernels can change the view passed in, and we need the send view
837  //intact for communication.
838  Kokkos::deep_copy(verts_to_recolor, verts_to_send_view);
839 
840  double recolor_temp = timer();
841  //use KokkosKernels to recolor the conflicting vertices.
842  deep_copy(verts_to_send_size_host, verts_to_send_size);
843  if(verts_to_send_size_host(0) > 0){
844  this->colorInterior<execution_space,
845  memory_space>(femv_colors.size(),
846  dist_adjs,dist_offsets,
847  femv,
848  verts_to_recolor,
849  verts_to_send_size_host(0),
850  use_vbbit);
851  }
852 
853  if(distributedRounds < numStatisticRecordingRounds){
854  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
855  recoloring_time += recoloringPerRound[distributedRounds];
856  comp_time += recoloringPerRound[distributedRounds];
857  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
858  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
859  } else if(timing) {
860  double recolor_round_time = timer() - recolor_temp;
861  recoloring_time += recolor_round_time;
862  comp_time += recolor_round_time;
863  }
864 
865  //reset the recoloringSize device host and device views
866  //to zero
867  recoloringSize_host(0) = 0;
868  Kokkos::deep_copy(recoloringSize,recoloringSize_host);
869 
870  Kokkos::parallel_for(rand.size()-nVtx, KOKKOS_LAMBDA(const int& i){
871  femv_colors(i+nVtx) = ghost_colors(i);
872  });
873  Kokkos::fence();
874  //communicate
875  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
876  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
877  gno_t sent,recv;
878  // Reset device views
879  femvColors = decltype(femvColors)();
880  femv_colors = decltype(femv_colors)();
881 
882  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
883  nVtx,
884  verts_to_send_host,
885  verts_to_send_size_host,
886  femv,
887  procs_to_send,
888  recv,
889  sent);
890  comm_time += curr_comm_time;
891  if(distributedRounds < numStatisticRecordingRounds){
892  commPerRound[distributedRounds] = curr_comm_time;
893  sentPerRound[distributedRounds] = sent;
894  recvPerRound[distributedRounds] = recv;
895  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
896  }
897  //detect conflicts in parallel. For a detected conflict,
898  //reset the vertex-to-be-recolored's color to 0, in order to
899  //allow KokkosKernels to recolor correctly.
900 
901  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
902  femv_colors = subview(femvColors, Kokkos::ALL, 0);
903  Kokkos::parallel_for(rand.size()-nVtx, KOKKOS_LAMBDA(const int& i){
904  ghost_colors(i) = femv_colors(i+nVtx);
905  });
906  Kokkos::fence();
907  verts_to_send_size_host(0) = 0;
908  deep_copy(verts_to_send_size, verts_to_send_size_host);
909  double detection_temp = timer();
910  detectConflicts<execution_space, memory_space>(nVtx,
911  rand.size()-nVtx,
912  dist_offsets,
913  dist_adjs,
914  femv_colors,
915  verts_to_send_view,
916  verts_to_send_size_atomic,
917  recoloringSize,
918  rand_dev,
919  gid_dev,
920  ghost_degrees_dev,
921  recolor_degrees);
922 
923  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
924 
925  if(distributedRounds < numStatisticRecordingRounds){
926  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
927  conflict_detection += conflictDetectionPerRound[distributedRounds];
928  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
929  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
930  comp_time += conflictDetectionPerRound[distributedRounds];
931  } else if(timing){
932  double conflict_detection_round_time = timer()- detection_temp;
933  conflict_detection += conflict_detection_round_time;
934  comp_time += conflict_detection_round_time;
935  }
936  //do a reduction to determine if we're done
937  int globalDone = 0;
938  int localDone = recoloringSize_host(0);
939  Teuchos::reduceAll<int, int>(*comm,Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
940  //We're only allowed to stop once everyone has no work to do.
941  //collectives will hang if one process exits.
942  distributedRounds++;
943  done = !globalDone;
944  }
945 
946  if(recoloringSize_host(0) > 0 || !done){
947  ghost_colors_host = Kokkos::create_mirror_view(ghost_colors);
948  deep_copy(ghost_colors_host, ghost_colors);
949  deep_copy(verts_to_send_host, verts_to_send_view);
950  deep_copy(verts_to_send_size_host, verts_to_send_size);
951  }
952 
953 
954  //finish the local coloring in serial on the host
955  while(recoloringSize_host(0) > 0 || !done){
956  //Use non-templated call to get the Host view
957  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
958  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
959  /*Kokkos::View<int*, Kokkos::Device<Kokkos::Serial, Kokkos::HostSpace>>femv_colors_host = create_mirror(femv_colors);
960  Kokkos::deep_copy(femv_colors_host, femv_colors);*/
961  if(distributedRounds < 100){
962  vertsPerRound[distributedRounds] = recoloringSize_host(0);
963  }
964 
965  double recolor_temp = timer();
966  //use KokkosKernels to recolor the conflicting vertices
967  if(verts_to_send_size_host(0) > 0){
968  this->colorInterior<host_exec,
969  host_mem>
970  (femv_colors.size(), dist_adjs_host, dist_offsets_host, femv, verts_to_send_host, verts_to_send_size_host(0), true);
971  }
972 
973  if(distributedRounds < numStatisticRecordingRounds){
974  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
975  recoloring_time += recoloringPerRound[distributedRounds];
976  comp_time += recoloringPerRound[distributedRounds];
977  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
978  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
979  } else if(timing){
980  double recolor_serial_round_time = timer() - recolor_temp;
981  recoloring_time += recolor_serial_round_time;
982  comp_time += recolor_serial_round_time;
983  }
984 
985  recoloringSize_host(0) = 0;
986 
987  for(size_t i = 0; i < rand.size() -nVtx; i++){
988  femv_colors(i+nVtx) = ghost_colors_host(i);
989  }
990 
991  gno_t sent,recv;
992  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,
993  nVtx,
994  verts_to_send_host,
995  verts_to_send_size_host,
996  femv,
997  procs_to_send,
998  recv,
999  sent);
1000  comm_time += curr_comm_time;
1001 
1002  if(distributedRounds < numStatisticRecordingRounds){
1003  commPerRound[distributedRounds] = curr_comm_time;
1004  sentPerRound[distributedRounds] = sent;
1005  recvPerRound[distributedRounds] = recv;
1006  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1007  }
1008  for(size_t i = 0; i < rand.size()-nVtx; i++){
1009  ghost_colors_host(i) = femv_colors(i+nVtx);
1010  }
1011 
1012  verts_to_send_size_host(0) = 0;
1013  double detection_temp = timer();
1014  detectConflicts<host_exec, host_mem>(nVtx,
1015  rand.size()-nVtx,
1016  dist_offsets_host,
1017  dist_adjs_host,
1018  femv_colors,
1019  verts_to_send_host,
1020  verts_to_send_size_host,
1021  recoloringSize_host,
1022  rand_host,
1023  gid_host,
1024  ghost_degrees_host,
1025  recolor_degrees);
1026  if(distributedRounds < numStatisticRecordingRounds){
1027  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1028  conflict_detection += conflictDetectionPerRound[distributedRounds];
1029  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1030  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1031  comp_time += conflictDetectionPerRound[distributedRounds];
1032  } else if(timing){
1033  double conflict_detection_serial_round_time = timer() - detection_temp;
1034  conflict_detection += conflict_detection_serial_round_time;
1035  comp_time += conflict_detection_serial_round_time;
1036  }
1037  //do a reduction to determine if we're done
1038  int globalDone = 0;
1039  int localDone = recoloringSize_host(0);
1040  Teuchos::reduceAll<int, int>(*comm, Teuchos::REDUCE_SUM,1, &localDone, &globalDone);
1041  distributedRounds++;
1042  done = !globalDone;
1043  }
1044  total_time = timer() - total_time;
1045 
1046  //only take time to compute statistics if the user wants them
1047  if(verbose){
1048  std::cout<<comm->getRank()<<": done recoloring loop, computing statistics\n";
1049  int localBoundaryVertices = 0;
1050  for(size_t i = 0; i < nVtx; i++){
1051  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1052  if((size_t)adjs[j] >= nVtx){
1053  localBoundaryVertices++;
1054  break;
1055  }
1056  }
1057  }
1058  //print how many rounds of speculating/correcting happened (this should be the same for all ranks):
1059  if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1060  int totalBoundarySize = 0;
1061  int totalVertsPerRound[numStatisticRecordingRounds];
1062  double finalTotalPerRound[numStatisticRecordingRounds];
1063  double maxRecoloringPerRound[numStatisticRecordingRounds];
1064  double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
1065  double minRecoloringPerRound[numStatisticRecordingRounds];
1066  double finalCommPerRound[numStatisticRecordingRounds];
1067  double finalCompPerRound[numStatisticRecordingRounds];
1068  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1069  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1070  gno_t finalSentPerRound[numStatisticRecordingRounds];
1071  for(int i = 0; i < numStatisticRecordingRounds; i++) {
1072  totalVertsPerRound[i] = 0;
1073  finalTotalPerRound[i] = 0.0;
1074  maxRecoloringPerRound[i] = 0.0;
1075  minRecoloringPerRound[i] = 0.0;
1076  finalCommPerRound[i] = 0.0;
1077  finalCompPerRound[i] = 0.0;
1078  finalConflictDetectionPerRound[i] = 0.0;
1079  finalSentPerRound[i] = 0;
1080  finalRecvPerRound[i] = 0;
1081  }
1082  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,1, &localBoundaryVertices,&totalBoundarySize);
1083  Teuchos::reduceAll<int,int>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1084  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound,finalTotalPerRound);
1085  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1086  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1087  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,serialRecoloringPerRound,finalSerialRecoloringPerRound);
1088  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1089  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1090  Teuchos::reduceAll<int,double>(*comm,
1091  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1092  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound, finalRecvPerRound);
1093  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);
1094 
1095  printf("Rank %d: boundary size: %d\n",comm->getRank(),localBoundaryVertices);
1096  if(comm->getRank()==0) printf("Total boundary size: %d\n",totalBoundarySize);
1097  for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
1098  printf("Rank %d: recolor %d vertices in round %d\n",comm->getRank(),vertsPerRound[i],i);
1099  if(comm->getRank()==0) printf("recolored %d vertices in round %d\n",totalVertsPerRound[i],i);
1100  if(comm->getRank()==0) printf("total time in round %d: %f\n",i,finalTotalPerRound[i]);
1101  if(comm->getRank()==0) printf("recoloring time in round %d: %f\n",i,maxRecoloringPerRound[i]);
1102  if(comm->getRank()==0) printf("serial recoloring time in round %d: %f\n",i,finalSerialRecoloringPerRound[i]);
1103  if(comm->getRank()==0) printf("min recoloring time in round %d: %f\n",i,minRecoloringPerRound[i]);
1104  if(comm->getRank()==0) printf("conflict detection time in round %d: %f\n",i,finalConflictDetectionPerRound[i]);
1105  if(comm->getRank()==0) printf("comm time in round %d: %f\n",i,finalCommPerRound[i]);
1106  if(comm->getRank()==0) printf("total sent in round %d: %lld\n",i,finalSentPerRound[i]);
1107  if(comm->getRank()==0) printf("total recv in round %d: %lld\n",i,finalRecvPerRound[i]);
1108  if(comm->getRank()==0) printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
1109  }
1110  } else if(timing){
1111  double global_total_time = 0.0;
1112  double global_recoloring_time=0.0;
1113  double global_min_recoloring_time=0.0;
1114  double global_conflict_detection=0.0;
1115  double global_comm_time=0.0;
1116  double global_comp_time=0.0;
1117  double global_interior_time = 0.0;
1118  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1119  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1120  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1121  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1122  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1123  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1124  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1125  comm->barrier();
1126  fflush(stdout);
1127  if(comm->getRank()==0){
1128  printf("Total Time: %f\n",global_total_time);
1129  printf("Interior Time: %f\n",global_interior_time);
1130  printf("Recoloring Time: %f\n",global_recoloring_time);
1131  printf("Min Recoloring Time: %f\n",global_min_recoloring_time);
1132  printf("Conflict Detection Time: %f\n",global_conflict_detection);
1133  printf("Comm Time: %f\n",global_comm_time);
1134  printf("Comp Time: %f\n",global_comp_time);
1135  }
1136  }
1137  if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
1138  }
1139 };
1140 
1141 template <typename Adapter>
1142 void AlgDistance1<Adapter>::buildModel(modelFlag_t &flags){
1143  flags.set(REMOVE_SELF_EDGES);
1144 
1145  this->env->debug(DETAILED_STATUS, " building graph model");
1146  if(verbose) std::cout<<comm->getRank()<<": starting to construct graph model\n";
1147  this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1148  this->comm, flags));
1149  if(verbose) std::cout<<comm->getRank()<<": done constructing graph model\n";
1150  this->env->debug(DETAILED_STATUS, " graph model built");
1151 }
1152 
1153 
1154 }//end namespace Zoltan2
1155 #endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
Tpetra::Map< lno_t, gno_t > map_t
void hybridGMB(const size_t nVtx, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
typename Kokkos::View< device_type >::HostMirror::memory_space host_mem
typename Adapter::base_adapter_t base_adapter_t
typename Adapter::lno_t lno_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
AlgDistance1(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
Tpetra::Map<>::memory_space memory_space
Tpetra::Map<>::device_type device_type
typename Kokkos::View< device_type >::HostMirror::execution_space host_exec
typename Adapter::offset_t offset_t
Tpetra::Map<>::execution_space execution_space
void detectConflicts(const size_t n_local, const size_t n_ghosts, Kokkos::View< offset_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_offsets, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> dist_adjs, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> femv_colors, Kokkos::View< lno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> verts_to_send_view, Kokkos::View< size_t *, Kokkos::Device< ExecutionSpace, MemorySpace >, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> recoloringSize, Kokkos::View< int *, Kokkos::Device< ExecutionSpace, MemorySpace >> rand, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> gid, Kokkos::View< gno_t *, Kokkos::Device< ExecutionSpace, MemorySpace >> ghost_degrees, bool recolor_degrees)
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ REMOVE_SELF_EDGES
algorithm requires no self edges