Zoltan2
Zoltan2_AlgHybrid2GL.hpp
Go to the documentation of this file.
1 #ifndef _ZOLTAN2_2GHOSTLAYER_HPP_
2 #define _ZOLTAN2_2GHOSTLAYER_HPP_
3 
4 #include <vector>
5 #include <unordered_map>
6 #include <iostream>
7 #include <queue>
8 #ifdef _WIN32
9 #include <time.h>
10 #else
11 #include <sys/time.h>
12 #endif
13 
14 #include "Zoltan2_Algorithm.hpp"
15 #include "Zoltan2_GraphModel.hpp"
17 #include "Zoltan2_Util.hpp"
18 #include "Zoltan2_TPLTraits.hpp"
19 #include "Zoltan2_AlltoAll.hpp"
20 
21 
22 #include "Tpetra_Core.hpp"
23 #include "Teuchos_RCP.hpp"
24 #include "Tpetra_Import.hpp"
25 #include "Tpetra_FEMultiVector.hpp"
26 
27 #include "KokkosKernels_Handle.hpp"
28 #include "KokkosKernels_IOUtils.hpp"
29 #include "KokkosGraph_Distance1Color.hpp"
30 #include "KokkosGraph_Distance1ColorHandle.hpp"
31 
35 // for methods that use two layers of ghosts.
36 
37 
38 namespace Zoltan2{
39 
40 template <typename Adapter>
41 class AlgTwoGhostLayer : public Algorithm<Adapter> {
42 
43  public:
44 
45  using lno_t = typename Adapter::lno_t;
46  using gno_t = typename Adapter::gno_t;
47  using offset_t = typename Adapter::offset_t;
48  using scalar_t = typename Adapter::scalar_t;
50  using map_t = Tpetra::Map<lno_t,gno_t>;
51  using femv_scalar_t = int;
52  using femv_t = Tpetra::FEMultiVector<femv_scalar_t, lno_t, gno_t>;
53  using device_type = Tpetra::Map<>::device_type;
54  using execution_space = Tpetra::Map<>::execution_space;
55  using memory_space = Tpetra::Map<>::memory_space;
56  using host_exec = typename Kokkos::View<device_type>::HostMirror::execution_space;
57  using host_mem = typename Kokkos::View<device_type>::HostMirror::memory_space;
58 
59  double timer(){
60  struct timeval tp;
61  gettimeofday(&tp, NULL);
62  return ((double) (tp.tv_sec) + 1e-6 * tp.tv_usec);
63  }
64  private:
65 
66  void buildModel(modelFlag_t &flags);
67 
68  //Entry point for parallel local coloring
69  // nVtx is the number of vertices owned by the current process
70  //
71  // adjs_view is the adjacencies, indexed by offset_view
72  //
73  // offset_view is the CSR row map, used to index the adjs_view
74  //
75  // femv is the FEMultiVector that holds the colors for the vertices
76  // the colors will change in this function.
77  //
78  // vertex_list is a list of vertices to recolor
79  //
80  // vertex_list_size is the size of the list of vertices to recolor
81  // vertex_list_size = 0 means recolor all uncolored vertices
82  //
83  // recolor decides which KokkosKernels algorithm to use.
84  //
85  virtual void colorInterior(const size_t nVtx,
86  Kokkos::View<lno_t*, device_type > adjs_view,
87  Kokkos::View<offset_t*,device_type > offset_view,
88  Teuchos::RCP<femv_t> femv,
89  Kokkos::View<lno_t*, device_type> vertex_list,
90  size_t vertex_list_size = 0,
91  bool recolor=false) = 0;
92 
93  //Entry point for serial local coloring
94  virtual void colorInterior_serial(const size_t nVtx,
95  typename Kokkos::View<lno_t*, device_type >::HostMirror adjs_view,
96  typename Kokkos::View<offset_t*,device_type >::HostMirror offset_view,
97  Teuchos::RCP<femv_t> femv,
98  typename Kokkos::View<lno_t*, device_type>::HostMirror vertex_list,
99  size_t vertex_list_size = 0,
100  bool recolor=false) = 0;
101  public:
102  //Entry point for parallel conflict detection
103  //INPUT ARGS
104  // n_local is the number of vertices owned by the current process
105  //
106  // dist_offsets_dev is the device view that holds the CSR offsets
107  // It holds offsets for all vertices, owned and ghosts.
108  // It is organized with owned first, and then ghost layers:
109  // -----------------------------------------------
110  // | owned | first | second |
111  // | vtx | ghost | ghost |
112  // | offsets | layer | layer |
113  // -----------------------------------------------
114  // The allocated length is equal to the sum of owned
115  // and ghost vertices + 1. Any vertex with LID >= n_local
116  // is a ghost.
117  //
118  // dist_adjs_dev is the device view that holds CSR adjacencies
119  //
120  // boundary_verts_view holds the local IDs of vertices in the boundary.
121  // By definition, boundary vertices are owned vertices
122  // that have a ghost in their neighborhood.
123  // (distance-1 coloring = 1-hop neighborhood)
124  // (distance-2 coloring = 2-hop neighborhood)
125  // Note: for D1-2GL we do not use this argument.
126  // It is possible to detect all D1 conflicts by only
127  // checking the ghost vertices, without constructing
128  // the boundary.
129  //
130  // rand is a device view that holds random numbers generated from GIDs,
131  // they are consistently generated across processes
132  //
133  // gid is a device view that holds the GIDs of each vertex on this process.
134  // It is indexable by local ID. It stores both owned and ghost
135  // vertices, and LIDs are ordered so that the structure looks like:
136  // -----------------------------------------
137  // | | first | second | owned LIDs are smallest
138  // | owned | ghost | ghost | ghost LIDs increase based
139  // | vtx | layer | layer | on layer (1 < 2)
140  // -----------------------------------------
141  // The allocated size is dist_offsets_dev.extent(0)-1.
142  //
143  //
144  // ghost_degrees is a device view that holds the degrees of ghost vertices only.
145  // A ghost with local ID n_local will be the first entry in this view.
146  // So, ghost_degrees[i] is the degree of the vtx with
147  // GID = gid[n_local+i].
148  //
149  // recolor_degrees is a boolean that determines whether or not we factor in vertex
150  // degrees on recoloring
151  //
152  //OUTPUT ARGS
153  // femv_colors is the device color view.
154  // After this function call, conflicts between vertices will
155  // be resolved via setting a vertex's color to 0. The vertex
156  // to be uncolored is determined by rules that are consistent
157  // across multiple processes.
158  //
159  // verts_to_recolor_view is a device view that holds the list
160  // of vertices to recolor. Any vertex
161  // uncolored by this function will appear in this
162  // view after the function returns.
163  //
164  // verts_to_recolor_size_atomic is an atomic device view that holds the
165  // size of verts_to_recolor_view.
166  // Effectively it represents how many
167  // vertices need to be recolored after
168  // conflict detection.
169  // It differs in size from the allocated size of
170  // verts_to_recolor_view.
171  // Atomicity is required for building
172  // verts_to_recolor_view in parallel, which
173  // is the reason this is a view instead of
174  // just an integer type.
175  //
176  // verts_to_send_view is a device view that holds the list of vertices
177  // that will need to be sent to remotes after recoloring
178  // Note: Only locally owned vertices should ever be in
179  // this view. A process cannot color ghosts correctly.
180  //
181  // verts_to_send_size_atomic is an atomic device view that holds the size of
182  // verts_to_send_view. It differs in size from the
183  // allocated size of verts_to_send_view.
184  // Atomicity is required for building
185  // verts_to_send_view in parallel,
186  // which is the reason this is a view instead of
187  // just an integer type.
188  //
189  // recoloringSize is a device view that stores the total number of
190  // vertices that were uncolored by the conflict detection.
191  //
192  virtual void detectConflicts(const size_t n_local,
193  Kokkos::View<offset_t*, device_type > dist_offsets_dev,
194  Kokkos::View<lno_t*, device_type > dist_adjs_dev,
195  Kokkos::View<int*,device_type > femv_colors,
196  Kokkos::View<lno_t*, device_type > boundary_verts_view,
197  Kokkos::View<lno_t*,
198  device_type > verts_to_recolor_view,
199  Kokkos::View<int*,
200  device_type,
201  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic,
202  Kokkos::View<lno_t*,
203  device_type > verts_to_send_view,
204  Kokkos::View<size_t*,
205  device_type,
206  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic,
207  Kokkos::View<size_t*, device_type> recoloringSize,
208  Kokkos::View<int*,
209  device_type> rand,
210  Kokkos::View<gno_t*,
211  device_type> gid,
212  Kokkos::View<gno_t*,
213  device_type> ghost_degrees,
214  bool recolor_degrees) = 0;
215 
216  //Entry point for serial conflict detection
217  virtual void detectConflicts_serial(const size_t n_local,
218  typename Kokkos::View<offset_t*, device_type >::HostMirror dist_offsets_host,
219  typename Kokkos::View<lno_t*, device_type >::HostMirror dist_adjs_host,
220  typename Kokkos::View<int*,device_type >::HostMirror femv_colors,
221  typename Kokkos::View<lno_t*, device_type >::HostMirror boundary_verts_view,
222  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_recolor_view,
223  typename Kokkos::View<int*,device_type>::HostMirror verts_to_recolor_size_atomic,
224  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send_view,
225  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size_atomic,
226  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize,
227  typename Kokkos::View<int*, device_type>::HostMirror rand,
228  typename Kokkos::View<gno_t*,device_type>::HostMirror gid,
229  typename Kokkos::View<gno_t*,device_type>::HostMirror ghost_degrees,
230  bool recolor_degrees) = 0;
231  //Entry point for the construction of the boundary
232  //INPUT ARGS
233  // n_local is the number of vertices owned by the current process
234  //
235  // dist_offsets_dev is the device view that holds the CSR offsets
236  //
237  // dist_adjs_dev is the device view that holds CSR adjacencies
238  //
239  // dist_offsets_host is the hostmirror that holds the CSR offsets
240  //
241  // dist_adjs_host is the hostmirror that holds the CSR adjacencies
242  //
243  //OUTPUT ARGS
244  // boundary_verts is an unallocated device view that will hold the
245  // list of boundary vertices.
246  //
247  // verts_to_send_view will hold the list of vertices to send
248  //
249  // verts_to_send_size_atomic will hold the number of vertices to
250  // send currently held in verts_to_send_view.
251  // verts_to_send_size_atomic differs
252  // from the allocated size of verts_to_send_view
253  // Atomicity is required for building
254  // verts_to_send_view in parallel, which is
255  // the reason this is a view instead of an
256  // integer type.
257  //
258  virtual void constructBoundary(const size_t n_local,
259  Kokkos::View<offset_t*, device_type> dist_offsets_dev,
260  Kokkos::View<lno_t*, device_type> dist_adjs_dev,
261  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host,
262  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host,
263  Kokkos::View<lno_t*, device_type>& boundary_verts,
264  Kokkos::View<lno_t*,
265  device_type > verts_to_send_view,
266  Kokkos::View<size_t*,
267  device_type,
268  Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic) = 0;
269 
270  protected:
271  RCP<const base_adapter_t> adapter;
272  RCP<GraphModel<base_adapter_t> > model;
273  RCP<Teuchos::ParameterList> pl;
274  RCP<Environment> env;
275  RCP<const Teuchos::Comm<int> > comm;
276  bool verbose;
277  bool timing;
278 
279  private:
280  //This function constructs a CSR with complete adjacency information for
281  //first-layer ghost vertices. This CSR can contain edges from:
282  // first-layer ghosts to owned;
283  // first-layer ghosts to first-layer ghosts;
284  // first-layer ghosts to second-layer ghosts;
285  //1) Each process sends a list of ghost GIDs back to their owning process
286  // to request full adjacency information for that vertex.
287  //
288  //2) Initially each process sends back degree information to the requesting
289  // process.
290  //
291  //3) Then, each process can construct send buffers and receive schedules for
292  // the adjacency information for each requested GID it received,
293  // in addition to building the 2GL offsets and relabeling ghost LIDs
294  // to make the final CSR construction easier.
295  //
296  // 3a) We can construct the 2GL offsets here because each process has
297  // already received the degree information for each first-layer
298  // ghost vertex.
299  //
300  // 3b) The original ghost LIDs may not correspond to the order in which
301  // we will receive the adjacency information. Instead of reordering
302  // the received adjacencies, we relabel the GIDs of first-layer
303  // ghosts with new LIDs that correspond to the order in which we
304  // receive the adjacency information. Only first-layer ghost LIDs
305  // are changed.
306  //
307  //4) Due to limitations on the size of an MPI message, we split sending and
308  // receiving into rounds to accommodate arbitrarily large graphs.
309  //
310  //5) As send rounds progress, we construct the 2GL adjacency structure.
311  // Because we relabel ghost LIDs, we do not need to reorder the
312  // data after we receive it.
313  //
314  //
315  //
316  //OUTPUT ARGS
317  // ownedPlusGhosts Initially holds the list of GIDs for owned and ghost
318  // vertices. The only hard constraint on the ordering is
319  // that owned vertices come before ghost vertices.
320  // This function can re-assign the LIDs of ghost vertices
321  // in order to make the final construction of the 2GL
322  // CSR easier. After this function call, some ghost
323  // LIDs may be changed, but they will still be greater
324  // than owned LIDs. No owned LIDs will be changed.
325  //
326  // adjs_2GL holds the second ghost layer CSR adjacencies. The 2GL CSR
327  // contains complete adjacency information for the first-layer
328  // ghosts. These adjacencies can hold both owned vertices and
329  // second-layer ghosts, as well as first-layer ghosts.
330  //
331  // offsets_2GL holds CSR offsets for vertices in the first ghost layer only.
332  // That is, a vertex with GID = gid[n_local] will be the first
333  // vertex with information in this offset structure.
334  //
335  //
336  //INPUT ARGS
337  // owners holds the owning proc for a given vertex, indexed by local ID.
338  //
339  // adjs is the CSR adjacencies of the local graph with a single ghost layer.
340  //
341  // offsets is the CSR offsets of the local graph with a single ghost layer.
342  //
343  // mapOwned translates from Owned GID to LID. We only need this translation
344  // for owned vertices.
345  //
346  //TODO: This function uses many vectors of size comm->getSize();
347  // consider changes to increase memory scalability.
348  void constructSecondGhostLayer(std::vector<gno_t>& ownedPlusGhosts,
349  const std::vector<int>& owners,
350  ArrayView<const gno_t> adjs,
351  ArrayView<const offset_t> offsets,
352  RCP<const map_t> mapOwned,
353  std::vector< gno_t>& adjs_2GL,
354  std::vector< offset_t>& offsets_2GL) {
355 
356  //first, we send ghost GIDs back to owners to receive the
357  //degrees of each first-layer ghost.
358  std::vector<int> sendcounts(comm->getSize(),0);
359  std::vector<size_t> sdispls(comm->getSize()+1,0);
360  //loop through owners, count how many vertices we'll send to each processor
361  //We send each ghost GID back to its owning process.
362  if(verbose) std::cout<<comm->getRank()<<": building sendcounts\n";
363  for(size_t i = 0; i < owners.size(); i++){
364  if(owners[i] != comm->getRank()&& owners[i] !=-1) sendcounts[owners[i]]++;
365  }
366  //construct sdispls (for building sendbuf), and sum the total sendcount
367  if(verbose) std::cout<<comm->getRank()<<": building sdispls\n";
368  size_t sendcount = 0;
369  for(int i = 1; i < comm->getSize()+1; i++){
370  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
371  sendcount += sendcounts[i-1];
372  }
373 
374  if(verbose) std::cout<<comm->getRank()<<": building idx\n";
375  std::vector<gno_t> idx(comm->getSize(),0);
376  for(int i = 0; i < comm->getSize(); i++){
377  idx[i]=sdispls[i];
378  }
379  //construct sendbuf to send GIDs to owning processes
380  if(verbose) std::cout<<comm->getRank()<<": building sendbuf\n";
381 
382  std::vector<gno_t> sendbuf(sendcount,0);
383  for(size_t i = offsets.size()-1; i < owners.size(); i++){
384  if(owners[i] != comm->getRank() && owners[i] != -1){
385  sendbuf[idx[owners[i]]++] = ownedPlusGhosts[i];
386  }
387  }
388 
389  //communicate GIDs to owners
390  if(verbose) std::cout<<comm->getRank()<<": requesting GIDs from owners\n";
391  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
392  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
393  Teuchos::ArrayRCP<gno_t> recvbuf;
394  std::vector<int> recvcounts(comm->getSize(),0);
395  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
396  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
397 
398  if(verbose) std::cout<<comm->getRank()<<": done communicating\n";
399  //replace entries in recvGIDs with their degrees
400 
401  if(verbose) std::cout<<comm->getRank()<<": building rdispls\n";
402  gno_t recvcounttotal = 0;
403  std::vector<int> rdispls(comm->getSize()+1,0);
404  for(size_t i = 1; i<recvcounts.size()+1; i++){
405  rdispls[i] = rdispls[i-1] + recvcounts[i-1];
406  recvcounttotal += recvcounts[i-1];
407  }
408  //send back the degrees to the requesting processes,
409  //build the adjacency counts
410  std::vector<offset_t> sendDegrees(recvcounttotal,0);
411  gno_t adj_len = 0;
412  std::vector<int> adjsendcounts(comm->getSize(),0);
413  if(verbose) std::cout<<comm->getRank()<<": building adjacency counts\n";
414  for(int i = 0; i < comm->getSize(); i++){
415  adjsendcounts[i] = 0;
416  for(int j = rdispls[i]; j < rdispls[i+1]; j++){
417  lno_t lid = mapOwned->getLocalElement(recvbuf[j]);
418  offset_t degree = offsets[lid+1] - offsets[lid];
419  sendDegrees[j] = degree;
420  adj_len +=degree;
421  adjsendcounts[i] += degree;
422  }
423  }
424  //communicate the degrees back to the requesting processes
425  if(verbose) std::cout<<comm->getRank()<<": sending degrees back to requestors\n";
426  Teuchos::ArrayView<offset_t> sendDegrees_view = Teuchos::arrayViewFromVector(sendDegrees);
427  Teuchos::ArrayRCP<offset_t> recvDegrees;
428  std::vector<int> recvDegreesCount(comm->getSize(),0);
429  Teuchos::ArrayView<int> recvDegreesCount_view = Teuchos::arrayViewFromVector(recvDegreesCount);
430  Zoltan2::AlltoAllv<offset_t>(*comm, *env, sendDegrees_view, recvcounts_view, recvDegrees, recvDegreesCount_view);
431 
432  //calculate number of rounds of AlltoAllv's that are necessary on this process
433 
434  if(verbose) std::cout<<comm->getRank()<<": determining number of rounds necessary\n";
435  int rounds = 1;
436  for(int i = 0; i < comm->getSize(); i++){
437  if(adjsendcounts[i]*sizeof(gno_t)/ INT_MAX > (size_t)rounds){
438  rounds = (adjsendcounts[i]*sizeof(gno_t)/INT_MAX)+1;
439  }
440  }
441 
442  //see what the max number of rounds will be globally
443  int max_rounds = 0;
444  Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_MAX, 1, &rounds, &max_rounds);
445 
446  if(verbose) std::cout<<comm->getRank()<<": building per_proc sums\n";
447  //compute send and receive schedules to and from each process
448  std::vector<std::vector<uint64_t>> per_proc_round_adj_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
449  std::vector<std::vector<uint64_t>> per_proc_round_vtx_sums(max_rounds+1,std::vector<uint64_t>(comm->getSize(),0));
450 
451  if(verbose) std::cout<<comm->getRank()<<": filling per_proc sums\n";
452  //fill out the send schedules
453  for(int proc_to_send = 0; proc_to_send < comm->getSize(); proc_to_send++){
454  int curr_round = 0;
455  for(size_t j = sdispls[proc_to_send]; j < sdispls[proc_to_send+1]; j++){
456  if((per_proc_round_adj_sums[curr_round][proc_to_send] + recvDegrees[j])*sizeof(gno_t) > INT_MAX){
457  curr_round++;
458  }
459  per_proc_round_adj_sums[curr_round][proc_to_send] += recvDegrees[j];
460  per_proc_round_vtx_sums[curr_round][proc_to_send]++;
461  }
462  }
463 
464  if(verbose) std::cout<<comm->getRank()<<": building recv GID schedule\n";
465 
466  //A 3D vector to hold the GIDs so we can see how this process will receive
467  //the vertices in each round, from each process. This way we can reorder things
468  //locally so that the data we receive will automatically construct the CSR correctly
469  //without any other computation. We do the reordering before we start receiving.
470  std::vector<std::vector<std::vector<gno_t>>> recv_GID_per_proc_per_round(
471  max_rounds+1,std::vector<std::vector<gno_t>>(
472  comm->getSize(),std::vector<gno_t>(0)));
473  for(int i = 0; i < max_rounds; i++){
474  for(int j = 0; j < comm->getSize(); j++){
475  recv_GID_per_proc_per_round[i][j] = std::vector<gno_t>(sendcounts[j],0);
476  }
477  }
478 
479  if(verbose) std::cout<<comm->getRank()<<": filling out recv GID schedule\n";
480  for(int i = 0; i < comm->getSize(); i++){
481  int curr_round = 0;
482  size_t curr_idx = 0;
483  for(size_t j = sdispls[i]; j < sdispls[i+1]; j++){
484  if(curr_idx > per_proc_round_vtx_sums[curr_round][i]){
485  curr_round++;
486  curr_idx = 0;
487  }
488  recv_GID_per_proc_per_round[curr_round][i][curr_idx++] = j;
489  }
490  }
491 
492  if(verbose) std::cout<<comm->getRank()<<": reordering gids and degrees in the order they'll be received\n";
493  //reorder the GIDs and degrees locally:
494  // - The way we previously stored GIDs and degrees had no relation
495  // to the order that we'll be receiving the adjacency data.
496  // For the CSR to be complete (and usable), we reorder the LIDs of
497  // ghosts so that they are in the order we will receive the adjacency
498  // data.
499  //
500  // - For graphs that are not large enough to trigger sending in multiple
501  // rounds of communication, the LIDs of the ghosts will be ordered
502  // by owning process. If multiple rounds are used, the LIDs of
503  // ghosts will be ordered by rounds in addition to owning process.
504  //
505  // - final_gid_vec and final_degree_vec hold the reorganized gids and
506  // degrees, the final re-mapping happens further down
507  std::vector<gno_t> final_gid_vec(sendcount, 0);
508  std::vector<offset_t> final_degree_vec(sendcount,0);
509  gno_t reorder_idx = 0;
510  for(int i = 0; i < max_rounds; i++){
511  for(int j = 0; j < comm->getSize(); j++){
512  for(size_t k = 0; k < per_proc_round_vtx_sums[i][j]; k++){
513  final_gid_vec[reorder_idx] = sendbuf[recv_GID_per_proc_per_round[i][j][k]];
514  final_degree_vec[reorder_idx++] = recvDegrees[recv_GID_per_proc_per_round[i][j][k]];
515  }
516  }
517  }
518 
519  //check to see if the reorganization has happened correctly
520  if(verbose){
521  //do a quick check to see if we ended up reorganizing anything
522  bool reorganized = false;
523  for(size_t i = 0; i < sendcount; i++){
524  if(final_gid_vec[i] != sendbuf[i]) reorganized = true;
525  }
526 
527  //if we have more than a single round of communication, we need to reorganize.
528  //this alerts of unnecessary reorganization, and a failure to perform necessary
529  //reorganization.
530  if(!reorganized && (max_rounds > 1)) std::cout<<comm->getRank()<<": did not reorgainze GIDs, but probably should have\n";
531  if(reorganized && (max_rounds == 1)) std::cout<<comm->getRank()<<": reorganized GIDs, but probably should not have\n";
532  }
533  //remap the GIDs so we receive the adjacencies in the same order as the current processes LIDs
534  // originally, the GID->LID mapping has no relevance to how we'll be receiving adj data from
535  // remote processes. Here we make it so that the GID->LID mapping does correspond to the
536  // order we have received degree info and will receive adjacencies. ( LID n_local
537  // corresponds to the first GID whose adjacency info we will receive, LID n_local+1 the
538  // second, etc.)
539  // That way, we don't need to reorder the adjacencies after we receive them.
540  //
541  // This next loop is the final mapping step.
542 
543  for (size_t i = 0; i < sendcount; i++){
544  ownedPlusGhosts[i+offsets.size()-1] = final_gid_vec[i];
545  }
546 
547  //status report
548  if(verbose) {
549  std::cout<<comm->getRank()<<": done remapping\n";
550  std::cout<<comm->getRank()<<": building ghost offsets\n";
551  }
552  //start building the second ghost layer's offsets
553  std::vector<offset_t> ghost_offsets(sendcount+1,0);
554  for(size_t i = 1; i < sendcount+1; i++){
555  ghost_offsets[i] = ghost_offsets[i-1] + final_degree_vec[i-1];
556  }
557 
558 
559  if(verbose) std::cout<<comm->getRank()<<": going through the sending rounds\n";
560  //set up counters to keep track of where we are in the sending order
561  std::vector<uint64_t> curr_idx_per_proc(comm->getSize(),0);
562  for(int i = 0; i < comm->getSize(); i++) curr_idx_per_proc[i] = rdispls[i];
563  for(int round = 0; round < max_rounds; round++){
564  //send buffers
565  std::vector<gno_t> send_adj;
566  std::vector<int> send_adj_counts(comm->getSize(),0);
567  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", constructing send_adj\n";
568  //construct the adjacencies to send for this round
569  for(int curr_proc = 0; curr_proc < comm->getSize(); curr_proc++){
570  uint64_t curr_adj_sum = 0;
571  //keep going through adjacencies to send to this process until we're done
572  while( curr_idx_per_proc[curr_proc] < (size_t)rdispls[curr_proc+1]){
573  lno_t lid = mapOwned->getLocalElement(recvbuf[curr_idx_per_proc[curr_proc]++]);
574 
575  //if the next adjacency would push us over the MPI message size max,
576  //stop for this round
577  if((curr_adj_sum + (offsets[lid+1]-offsets[lid]))*sizeof(gno_t) >= INT_MAX){
578  break;
579  }
580 
581  //add the adjacencies to the send buffer
582  curr_adj_sum += (offsets[lid+1] - offsets[lid]);
583  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
584  send_adj.push_back(adjs[j]);
585  }
586  }
587  //update the send counts for this round
588  send_adj_counts[curr_proc] = curr_adj_sum;
589  }
590  if(verbose) std::cout<<comm->getRank()<<": round "<<round<<", sending...\n";
591  //do the sending...
592  Teuchos::ArrayView<gno_t> send_adjs_view = Teuchos::arrayViewFromVector(send_adj);
593  Teuchos::ArrayView<int> adjsendcounts_view = Teuchos::arrayViewFromVector(send_adj_counts);
594  Teuchos::ArrayRCP<gno_t> ghost_adjs;
595  uint64_t recv_adjs_count = 0;
596  for(int i = 0; i < comm->getSize(); i++){
597  recv_adjs_count += per_proc_round_adj_sums[round][i];
598  }
599  std::vector<int> adjrecvcounts(comm->getSize(),0);
600  Teuchos::ArrayView<int> adjsrecvcounts_view = Teuchos::arrayViewFromVector(adjrecvcounts);
601  Zoltan2::AlltoAllv<gno_t>(*comm, *env, send_adjs_view, adjsendcounts_view, ghost_adjs, adjsrecvcounts_view);
602  //Because of the previous reordering, these adjacencies are
603  //in the correct order as they arrive on the process.
604  for(offset_t i = 0; i< (offset_t)ghost_adjs.size(); i++){
605  adjs_2GL.push_back(ghost_adjs[i]);
606  }
607  }
608  if(verbose) std::cout<<comm->getRank()<<": constructing offsets\n";
609  //put the offsets we computed into the output argument.
610  for(size_t i = 0; i < sendcount+1; i++){
611  offsets_2GL.push_back(ghost_offsets[i]);
612  }
613  if(verbose) std::cout<<comm->getRank()<<": done building 2nd ghost layer\n";
614  }
615 
616  //Communicates owned vertex colors to remote ghost copies.
617  //
618  //returns: the amount of time the communication call took.
619  //
620  //OUTPUT ARGS:
621  // colors: the owned vertices' colors are not changed,
622  // ghost vertex colors are updated from their owners.
623  //
624  // total_sent: reports the total size of the send buffer
625  //
626  // total_recv: reports the total size of the recv buffer
627  //
628  //INPUT ARGS:
629  //
630  // mapOwnedPlusGhosts: maps global IDs to local IDs and vice-versa.
631  //
632  // nVtx: the number of owned vertices on this process
633  //
634  // verts_to_send: hostmirror of the list of vertices to send.
635  // This list will only contain local vertices
636  // that are ghosted on a remote process.
637  //
638  // verts_to_send_size: hostmirror of the size of verts_to_send
639  //
640  // procs_to_send: map that takes a local ID and gives a vector of
641  // process IDs which have a ghost copy of that vertex.
642  //
643  double doOwnedToGhosts(RCP<const map_t> mapOwnedPlusGhosts,
644  size_t nVtx,
645  typename Kokkos::View<lno_t*,device_type>::HostMirror verts_to_send,
646  typename Kokkos::View<size_t*,device_type>::HostMirror verts_to_send_size,
647  Teuchos::RCP<femv_t> femv,
648  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send,
649  gno_t& total_sent, gno_t& total_recvd){
650 
651  auto femvColors = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
652  auto colors = subview(femvColors, Kokkos::ALL, 0);
653  //create vectors to hold send information
654  int nprocs = comm->getSize();
655  std::vector<int> sendcnts(comm->getSize(), 0);
656  std::vector<gno_t> sdispls(comm->getSize()+1, 0);
657 
658  //calculate how much data we're sending to each process
659  for(size_t i = 0; i < verts_to_send_size(0); i++){
660  for(size_t j = 0; j < procs_to_send.at(verts_to_send(i)).size(); j++){
661  sendcnts[procs_to_send.at(verts_to_send(i))[j]] += 2;
662  }
663  }
664  //calculate sendsize and sdispls
665  sdispls[0] = 0;
666  gno_t sendsize = 0;
667  std::vector<int> sentcount(nprocs, 0);
668 
669  for(int i = 1; i < comm->getSize()+1; i++){
670  sdispls[i] = sdispls[i-1] + sendcnts[i-1];
671  sendsize += sendcnts[i-1];
672  }
673  total_sent = sendsize;
674  std::vector<gno_t> sendbuf(sendsize,0);
675  //construct sendbuf, send each owned vertex's GID
676  //and its color to the processes that have a
677  //ghost copy of that vertex. If a vertex is not ghosted,
678  //it does not get sent anywhere.
679  for(size_t i = 0; i < verts_to_send_size(0); i++){
680  std::vector<int> procs = procs_to_send.at(verts_to_send(i));
681  for(size_t j = 0; j < procs.size(); j++){
682  size_t idx = sdispls[procs[j]] + sentcount[procs[j]];
683  sentcount[procs[j]] += 2;
684  sendbuf[idx++] = mapOwnedPlusGhosts->getGlobalElement(verts_to_send(i));
685  sendbuf[idx] = colors(verts_to_send(i));
686  }
687  }
688 
689  Teuchos::ArrayView<int> sendcnts_view = Teuchos::arrayViewFromVector(sendcnts);
690  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
691  Teuchos::ArrayRCP<gno_t> recvbuf;
692  std::vector<int> recvcnts(comm->getSize(), 0);
693  Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);
694 
695  //if we're reporting times, remove the computation imbalance from the comm timer
696  if(timing) comm->barrier();
697  double comm_total = 0.0;
698  double comm_temp = timer();
699 
700  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcnts_view, recvbuf, recvcnts_view);
701  comm_total += timer() - comm_temp;
702 
703  //compute total recvsize for updating local ghost colors
704  gno_t recvsize = 0;
705  for(int i = 0; i < recvcnts_view.size(); i++){
706  recvsize += recvcnts_view[i];
707  }
708  total_recvd = recvsize;
709  //update the local ghost copies with the color we just received.
710  for(int i = 0; i < recvsize; i+=2){
711  size_t lid = mapOwnedPlusGhosts->getLocalElement(recvbuf[i]);
712  colors(lid) = recvbuf[i+1];
713  }
714 
715  return comm_total;
716  }
717 
718  public:
719  //constructor
721  const RCP<const base_adapter_t> &adapter_,
722  const RCP<Teuchos::ParameterList> &pl_,
723  const RCP<Environment> &env_,
724  const RCP<const Teuchos::Comm<int> > &comm_)
725  : adapter(adapter_), pl(pl_), env(env_), comm(comm_){
726  verbose = pl->get<bool>("verbose",false);
727  timing = pl->get<bool>("timing", false);
728  modelFlag_t flags;
729  flags.reset();
730  buildModel(flags);
731  }
732  //Main entry point for graph coloring
733  void color( const RCP<ColoringSolution<Adapter> > &solution){
734  //convert from global graph to local graph
735  ArrayView<const gno_t> vtxIDs;
736  ArrayView<StridedData<lno_t, scalar_t> > vwgts;
737  size_t nVtx = model->getVertexList(vtxIDs, vwgts);
738  // the weights are not used at this point.
739 
740  //get the adjacencies in a view that holds GIDs
741  ArrayView<const gno_t> adjs;
742  //get the CSR offsets
743  ArrayView<const offset_t> offsets;
744  ArrayView<StridedData<lno_t, scalar_t> > ewgts;
745  model->getEdgeList(adjs, offsets, ewgts);
746 
747  //This map is used to map GID to LID initially
748  std::unordered_map<gno_t,lno_t> globalToLocal;
749  //this vector holds GIDs for owned and ghost vertices
750  //The final order of the gids is:
751  // -----------------------------------
752  // | | first | second |
753  // | owned | layer | layer |
754  // | | ghosts | ghosts |
755  // -----------------------------------
756  //
757  // constructSecondGhostLayer reorders the first layer ghost
758  // GIDs in the particular order that the communication requires.
759 
760  std::vector<gno_t> ownedPlusGhosts;
761 
762  //this vector holds the owning process for
763  //owned vertices and the first ghost layer.
764  //owners[i] gives the processor owning ownedPlusGhosts[i],
765  //for owned vertices and first layer ghosts.
766  //
767  //This should only be used BEFORE the call to constructSecondGhostLayer
768  std::vector<int> owners;
769 
770  //fill these structures using owned vertices
771  for(int i = 0; i < vtxIDs.size(); i++){
772  globalToLocal[vtxIDs[i]] = i;
773  ownedPlusGhosts.push_back(vtxIDs[i]);
774  owners.push_back(comm->getRank());
775  }
776 
777  //count the initial number of ghosts,
778  //map them to a local ID.
779  //The globalToLocal map has first layer ghosts
780  //from here on.
781  int nGhosts = 0;
782  std::vector<lno_t> local_adjs;
783  for(int i = 0; i < adjs.size(); i++){
784  if(globalToLocal.count(adjs[i])==0){
785  ownedPlusGhosts.push_back(adjs[i]);
786  globalToLocal[adjs[i]] = vtxIDs.size()+nGhosts;
787  nGhosts++;
788  }
789  local_adjs.push_back(globalToLocal[adjs[i]]);
790  }
791 
792  //construct a Tpetra map for the FEMultiVector
793  Tpetra::global_size_t dummy = Teuchos::OrdinalTraits
794  <Tpetra::global_size_t>::invalid();
795  RCP<const map_t> mapOwned = rcp(new map_t(dummy, vtxIDs, 0, comm));
796 
797  //GID and owner lookup for ghost vertices
798  std::vector<gno_t> ghosts;
799  std::vector<int> ghostowners;
800  for(size_t i = nVtx; i < nVtx+nGhosts; i++){
801  ghosts.push_back(ownedPlusGhosts[i]);
802  ghostowners.push_back(-1);
803  }
804 
805  //get the owners of the ghost vertices
806  ArrayView<int> owningProcs = Teuchos::arrayViewFromVector(ghostowners);
807  ArrayView<const gno_t> gids = Teuchos::arrayViewFromVector(ghosts);
808  mapOwned->getRemoteIndexList(gids, owningProcs);
809 
810  //add ghost owners to the total owner vector
811  for(size_t i = 0; i < ghostowners.size(); i++){
812  owners.push_back(ghostowners[i]);
813  }
814 
815  //construct the second ghost layer.
816  //NOTE: this may reorder the LIDs of the ghosts.
817  //
818  //these vectors hold the CSR representation of the
819  // first ghost layer. This CSR contains:
820  // - edges from first layer ghosts to:
821  // - owned vertices
822  // - first layer ghosts
823  // - second layer ghosts
824  //
825  // - first_layer_ghost_adjs uses GIDs that need
826  // translated to LIDs.
827  //
828  // - first_layer_ghost_offsets are indexed by LID,
829  // first_layer_ghost_offsets[i] corresponds to
830  // the vertex with GID = ownedPlusGhosts[i+nVtx].
831  // first_layer_ghost_offsets.size() = #first layer ghosts + 1
832  //
833  std::vector< gno_t> first_layer_ghost_adjs;
834  std::vector< offset_t> first_layer_ghost_offsets;
835  constructSecondGhostLayer(ownedPlusGhosts,owners, adjs, offsets, mapOwned,
836  first_layer_ghost_adjs, first_layer_ghost_offsets);
837 
838  //we potentially reordered the local IDs of the ghost vertices, so we need
839  //to re-insert the GIDs into the global to local ID mapping.
840  globalToLocal.clear();
841  for(size_t i = 0; i < ownedPlusGhosts.size(); i++){
842  globalToLocal[ownedPlusGhosts[i]] = i;
843  }
844 
845  //use updated globalToLocal map to translate
846  //adjacency GIDs to LIDs
847  for(int i = 0 ; i < adjs.size(); i++){
848  local_adjs[i] = globalToLocal[adjs[i]];
849  }
850 
851  //at this point, we have ownedPlusGhosts with 1layer ghosts' GIDs.
852  //Need to map the second ghost layer GIDs to new local IDs,
853  //and add them to the map, as well as translating 2layer ghost
854  //adjacencies to use those new LIDs.
855  size_t n2Ghosts = 0;
856 
857  //local_ghost_adjs is the LID translation of first_layer_ghost_adjs.
858  std::vector<lno_t> local_ghost_adjs;
859  for(size_t i = 0; i< first_layer_ghost_adjs.size(); i++ ){
860  if(globalToLocal.count(first_layer_ghost_adjs[i]) == 0){
861  ownedPlusGhosts.push_back(first_layer_ghost_adjs[i]);
862  globalToLocal[first_layer_ghost_adjs[i]] = vtxIDs.size() + nGhosts + n2Ghosts;
863  n2Ghosts++;
864  }
865  local_ghost_adjs.push_back(globalToLocal[first_layer_ghost_adjs[i]]);
866  }
867 
868  //construct data structures necessary for FEMultiVector
869  if(verbose) std::cout<<comm->getRank()<<": constructing Tpetra map with copies\n";
870  dummy = Teuchos::OrdinalTraits <Tpetra::global_size_t>::invalid();
871  RCP<const map_t> mapWithCopies = rcp(new map_t(dummy,
872  Teuchos::arrayViewFromVector(ownedPlusGhosts),
873  0, comm));
874  if(verbose) std::cout<<comm->getRank()<<": done constructing map with copies\n";
875 
876  using import_t = Tpetra::Import<lno_t, gno_t>;
877  Teuchos::RCP<import_t> importer = rcp(new import_t(mapOwned,
878  mapWithCopies));
879  if(verbose) std::cout<<comm->getRank()<<": done constructing importer\n";
880  Teuchos::RCP<femv_t> femv = rcp(new femv_t(mapOwned,
881  importer, 1, true));
882  if(verbose) std::cout<<comm->getRank()<<": done constructing femv\n";
883 
884  //Create random numbers seeded on global IDs to resolve conflicts
885  //consistently between processes.
886  std::vector<int> rand(ownedPlusGhosts.size());
887  for(size_t i = 0; i < rand.size(); i++){
888  std::srand(ownedPlusGhosts[i]);
889  rand[i] = std::rand();
890  }
891 
892  // get up-to-date owners for all ghosts.
893  std::vector<int> ghostOwners2(ownedPlusGhosts.size() -nVtx);
894  std::vector<gno_t> ghosts2(ownedPlusGhosts.size() - nVtx);
895  for(size_t i = nVtx; i < ownedPlusGhosts.size(); i++) ghosts2[i-nVtx] = ownedPlusGhosts[i];
896  Teuchos::ArrayView<int> owners2 = Teuchos::arrayViewFromVector(ghostOwners2);
897  Teuchos::ArrayView<const gno_t> ghostGIDs = Teuchos::arrayViewFromVector(ghosts2);
898  mapOwned->getRemoteIndexList(ghostGIDs,owners2);
899  if(verbose) std::cout<<comm->getRank()<<": done getting ghost owners\n";
900 
901  //calculations for how many 2GL verts are in the boundary of another process, only
902  //do this if it's going to be displayed.
903  if(verbose) {
904  std::cout<<comm->getRank()<<": calculating 2GL stats...\n";
905 
906  std::vector<int> sendcounts(comm->getSize(),0);
907  std::vector<gno_t> sdispls(comm->getSize()+1,0);
908  //loop through owners, count how many vertices we'll send to each processor
909  for(int i = nGhosts; i < ghostGIDs.size(); i++){
910  if(owners2[i] != comm->getRank()&& owners2[i] !=-1) sendcounts[owners2[i]]++;
911  }
912  //construct sdispls (for building sendbuf), and sum the total sendcount
913  gno_t sendcount = 0;
914  for(int i = 1; i < comm->getSize()+1; i++){
915  sdispls[i] = sdispls[i-1] + sendcounts[i-1];
916  sendcount += sendcounts[i-1];
917  }
918  std::vector<gno_t> idx(comm->getSize(),0);
919  for(int i = 0; i < comm->getSize(); i++){
920  idx[i]=sdispls[i];
921  }
922  //construct sendbuf to send GIDs to owning processes
923  std::vector<gno_t> sendbuf(sendcount,0);
924  for(size_t i = nGhosts; i < (size_t)ghostGIDs.size(); i++){
925  if(owners2[i] != comm->getRank() && owners2[i] != -1){
926  sendbuf[idx[owners2[i]]++] = ghostGIDs[i];
927  }
928  }
929  //do the communication
930  Teuchos::ArrayView<int> sendcounts_view = Teuchos::arrayViewFromVector(sendcounts);
931  Teuchos::ArrayView<gno_t> sendbuf_view = Teuchos::arrayViewFromVector(sendbuf);
932  Teuchos::ArrayRCP<gno_t> recvbuf;
933  std::vector<int> recvcounts(comm->getSize(),0);
934  Teuchos::ArrayView<int> recvcounts_view = Teuchos::arrayViewFromVector(recvcounts);
935  Zoltan2::AlltoAllv<gno_t>(*comm, *env, sendbuf_view, sendcounts_view, recvbuf, recvcounts_view);
936  std::vector<int> is_bndry_send(recvbuf.size(),0);
937 
938  //send back how many received vertices are in the boundary
939  for(int i = 0; i < recvbuf.size(); i++){
940  size_t lid = mapWithCopies->getLocalElement(recvbuf[i]);
941  is_bndry_send[i] = 0;
942  if(lid < nVtx){
943  for(offset_t j = offsets[lid]; j < offsets[lid+1]; j++){
944  if((size_t)local_adjs[j] >= nVtx) is_bndry_send[i] = 1;
945  }
946  } else{
947  for(offset_t j = first_layer_ghost_offsets[lid]; j < first_layer_ghost_offsets[lid+1]; j++){
948  if((size_t)local_ghost_adjs[j] >= nVtx) is_bndry_send[i] = 1;
949  }
950  }
951  }
952 
953  //send back the boundary flags
954  Teuchos::ArrayView<int> is_bndry_send_view = Teuchos::arrayViewFromVector(is_bndry_send);
955  Teuchos::ArrayRCP<int> is_bndry_recv;
956  std::vector<int> bndry_recvcounts(comm->getSize(),0);
957  Teuchos::ArrayView<int> bndry_recvcounts_view = Teuchos::arrayViewFromVector(bndry_recvcounts);
958  Zoltan2::AlltoAllv<int> (*comm, *env, is_bndry_send_view, recvcounts_view, is_bndry_recv, bndry_recvcounts_view);
959 
960  //add together the flags to compute how many boundary vertices are in the second ghost layer
961  int boundaryverts = 0;
962  for(int i = 0; i < is_bndry_recv.size(); i++){
963  boundaryverts += is_bndry_recv[i];
964  }
965  //this cout is guarded by a verbose check.
966  std::cout<<comm->getRank()<<": "<<boundaryverts<<" boundary verts out of "<<n2Ghosts<<" verts in 2GL\n";
967  }
968 
969 
970  //local CSR representation for the owned vertices:
971  Teuchos::ArrayView<const lno_t> local_adjs_view = Teuchos::arrayViewFromVector(local_adjs);
972  //NOTE: the offset ArrayView was constructed earlier, and is up-to-date.
973  //
974  //first ghost layer CSR representation:
975  Teuchos::ArrayView<const offset_t> ghost_offsets = Teuchos::arrayViewFromVector(first_layer_ghost_offsets);
976  Teuchos::ArrayView<const lno_t> ghost_adjacencies = Teuchos::arrayViewFromVector(local_ghost_adjs);
977  Teuchos::ArrayView<const int> rand_view = Teuchos::arrayViewFromVector(rand);
978  Teuchos::ArrayView<const gno_t> gid_view = Teuchos::arrayViewFromVector(ownedPlusGhosts);
979  //these ArrayViews contain LIDs that are ghosted remotely,
980  //and the Process IDs that have those ghost copies.
981  //An LID may appear in the exportLIDs more than once.
982  Teuchos::ArrayView<const lno_t> exportLIDs = importer->getExportLIDs();
983  Teuchos::ArrayView<const int> exportPIDs = importer->getExportPIDs();
984 
985  //construct a quick lookup datastructure to map from LID to a
986  //list of PIDs we need to send data to.
987  std::unordered_map<lno_t, std::vector<int>> procs_to_send;
988  for(int i = 0; i < exportLIDs.size(); i++){
989  procs_to_send[exportLIDs[i]].push_back(exportPIDs[i]);
990  }
991 
992  //call the coloring algorithm
993  twoGhostLayer(nVtx, nVtx+nGhosts, local_adjs_view, offsets, ghost_adjacencies, ghost_offsets,
994  femv, gid_view, rand_view, owners2, mapWithCopies, procs_to_send);
995 
996  //copy colors to the output array
997  ArrayRCP<int> colors = solution->getColorsRCP();
998  auto femvdata = femv->getData(0);
999  for(size_t i=0; i<nVtx; i++){
1000  colors[i] = femvdata[i];
1001  }
1002 
1003  }
1004 // private:
1005 
1006  //This is the coloring logic for all 2GL-based methods.
1007  //
1008  //INPUT ARGS:
1009  //
1010  // n_local: the number of local owned vertices
1011  //
1012  // n_total: the number of local owned vertices plus first layer ghosts
1013  //
1014  // adjs: the CSR adjacencies for the graph, only including owned vertices
1015  // and first layer ghosts
1016  //
1017  // offsets: the CSR offsets for the graph, has owned offsets into adjacencies
1018  //
1019  // ghost_adjs: the adjacency information for first-layer ghost vertices.
1020  // Includes all neighbors (owned, first-layer ghost,
1021  // second-layer ghost) for each first-layer ghost.
1022  //
1023  //
1024  // ghost_offsets: the offsets into ghost_adjs, first layer ghost LIDs
1025  // minus n_local are used to index this
1026  // datastructure. I.E. ghost_offsets(0)
1027  // corresponds to the ghost with LID n_local
1028  //
1029  // gids: a vector that holds GIDs for all vertices on this process
1030  // (includes owned, and all ghosts, indexable by LID)
1031  // The GIDs are ordered like this:
1032  // ----------------------------------
1033  // | | first | second |
1034  // | owned | layer | layer |
1035  // | | ghosts | ghosts |
1036  // ----------------------------------
1037  // ^ ^
1038  // n_local n_total
1039  //
1040  // rand: a vector that holds random numbers generated on GID for tie
1041  // breaking. Indexed by LID.
1042  //
1043  // ghost_owners: contains the process ID for the owner of each remote vertex.
1044  // Indexable by LID. owners[i] = the owning process for vertex
1045  // with GID = gids[i+n_local]
1046  //
1047  // mapOwnedPlusGhosts: map that can translate between GID and LID
1048  //
1049  // procs_to_send: for each vertex that is copied on a remote process,
1050  // this map contains the list of processes that have
1051  // a copy of a given vertex. Input LID, get the list
1052  // of PIDs that have a ghost copy of that LID.
1053  //
1054  //OUTPUT ARGS:
1055  //
1056  // femv: an FEMultiVector that holds a dual view of the colors.
1057  // After this call, femv contains updated colors.
1058  //
1059  void twoGhostLayer(const size_t n_local, const size_t n_total,
1060  const Teuchos::ArrayView<const lno_t>& adjs,
1061  const Teuchos::ArrayView<const offset_t>& offsets,
1062  const Teuchos::ArrayView<const lno_t>& ghost_adjs,
1063  const Teuchos::ArrayView<const offset_t>& ghost_offsets,
1064  const Teuchos::RCP<femv_t>& femv,
1065  const Teuchos::ArrayView<const gno_t>& gids,
1066  const Teuchos::ArrayView<const int>& rand,
1067  const Teuchos::ArrayView<const int>& ghost_owners,
1068  RCP<const map_t> mapOwnedPlusGhosts,
1069  const std::unordered_map<lno_t, std::vector<int>>& procs_to_send){
1070  //initialize timing variables
1071  double total_time = 0.0;
1072  double interior_time = 0.0;
1073  double comm_time = 0.0;
1074  double comp_time = 0.0;
1075  double recoloring_time=0.0;
1076  double conflict_detection = 0.0;
1077 
1078  //Number of rounds we are saving statistics for
1079  //100 is a decent default. Reporting requires --verbose argument.
1080  const int numStatisticRecordingRounds = 100;
1081 
1082  //includes all ghosts, including the second layer.
1083  const size_t n_ghosts = rand.size() - n_local;
1084 
1085 
1086  //Get the degrees of all ghost vertices
1087  //This is necessary for recoloring based on degrees,
1088  //we need ghost vertex degrees for consistency.
1089  std::vector<int> deg_send_cnts(comm->getSize(),0);
1090  std::vector<gno_t> deg_sdispls(comm->getSize()+1,0);
1091  for(int i = 0; i < ghost_owners.size(); i++){
1092  deg_send_cnts[ghost_owners[i]]++;
1093  }
1094  deg_sdispls[0] = 0;
1095  gno_t deg_sendsize = 0;
1096  std::vector<int> deg_sentcount(comm->getSize(),0);
1097  for(int i = 1; i < comm->getSize()+1; i++){
1098  deg_sdispls[i] = deg_sdispls[i-1] + deg_send_cnts[i-1];
1099  deg_sendsize += deg_send_cnts[i-1];
1100  }
1101  std::vector<gno_t> deg_sendbuf(deg_sendsize,0);
1102  for(int i = 0; i < ghost_owners.size(); i++){
1103  size_t idx = deg_sdispls[ghost_owners[i]] + deg_sentcount[ghost_owners[i]];
1104  deg_sentcount[ghost_owners[i]]++;
1105  deg_sendbuf[idx] = mapOwnedPlusGhosts->getGlobalElement(i+n_local);
1106  }
1107  Teuchos::ArrayView<int> deg_send_cnts_view = Teuchos::arrayViewFromVector(deg_send_cnts);
1108  Teuchos::ArrayView<gno_t> deg_sendbuf_view = Teuchos::arrayViewFromVector(deg_sendbuf);
1109  Teuchos::ArrayRCP<gno_t> deg_recvbuf;
1110  std::vector<int> deg_recvcnts(comm->getSize(),0);
1111  Teuchos::ArrayView<int> deg_recvcnts_view = Teuchos::arrayViewFromVector(deg_recvcnts);
1112  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_sendbuf_view, deg_send_cnts_view, deg_recvbuf, deg_recvcnts_view);
1113 
1114  //replace GID with the degree the owning process has for this vertex.
1115  //(this will include all edges present for this vertex globally)
1116  //This is safe to do, since we sent ghosts to their owners.
1117  for(int i = 0; i < deg_recvbuf.size(); i++){
1118  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_recvbuf[i]);
1119  deg_recvbuf[i] = offsets[lid+1] - offsets[lid];
1120  }
1121  //send modified buffer back
1122  ArrayRCP<gno_t> ghost_degrees;
1123  Zoltan2::AlltoAllv<gno_t>(*comm, *env, deg_recvbuf(), deg_recvcnts_view, ghost_degrees, deg_send_cnts_view);
1124 
1125  //create the ghost degree views, for use on host and device.
1126  Kokkos::View<gno_t*, device_type> ghost_degrees_dev("ghost degree view",ghost_degrees.size());
1127  typename Kokkos::View<gno_t*, device_type>::HostMirror ghost_degrees_host = Kokkos::create_mirror(ghost_degrees_dev);
1128  for(int i = 0; i < ghost_degrees.size(); i++){
1129  lno_t lid = mapOwnedPlusGhosts->getLocalElement(deg_sendbuf[i]);
1130  ghost_degrees_host(lid-n_local) = ghost_degrees[i];
1131  }
1132  Kokkos::deep_copy(ghost_degrees_dev, ghost_degrees_host);
1133 
1134  //track the size of sends and receives through coloring rounds.
1135  gno_t recvPerRound[numStatisticRecordingRounds];
1136  gno_t sentPerRound[numStatisticRecordingRounds];
1137 
1138  //find global max degree to determine which
1139  //coloring algorithm will be the most efficient
1140  //(For D1-2GL this is important, D2 and PD2 should always use NBBIT
1141  offset_t local_max_degree = 0;
1142  offset_t global_max_degree = 0;
1143  for(size_t i = 0; i < n_local; i++){
1144  offset_t curr_degree = offsets[i+1] - offsets[i];
1145  if(curr_degree > local_max_degree){
1146  local_max_degree = curr_degree;
1147  }
1148  }
1149  Teuchos::reduceAll<int, offset_t>(*comm, Teuchos::REDUCE_MAX,1, &local_max_degree, &global_max_degree);
1150  if(comm->getRank() == 0 && verbose) std::cout<<"Input has max degree "<<global_max_degree<<"\n";
1151 
1152  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for initial coloring\n";
1153 
1154  //the initial coloring is able to use a standard CSR representation, so we construct that here.
1155  //This is a direct translation of the offsets and adjs arguments into Kokkos::Views.
1156  Kokkos::View<offset_t*, device_type> offsets_dev("Host Offset View", offsets.size());
1157  typename Kokkos::View<offset_t*, device_type>::HostMirror offsets_host = Kokkos::create_mirror(offsets_dev);
1158  Kokkos::View<lno_t*, device_type> adjs_dev("Host Adjacencies View", adjs.size());
1159  typename Kokkos::View<lno_t*, device_type>::HostMirror adjs_host = Kokkos::create_mirror(adjs_dev);
1160  for(Teuchos_Ordinal i = 0; i < offsets.size(); i++) offsets_host(i) = offsets[i];
1161  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) adjs_host(i) = adjs[i];
1162  Kokkos::deep_copy(offsets_dev,offsets_host);
1163  Kokkos::deep_copy(adjs_dev, adjs_host);
1164 
1165 
1166  //create the graph structures which allow KokkosKernels to recolor the conflicting vertices
1167  if(verbose) std::cout<<comm->getRank()<<": constructing Kokkos Views for recoloring\n";
1168 
1169  //in order to correctly recolor, all ghost vertices on this process need an entry in the CSR offsets.
1170  //Otherwise, the color of the ghosts will be ignored, and the coloring will not converge.
1171  Kokkos::View<offset_t*, device_type> dist_degrees_dev("Owned+Ghost degree view",rand.size());
1172  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_degrees_host = Kokkos::create_mirror(dist_degrees_dev);
1173 
1174  //calculate the local degrees for the owned, first layer ghosts, and second layer ghosts
1175  //to be used to construct the CSR representation of the local graph.
1176  //owned
1177  for(Teuchos_Ordinal i = 0; i < offsets.size()-1; i++) dist_degrees_host(i) = offsets[i+1] - offsets[i];
1178  //first layer ghosts
1179  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++) dist_degrees_host(i+n_local) = ghost_offsets[i+1] - ghost_offsets[i];
1180  //second layer ghosts
1181  for(Teuchos_Ordinal i = 0; i < ghost_adjs.size(); i++){
1182  //second layer ghosts have LID >= n_total
1183  if((size_t)ghost_adjs[i] >= n_total ){
1184  dist_degrees_host(ghost_adjs[i])++;
1185  }
1186  }
1187 
1188  //The structure of the final CSR constructed here looks like:
1189  //
1190  // Index by LID--v
1191  // --------------------------------------------
1192  // | | first |second |
1193  // dist_offsets_dev/host |owned verts | layer |layer |
1194  // | | ghosts |ghosts |
1195  // --------------------------------------------
1196  // |indexes
1197  // v
1198  // --------------------------------------------
1199  // | |first |second |
1200  // dist_adjs_dev/host |owned adjs |layer |layer |
1201  // | |ghost adjs |ghost adjs |
1202  // --------------------------------------------
1203  //
1204  // We end up with a CSR that has adjacency information for all the vertices on
1205  // this process. We include only edges on the process, so ghosts have only partial
1206  // adjacency information.
1207  //
1208  // We symmetrize the local graph representation as well, for
1209  // KokkosKernels to behave as we need it to. This means that edges to
1210  // second layer ghosts must be symmetrized, so we end up with offsets
1211  // that correspond to second layer ghosts.
1212  Kokkos::View<offset_t*, device_type> dist_offsets_dev("Owned+Ghost Offset view", rand.size()+1);
1213  typename Kokkos::View<offset_t*, device_type>::HostMirror dist_offsets_host = Kokkos::create_mirror(dist_offsets_dev);
1214 
1215  //we can construct the entire offsets using the degrees we computed above
1216  dist_offsets_host(0) = 0;
1217  offset_t total_adjs = 0;
1218  for(Teuchos_Ordinal i = 1; i < rand.size()+1; i++){
1219  dist_offsets_host(i) = dist_degrees_host(i-1) + dist_offsets_host(i-1);
1220  total_adjs += dist_degrees_host(i-1);
1221  }
1222  Kokkos::View<lno_t*, device_type> dist_adjs_dev("Owned+Ghost adjacency view", total_adjs);
1223  typename Kokkos::View<lno_t*, device_type>::HostMirror dist_adjs_host = Kokkos::create_mirror(dist_adjs_dev);
1224 
1225  //zero out the degrees to use it as a counter.
1226  //The offsets now allow us to compute degrees,
1227  //so we aren't losing anything.
1228  for(Teuchos_Ordinal i = 0; i < rand.size(); i++){
1229  dist_degrees_host(i) = 0;
1230  }
1231  //We can just copy the adjacencies for the owned verts and first layer ghosts
1232  for(Teuchos_Ordinal i = 0; i < adjs.size(); i++) dist_adjs_host(i) = adjs[i];
1233  for(Teuchos_Ordinal i = adjs.size(); i < adjs.size() + ghost_adjs.size(); i++) dist_adjs_host(i) = ghost_adjs[i-adjs.size()];
1234 
1235  //We have to symmetrize edges that involve a second layer ghost.
1236  //Add edges from the second layer ghosts to their neighbors.
1237  for(Teuchos_Ordinal i = 0; i < ghost_offsets.size()-1; i++){
1238  //loop through each first layer ghost
1239  for(offset_t j = ghost_offsets[i]; j < ghost_offsets[i+1]; j++){
1240  //if an adjacency is a second layer ghost
1241  if((size_t)ghost_adjs[j] >= n_total){
1242  //compute the offset to place the symmetric edge, and place it.
1243  dist_adjs_host(dist_offsets_host(ghost_adjs[j]) + dist_degrees_host(ghost_adjs[j])) = i + n_local;
1244  //increment the counter to keep track of how many adjacencies
1245  //have been placed.
1246  dist_degrees_host(ghost_adjs[j])++;
1247  }
1248  }
1249  }
1250  //copy the host views back to the device views
1251  Kokkos::deep_copy(dist_degrees_dev,dist_degrees_host);
1252  Kokkos::deep_copy(dist_offsets_dev,dist_offsets_host);
1253  Kokkos::deep_copy(dist_adjs_dev, dist_adjs_host);
1254 
1255  //this view represents how many conflicts were found
1256  Kokkos::View<size_t*, device_type> recoloringSize("Recoloring Queue Size",1);
1257  typename Kokkos::View<size_t*, device_type>::HostMirror recoloringSize_host = Kokkos::create_mirror(recoloringSize);
1258  recoloringSize_host(0) = 0;
1259  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1260 
1261  //create a view for the tie-breaking random numbers.
1262  if(verbose) std::cout<<comm->getRank()<<": constructing rand and GIDs views\n";
1263  Kokkos::View<int*, device_type> rand_dev("Random View", rand.size());
1264  typename Kokkos::View<int*, device_type>::HostMirror rand_host = Kokkos::create_mirror(rand_dev);
1265  for(Teuchos_Ordinal i = 0; i < rand.size(); i ++) rand_host(i) = rand[i];
1266  Kokkos::deep_copy(rand_dev,rand_host);
1267 
1268  //create a view for the global IDs of each vertex.
1269  Kokkos::View<gno_t*, device_type> gid_dev("GIDs", gids.size());
1270  typename Kokkos::View<gno_t*, device_type>::HostMirror gid_host = Kokkos::create_mirror(gid_dev);
1271  for(Teuchos_Ordinal i = 0; i < gids.size(); i++) gid_host(i) = gids[i];
1272  Kokkos::deep_copy(gid_dev,gid_host);
1273 
1274  //These variables will be initialized by constructBoundary()
1275  //
1276  //boundary_verts_dev holds all owned vertices that could possibly conflict
1277  //with a remote vertex. Stores LIDs.
1278  Kokkos::View<lno_t*, device_type> boundary_verts_dev;
1279  //this is the number of vertices in the boundary.
1280  if(verbose) std::cout<<comm->getRank()<<": constructing communication and recoloring lists\n";
1281 
1282  //We keep track of the vertices that need to get recolored
1283  //this list can contain ghost vertices, so it needs to be initialized
1284  //to the number of all vertices on the local process.
1285  //rand has an entry for each vertex, so its size corresponds to what is needed.
1286  Kokkos::View<lno_t*, device_type> verts_to_recolor_view("verts to recolor", rand.size());
1287  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_recolor_host = create_mirror(verts_to_recolor_view);
1288 
1289  //This view keeps track of the size of the list of vertices to recolor.
1290  Kokkos::View<int*, device_type> verts_to_recolor_size("verts to recolor size",1);
1291  Kokkos::View<int*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_recolor_size_atomic = verts_to_recolor_size;
1292  typename Kokkos::View<int*, device_type>::HostMirror verts_to_recolor_size_host = create_mirror(verts_to_recolor_size);
1293 
1294  //initialize the host view
1295  verts_to_recolor_size_host(0) = 0;
1296  //copy to device
1297  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1298 
1299  //verts to send can only include locally owned vertices,
1300  //so we can safely allocate the view to size n_local.
1301  //
1302  //This view is a list of local vertices that are going to be
1303  //recolored, and need to be sent to their remote copies.
1304  Kokkos::View<lno_t*, device_type> verts_to_send_view("verts to send", n_local);
1305  typename Kokkos::View<lno_t*, device_type>::HostMirror verts_to_send_host = create_mirror(verts_to_send_view);
1306 
1307  //this view keeps track of the size of verts_to_send.
1308  Kokkos::View<size_t*, device_type> verts_to_send_size("verts to send size",1);
1309  Kokkos::View<size_t*, device_type, Kokkos::MemoryTraits<Kokkos::Atomic>> verts_to_send_size_atomic = verts_to_send_size;
1310  typename Kokkos::View<size_t*, device_type>::HostMirror verts_to_send_size_host = create_mirror(verts_to_send_size);
1311 
1312  verts_to_send_size_host(0) = 0;
1313  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1314 
1315  if(verbose) std::cout<<comm->getRank()<<": Constructing the boundary\n";
1316 
1317  //this function allocates and initializes the boundary_verts_dev view,
1318  //sets the boundary_size accordingly, and adds vertices to the
1319  //verts_to_send_atomic view, updating the size view as well.
1320  constructBoundary(n_local, dist_offsets_dev, dist_adjs_dev, dist_offsets_host, dist_adjs_host, boundary_verts_dev,
1321  verts_to_send_view, verts_to_send_size_atomic);
1322 
1323  //this boolean chooses which KokkosKernels algorithm to use.
1324  //It was experimentally chosen for distance-1 coloring.
1325  //Should be disregarded for distance-2 colorings.
1326  bool use_vbbit = (global_max_degree < 6000);
1327  //Done initializing, start coloring!
1328 
1329  //use a barrier if we are reporting timing info
1330  if(timing) comm->barrier();
1331  interior_time = timer();
1332  total_time = timer();
1333  //give the entire local graph to KokkosKernels to color
1334  this->colorInterior(n_local, adjs_dev, offsets_dev, femv,adjs_dev,0,use_vbbit);
1335  interior_time = timer() - interior_time;
1336  comp_time = interior_time;
1337 
1338  //ghost_colors holds the colors of only ghost vertices.
1339  //ghost_colors(0) holds the color of a vertex with LID n_local.
1340  //To index this with LIDs, subtract n_local. If an LID is < n_local,
1341  //it will not have a color stored in this view.
1342  Kokkos::View<int*,device_type> ghost_colors("ghost color backups", n_ghosts);
1343 
1344  //communicate the initial coloring.
1345  if(verbose) std::cout<<comm->getRank()<<": communicating\n";
1346 
1347  //update the host views for the verts to send,
1348  //doOwnedToGhosts needs host memory views, but doesn't
1349  //change the values in them, so no need to copy afterwards
1350  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1351  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1352  gno_t recv,sent;
1353  comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1354  sentPerRound[0] = sent;
1355  recvPerRound[0] = recv;
1356 
1357  //store ghost colors so we can restore them after recoloring.
1358  //the local process can't color ghosts correctly, so we
1359  //reset the colors to avoid consistency issues.
1360  //get the color view from the FEMultiVector
1361  auto femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1362  auto femv_colors = subview(femvColors, Kokkos::ALL, 0);
1363  Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1364  ghost_colors(i) = femv_colors(i+n_local);
1365  });
1366  Kokkos::fence();
1367 
1368  double temp = timer();
1369  //detect conflicts only for ghost vertices
1370  bool recolor_degrees = this->pl->template get<bool>("recolor_degrees",false);
1371  if(verbose) std::cout<<comm->getRank()<<": detecting conflicts\n";
1372 
1373  //these sizes will be updated by detectConflicts, zero them out beforehand
1374  verts_to_send_size_host(0) = 0;
1375  verts_to_recolor_size_host(0) = 0;
1376  recoloringSize_host(0) = 0;
1377  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1378  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1379  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1380 
1381  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev, femv_colors, boundary_verts_dev,
1382  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1383  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1384 
1385  //these sizes were updated by detectConflicts,
1386  //copy the device views back into host memory
1387  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1388  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1389  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1390  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1391 
1392  if(comm->getSize() > 1){
1393  conflict_detection = timer() - temp;
1394  comp_time += conflict_detection;
1395  }
1396  //all conflicts detected!
1397  if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";
1398  //variables for statistics
1399  double totalPerRound[numStatisticRecordingRounds];
1400  double commPerRound[numStatisticRecordingRounds];
1401  double compPerRound[numStatisticRecordingRounds];
1402  double recoloringPerRound[numStatisticRecordingRounds];
1403  double conflictDetectionPerRound[numStatisticRecordingRounds];
1404  uint64_t vertsPerRound[numStatisticRecordingRounds];
1405  uint64_t incorrectGhostsPerRound[numStatisticRecordingRounds];
1406  int distributedRounds = 1;
1407  totalPerRound[0] = interior_time + comm_time + conflict_detection;
1408  recoloringPerRound[0] = 0;
1409  commPerRound[0] = comm_time;
1410  compPerRound[0] = interior_time + conflict_detection;
1411  conflictDetectionPerRound[0] = conflict_detection;
1412  recoloringPerRound[0] = 0;
1413  vertsPerRound[0] = 0;
1414  incorrectGhostsPerRound[0]=0;
1415  typename Kokkos::View<int*, device_type>::HostMirror ghost_colors_host;
1416  typename Kokkos::View<lno_t*, device_type>::HostMirror boundary_verts_host;
1417  size_t serial_threshold = this->pl->template get<int>("serial_threshold",0);
1418  //see if recoloring is necessary.
1419  size_t totalConflicts = 0;
1420  size_t localConflicts = recoloringSize_host(0);
1421  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localConflicts, &totalConflicts);
1422  bool done = !totalConflicts;
1423  if(comm->getSize()==1) done = true;
1424 
1425  //recolor until no conflicts are left
1426  while(!done){
1427  //if the number of vertices left to recolor is less than
1428  //the serial threshold set by the user, finish the coloring
1429  //only using host views in a serial execution space.
1430  if(recoloringSize_host(0) < serial_threshold) break;
1431  if(distributedRounds < numStatisticRecordingRounds) {
1432  vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
1433  }
1434 
1435  if(timing) comm->barrier();
1436  double recolor_temp = timer();
1437  //recolor using KokkosKernels' coloring function
1438  if(verts_to_recolor_size_host(0) > 0){
1439  this->colorInterior(femv_colors.size(), dist_adjs_dev, dist_offsets_dev,femv,verts_to_recolor_view,verts_to_recolor_size_host(0),use_vbbit);
1440  }
1441 
1442  if(distributedRounds < numStatisticRecordingRounds){
1443  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1444  recoloring_time += recoloringPerRound[distributedRounds];
1445  comp_time += recoloringPerRound[distributedRounds];
1446  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1447  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1448  } else if(timing){
1449  double recoloring_round_time = timer() - recolor_temp;
1450  recoloring_time += recoloring_round_time;
1451  comp_time += recoloring_round_time;
1452  }
1453 
1454  //reset the ghost colors to what they were before recoloring
1455  //the recoloring does not have enough information to color
1456  //ghosts correctly, so we set the colors to what they were before
1457  //to avoid consistency issues.
1458  Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1459  femv_colors(i+n_local) = ghost_colors(i);
1460  });
1461  Kokkos::fence();
1462 
1463  //send views are up-to-date, they were copied after conflict detection.
1464  //communicate the new colors
1465 
1466  // Reset device views
1467  femvColors = decltype(femvColors)();
1468  femv_colors = decltype(femv_colors)();
1469  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts,n_local,verts_to_send_host,verts_to_send_size_host,femv,procs_to_send,sent,recv);
1470  comm_time += curr_comm_time;
1471 
1472  if(distributedRounds < numStatisticRecordingRounds){
1473  commPerRound[distributedRounds] = curr_comm_time;
1474  recvPerRound[distributedRounds] = recv;
1475  sentPerRound[distributedRounds] = sent;
1476  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1477  }
1478 
1479  //store ghost colors before we uncolor and recolor them.
1480  //this process doesn't have enough info to correctly color
1481  //ghosts, so we set them back to what they were before to
1482  //remove consistency issues.
1483  femvColors = femv->getLocalViewDevice(Tpetra::Access::ReadWrite);
1484  femv_colors = subview(femvColors, Kokkos::ALL, 0);
1485  Kokkos::parallel_for(n_ghosts, KOKKOS_LAMBDA(const int& i){
1486  ghost_colors(i) = femv_colors(i+n_local);
1487  });
1488  Kokkos::fence();
1489 
1490 
1491  //these views will change in detectConflicts, they need
1492  //to be zeroed out beforehand
1493  verts_to_send_size_host(0) = 0;
1494  verts_to_recolor_size_host(0) = 0;
1495  recoloringSize_host(0) = 0;
1496  Kokkos::deep_copy(verts_to_send_size, verts_to_send_size_host);
1497  Kokkos::deep_copy(verts_to_recolor_size, verts_to_recolor_size_host);
1498  Kokkos::deep_copy(recoloringSize, recoloringSize_host);
1499 
1500  //check for further conflicts
1501  double detection_temp = timer();
1502 
1503  detectConflicts(n_local, dist_offsets_dev, dist_adjs_dev,femv_colors, boundary_verts_dev,
1504  verts_to_recolor_view, verts_to_recolor_size_atomic, verts_to_send_view, verts_to_send_size_atomic,
1505  recoloringSize, rand_dev, gid_dev, ghost_degrees_dev, recolor_degrees);
1506 
1507  //copy the updated device views back into host memory where necessary
1508  Kokkos::deep_copy(verts_to_send_host, verts_to_send_view);
1509  Kokkos::deep_copy(verts_to_send_size_host, verts_to_send_size);
1510  //we only use the list of verts to recolor on device, no need to copy to host.
1511  Kokkos::deep_copy(verts_to_recolor_size_host, verts_to_recolor_size);
1512  Kokkos::deep_copy(recoloringSize_host, recoloringSize);
1513 
1514  if(distributedRounds < numStatisticRecordingRounds){
1515  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1516  conflict_detection += conflictDetectionPerRound[distributedRounds];
1517  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1518  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1519  comp_time += conflictDetectionPerRound[distributedRounds];
1520  } else if(timing){
1521  double conflict_detection_round_time = timer() - detection_temp;
1522  conflict_detection += conflict_detection_round_time;
1523  comp_time += conflict_detection_round_time;
1524  }
1525 
1526  distributedRounds++;
1527  size_t localDone = recoloringSize_host(0);
1528  size_t globalDone = 0;
1529  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1530  done = !globalDone;
1531 
1532  }//end device coloring
1533 
1534 
1535  //If we reach this point and still have vertices to color,
1536  //the rest of the coloring is happening in serial on the host.
1537  //
1538  //First, we need to copy device-only views to host mirrors
1539  if(recoloringSize_host(0) > 0 || !done){
1540  ghost_colors_host = Kokkos::create_mirror_view_and_copy(host_mem(),ghost_colors,"ghost_colors host mirror");
1541  boundary_verts_host = Kokkos::create_mirror_view_and_copy(host_mem(),boundary_verts_dev,"boundary_verts host mirror");
1542  }
1543 
1544  //Now we do a similar coloring loop to before,
1545  //but using only host views in a serial execution space.
1546  // Reset device views
1547  femvColors = decltype(femvColors)();
1548  femv_colors = decltype(femv_colors)();
1549  while(recoloringSize_host(0) > 0 || !done){
1550  auto femvColors_host = femv->getLocalViewHost(Tpetra::Access::ReadWrite);
1551  auto colors_host = subview(femvColors_host, Kokkos::ALL, 0);
1552  if(distributedRounds < numStatisticRecordingRounds){
1553  vertsPerRound[distributedRounds] = recoloringSize_host(0);
1554  }
1555  if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
1556  if(timing) comm->barrier();
1557 
1558  double recolor_temp = timer();
1559  if(verts_to_recolor_size_host(0) > 0){
1560  this->colorInterior_serial(colors_host.size(), dist_adjs_host, dist_offsets_host, femv,
1561  verts_to_recolor_host, verts_to_recolor_size_host(0), true);
1562  }
1563  if(distributedRounds < numStatisticRecordingRounds){
1564  recoloringPerRound[distributedRounds] = timer() - recolor_temp;
1565  recoloring_time += recoloringPerRound[distributedRounds];
1566  comp_time += recoloringPerRound[distributedRounds];
1567  compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1568  totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
1569  } else if(timing){
1570  double recoloring_serial_round_time = timer() - recolor_temp;
1571  recoloring_time += recoloring_serial_round_time;
1572  comp_time += recoloring_serial_round_time;
1573  }
1574 
1575  //reset the ghost colors to their previous values to avoid
1576  //consistency issues.
1577  for(size_t i = 0; i < n_ghosts; i++){
1578  colors_host(i+n_local) = ghost_colors_host(i);
1579  }
1580 
1581  double curr_comm_time = doOwnedToGhosts(mapOwnedPlusGhosts, n_local,verts_to_send_host, verts_to_send_size_host, femv, procs_to_send, sent,recv);
1582  comm_time += curr_comm_time;
1583 
1584  if(distributedRounds < numStatisticRecordingRounds){
1585  commPerRound[distributedRounds] = curr_comm_time;
1586  recvPerRound[distributedRounds] = recv;
1587  sentPerRound[distributedRounds] = sent;
1588  totalPerRound[distributedRounds] += commPerRound[distributedRounds];
1589  }
1590 
1591  //store updated ghost colors we received from their owners
1592  //before conflict detection and recoloring changes them locally.
1593  for(size_t i = 0; i < n_ghosts; i++){
1594  ghost_colors_host(i) = colors_host(i+n_local);
1595  }
1596 
1597  if(timing) comm->barrier();
1598  double detection_temp = timer();
1599 
1600  //zero these out, they'll be updated by detectConflicts_serial
1601  verts_to_recolor_size_host(0) = 0;
1602  verts_to_send_size_host(0) = 0;
1603  recoloringSize_host(0) = 0;
1604 
1605  detectConflicts_serial(n_local,dist_offsets_host, dist_adjs_host, colors_host, boundary_verts_host,
1606  verts_to_recolor_host, verts_to_recolor_size_host, verts_to_send_host, verts_to_send_size_host,
1607  recoloringSize_host, rand_host, gid_host, ghost_degrees_host, recolor_degrees);
1608 
1609  //no need to update the host views, we're only using host views now.
1610 
1611  if(distributedRounds < numStatisticRecordingRounds){
1612  conflictDetectionPerRound[distributedRounds] = timer() - detection_temp;
1613  conflict_detection += conflictDetectionPerRound[distributedRounds];
1614  compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1615  totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
1616  comp_time += conflictDetectionPerRound[distributedRounds];
1617  } else if(timing){
1618  double conflict_detection_serial_round_time = timer() - detection_temp;
1619  conflict_detection += conflict_detection_serial_round_time;
1620  comp_time += conflict_detection_serial_round_time;
1621  }
1622 
1623  size_t globalDone = 0;
1624  size_t localDone = recoloringSize_host(0);
1625  Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
1626  distributedRounds++;
1627  done = !globalDone;
1628  }
1629 
1630  total_time = timer() - total_time;
1631  //only compute stats if they will be displayed
1632  if(verbose){
1633  uint64_t localBoundaryVertices = 0;
1634  for(size_t i = 0; i < n_local; i++){
1635  for(offset_t j = offsets[i]; j < offsets[i+1]; j++){
1636  if((size_t)adjs[j] >= n_local){
1637  localBoundaryVertices++;
1638  break;
1639  }
1640  }
1641  }
1642 
1643  if(comm->getRank() == 0) printf("did %d rounds of distributed coloring\n", distributedRounds);
1644  uint64_t totalVertsPerRound[numStatisticRecordingRounds];
1645  uint64_t totalBoundarySize = 0;
1646  uint64_t totalIncorrectGhostsPerRound[numStatisticRecordingRounds];
1647  double finalTotalPerRound[numStatisticRecordingRounds];
1648  double maxRecoloringPerRound[numStatisticRecordingRounds];
1649  double minRecoloringPerRound[numStatisticRecordingRounds];
1650  double finalCommPerRound[numStatisticRecordingRounds];
1651  double finalCompPerRound[numStatisticRecordingRounds];
1652  double finalConflictDetectionPerRound[numStatisticRecordingRounds];
1653  gno_t finalRecvPerRound[numStatisticRecordingRounds];
1654  gno_t finalSentPerRound[numStatisticRecordingRounds];
1655  for(int i = 0; i < numStatisticRecordingRounds; i++){
1656  totalVertsPerRound[i] = 0;
1657  finalTotalPerRound[i] = 0.0;
1658  maxRecoloringPerRound[i] = 0.0;
1659  minRecoloringPerRound[i] = 0.0;
1660  finalCommPerRound[i] = 0.0;
1661  finalCompPerRound[i] = 0.0;
1662  finalConflictDetectionPerRound[i] = 0.0;
1663  finalRecvPerRound[i] = 0;
1664  finalSentPerRound[i] = 0;
1665  }
1666  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,1,&localBoundaryVertices, &totalBoundarySize);
1667  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,vertsPerRound,totalVertsPerRound);
1668  Teuchos::reduceAll<int,uint64_t>(*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,incorrectGhostsPerRound,totalIncorrectGhostsPerRound);
1669  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,totalPerRound, finalTotalPerRound);
1670  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,recoloringPerRound,maxRecoloringPerRound);
1671  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,numStatisticRecordingRounds,recoloringPerRound,minRecoloringPerRound);
1672  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,commPerRound,finalCommPerRound);
1673  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,numStatisticRecordingRounds,compPerRound,finalCompPerRound);
1674  Teuchos::reduceAll<int,double>(*comm,
1675  Teuchos::REDUCE_MAX,numStatisticRecordingRounds,conflictDetectionPerRound,finalConflictDetectionPerRound);
1676  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
1677  Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
1678  printf("Rank %d: boundary size: %ld\n",comm->getRank(),localBoundaryVertices);
1679  if(comm->getRank() == 0) printf("Total boundary size: %ld\n",totalBoundarySize);
1680  for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
1681  printf("Rank %d: recolor %ld vertices in round %d\n",comm->getRank(), vertsPerRound[i],i);
1682  printf("Rank %d: sentbuf had %lld entries in round %d\n", comm->getRank(), sentPerRound[i],i);
1683  if(comm->getRank()==0){
1684  printf("recolored %ld vertices in round %d\n",totalVertsPerRound[i], i);
1685  printf("%ld inconsistent ghosts in round %d\n",totalIncorrectGhostsPerRound[i],i);
1686  printf("total time in round %d: %f\n",i,finalTotalPerRound[i]);
1687  printf("recoloring time in round %d: %f\n",i,maxRecoloringPerRound[i]);
1688  printf("min recoloring time in round %d: %f\n",i,minRecoloringPerRound[i]);
1689  printf("conflict detection time in round %d: %f\n",i,finalConflictDetectionPerRound[i]);
1690  printf("comm time in round %d: %f\n",i,finalCommPerRound[i]);
1691  printf("recvbuf size in round %d: %lld\n",i,finalRecvPerRound[i]);
1692  printf("sendbuf size in round %d: %lld\n",i,finalSentPerRound[i]);
1693  printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
1694  }
1695  }
1696  } else if (timing){
1697  double global_total_time = 0.0;
1698  double global_recoloring_time = 0.0;
1699  double global_min_recoloring_time = 0.0;
1700  double global_conflict_detection=0.0;
1701  double global_comm_time=0.0;
1702  double global_comp_time=0.0;
1703  double global_interior_time=0.0;
1704  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&total_time,&global_total_time);
1705  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&recoloring_time,&global_recoloring_time);
1706  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MIN,1,&recoloring_time,&global_min_recoloring_time);
1707  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&conflict_detection,&global_conflict_detection);
1708  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comm_time,&global_comm_time);
1709  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&comp_time,&global_comp_time);
1710  Teuchos::reduceAll<int,double>(*comm, Teuchos::REDUCE_MAX,1,&interior_time,&global_interior_time);
1711  comm->barrier();
1712  fflush(stdout);
1713  if(comm->getRank()==0){
1714  printf("Total Time: %f\n",global_total_time);
1715  printf("Interior Time: %f\n",global_interior_time);
1716  printf("Recoloring Time: %f\n",global_recoloring_time);
1717  printf("Min Recoloring Time: %f\n",global_min_recoloring_time);
1718  printf("Conflict Detection Time: %f\n",global_conflict_detection);
1719  printf("Comm Time: %f\n",global_comm_time);
1720  printf("Comp Time: %f\n",global_comp_time);
1721  }
1722  }
1723  }
1724 }; //end class
1725 
1726 template <typename Adapter>
1727 void AlgTwoGhostLayer<Adapter>::buildModel(modelFlag_t &flags){
1728  flags.set(REMOVE_SELF_EDGES);
1729 
1730  this->env->debug(DETAILED_STATUS, " building graph model");
1731  this->model = rcp(new GraphModel<base_adapter_t>(this->adapter, this->env,
1732  this->comm, flags));
1733  this->env->debug(DETAILED_STATUS, " graph model built");
1734 }
1735 
1736 }//end namespace Zoltan2
1737 
1738 #endif
AlltoAll communication methods.
Defines the ColoringSolution class.
Defines the GraphModel interface.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
A gathering of useful namespace methods.
Tpetra::Map<>::memory_space memory_space
typename Kokkos::View< device_type >::HostMirror::execution_space host_exec
Tpetra::Map<>::device_type device_type
Tpetra::Map<>::execution_space execution_space
typename Adapter::base_adapter_t base_adapter_t
typename Adapter::gno_t gno_t
typename Adapter::scalar_t scalar_t
void twoGhostLayer(const size_t n_local, const size_t n_total, const Teuchos::ArrayView< const lno_t > &adjs, const Teuchos::ArrayView< const offset_t > &offsets, const Teuchos::ArrayView< const lno_t > &ghost_adjs, const Teuchos::ArrayView< const offset_t > &ghost_offsets, const Teuchos::RCP< femv_t > &femv, const Teuchos::ArrayView< const gno_t > &gids, const Teuchos::ArrayView< const int > &rand, const Teuchos::ArrayView< const int > &ghost_owners, RCP< const map_t > mapOwnedPlusGhosts, const std::unordered_map< lno_t, std::vector< int >> &procs_to_send)
typename Adapter::offset_t offset_t
Tpetra::FEMultiVector< femv_scalar_t, lno_t, gno_t > femv_t
void color(const RCP< ColoringSolution< Adapter > > &solution)
Coloring method.
typename Kokkos::View< device_type >::HostMirror::memory_space host_mem
virtual void detectConflicts_serial(const size_t n_local, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, typename Kokkos::View< int *, device_type >::HostMirror femv_colors, typename Kokkos::View< lno_t *, device_type >::HostMirror boundary_verts_view, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_recolor_view, typename Kokkos::View< int *, device_type >::HostMirror verts_to_recolor_size_atomic, typename Kokkos::View< lno_t *, device_type >::HostMirror verts_to_send_view, typename Kokkos::View< size_t *, device_type >::HostMirror verts_to_send_size_atomic, typename Kokkos::View< size_t *, device_type >::HostMirror recoloringSize, typename Kokkos::View< int *, device_type >::HostMirror rand, typename Kokkos::View< gno_t *, device_type >::HostMirror gid, typename Kokkos::View< gno_t *, device_type >::HostMirror ghost_degrees, bool recolor_degrees)=0
virtual void detectConflicts(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, Kokkos::View< int *, device_type > femv_colors, Kokkos::View< lno_t *, device_type > boundary_verts_view, Kokkos::View< lno_t *, device_type > verts_to_recolor_view, Kokkos::View< int *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_recolor_size_atomic, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic, Kokkos::View< size_t *, device_type > recoloringSize, Kokkos::View< int *, device_type > rand, Kokkos::View< gno_t *, device_type > gid, Kokkos::View< gno_t *, device_type > ghost_degrees, bool recolor_degrees)=0
RCP< Teuchos::ParameterList > pl
RCP< GraphModel< base_adapter_t > > model
virtual void constructBoundary(const size_t n_local, Kokkos::View< offset_t *, device_type > dist_offsets_dev, Kokkos::View< lno_t *, device_type > dist_adjs_dev, typename Kokkos::View< offset_t *, device_type >::HostMirror dist_offsets_host, typename Kokkos::View< lno_t *, device_type >::HostMirror dist_adjs_host, Kokkos::View< lno_t *, device_type > &boundary_verts, Kokkos::View< lno_t *, device_type > verts_to_send_view, Kokkos::View< size_t *, device_type, Kokkos::MemoryTraits< Kokkos::Atomic >> verts_to_send_size_atomic)=0
RCP< const base_adapter_t > adapter
AlgTwoGhostLayer(const RCP< const base_adapter_t > &adapter_, const RCP< Teuchos::ParameterList > &pl_, const RCP< Environment > &env_, const RCP< const Teuchos::Comm< int > > &comm_)
typename Adapter::lno_t lno_t
Tpetra::Map< lno_t, gno_t > map_t
RCP< const Teuchos::Comm< int > > comm
Algorithm defines the base class for all algorithms.
The class containing coloring solution.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Zoltan2::BaseAdapter< userTypes_t > base_adapter_t
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ DETAILED_STATUS
sub-steps, each method's entry and exit
@ REMOVE_SELF_EDGES
algorithm requires no self edges