MueLu  Version of the Day
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 
51 #include "Kokkos_UnorderedMap.hpp"
52 
54 
55 #include "MueLu_Aggregates_kokkos.hpp"
56 #include "MueLu_AmalgamationFactory.hpp"
57 #include "MueLu_AmalgamationInfo.hpp"
58 #include "MueLu_CoarseMapFactory_kokkos.hpp"
59 #include "MueLu_NullspaceFactory_kokkos.hpp"
60 #include "MueLu_PerfUtils.hpp"
61 #include "MueLu_Monitor.hpp"
62 #include "MueLu_Utilities_kokkos.hpp"
63 
64 namespace MueLu {
65 
66  namespace { // anonymous
67 
68  template<class LocalOrdinal, class RowType>
69  class ScanFunctor {
70  public:
71  ScanFunctor(RowType rows) : rows_(rows) { }
72 
73  KOKKOS_INLINE_FUNCTION
74  void operator()(const LocalOrdinal i, LocalOrdinal& upd, const bool& final) const {
75  upd += rows_(i);
76  if (final)
77  rows_(i) = upd;
78  }
79 
80  private:
81  RowType rows_;
82  };
83 
84  // only limited support for lambdas with CUDA 7.5
85  // see https://github.com/kokkos/kokkos/issues/763
86  // This functor probably can go away when using CUDA 8.0
87  template<class LocalOrdinal, class rowType, class NSDimType>
88  class FillRowAuxArrayFunctor {
89  public:
90  FillRowAuxArrayFunctor(rowType rows, NSDimType nsDim) : rows_(rows), nsDim_(nsDim) { }
91 
92  KOKKOS_INLINE_FUNCTION
93  void operator()(const LocalOrdinal row) const {
94  rows_(row) = row*nsDim_;
95  }
96 
97  private:
98  rowType rows_;
99  NSDimType nsDim_;
100  };
101 
102  // only limited support for lambdas with CUDA 7.5
103  // see https://github.com/kokkos/kokkos/issues/763
104  // This functor probably can go away when using CUDA 8.0
105  template<class SC, class LO, class colType, class valType>
106  class FillColAuxArrayFunctor {
107  public:
108  FillColAuxArrayFunctor(SC zero, LO invalid, colType cols, valType vals) : zero_(zero), invalid_(invalid), cols_(cols), vals_(vals) { }
109 
110  KOKKOS_INLINE_FUNCTION
111  void operator()(const LO col) const {
112  cols_(col) = invalid_;
113  vals_(col) = zero_;
114  }
115 
116  private:
117  SC zero_;
118  LO invalid_;
119  colType cols_;
120  valType vals_;
121  };
122 
123  // collect aggregate sizes (number of dofs associated with all nodes in aggregate)
124  // aggSizesType needs to be atomic
125  template<class aggSizesType, class vertex2AggIdType, class procWinnerType, class nodeGlobalEltsType, class isNodeGlobalEltsType, class LOType, class GOType>
126  class CollectAggregateSizeFunctor {
127  private:
128  typedef LOType LO;
129  typedef GOType GO;
130 
131  aggSizesType aggSizes; //< view containint size of aggregate
132  vertex2AggIdType vertex2AggId; //< view containing vertex2AggId information
133  procWinnerType procWinner; //< view containing processor ids
134  nodeGlobalEltsType nodeGlobalElts; //< view containing global node ids of current proc
135  isNodeGlobalEltsType isNodeGlobalElement; //< unordered map storing whether (global) node id is owned by current proc
136 
137  int myPid;
138  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
139  GO indexBase, globalOffset;
140  public:
141  CollectAggregateSizeFunctor(aggSizesType aggSizes_, vertex2AggIdType vertex2AggId_, procWinnerType procWinner_, nodeGlobalEltsType nodeGlobalElts_, isNodeGlobalEltsType isNodeGlobalElement_, LO fullBlockSize_, LOType blockID_, LOType stridingOffset_, LOType stridedBlockSize_, GOType indexBase_, GOType globalOffset_, int myPID_ ) :
142  aggSizes(aggSizes_),
143  vertex2AggId(vertex2AggId_),
144  procWinner(procWinner_),
145  nodeGlobalElts(nodeGlobalElts_),
146  isNodeGlobalElement(isNodeGlobalElement_),
147  fullBlockSize(fullBlockSize_),
148  blockID(blockID_),
149  stridingOffset(stridingOffset_),
150  stridedBlockSize(stridedBlockSize_),
151  indexBase(indexBase_),
152  globalOffset(globalOffset_),
153  myPid(myPID_) {
154  }
155 
156  KOKKOS_INLINE_FUNCTION
157  void operator()(const LO lnode) const {
158  if(stridedBlockSize == 1) {
159  if(procWinner(lnode,0) == myPid) {
160  auto myAgg = vertex2AggId(lnode,0);
161  aggSizes(myAgg+1)++; // atomic access
162  }
163  } else {
164  if(procWinner(lnode,0) == myPid) {
165  auto myAgg = vertex2AggId(lnode,0);
166  GO gnodeid = nodeGlobalElts(lnode);
167  for (LO k = 0; k< stridedBlockSize; k++) {
168  GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
169  if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) == true) {
170  aggSizes(myAgg+1)++; // atomic access
171  }
172  }
173  }
174  }
175  }
176  };
177 
178  // CreateAgg2RowMapLOFunctor<decltype(agg2RowMapLO), decltype(sizes), decltype(vertex2AggId), decltype(procWinner), decltype(nodeGlobalElts), decltype(isNodeGlobalElement), LO, GO> createAgg2RowMap (agg2RowMapLO, sizes, vertex2AggId, procWinner, nodeGlobalElts, isNodeGlobalElement, fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase, globalOffset, myPid );
179 
180  template<class agg2RowMapType, class aggSizesType, class vertex2AggIdType, class procWinnerType, class nodeGlobalEltsType, class isNodeGlobalEltsType, class LOType, class GOType>
181  class CreateAgg2RowMapLOFunctor {
182  private:
183  typedef LOType LO;
184  typedef GOType GO;
185 
186  agg2RowMapType agg2RowMap; //< view containing row map entries associated with aggregate
187  aggSizesType aggSizes; //< view containing size of aggregate
188  aggSizesType aggDofCount; //< view containing current count of "found" vertices in aggregate
189  vertex2AggIdType vertex2AggId; //< view containing vertex2AggId information
190  procWinnerType procWinner; //< view containing processor ids
191  nodeGlobalEltsType nodeGlobalElts; //< view containing global node ids of current proc
192  isNodeGlobalEltsType isNodeGlobalElement; //< unordered map storing whether (global) node id is owned by current proc
193 
194  int myPid;
195  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
196  GO indexBase, globalOffset;
197  public:
198  CreateAgg2RowMapLOFunctor(agg2RowMapType agg2RowMap_, aggSizesType aggSizes_, aggSizesType aggDofCount_, vertex2AggIdType vertex2AggId_, procWinnerType procWinner_, nodeGlobalEltsType nodeGlobalElts_, isNodeGlobalEltsType isNodeGlobalElement_, LO fullBlockSize_, LOType blockID_, LOType stridingOffset_, LOType stridedBlockSize_, GOType indexBase_, GOType globalOffset_, int myPID_ ) :
199  agg2RowMap(agg2RowMap_),
200  aggSizes(aggSizes_),
201  aggDofCount(aggDofCount_),
202  vertex2AggId(vertex2AggId_),
203  procWinner(procWinner_),
204  nodeGlobalElts(nodeGlobalElts_),
205  isNodeGlobalElement(isNodeGlobalElement_),
206  fullBlockSize(fullBlockSize_),
207  blockID(blockID_),
208  stridingOffset(stridingOffset_),
209  stridedBlockSize(stridedBlockSize_),
210  indexBase(indexBase_),
211  globalOffset(globalOffset_),
212  myPid(myPID_) {
213  }
214 
215  KOKKOS_INLINE_FUNCTION
216  void operator()(const LO lnode) const {
217  if(stridedBlockSize == 1) {
218  if(procWinner(lnode,0) == myPid) {
219  auto myAgg = vertex2AggId(lnode,0);
220  LO myAggDofStart = aggSizes(myAgg);
221  auto idx = Kokkos::atomic_fetch_add( &aggDofCount(myAgg), 1 );
222  agg2RowMap(myAggDofStart + idx) = lnode;
223  }
224  } else {
225  if(procWinner(lnode,0) == myPid) {
226  auto myAgg = vertex2AggId(lnode,0);
227  LO myAggDofStart = aggSizes(myAgg);
228  auto idx = Kokkos::atomic_fetch_add( &aggDofCount(myAgg), stridedBlockSize );
229  for (LO k = 0; k< stridedBlockSize; k++) {
230  agg2RowMap(myAggDofStart + idx + k) = lnode * stridedBlockSize +k;
231  }
232  }
233  }
234  }
235  };
236 
237  // local QR decomposition (scalar case, no real QR decomposition necessary)
238  // Use exactly the same template types. The only one that is not really necessary is the maxAggDofSizeType
239  // for reserving temporary scratch space
240  template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
241  class LocalScalarQRDecompFunctor {
242  private:
243  typedef LOType LO;
244  typedef GOType GO;
245  typedef SCType SC;
246 
247  typedef typename DeviceType::execution_space execution_space;
248 
249  private:
250 
251  NspType fineNSRandom;
252  NspType coarseNS;
253  aggRowsType aggRows;
254  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
255  agg2RowMapLOType agg2RowMapLO;
256  statusType statusAtomic;
257  rowsType rows;
258  rowsAuxType rowsAux;
259  colsAuxType colsAux;
260  valsAuxType valsAux;
261  public:
262  LocalScalarQRDecompFunctor(NspType fineNSRandom_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_) :
263  fineNSRandom(fineNSRandom_),
264  coarseNS(coarseNS_),
265  aggRows(aggRows_),
266  maxAggDofSize(maxAggDofSize_),
267  agg2RowMapLO(agg2RowMapLO_),
268  statusAtomic(statusAtomic_),
269  rows(rows_),
270  rowsAux(rowsAux_),
271  colsAux(colsAux_),
272  valsAux(valsAux_)
273  { }
274 
275  KOKKOS_INLINE_FUNCTION
276  void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& rowNnz) const {
277  auto agg = thread.league_rank();
278 
279  // size of aggregate: number of DOFs in aggregate
280  LO aggSize = aggRows(agg+1) - aggRows(agg);
281 
282  // Extract the piece of the nullspace corresponding to the aggregate, and
283  // put it in the flat array, "localQR" (in column major format) for the
284  // QR routine. Trivial in 1D.
285 
286  // Calculate QR by hand
287  typedef Kokkos::ArithTraits<SC> ATS;
288  typedef typename ATS::magnitudeType Magnitude;
289 
290  Magnitude norm = Kokkos::ArithTraits<Magnitude>::zero();
291  for (decltype(aggSize) k = 0; k < aggSize; k++) {
292  Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
293  norm += dnorm*dnorm;
294  }
295  norm = sqrt(norm);
296 
297  if (norm == ATS::zero()) {
298  // zero column; terminate the execution
299  statusAtomic(1) = true;
300  return;
301  }
302 
303  // R = norm
304  coarseNS(agg, 0) = norm;
305 
306  // Q = localQR(:,0)/norm
307  for (LO k = 0; k < aggSize; k++) {
308  LO localRow = agg2RowMapLO(aggRows(agg)+k);
309  SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
310 
311  size_t rowStart = rowsAux(localRow);
312  colsAux(rowStart) = agg;
313  valsAux(rowStart) = localVal;
314 
315  // Store true number of nonzeros per row
316  rows(localRow+1) = 1;
317  rowNnz += 1;
318  }
319  }
320  };
321 
322  // local QR decomposition
323  template<class LOType, class GOType, class SCType,class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
324  class LocalQRDecompFunctor {
325  private:
326  typedef LOType LO;
327  typedef GOType GO;
328  typedef SCType SC;
329 
330  typedef typename DeviceType::execution_space execution_space;
331 
334 
335  private:
336 
337  NspType fineNS;
338  NspType coarseNS;
339  aggRowsType aggRows;
340  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
341  agg2RowMapLOType agg2RowMapLO;
342  statusType statusAtomic;
343  rowsType rows;
344  rowsAuxType rowsAux;
345  colsAuxType colsAux;
346  valsAuxType valsAux;
347  public:
348  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_) :
349  fineNS(fineNS_),
350  coarseNS(coarseNS_),
351  aggRows(aggRows_),
352  maxAggDofSize(maxAggDofSize_),
353  agg2RowMapLO(agg2RowMapLO_),
354  statusAtomic(statusAtomic_),
355  rows(rows_),
356  rowsAux(rowsAux_),
357  colsAux(colsAux_),
358  valsAux(valsAux_)
359  { }
360 
361  KOKKOS_INLINE_FUNCTION
362  void operator() ( const typename Kokkos::TeamPolicy<execution_space>::member_type & thread, size_t& nnz) const {
363  auto agg = thread.league_rank();
364 
365  // size of aggregate: number of DOFs in aggregate
366  auto aggSize = aggRows(agg+1) - aggRows(agg);
367 
368  typedef Kokkos::ArithTraits<SC> ATS;
369  SC one = ATS::one();
370  SC zero = ATS::zero();
371 
372  // Extract the piece of the nullspace corresponding to the aggregate, and
373  // put it in the flat array, "localQR" (in column major format) for the
374  // QR routine.
375  shared_matrix localQR(thread.team_shmem(),aggSize,fineNS.dimension_1());
376  for (size_t j = 0; j < fineNS.dimension_1(); j++)
377  for (LO k = 0; k < aggSize; k++)
378  localQR(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
379 
380  // Test for zero columns
381  for (size_t j = 0; j < fineNS.dimension_1(); j++) {
382  bool bIsZeroNSColumn = true;
383 
384  for (LO k = 0; k < aggSize; k++)
385  if (localQR(k,j) != zero)
386  bIsZeroNSColumn = false;
387 
388  if (bIsZeroNSColumn) {
389  statusAtomic(1) = true;
390  return;
391  }
392  }
393  // calculate row offset for coarse nullspace
394  Xpetra::global_size_t offset = agg * fineNS.dimension_1();
395 
396  // Reserve shared memory for local QR decomposition and results (r and q)
397  shared_matrix q (thread.team_shmem(),aggSize,aggSize); // memory containing q part in the end
398 
399  // Calculate QR decomposition (standard)
400  if (aggSize >= decltype(aggSize)( fineNS.dimension_1() )) {
401 
402  // Reserve temporary shared memory for local QR decomposition
403  shared_matrix r(thread.team_shmem(),aggSize,fineNS.dimension_1()); // memory containing the r part in the end
404  shared_matrix z(thread.team_shmem(),aggSize,fineNS.dimension_1()); // helper matrix (containing parts of localQR)
405  shared_vector e(thread.team_shmem(),aggSize); //unit vector
406  shared_matrix qk(thread.team_shmem(),aggSize,aggSize); // memory cotaining one householder reflection part
407  shared_matrix qt (thread.team_shmem(),aggSize,aggSize); // temporary
408  shared_vector x(thread.team_shmem(),aggSize);
409 
410  // standard case
411  // do local QR decomposition
412  matrix_copy(localQR,z);
413 
414  typedef typename ATS::magnitudeType Magnitude;
415  Magnitude zeroMagnitude = Kokkos::ArithTraits<Magnitude>::zero();
416 
417  for(decltype(localQR.dimension_0()) k = 0; k < localQR.dimension_0() && k < localQR.dimension_1()/*-1*/; k++) {
418  // extract minor parts from mat
419  matrix_clear(r); // zero out temporary matrix (there is some potential to speed this up by avoiding this)
420  matrix_minor(z,r,k);
421 
422  // extract k-th column from current minor
423  //auto x = subview(r, Kokkos::ALL (), k);
424  for(decltype(x.dimension_0()) i = 0; i < x.dimension_0(); i++)
425  x(i) = r(i,k);
426  SC xn = vnorm(x); // calculate 2-norm of current column vector
427  if(ATS::magnitude(localQR(k,k)) > zeroMagnitude) xn = -xn;
428 
429  // build k-th unit vector
430  for(decltype(e.dimension_0()) i = 0; i < e.dimension_0(); i++)
431  e(i) = (i==k) ? one : zero;
432  vmadd(e, x, xn); // e = x + xn * e;
433  SC en = vnorm(e);
434  vdiv(e,en); // scale vector e
435 
436  // build Q(k) and Q matrix
437  if (k == 0) {
438  vmul ( e, q);
439  matrix_mul(q, r, z);
440  }
441  else {
442  matrix_clear(qk); // zero out old qk
443  vmul ( e, qk);
444 
445  matrix_mul ( qk, q, qt); // TODO can we avoid qt?
446  matrix_copy(qt,q);
447  matrix_mul(qk, r, z);
448  }
449  }
450 
451  // build R part
452  matrix_mul ( q, localQR, r);
453 
454  // upper triangular part of R build coarse NS
455  for(decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
456  for(decltype(j) k = 0; k <= j; k++)
457  coarseNS(offset+k,j) = r(k,j);
458  // Don't forget to transpose q
459  matrix_transpose(q);
460  } else {
461  // Special handling for aggSize < NSDim (i.e. single node aggregates in structural mechanics)
462 
463  // The local QR decomposition is not possible in the "overconstrained"
464  // case (i.e. number of columns in localQR > number of rowsAux), which
465  // corresponds to #DOFs in Aggregate < NSDim. For usual problems this
466  // is only possible for single node aggregates in structural mechanics.
467  // (Similar problems may arise in discontinuous Galerkin problems...)
468  // We bypass the QR decomposition and use an identity block in the
469  // tentative prolongator for the single node aggregate and transfer the
470  // corresponding fine level null space information 1-to-1 to the coarse
471  // level null space part.
472 
473  // NOTE: The resulting tentative prolongation operator has
474  // (aggSize*DofsPerNode-NSDim) zero columns leading to a singular
475  // coarse level operator A. To deal with that one has the following
476  // options:
477  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
478  // false) to add some identity block to the diagonal of the zero rowsAux
479  // in the coarse level operator A, such that standard level smoothers
480  // can be used again.
481  // - Use special (projection-based) level smoothers, which can deal
482  // with singular matrices (very application specific)
483  // - Adapt the code below to avoid zero columns. However, we do not
484  // support a variable number of DOFs per node in MueLu/Xpetra which
485  // makes the implementation really hard.
486 
487  // R = extended (by adding identity rowsAux) localQR
488  for (decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
489  for (decltype(fineNS.dimension_1()) k = 0; k < fineNS.dimension_1(); k++)
490  if (k < decltype(k)(aggSize))
491  coarseNS(offset+k,j) = localQR(k,j);
492  else
493  coarseNS(offset+k,j) = (k == j ? one : zero);
494 
495  // Q = I (rectangular)
496  for (decltype(fineNS.dimension_1()) i = 0; i < decltype(fineNS.dimension_1()) (aggSize); i++)
497  for (decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
498  q(i,j) = (j == i ? one : zero);
499  }
500 
501  // Process each row in the local Q factor and fill helper arrays to assemble P
502  for (LO j = 0; j < aggSize; j++) {
503  LO localRow = agg2RowMapLO(aggRows(agg)+j);
504  size_t rowStart = rowsAux(localRow);
505  size_t lnnz = 0;
506  for (decltype(fineNS.dimension_1()) k = 0; k < fineNS.dimension_1(); k++) {
507  // skip zeros
508  if(q(j,k) != zero) {
509  colsAux(rowStart+lnnz) = offset + k;
510  valsAux(rowStart+lnnz) = q(j,k);
511  lnnz++;
512  }
513  }
514  rows(localRow+1) = lnnz;
515  nnz += lnnz;
516  }
517 
518  // debug
519  /*printf("R\n");
520  for(int i=0; i<aggSize; i++) {
521  for(int j=0; j<fineNS.dimension_1(); j++) {
522  printf(" %.3g ",coarseNS(i,j));
523  }
524  printf("\n");
525  }
526  printf("Q\n");
527 
528  for(int i=0; i<aggSize; i++) {
529  for(int j=0; j<aggSize; j++) {
530  printf(" %.3g ",q(i,j));
531  }
532  printf("\n");
533  }*/
534  // end debug
535 
536  }
537 
538  KOKKOS_INLINE_FUNCTION
539  void matrix_clear ( shared_matrix & m) const {
540  typedef Kokkos::ArithTraits<SC> ATS;
541  SC zero = ATS::zero();
542  for(decltype(m.dimension_0()) i = 0; i < m.dimension_0(); i++) {
543  for(decltype(m.dimension_1()) j = 0; j < m.dimension_1(); j++) {
544  m(i,j) = zero;
545  }
546  }
547  }
548 
549  KOKKOS_INLINE_FUNCTION
550  void matrix_copy (const shared_matrix & mi, shared_matrix & mo) const {
551  for(decltype(mi.dimension_0()) i = 0; i < mi.dimension_0(); i++) {
552  for(decltype(mi.dimension_1()) j = 0; j < mi.dimension_1(); j++) {
553  mo(i,j) = mi(i,j);
554  }
555  }
556  }
557 
558  KOKKOS_FUNCTION
559  void matrix_transpose ( shared_matrix & m) const {
560  for(decltype(m.dimension_0()) i = 0; i < m.dimension_0(); i++) {
561  for(decltype(m.dimension_1()) j = 0; j < i; j++) {
562  SC t = m(i,j);
563  m(i,j) = m(j,i);
564  m(j,i) = t;
565  }
566  }
567  }
568 
569  KOKKOS_FUNCTION
570  void matrix_mul ( const shared_matrix & m1, const shared_matrix & m2, shared_matrix & m1m2) const {
571  typedef Kokkos::ArithTraits<SC> ATS;
572  SC zero = ATS::zero();
573  for(decltype(m1.dimension_0()) i = 0; i < m1.dimension_0(); i++) {
574  for(decltype(m1.dimension_1()) j = 0; j < m1.dimension_1(); j++) {
575  m1m2(i,j) = zero;
576  for(decltype(m1.dimension_1()) k = 0; k < m1.dimension_1(); k++) {
577  m1m2(i,j) += m1(i,k) * m2(k,j);
578  }
579  }
580  }
581  }
582 
583  KOKKOS_FUNCTION
584  void matrix_minor ( const shared_matrix & mat, shared_matrix & matminor, LO d) const {
585  typedef Kokkos::ArithTraits<SC> ATS;
586  SC one = ATS::one();
587 
588  for (decltype(d) i = 0; i < d; i++) {
589  matminor(i,i) = one;
590  }
591  for (decltype(mat.dimension_0()) i = d; i < mat.dimension_0(); i++) {
592  for (decltype(mat.dimension_1()) j=d; j < mat.dimension_1(); j++) {
593  matminor(i,j) = mat(i,j);
594  }
595  }
596  }
597 
601  KOKKOS_FUNCTION
602  void vmul ( const shared_vector & v, shared_matrix & vmuldata) const {
603  typedef Kokkos::ArithTraits<SC> ATS;
604  SC one = ATS::one();
605  for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++) {
606  for(decltype(v.dimension_0()) j = 0; j < v.dimension_0(); j++) {
607  vmuldata(i,j) = -2. * v(i) * v(j);
608  }
609  }
610  for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++) {
611  vmuldata(i,i) += one;
612  }
613  }
614 
618  KOKKOS_INLINE_FUNCTION
619  SC vnorm(const shared_vector & v) const {
620  SC sum = 0;
621  for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
622  sum += v(i) * v(i);
623  return sqrt(sum);
624  }
625 
629  KOKKOS_INLINE_FUNCTION
630  void vdiv(shared_vector & v, SC d) const {
631  for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
632  v(i) = v(i) / d;
633  }
634 
639  KOKKOS_INLINE_FUNCTION
640  void vmadd(shared_vector & v, const shared_vector & va, SC d) const {
641  for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
642  v(i) = va(i) + d * v(i);
643  }
644 
645  // amout of shared memory
646  size_t team_shmem_size( int team_size ) const {
647  return 3 * Kokkos::View<double**,Kokkos::MemoryUnmanaged>::shmem_size(maxAggDofSize,fineNS.dimension_1()) + // mat + matminor + z
648  3 * Kokkos::View<double**,Kokkos::MemoryUnmanaged>::shmem_size(maxAggDofSize,maxAggDofSize) + // qk and q and qt
650  }
651  };
652 
653 
654  // only limited support for lambdas with CUDA 7.5
655  // see https://github.com/kokkos/kokkos/issues/763
656  // This functor probably can go away when using CUDA 8.0
657  template<class LO, class rowType, class colType, class valType>
658  class CompressArraysFunctor {
659  public:
660  CompressArraysFunctor(LO invalid, rowType rowsAux, colType colsAux, valType valsAux, rowType rows, colType cols, valType vals) : invalid_(invalid), rowsAux_(rowsAux), colsAux_(colsAux), valsAux_(valsAux), rows_(rows), cols_(cols), vals_(vals) { }
661 
662  KOKKOS_INLINE_FUNCTION
663  void operator()(const LO i) const {
664  LO rowStart = rows_(i);
665 
666  size_t lnnz = 0;
667  for (decltype(rowsAux_(i)) j = rowsAux_(i); j < rowsAux_(i+1); j++)
668  if (colsAux_(j) != invalid_) {
669  cols_(rowStart+lnnz) = colsAux_(j);
670  vals_(rowStart+lnnz) = valsAux_(j);
671  lnnz++;
672  }
673  }
674 
675  private:
676  LO invalid_;
677  rowType rows_;
678  colType cols_;
679  valType vals_;
680  rowType rowsAux_;
681  colType colsAux_;
682  valType valsAux_;
683  };
684  }
685 
686  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
687  RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
688  RCP<ParameterList> validParamList = rcp(new ParameterList());
689 
690  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
691  validParamList->set< RCP<const FactoryBase> >("Aggregates", Teuchos::null, "Generating factory of the aggregates");
692  validParamList->set< RCP<const FactoryBase> >("Nullspace", Teuchos::null, "Generating factory of the nullspace");
693  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
694  validParamList->set< RCP<const FactoryBase> >("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
695 
696  return validParamList;
697  }
698 
699  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
700  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& coarseLevel) const {
701  Input(fineLevel, "A");
702  Input(fineLevel, "Aggregates");
703  Input(fineLevel, "Nullspace");
704  Input(fineLevel, "UnAmalgamationInfo");
705  Input(fineLevel, "CoarseMap");
706  }
707 
708  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
709  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel) const {
710  return BuildP(fineLevel, coarseLevel);
711  }
712 
713  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
714  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel) const {
715  FactoryMonitor m(*this, "Build", coarseLevel);
716 
717  auto A = Get< RCP<Matrix> > (fineLevel, "A");
718  auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel, "Aggregates");
719  auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel, "UnAmalgamationInfo");
720  auto fineNullspace = Get< RCP<MultiVector> > (fineLevel, "Nullspace");
721  auto coarseMap = Get< RCP<const Map> > (fineLevel, "CoarseMap");
722 
723  RCP<Matrix> Ptentative;
724  RCP<MultiVector> coarseNullspace;
725  if (!aggregates->AggregatesCrossProcessors())
726  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
727  else
728  BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
729 
730  // If available, use striding information of fine level matrix A for range
731  // map and coarseMap as domain map; otherwise use plain range map of
732  // Ptent = plain range map of A for range map and coarseMap as domain map.
733  // NOTE:
734  // The latter is not really safe, since there is no striding information
735  // for the range map. This is not really a problem, since striding
736  // information is always available on the intermedium levels and the
737  // coarsest levels.
738  if (A->IsView("stridedMaps") == true)
739  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
740  else
741  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
742 
743  Set(coarseLevel, "Nullspace", coarseNullspace);
744  Set(coarseLevel, "P", Ptentative);
745 
746  if (IsPrint(Statistics1)) {
747  RCP<ParameterList> params = rcp(new ParameterList());
748  params->set("printLoadBalancingInfo", true);
749  GetOStream(Statistics1) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
750  }
751  }
752 
753  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
754  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
755  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
756  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
757  RCP<const Map> rowMap = A->getRowMap();
758  RCP<const Map> colMap = A->getColMap();
759 
760  const size_t numRows = rowMap->getNodeNumElements();
761  const size_t NSDim = fineNullspace->getNumVectors();
762 
763  typedef Teuchos::ScalarTraits<SC> STS;
764  const SC zero = STS::zero();
765  const SC one = STS::one();
766  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
767 
768  auto aggGraph = aggregates->GetGraph();
769  auto aggRows = aggGraph.row_map;
770  auto aggCols = aggGraph.entries;
771  //const GO numAggs = aggregates->GetNumAggregates();
772 
773  // Aggregates map is based on the amalgamated column map
774  // We can skip global-to-local conversion if LIDs in row map are
775  // same as LIDs in column map
776  bool goodMap = isGoodMap(*rowMap, *colMap);
777 
778  // STEP 1: do unamalgamation
779  // In contrast to the non-kokkos version which uses member functions from the AmalgamationInfo container
780  // class to unamalgamate the data. The kokkos version of TentativePFacotry does the unamalgamation here
781  // and only uses the data of the AmalgamationInfo container class
782 
783  // Extract information for unamalgamation
784  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
785  GO indexBase;
786  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
787  GO globalOffset = amalgInfo->GlobalOffset();
788 
789  // Store overlapping local node ids belonging to aggregates on the current processor in a view
790  // This has to be done in serial on the host
791  Teuchos::ArrayView<const GO> nodeGlobalEltsView = aggregates->GetMap()->getNodeElementList();
792  Kokkos::View<GO*, DeviceType> nodeGlobalElts("nodeGlobalElts", nodeGlobalEltsView.size());
793  for(size_t i = 0; i < nodeGlobalElts.size(); i++)
794  nodeGlobalElts(i) = nodeGlobalEltsView[i];
795 
796  // Extract aggregation info (already in Kokkos host views)
797  auto procWinner = aggregates->GetProcWinner()->getHostLocalView();
798  auto vertex2AggId = aggregates->GetVertex2AggId()->getHostLocalView();
799  const GO numAggregates = aggregates->GetNumAggregates();
800 
801  // Create an Kokkos::UnorderedMap to store the mapping globalDofIndex -> bool (isNodeGlobalElement)
802  // We want to use that information within parallel kernels and cannot call Xpetra::Map routines in the
803  // parallel kernels
805  map_type isNodeGlobalElement(colMap->getNodeNumElements());
806 
807  int myPid = aggregates->GetMap()->getComm()->getRank();
808 
809  // create a unordered map GID -> isGlobalElement in colMap of A (available from above)
810  // This has to be done on the host
811  for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
812  auto myAgg = vertex2AggId(lnode,0);
813  if(procWinner(lnode,0) == myPid) {
814  GO gnodeid = nodeGlobalElts[lnode];
815  for (LO k = 0; k< stridedBlockSize; k++) {
816  GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
817  bool bIsInColumnMap = colMap->isNodeGlobalElement(gDofIndex);
818  isNodeGlobalElement.insert(gDofIndex, bIsInColumnMap);
819  }
820  }
821  }
822 
823  // Create Kokkos::View (on the device) to store the aggreate dof size
824  // Later used to get aggregate dof offsets
825  // NOTE: This zeros itself on construction
826  typedef Kokkos::View<LO*, DeviceType> aggSizeType;
827  aggSizeType sizes("agg_dof_sizes", numAggregates + 1);
828  // sizes(0) = 0;
829 
830 #if 1
831 
832  {
833  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
834 
835  // atomic data access for sizes view
836  typename AppendTrait<aggSizeType, Kokkos::Atomic>::type sizesAtomic = sizes;
837 
838  // loop over all nodes
839  CollectAggregateSizeFunctor <decltype(sizesAtomic), decltype(vertex2AggId), decltype(procWinner), decltype(nodeGlobalElts), decltype(isNodeGlobalElement), LO, GO> collectAggregateSizes(sizesAtomic, vertex2AggId, procWinner, nodeGlobalElts, isNodeGlobalElement, fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase, globalOffset, myPid );
840  Kokkos::parallel_for("MueLu:TentativePF:Build:getaggsizes", range_type(0,vertex2AggId.dimension_0()), collectAggregateSizes);
841  }
842 #else
843  // we have to avoid race conditions when parallelizing the loops below
844  if(stridedBlockSize == 1) {
845  // loop over all nodes
846  for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
847  if(procWinner(lnode,0) == myPid) {
848  auto myAgg = vertex2AggId(lnode,0);
849  sizes(myAgg+1)++;
850  }
851  }
852  } else {
853  // loop over all nodes
854  for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
855  if(procWinner(lnode,0) == myPid) {
856  auto myAgg = vertex2AggId(lnode,0);
857  GO gnodeid = nodeGlobalElts[lnode];
858 
859  for (LO k = 0; k< stridedBlockSize; k++) {
860  GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
861  if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) == true) {
862  sizes(myAgg+1)++;
863  }
864  }
865  }
866  }
867  }
868 #endif
869 
870  // Find maximum dof size for aggregates
871  // Later used to reserve enough scratch space for local QR decompositions
872  // TODO can be done with a parallel reduce
873  LO maxAggSize = 0;
874  for(decltype(sizes.dimension_0()) i = 0; i < sizes.dimension_0(); i++) {
875  if(sizes(i) > maxAggSize) maxAggSize = sizes(i);
876  }
877 
878  // parallel_scan (exclusive)
879  // The Kokkos::View sizes then contains the aggreate Dof offsets
880  ScanFunctor<LO,decltype(sizes)> scanFunctorAggSizes(sizes);
881  Kokkos::parallel_scan("MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1), scanFunctorAggSizes);
882  // Create Kokkos::View on the device to store mapping between (local) aggregate id and row map ids (LIDs)
883  Kokkos::View<LO*, DeviceType> agg2RowMapLO("agg2row_map_LO", numRows); // initialized to 0
884 
885 #if 1
886 
887  {
888  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
889  // NOTE: This zeros itself on construction
890  aggSizeType aggDofCount("aggDofCount", numAggregates);
891 
892 
893  // atomic data access for agg2RowMapLO view
894  //typename AppendTrait<decltype(agg2RowMapLO), Kokkos::Atomic>::type agg2RowMapLOAtomic = agg2RowMapLO;
895 
896  CreateAgg2RowMapLOFunctor<decltype(agg2RowMapLO), decltype(sizes), decltype(vertex2AggId), decltype(procWinner), decltype(nodeGlobalElts), decltype(isNodeGlobalElement), LO, GO> createAgg2RowMap (agg2RowMapLO, sizes, aggDofCount,vertex2AggId, procWinner, nodeGlobalElts, isNodeGlobalElement, fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase, globalOffset, myPid );
897  Kokkos::parallel_for("MueLu:TentativePF:Build:createAgg2RowMap", range_type(0,vertex2AggId.dimension_0()), createAgg2RowMap);
898  }
899 #else
900  Kokkos::View<LO*, DeviceType> numDofs("countDofsPerAggregate", numAggregates); // helper view. We probably don't need that
901  if(stridedBlockSize == 1) {
902  // loop over all nodes
903  for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
904  if(procWinner(lnode,0) == myPid) {
905  auto myAgg = vertex2AggId(lnode,0);
906  agg2RowMapLO(sizes(myAgg) + numDofs(myAgg)) = lnode;
907  numDofs(myAgg)++;
908  }
909  }
910  } else {
911  // loop over all nodes
912  for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
913  if(procWinner(lnode,0) == myPid) {
914  auto myAgg = vertex2AggId(lnode,0);
915  GO gnodeid = nodeGlobalElts[lnode];
916 
917  for (LO k = 0; k< stridedBlockSize; k++) {
918  GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
919  if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) == true) {
920  agg2RowMapLO(sizes(myAgg) + numDofs(myAgg)) = lnode * stridedBlockSize + k;
921  numDofs(myAgg)++;
922  }
923  }
924  }
925  }
926  }
927 #endif
928 
929  // STEP 2: prepare local QR decomposition
930  // Reserve memory for tentative prolongation operator
931 
932  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
933 
934  // Pull out the nullspace vectors so that we can have random access (on the device)
935  auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
936  auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
937 
938  size_t nnzEstimate = numRows * NSDim; // maximum number of possible nnz
939  size_t nnz = 0; // actual number of nnz
940 
941  typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
942  typedef typename local_matrix_type::row_map_type rows_type;
943  typedef typename local_matrix_type::index_type cols_type;
944  typedef typename local_matrix_type::values_type vals_type;
945 
946  // Stage 0: initialize auxiliary arrays
947  // The main thing to notice is initialization of vals with INVALID. These
948  // values will later be used to compress the arrays
949  typename rows_type::non_const_type rowsAux("Ptent_aux_rows", numRows+1), rows("Ptent_rows", numRows+1);
950  typename cols_type::non_const_type colsAux("Ptent_aux_cols", nnzEstimate);
951  typename vals_type::non_const_type valsAux("Ptent_aux_vals", nnzEstimate);
952 
953 #if 1
954  // only limited support for lambdas with CUDA 7.5
955  // see https://github.com/kokkos/kokkos/issues/763
956  FillRowAuxArrayFunctor<LO, decltype(rowsAux), decltype(NSDim)> rauxf(rowsAux, NSDim);
957  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0,numRows+1), rauxf);
958  FillColAuxArrayFunctor<SC, LO, decltype(colsAux), decltype(valsAux)> cauxf(zero, INVALID, colsAux, valsAux);
959  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", range_type(0,nnzEstimate), cauxf);
960 #else
961  Kokkos::parallel_for("MueLu:TentativePF:BuildPuncoupled:for1", range_type(0,numRows+1), KOKKOS_LAMBDA(const LO row) {
962  rowsAux(row) = row*NSDim;
963  });
964  Kokkos::parallel_for("MueLu:TentativePF:BuildUncoupled:for2", nnzEstimate, KOKKOS_LAMBDA(const LO j) {
965  colsAux(j) = INVALID;
966  valsAux(j) = zero;
967  });
968 #endif
969 
970  // Device View for status (error messages...)
971  typedef Kokkos::View<int[10], DeviceType> status_type;
972  status_type status("status");
973 
974  // Stage 1: construct auxiliary arrays.
975  // The constructed arrays may have gaps in them (vals(j) == INVALID)
976  // Run one thread per aggregate.
977  typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
978  typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
979 
980  TEUCHOS_TEST_FOR_EXCEPTION(goodMap == false, Exceptions::RuntimeError, "Only works for non-overlapping aggregates (goodMap == true)");
981 
982  {
983  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
984 
985  if (NSDim == 1) {
986 
987  #if 1
988  // only limited support for lambdas with CUDA 7.5
989  // see https://github.com/kokkos/kokkos/issues/763
990  //
991  // Set up team policy with numAggregates teams and one thread per team.
992  // Each team handles a slice of the data associated with one aggregate
993  // and performs a local QR decomposition (in this case no real QR decomp
994  // is necessary)
995  const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1);
996  Kokkos::parallel_reduce(policy, LocalScalarQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom), decltype(sizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO), decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux), decltype(valsAux)>(fineNSRandom,coarseNS,sizes,maxAggSize,agg2RowMapLO,statusAtomic,rows,rowsAux,colsAux,valsAux),nnz);
997  #else
998  // Andrey's original implementation as a lambda
999  // 1D is special, as it is the easiest. We don't even need to the QR,
1000  // just normalize an array. Plus, no worries abot small aggregates.
1001  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_loop", numAggregates, KOKKOS_LAMBDA(const GO agg, size_t& rowNnz) {
1002  LO aggSize = aggRows(agg+1) - aggRows(agg);
1003 
1004  // Extract the piece of the nullspace corresponding to the aggregate, and
1005  // put it in the flat array, "localQR" (in column major format) for the
1006  // QR routine. Trivial in 1D.
1007  if (goodMap) {
1008  // Calculate QR by hand
1009  typedef Kokkos::ArithTraits<SC> ATS;
1010  typedef typename ATS::magnitudeType Magnitude;
1011 
1012  Magnitude norm = ATS::magnitude(zero);
1013  for (size_t k = 0; k < aggSize; k++) {
1014  Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
1015  norm += dnorm*dnorm;
1016  }
1017  norm = sqrt(norm);
1018 
1019  if (norm == zero) {
1020  // zero column; terminate the execution
1021  statusAtomic(1) = true;
1022  return;
1023  }
1024 
1025  // R = norm
1026  coarseNS(agg, 0) = norm;
1027 
1028  // Q = localQR(:,0)/norm
1029  for (LO k = 0; k < aggSize; k++) {
1030  LO localRow = agg2RowMapLO(aggRows(agg)+k);
1031  SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
1032 
1033  size_t rowStart = rowsAux(localRow);
1034  colsAux(rowStart) = agg;
1035  valsAux(rowStart) = localVal;
1036 
1037  // Store true number of nonzeros per row
1038  rows(localRow+1) = 1;
1039  rowNnz += 1;
1040  }
1041 
1042  } else {
1043  // FIXME: implement non-standard map QR
1044  // Look at the original TentativeP for how to do that
1045  statusAtomic(0) = true;
1046  return;
1047  }
1048  }, nnz);
1049  #endif
1050 
1051  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
1052  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
1053  if (statusHost(i)) {
1054  std::ostringstream oss;
1055  oss << "MueLu::TentativePFactory::MakeTentative: ";
1056  switch(i) {
1057  case 0: oss << "!goodMap is not implemented"; break;
1058  case 1: oss << "fine level NS part has a zero column"; break;
1059  }
1060  throw Exceptions::RuntimeError(oss.str());
1061  }
1062 
1063  } else { // NSdim > 1
1064 
1065  // Set up team policy with numAggregates teams and one thread per team.
1066  // Each team handles a slice of the data associated with one aggregate
1067  // and performs a local QR decomposition
1068  //const Kokkos::TeamPolicy<> policy( numAggregates, 1); // numAggregates teams a 1 thread
1069  const Kokkos::TeamPolicy<execution_space> policy(numAggregates,1);
1070  Kokkos::parallel_reduce(policy, LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom), decltype(sizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO), decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux), decltype(valsAux)>(fineNSRandom,coarseNS,sizes,maxAggSize,agg2RowMapLO,statusAtomic,rows,rowsAux,colsAux,valsAux),nnz);
1071 
1072  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
1073  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
1074  if (statusHost(i)) {
1075  std::ostringstream oss;
1076  oss << "MueLu::TentativePFactory::MakeTentative: ";
1077  switch(i) {
1078  case 0: oss << "!goodMap is not implemented"; break;
1079  case 1: oss << "fine level NS part has a zero column"; break;
1080  }
1081  throw Exceptions::RuntimeError(oss.str());
1082  }
1083  }
1084 
1085  } // subtime monitor stage 1
1086 
1087  {
1088  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
1089  // Stage 2: compress the arrays
1090  ScanFunctor<LO,decltype(rows)> scanFunctor(rows);
1091  Kokkos::parallel_scan("MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1), scanFunctor);
1092  }
1093 
1094  // The real cols and vals are constructed using calculated (not estimated) nnz
1095  typename cols_type::non_const_type cols("Ptent_cols", nnz);
1096  typename vals_type::non_const_type vals("Ptent_vals", nnz);
1097 
1098 #if 1
1099  {
1100  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
1101 
1102  CompressArraysFunctor<LO, decltype(rows),decltype(cols),decltype(vals)> comprArr(INVALID, rowsAux, colsAux, valsAux, rows, cols, vals);
1103  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows), comprArr);
1104  }
1105 #else
1106  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols_vals", numRows, KOKKOS_LAMBDA(const LO i) {
1107  LO rowStart = rows(i);
1108 
1109  size_t lnnz = 0;
1110  for (LO j = rowsAux(i); j < rowsAux(i+1); j++)
1111  if (colsAux(j) != INVALID) {
1112  cols(rowStart+lnnz) = colsAux(j);
1113  vals(rowStart+lnnz) = valsAux(j);
1114  lnnz++;
1115  }
1116  });
1117 #endif
1118 
1119  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
1120 
1121  // Stage 3: construct Xpetra::Matrix
1122 #if 1
1123  // create local CrsMatrix
1124  {
1125  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
1126  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
1127  Teuchos::RCP<CrsMatrix> PtentCrs = CrsMatrixFactory::Build(rowMap,coarseMap,lclMatrix);
1128  PtentCrs->resumeFill(); // we need that for rectangular matrices
1129  PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
1130  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
1131  }
1132 #else
1133  Ptentative = rcp(new CrsMatrixWrap(rowMap, coarseMap, 0, Xpetra::StaticProfile));
1134  RCP<CrsMatrix> PtentCrs = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
1135 
1136  ArrayRCP<size_t> iaPtent;
1137  ArrayRCP<LO> jaPtent;
1138  ArrayRCP<SC> valPtent;
1139 
1140  PtentCrs->allocateAllValues(nnz, iaPtent, jaPtent, valPtent);
1141  ArrayView<size_t> ia = iaPtent();
1142  ArrayView<LO> ja = jaPtent();
1143  ArrayView<SC> val = valPtent();
1144 
1145  // Copy values
1146  typename rows_type::HostMirror rowsHost = Kokkos::create_mirror_view(rows);
1147  typename cols_type::HostMirror colsHost = Kokkos::create_mirror_view(cols);
1148  typename vals_type::HostMirror valsHost = Kokkos::create_mirror_view(vals);
1149 
1150  // copy data from device to host
1151  // shouldn't be do anything if we are already on the host
1152  Kokkos::deep_copy (rowsHost, rows);
1153  Kokkos::deep_copy (colsHost, cols);
1154  Kokkos::deep_copy (valsHost, vals);
1155 
1156  for (LO i = 0; i < rowsHost.size(); i++)
1157  ia[i] = rowsHost(i);
1158  for (LO j = 0; j < colsHost.size(); j++) {
1159  ja [j] = colsHost(j);
1160  val[j] = valsHost(j);
1161  }
1162  PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
1163  PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
1164 #endif
1165  }
1166 
1167  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1168  void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
1169  BuildPcoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
1170  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace) const {
1171  throw Exceptions::RuntimeError("Construction of coupled tentative P is not implemented");
1172  }
1173 
1174  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
1175  bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::isGoodMap(const Map& rowMap, const Map& colMap) const {
1176  ArrayView<const GO> rowElements = rowMap.getNodeElementList();
1177  ArrayView<const GO> colElements = colMap.getNodeElementList();
1178 
1179  const size_t numElements = rowElements.size();
1180 
1181  bool goodMap = true;
1182  for (size_t i = 0; i < numElements; i++)
1183  if (rowElements[i] != colElements[i]) {
1184  goodMap = false;
1185  break;
1186  }
1187 
1188  return goodMap;
1189  }
1190 
1191 } //namespace MueLu
1192 
1193 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1194 #endif // HAVE_MUELU_KOKKOS_REFACTOR
1195 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Print more statistics.
Namespace for MueLu classes and methods.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > sqrt(const complex< RealType > &x)
void deep_copy(const View< DT, DP... > &dst, typename ViewTraits< DT, DP... >::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP... >::specialize, void >::value >::type *=0)
Description of what is happening (more verbose)