46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP 49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR 55 #include "MueLu_Aggregates_kokkos.hpp" 56 #include "MueLu_AmalgamationFactory.hpp" 57 #include "MueLu_AmalgamationInfo.hpp" 58 #include "MueLu_CoarseMapFactory_kokkos.hpp" 59 #include "MueLu_NullspaceFactory_kokkos.hpp" 60 #include "MueLu_PerfUtils.hpp" 62 #include "MueLu_Utilities_kokkos.hpp" 68 template<
class LocalOrdinal,
class RowType>
71 ScanFunctor(RowType rows) : rows_(rows) { }
73 KOKKOS_INLINE_FUNCTION
74 void operator()(
const LocalOrdinal i, LocalOrdinal& upd,
const bool&
final)
const {
87 template<
class LocalOrdinal,
class rowType,
class NSDimType>
88 class FillRowAuxArrayFunctor {
90 FillRowAuxArrayFunctor(rowType rows, NSDimType nsDim) : rows_(rows), nsDim_(nsDim) { }
92 KOKKOS_INLINE_FUNCTION
93 void operator()(
const LocalOrdinal row)
const {
94 rows_(row) = row*nsDim_;
105 template<
class SC,
class LO,
class colType,
class valType>
106 class FillColAuxArrayFunctor {
108 FillColAuxArrayFunctor(SC zero, LO invalid, colType cols, valType vals) : zero_(zero), invalid_(invalid), cols_(cols), vals_(vals) { }
110 KOKKOS_INLINE_FUNCTION
111 void operator()(
const LO col)
const {
112 cols_(col) = invalid_;
125 template<
class aggSizesType,
class vertex2AggIdType,
class procWinnerType,
class nodeGlobalEltsType,
class isNodeGlobalEltsType,
class LOType,
class GOType>
126 class CollectAggregateSizeFunctor {
131 aggSizesType aggSizes;
132 vertex2AggIdType vertex2AggId;
133 procWinnerType procWinner;
134 nodeGlobalEltsType nodeGlobalElts;
135 isNodeGlobalEltsType isNodeGlobalElement;
138 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
139 GO indexBase, globalOffset;
141 CollectAggregateSizeFunctor(aggSizesType aggSizes_, vertex2AggIdType vertex2AggId_, procWinnerType procWinner_, nodeGlobalEltsType nodeGlobalElts_, isNodeGlobalEltsType isNodeGlobalElement_, LO fullBlockSize_, LOType blockID_, LOType stridingOffset_, LOType stridedBlockSize_, GOType indexBase_, GOType globalOffset_,
int myPID_ ) :
143 vertex2AggId(vertex2AggId_),
144 procWinner(procWinner_),
145 nodeGlobalElts(nodeGlobalElts_),
146 isNodeGlobalElement(isNodeGlobalElement_),
147 fullBlockSize(fullBlockSize_),
149 stridingOffset(stridingOffset_),
150 stridedBlockSize(stridedBlockSize_),
151 indexBase(indexBase_),
152 globalOffset(globalOffset_),
156 KOKKOS_INLINE_FUNCTION
157 void operator()(
const LO lnode)
const {
158 if(stridedBlockSize == 1) {
159 if(procWinner(lnode,0) == myPid) {
160 auto myAgg = vertex2AggId(lnode,0);
164 if(procWinner(lnode,0) == myPid) {
165 auto myAgg = vertex2AggId(lnode,0);
166 GO gnodeid = nodeGlobalElts(lnode);
167 for (LO k = 0; k< stridedBlockSize; k++) {
168 GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
169 if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) ==
true) {
180 template<
class agg2RowMapType,
class aggSizesType,
class vertex2AggIdType,
class procWinnerType,
class nodeGlobalEltsType,
class isNodeGlobalEltsType,
class LOType,
class GOType>
181 class CreateAgg2RowMapLOFunctor {
186 agg2RowMapType agg2RowMap;
187 aggSizesType aggSizes;
188 aggSizesType aggDofCount;
189 vertex2AggIdType vertex2AggId;
190 procWinnerType procWinner;
191 nodeGlobalEltsType nodeGlobalElts;
192 isNodeGlobalEltsType isNodeGlobalElement;
195 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
196 GO indexBase, globalOffset;
198 CreateAgg2RowMapLOFunctor(agg2RowMapType agg2RowMap_, aggSizesType aggSizes_, aggSizesType aggDofCount_, vertex2AggIdType vertex2AggId_, procWinnerType procWinner_, nodeGlobalEltsType nodeGlobalElts_, isNodeGlobalEltsType isNodeGlobalElement_, LO fullBlockSize_, LOType blockID_, LOType stridingOffset_, LOType stridedBlockSize_, GOType indexBase_, GOType globalOffset_,
int myPID_ ) :
199 agg2RowMap(agg2RowMap_),
201 aggDofCount(aggDofCount_),
202 vertex2AggId(vertex2AggId_),
203 procWinner(procWinner_),
204 nodeGlobalElts(nodeGlobalElts_),
205 isNodeGlobalElement(isNodeGlobalElement_),
206 fullBlockSize(fullBlockSize_),
208 stridingOffset(stridingOffset_),
209 stridedBlockSize(stridedBlockSize_),
210 indexBase(indexBase_),
211 globalOffset(globalOffset_),
215 KOKKOS_INLINE_FUNCTION
216 void operator()(
const LO lnode)
const {
217 if(stridedBlockSize == 1) {
218 if(procWinner(lnode,0) == myPid) {
219 auto myAgg = vertex2AggId(lnode,0);
220 LO myAggDofStart = aggSizes(myAgg);
221 auto idx = Kokkos::atomic_fetch_add( &aggDofCount(myAgg), 1 );
222 agg2RowMap(myAggDofStart + idx) = lnode;
225 if(procWinner(lnode,0) == myPid) {
226 auto myAgg = vertex2AggId(lnode,0);
227 LO myAggDofStart = aggSizes(myAgg);
228 auto idx = Kokkos::atomic_fetch_add( &aggDofCount(myAgg), stridedBlockSize );
229 for (LO k = 0; k< stridedBlockSize; k++) {
230 agg2RowMap(myAggDofStart + idx + k) = lnode * stridedBlockSize +k;
240 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
241 class LocalScalarQRDecompFunctor {
247 typedef typename DeviceType::execution_space execution_space;
251 NspType fineNSRandom;
254 maxAggDofSizeType maxAggDofSize;
255 agg2RowMapLOType agg2RowMapLO;
256 statusType statusAtomic;
262 LocalScalarQRDecompFunctor(NspType fineNSRandom_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_) :
263 fineNSRandom(fineNSRandom_),
266 maxAggDofSize(maxAggDofSize_),
267 agg2RowMapLO(agg2RowMapLO_),
268 statusAtomic(statusAtomic_),
275 KOKKOS_INLINE_FUNCTION
277 auto agg = thread.league_rank();
280 LO aggSize = aggRows(agg+1) - aggRows(agg);
287 typedef Kokkos::ArithTraits<SC> ATS;
288 typedef typename ATS::magnitudeType Magnitude;
290 Magnitude norm = Kokkos::ArithTraits<Magnitude>::zero();
291 for (decltype(aggSize) k = 0; k < aggSize; k++) {
292 Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
297 if (norm == ATS::zero()) {
299 statusAtomic(1) =
true;
304 coarseNS(agg, 0) = norm;
307 for (LO k = 0; k < aggSize; k++) {
308 LO localRow = agg2RowMapLO(aggRows(agg)+k);
309 SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
311 size_t rowStart = rowsAux(localRow);
312 colsAux(rowStart) = agg;
313 valsAux(rowStart) = localVal;
316 rows(localRow+1) = 1;
323 template<
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
324 class LocalQRDecompFunctor {
330 typedef typename DeviceType::execution_space execution_space;
340 maxAggDofSizeType maxAggDofSize;
341 agg2RowMapLOType agg2RowMapLO;
342 statusType statusAtomic;
348 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_) :
352 maxAggDofSize(maxAggDofSize_),
353 agg2RowMapLO(agg2RowMapLO_),
354 statusAtomic(statusAtomic_),
361 KOKKOS_INLINE_FUNCTION
363 auto agg = thread.league_rank();
366 auto aggSize = aggRows(agg+1) - aggRows(agg);
368 typedef Kokkos::ArithTraits<SC> ATS;
370 SC zero = ATS::zero();
375 shared_matrix localQR(thread.team_shmem(),aggSize,fineNS.dimension_1());
376 for (
size_t j = 0; j < fineNS.dimension_1(); j++)
377 for (LO k = 0; k < aggSize; k++)
378 localQR(k,j) = fineNS(agg2RowMapLO(aggRows(agg)+k),j);
381 for (
size_t j = 0; j < fineNS.dimension_1(); j++) {
382 bool bIsZeroNSColumn =
true;
384 for (LO k = 0; k < aggSize; k++)
385 if (localQR(k,j) != zero)
386 bIsZeroNSColumn =
false;
388 if (bIsZeroNSColumn) {
389 statusAtomic(1) =
true;
394 Xpetra::global_size_t offset = agg * fineNS.dimension_1();
397 shared_matrix q (thread.team_shmem(),aggSize,aggSize);
400 if (aggSize >= decltype(aggSize)( fineNS.dimension_1() )) {
403 shared_matrix r(thread.team_shmem(),aggSize,fineNS.dimension_1());
404 shared_matrix z(thread.team_shmem(),aggSize,fineNS.dimension_1());
405 shared_vector e(thread.team_shmem(),aggSize);
406 shared_matrix qk(thread.team_shmem(),aggSize,aggSize);
407 shared_matrix qt (thread.team_shmem(),aggSize,aggSize);
408 shared_vector x(thread.team_shmem(),aggSize);
412 matrix_copy(localQR,z);
414 typedef typename ATS::magnitudeType Magnitude;
415 Magnitude zeroMagnitude = Kokkos::ArithTraits<Magnitude>::zero();
417 for(decltype(localQR.dimension_0()) k = 0; k < localQR.dimension_0() && k < localQR.dimension_1(); k++) {
424 for(decltype(x.dimension_0()) i = 0; i < x.dimension_0(); i++)
427 if(ATS::magnitude(localQR(k,k)) > zeroMagnitude) xn = -xn;
430 for(decltype(e.dimension_0()) i = 0; i < e.dimension_0(); i++)
431 e(i) = (i==k) ? one : zero;
445 matrix_mul ( qk, q, qt);
447 matrix_mul(qk, r, z);
452 matrix_mul ( q, localQR, r);
455 for(decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
456 for(decltype(j) k = 0; k <= j; k++)
457 coarseNS(offset+k,j) = r(k,j);
488 for (decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
489 for (decltype(fineNS.dimension_1()) k = 0; k < fineNS.dimension_1(); k++)
490 if (k < decltype(k)(aggSize))
491 coarseNS(offset+k,j) = localQR(k,j);
493 coarseNS(offset+k,j) = (k == j ? one : zero);
496 for (decltype(fineNS.dimension_1()) i = 0; i < decltype(fineNS.dimension_1()) (aggSize); i++)
497 for (decltype(fineNS.dimension_1()) j = 0; j < fineNS.dimension_1(); j++)
498 q(i,j) = (j == i ? one : zero);
502 for (LO j = 0; j < aggSize; j++) {
503 LO localRow = agg2RowMapLO(aggRows(agg)+j);
504 size_t rowStart = rowsAux(localRow);
506 for (decltype(fineNS.dimension_1()) k = 0; k < fineNS.dimension_1(); k++) {
509 colsAux(rowStart+lnnz) = offset + k;
510 valsAux(rowStart+lnnz) = q(j,k);
514 rows(localRow+1) = lnnz;
538 KOKKOS_INLINE_FUNCTION
539 void matrix_clear ( shared_matrix & m)
const {
540 typedef Kokkos::ArithTraits<SC> ATS;
541 SC zero = ATS::zero();
542 for(decltype(m.dimension_0()) i = 0; i < m.dimension_0(); i++) {
543 for(decltype(m.dimension_1()) j = 0; j < m.dimension_1(); j++) {
549 KOKKOS_INLINE_FUNCTION
550 void matrix_copy (
const shared_matrix & mi, shared_matrix & mo)
const {
551 for(decltype(mi.dimension_0()) i = 0; i < mi.dimension_0(); i++) {
552 for(decltype(mi.dimension_1()) j = 0; j < mi.dimension_1(); j++) {
559 void matrix_transpose ( shared_matrix & m)
const {
560 for(decltype(m.dimension_0()) i = 0; i < m.dimension_0(); i++) {
561 for(decltype(m.dimension_1()) j = 0; j < i; j++) {
570 void matrix_mul (
const shared_matrix & m1,
const shared_matrix & m2, shared_matrix & m1m2)
const {
571 typedef Kokkos::ArithTraits<SC> ATS;
572 SC zero = ATS::zero();
573 for(decltype(m1.dimension_0()) i = 0; i < m1.dimension_0(); i++) {
574 for(decltype(m1.dimension_1()) j = 0; j < m1.dimension_1(); j++) {
576 for(decltype(m1.dimension_1()) k = 0; k < m1.dimension_1(); k++) {
577 m1m2(i,j) += m1(i,k) * m2(k,j);
584 void matrix_minor (
const shared_matrix & mat, shared_matrix & matminor, LO d)
const {
585 typedef Kokkos::ArithTraits<SC> ATS;
588 for (decltype(d) i = 0; i < d; i++) {
591 for (decltype(mat.dimension_0()) i = d; i < mat.dimension_0(); i++) {
592 for (decltype(mat.dimension_1()) j=d; j < mat.dimension_1(); j++) {
593 matminor(i,j) = mat(i,j);
602 void vmul (
const shared_vector & v, shared_matrix & vmuldata)
const {
603 typedef Kokkos::ArithTraits<SC> ATS;
605 for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++) {
606 for(decltype(v.dimension_0()) j = 0; j < v.dimension_0(); j++) {
607 vmuldata(i,j) = -2. * v(i) * v(j);
610 for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++) {
611 vmuldata(i,i) += one;
618 KOKKOS_INLINE_FUNCTION
619 SC vnorm(
const shared_vector & v)
const {
621 for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
629 KOKKOS_INLINE_FUNCTION
630 void vdiv(shared_vector & v, SC d)
const {
631 for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
639 KOKKOS_INLINE_FUNCTION
640 void vmadd(shared_vector & v,
const shared_vector & va, SC d)
const {
641 for(decltype(v.dimension_0()) i = 0; i < v.dimension_0(); i++)
642 v(i) = va(i) + d * v(i);
646 size_t team_shmem_size(
int team_size )
const {
657 template<
class LO,
class rowType,
class colType,
class valType>
658 class CompressArraysFunctor {
660 CompressArraysFunctor(LO invalid, rowType rowsAux, colType colsAux, valType valsAux, rowType rows, colType cols, valType vals) : invalid_(invalid), rowsAux_(rowsAux), colsAux_(colsAux), valsAux_(valsAux), rows_(rows), cols_(cols), vals_(vals) { }
662 KOKKOS_INLINE_FUNCTION
663 void operator()(
const LO i)
const {
664 LO rowStart = rows_(i);
667 for (decltype(rowsAux_(i)) j = rowsAux_(i); j < rowsAux_(i+1); j++)
668 if (colsAux_(j) != invalid_) {
669 cols_(rowStart+lnnz) = colsAux_(j);
670 vals_(rowStart+lnnz) = valsAux_(j);
686 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
687 RCP<const ParameterList> TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList()
const {
688 RCP<ParameterList> validParamList = rcp(
new ParameterList());
690 validParamList->set< RCP<const FactoryBase> >(
"A", Teuchos::null,
"Generating factory of the matrix A");
691 validParamList->set< RCP<const FactoryBase> >(
"Aggregates", Teuchos::null,
"Generating factory of the aggregates");
692 validParamList->set< RCP<const FactoryBase> >(
"Nullspace", Teuchos::null,
"Generating factory of the nullspace");
693 validParamList->set< RCP<const FactoryBase> >(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
694 validParamList->set< RCP<const FactoryBase> >(
"CoarseMap", Teuchos::null,
"Generating factory of the coarse map");
696 return validParamList;
699 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
700 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level& fineLevel, Level& coarseLevel)
const {
701 Input(fineLevel,
"A");
702 Input(fineLevel,
"Aggregates");
703 Input(fineLevel,
"Nullspace");
704 Input(fineLevel,
"UnAmalgamationInfo");
705 Input(fineLevel,
"CoarseMap");
708 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
709 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::Build(Level& fineLevel, Level& coarseLevel)
const {
710 return BuildP(fineLevel, coarseLevel);
713 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
714 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::BuildP(Level& fineLevel, Level& coarseLevel)
const {
715 FactoryMonitor m(*
this,
"Build", coarseLevel);
717 auto A = Get< RCP<Matrix> > (fineLevel,
"A");
718 auto aggregates = Get< RCP<Aggregates_kokkos> >(fineLevel,
"Aggregates");
719 auto amalgInfo = Get< RCP<AmalgamationInfo> > (fineLevel,
"UnAmalgamationInfo");
720 auto fineNullspace = Get< RCP<MultiVector> > (fineLevel,
"Nullspace");
721 auto coarseMap = Get< RCP<const Map> > (fineLevel,
"CoarseMap");
723 RCP<Matrix> Ptentative;
724 RCP<MultiVector> coarseNullspace;
725 if (!aggregates->AggregatesCrossProcessors())
726 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
728 BuildPcoupled (A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
738 if (A->IsView(
"stridedMaps") ==
true)
739 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
741 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
743 Set(coarseLevel,
"Nullspace", coarseNullspace);
744 Set(coarseLevel,
"P", Ptentative);
747 RCP<ParameterList> params = rcp(
new ParameterList());
748 params->set(
"printLoadBalancingInfo",
true);
753 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
754 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
755 BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
756 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace)
const {
757 RCP<const Map> rowMap = A->getRowMap();
758 RCP<const Map> colMap = A->getColMap();
760 const size_t numRows = rowMap->getNodeNumElements();
761 const size_t NSDim = fineNullspace->getNumVectors();
763 typedef Teuchos::ScalarTraits<SC> STS;
764 const SC zero = STS::zero();
765 const SC one = STS::one();
766 const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
768 auto aggGraph = aggregates->GetGraph();
769 auto aggRows = aggGraph.row_map;
770 auto aggCols = aggGraph.entries;
776 bool goodMap = isGoodMap(*rowMap, *colMap);
784 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
786 amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
787 GO globalOffset = amalgInfo->GlobalOffset();
791 Teuchos::ArrayView<const GO> nodeGlobalEltsView = aggregates->GetMap()->getNodeElementList();
793 for(
size_t i = 0; i < nodeGlobalElts.size(); i++)
794 nodeGlobalElts(i) = nodeGlobalEltsView[i];
797 auto procWinner = aggregates->GetProcWinner()->getHostLocalView();
798 auto vertex2AggId = aggregates->GetVertex2AggId()->getHostLocalView();
799 const GO numAggregates = aggregates->GetNumAggregates();
805 map_type isNodeGlobalElement(colMap->getNodeNumElements());
807 int myPid = aggregates->GetMap()->getComm()->getRank();
811 for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
812 auto myAgg = vertex2AggId(lnode,0);
813 if(procWinner(lnode,0) == myPid) {
814 GO gnodeid = nodeGlobalElts[lnode];
815 for (LO k = 0; k< stridedBlockSize; k++) {
816 GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
817 bool bIsInColumnMap = colMap->isNodeGlobalElement(gDofIndex);
818 isNodeGlobalElement.insert(gDofIndex, bIsInColumnMap);
827 aggSizeType sizes(
"agg_dof_sizes", numAggregates + 1);
833 SubFactoryMonitor m2(*
this,
"Calc AggSizes", coarseLevel);
836 typename AppendTrait<aggSizeType, Kokkos::Atomic>::type sizesAtomic = sizes;
839 CollectAggregateSizeFunctor <decltype(sizesAtomic), decltype(vertex2AggId), decltype(procWinner), decltype(nodeGlobalElts), decltype(isNodeGlobalElement), LO, GO> collectAggregateSizes(sizesAtomic, vertex2AggId, procWinner, nodeGlobalElts, isNodeGlobalElement, fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase, globalOffset, myPid );
840 Kokkos::parallel_for(
"MueLu:TentativePF:Build:getaggsizes", range_type(0,vertex2AggId.dimension_0()), collectAggregateSizes);
844 if(stridedBlockSize == 1) {
846 for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
847 if(procWinner(lnode,0) == myPid) {
848 auto myAgg = vertex2AggId(lnode,0);
854 for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
855 if(procWinner(lnode,0) == myPid) {
856 auto myAgg = vertex2AggId(lnode,0);
857 GO gnodeid = nodeGlobalElts[lnode];
859 for (LO k = 0; k< stridedBlockSize; k++) {
860 GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
861 if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) ==
true) {
874 for(decltype(sizes.dimension_0()) i = 0; i < sizes.dimension_0(); i++) {
875 if(sizes(i) > maxAggSize) maxAggSize = sizes(i);
880 ScanFunctor<LO,decltype(sizes)> scanFunctorAggSizes(sizes);
881 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0,numAggregates+1), scanFunctorAggSizes);
888 SubFactoryMonitor m2(*
this,
"Create Agg2RowMap", coarseLevel);
890 aggSizeType aggDofCount(
"aggDofCount", numAggregates);
896 CreateAgg2RowMapLOFunctor<decltype(agg2RowMapLO), decltype(sizes), decltype(vertex2AggId), decltype(procWinner), decltype(nodeGlobalElts), decltype(isNodeGlobalElement), LO, GO> createAgg2RowMap (agg2RowMapLO, sizes, aggDofCount,vertex2AggId, procWinner, nodeGlobalElts, isNodeGlobalElement, fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase, globalOffset, myPid );
897 Kokkos::parallel_for(
"MueLu:TentativePF:Build:createAgg2RowMap", range_type(0,vertex2AggId.dimension_0()), createAgg2RowMap);
901 if(stridedBlockSize == 1) {
903 for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
904 if(procWinner(lnode,0) == myPid) {
905 auto myAgg = vertex2AggId(lnode,0);
906 agg2RowMapLO(sizes(myAgg) + numDofs(myAgg)) = lnode;
912 for (decltype(vertex2AggId.dimension_0()) lnode = 0; lnode < vertex2AggId.dimension_0(); lnode++) {
913 if(procWinner(lnode,0) == myPid) {
914 auto myAgg = vertex2AggId(lnode,0);
915 GO gnodeid = nodeGlobalElts[lnode];
917 for (LO k = 0; k< stridedBlockSize; k++) {
918 GO gDofIndex = globalOffset + (gnodeid - indexBase) * fullBlockSize + stridingOffset + k + indexBase;
919 if(isNodeGlobalElement.value_at(isNodeGlobalElement.find(gDofIndex)) ==
true) {
920 agg2RowMapLO(sizes(myAgg) + numDofs(myAgg)) = lnode * stridedBlockSize + k;
932 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
935 auto fineNS = fineNullspace ->template getLocalView<DeviceType>();
936 auto coarseNS = coarseNullspace->template getLocalView<DeviceType>();
938 size_t nnzEstimate = numRows * NSDim;
941 typedef typename Xpetra::Matrix<SC,LO,GO,NO>::local_matrix_type local_matrix_type;
942 typedef typename local_matrix_type::row_map_type rows_type;
943 typedef typename local_matrix_type::index_type cols_type;
944 typedef typename local_matrix_type::values_type vals_type;
949 typename rows_type::non_const_type rowsAux(
"Ptent_aux_rows", numRows+1), rows(
"Ptent_rows", numRows+1);
950 typename cols_type::non_const_type colsAux(
"Ptent_aux_cols", nnzEstimate);
951 typename vals_type::non_const_type valsAux(
"Ptent_aux_vals", nnzEstimate);
956 FillRowAuxArrayFunctor<LO, decltype(rowsAux), decltype(NSDim)> rauxf(rowsAux, NSDim);
958 FillColAuxArrayFunctor<SC, LO, decltype(colsAux), decltype(valsAux)> cauxf(zero, INVALID, colsAux, valsAux);
959 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2", range_type(0,nnzEstimate), cauxf);
961 Kokkos::parallel_for(
"MueLu:TentativePF:BuildPuncoupled:for1", range_type(0,numRows+1), KOKKOS_LAMBDA(
const LO row) {
962 rowsAux(row) = row*NSDim;
964 Kokkos::parallel_for(
"MueLu:TentativePF:BuildUncoupled:for2", nnzEstimate, KOKKOS_LAMBDA(
const LO j) {
965 colsAux(j) = INVALID;
972 status_type status(
"status");
977 typename AppendTrait<decltype(fineNS), Kokkos::RandomAccess>::type fineNSRandom = fineNS;
978 typename AppendTrait<status_type, Kokkos::Atomic> ::type statusAtomic = status;
980 TEUCHOS_TEST_FOR_EXCEPTION(goodMap ==
false, Exceptions::RuntimeError,
"Only works for non-overlapping aggregates (goodMap == true)");
983 SubFactoryMonitor m2(*
this,
"Stage 1 (LocalQR)", coarseLevel);
996 Kokkos::parallel_reduce(policy, LocalScalarQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom), decltype(sizes ), decltype(maxAggSize), decltype(agg2RowMapLO), decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux), decltype(valsAux)>(fineNSRandom,coarseNS,sizes,maxAggSize,agg2RowMapLO,statusAtomic,rows,rowsAux,colsAux,valsAux),nnz);
1001 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_loop", numAggregates, KOKKOS_LAMBDA(
const GO agg,
size_t& rowNnz) {
1002 LO aggSize = aggRows(agg+1) - aggRows(agg);
1009 typedef Kokkos::ArithTraits<SC> ATS;
1010 typedef typename ATS::magnitudeType Magnitude;
1012 Magnitude norm = ATS::magnitude(zero);
1013 for (
size_t k = 0; k < aggSize; k++) {
1014 Magnitude dnorm = ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0));
1015 norm += dnorm*dnorm;
1021 statusAtomic(1) =
true;
1026 coarseNS(agg, 0) = norm;
1029 for (LO k = 0; k < aggSize; k++) {
1030 LO localRow = agg2RowMapLO(aggRows(agg)+k);
1031 SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg)+k),0) / norm;
1033 size_t rowStart = rowsAux(localRow);
1034 colsAux(rowStart) = agg;
1035 valsAux(rowStart) = localVal;
1038 rows(localRow+1) = 1;
1045 statusAtomic(0) =
true;
1051 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
1052 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
1053 if (statusHost(i)) {
1054 std::ostringstream oss;
1055 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
1057 case 0: oss <<
"!goodMap is not implemented";
break;
1058 case 1: oss <<
"fine level NS part has a zero column";
break;
1060 throw Exceptions::RuntimeError(oss.str());
1070 Kokkos::parallel_reduce(policy, LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom), decltype(sizes ), decltype(maxAggSize), decltype(agg2RowMapLO), decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux), decltype(valsAux)>(fineNSRandom,coarseNS,sizes,maxAggSize,agg2RowMapLO,statusAtomic,rows,rowsAux,colsAux,valsAux),nnz);
1072 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
1073 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
1074 if (statusHost(i)) {
1075 std::ostringstream oss;
1076 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
1078 case 0: oss <<
"!goodMap is not implemented";
break;
1079 case 1: oss <<
"fine level NS part has a zero column";
break;
1081 throw Exceptions::RuntimeError(oss.str());
1088 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressRows)", coarseLevel);
1090 ScanFunctor<LO,decltype(rows)> scanFunctor(rows);
1091 Kokkos::parallel_scan(
"MueLu:TentativePF:Build:compress_rows", range_type(0,numRows+1), scanFunctor);
1095 typename cols_type::non_const_type cols(
"Ptent_cols", nnz);
1096 typename vals_type::non_const_type vals(
"Ptent_vals", nnz);
1100 SubFactoryMonitor m2(*
this,
"Stage 2 (CompressCols)", coarseLevel);
1102 CompressArraysFunctor<LO, decltype(rows),decltype(cols),decltype(vals)> comprArr(INVALID, rowsAux, colsAux, valsAux, rows, cols, vals);
1103 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals", range_type(0,numRows), comprArr);
1106 Kokkos::parallel_for(
"MueLu:TentativePF:Build:compress_cols_vals", numRows, KOKKOS_LAMBDA(
const LO i) {
1107 LO rowStart = rows(i);
1110 for (LO j = rowsAux(i); j < rowsAux(i+1); j++)
1111 if (colsAux(j) != INVALID) {
1112 cols(rowStart+lnnz) = colsAux(j);
1113 vals(rowStart+lnnz) = valsAux(j);
1119 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
1125 SubFactoryMonitor m2(*
this,
"Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
1126 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getNodeNumElements(), nnz, vals, rows, cols);
1127 Teuchos::RCP<CrsMatrix> PtentCrs = CrsMatrixFactory::Build(rowMap,coarseMap,lclMatrix);
1128 PtentCrs->resumeFill();
1129 PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
1130 Ptentative = rcp(
new CrsMatrixWrap(PtentCrs));
1133 Ptentative = rcp(
new CrsMatrixWrap(rowMap, coarseMap, 0, Xpetra::StaticProfile));
1134 RCP<CrsMatrix> PtentCrs = rcp_dynamic_cast<CrsMatrixWrap>(Ptentative)->getCrsMatrix();
1136 ArrayRCP<size_t> iaPtent;
1137 ArrayRCP<LO> jaPtent;
1138 ArrayRCP<SC> valPtent;
1140 PtentCrs->allocateAllValues(nnz, iaPtent, jaPtent, valPtent);
1141 ArrayView<size_t> ia = iaPtent();
1142 ArrayView<LO> ja = jaPtent();
1143 ArrayView<SC> val = valPtent();
1146 typename rows_type::HostMirror rowsHost = Kokkos::create_mirror_view(rows);
1147 typename cols_type::HostMirror colsHost = Kokkos::create_mirror_view(cols);
1148 typename vals_type::HostMirror valsHost = Kokkos::create_mirror_view(vals);
1156 for (LO i = 0; i < rowsHost.size(); i++)
1157 ia[i] = rowsHost(i);
1158 for (LO j = 0; j < colsHost.size(); j++) {
1159 ja [j] = colsHost(j);
1160 val[j] = valsHost(j);
1162 PtentCrs->setAllValues(iaPtent, jaPtent, valPtent);
1163 PtentCrs->expertStaticFillComplete(coarseMap, A->getDomainMap());
1167 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1168 void TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
1169 BuildPcoupled(RCP<Matrix> A, RCP<Aggregates_kokkos> aggregates, RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
1170 RCP<const Map> coarseMap, RCP<Matrix>& Ptentative, RCP<MultiVector>& coarseNullspace)
const {
1171 throw Exceptions::RuntimeError(
"Construction of coupled tentative P is not implemented");
1174 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class DeviceType>
1175 bool TentativePFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::isGoodMap(
const Map& rowMap,
const Map& colMap)
const {
1176 ArrayView<const GO> rowElements = rowMap.getNodeElementList();
1177 ArrayView<const GO> colElements = colMap.getNodeElementList();
1179 const size_t numElements = rowElements.size();
1181 bool goodMap =
true;
1182 for (
size_t i = 0; i < numElements; i++)
1183 if (rowElements[i] != colElements[i]) {
1193 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT 1194 #endif // HAVE_MUELU_KOKKOS_REFACTOR 1195 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Namespace for MueLu classes and methods.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type *=0)
KOKKOS_INLINE_FUNCTION Kokkos::complex< RealType > sqrt(const complex< RealType > &x)
void deep_copy(const View< DT, DP... > &dst, typename ViewTraits< DT, DP... >::const_value_type &value, typename std::enable_if< std::is_same< typename ViewTraits< DT, DP... >::specialize, void >::value >::type *=0)
Description of what is happening (more verbose)