41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
84 #include "TpetraExt_MatrixMatrix_HIP.hpp"
88 namespace MatrixMatrix{
96 template <
class Scalar,
106 bool call_FillComplete_on_result,
107 const std::string& label,
108 const Teuchos::RCP<Teuchos::ParameterList>& params)
114 typedef LocalOrdinal LO;
115 typedef GlobalOrdinal GO;
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
125 using Teuchos::TimeMonitor;
128 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
131 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
136 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
137 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
142 RCP<const crs_matrix_type> Aprime =
null;
146 RCP<const crs_matrix_type> Bprime =
null;
156 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
158 bool use_optimized_ATB =
false;
159 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160 use_optimized_ATB =
true;
162 #ifdef USE_OLD_TRANSPOSE
163 use_optimized_ATB =
false;
166 using Teuchos::ParameterList;
167 RCP<ParameterList> transposeParams (
new ParameterList);
168 transposeParams->set (
"sort",
false);
170 if (!use_optimized_ATB && transposeA) {
171 transposer_type transposer (rcpFromRef (A));
172 Aprime = transposer.createTranspose (transposeParams);
175 Aprime = rcpFromRef(A);
179 transposer_type transposer (rcpFromRef (B));
180 Bprime = transposer.createTranspose (transposeParams);
183 Bprime = rcpFromRef(B);
193 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
195 "must match for matrix-matrix product. op(A) is "
196 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
202 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
203 prefix <<
"ERROR, dimensions of result C must "
205 <<
" rows, should have at least " << Aouter << std::endl);
217 int numProcs = A.
getComm()->getSize();
221 crs_matrix_struct_type Aview;
222 crs_matrix_struct_type Bview;
224 RCP<const map_type> targetMap_A = Aprime->getRowMap();
225 RCP<const map_type> targetMap_B = Bprime->getRowMap();
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
229 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
235 if (!use_optimized_ATB) {
236 RCP<const import_type> dummyImporter;
237 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
243 targetMap_B = Aprime->getColMap();
246 if (!use_optimized_ATB)
247 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
252 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
256 if (use_optimized_ATB) {
257 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
259 }
else if (call_FillComplete_on_result && newFlag) {
260 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
262 }
else if (call_FillComplete_on_result) {
263 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
269 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
271 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
284 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
289 template <
class Scalar,
298 bool call_FillComplete_on_result,
299 const std::string& label,
300 const Teuchos::RCP<Teuchos::ParameterList>& params)
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
314 using Teuchos::TimeMonitor;
315 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
318 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
323 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
324 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
326 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
336 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
338 "must match for matrix-matrix product. op(A) is "
339 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
345 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
346 prefix <<
"ERROR, dimensions of result C must "
348 <<
" rows, should have at least "<< Aouter << std::endl);
360 int numProcs = A.
getComm()->getSize();
364 crs_matrix_struct_type Aview;
365 crs_matrix_struct_type Bview;
367 RCP<const map_type> targetMap_A = Aprime->getRowMap();
368 RCP<const map_type> targetMap_B = Bprime->getRowMap();
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
377 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
378 if(!params.is_null()) {
379 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
380 int mm_optimization_core_count=0;
381 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
383 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
386 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
387 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
388 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
390 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
394 RCP<const import_type> dummyImporter;
395 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label,importParams1);
400 targetMap_B = Aprime->getColMap();
405 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
406 if(!params.is_null()) {
407 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
409 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
411 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
412 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
413 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
414 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
418 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
421 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
425 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
429 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
432 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
434 if (call_FillComplete_on_result && newFlag) {
435 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
437 }
else if (call_FillComplete_on_result) {
438 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
441 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
464 template <
class Scalar,
475 using Teuchos::Array;
479 typedef LocalOrdinal LO;
480 typedef GlobalOrdinal GO;
485 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
487 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
488 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489 "(Result matrix B is not required to be isFillComplete()).");
490 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
491 prefix <<
"ERROR, input matrix B must not be fill complete!");
492 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
493 prefix <<
"ERROR, input matrix B must not have static graph!");
495 prefix <<
"ERROR, input matrix B must not be locally indexed!");
497 using Teuchos::ParameterList;
498 RCP<ParameterList> transposeParams (
new ParameterList);
499 transposeParams->set (
"sort",
false);
501 RCP<const crs_matrix_type> Aprime =
null;
503 transposer_type transposer (rcpFromRef (A));
504 Aprime = transposer.createTranspose (transposeParams);
507 Aprime = rcpFromRef(A);
515 if (scalarB != Teuchos::ScalarTraits<SC>::one())
520 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521 for (LO i = 0; (size_t)i < numMyRows; ++i) {
522 row = B.
getRowMap()->getGlobalElement(i);
523 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
525 if (scalarA != Teuchos::ScalarTraits<SC>::one())
526 for (
size_t j = 0; j < a_numEntries; ++j)
527 a_vals[j] *= scalarA;
530 B.
sumIntoGlobalValues(row, a_numEntries,
reinterpret_cast<Scalar *
>(a_vals.data()), a_inds.data());
532 B.
insertGlobalValues(row, a_numEntries,
reinterpret_cast<Scalar *
>(a_vals.data()), a_inds.data());
537 template <
class Scalar,
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
543 const bool transposeA,
546 const bool transposeB,
550 const Teuchos::RCP<Teuchos::ParameterList>& params)
553 using Teuchos::rcpFromRef;
555 using Teuchos::ParameterList;
557 if(!params.is_null())
559 TEUCHOS_TEST_FOR_EXCEPTION(
560 params->isParameter(
"Call fillComplete") && !params->get<
bool>(
"Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set(
"Call fillComplete",
true);
568 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
575 TEUCHOS_TEST_FOR_EXCEPTION
576 (! A.
isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578 RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(Brcp->getRowMap(), 0));
580 add(alpha, transposeA, A, beta,
false, *Brcp, *C, domainMap, rangeMap, params);
588 template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
589 struct ConvertGlobalToLocalFunctor
591 ConvertGlobalToLocalFunctor(LOView& lids_,
const GOView& gids_,
const LocalMap localColMap_)
592 : lids(lids_), gids(gids_), localColMap(localColMap_)
595 KOKKOS_FUNCTION
void operator() (
const GO i)
const
597 lids(i) = localColMap.getLocalElement(gids(i));
602 const LocalMap localColMap;
605 template <
class Scalar,
611 const bool transposeA,
614 const bool transposeB,
619 const Teuchos::RCP<Teuchos::ParameterList>& params)
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
628 using LO = LocalOrdinal;
629 using GO = GlobalOrdinal;
637 using exec_space =
typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
640 constexpr
bool debug =
false;
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
647 std::ostringstream os;
648 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
649 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr << os.str ();
653 TEUCHOS_TEST_FOR_EXCEPTION
655 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
656 TEUCHOS_TEST_FOR_EXCEPTION
658 prefix_mmm <<
"A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
662 const bool domainMapsSame =
663 (! transposeA && ! transposeB &&
665 (! transposeA && transposeB &&
667 ( transposeA && ! transposeB &&
669 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
672 const bool rangeMapsSame =
673 (! transposeA && ! transposeB &&
675 (! transposeA && transposeB &&
677 ( transposeA && ! transposeB &&
679 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
684 using Teuchos::ParameterList;
686 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
688 transposer_type transposer (Aprime);
689 Aprime = transposer.createTranspose ();
692 #ifdef HAVE_TPETRA_DEBUG
693 TEUCHOS_TEST_FOR_EXCEPTION
694 (Aprime.is_null (), std::logic_error,
695 prefix_mmm <<
"Failed to compute Op(A). "
696 "Please report this bug to the Tpetra developers.");
700 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
703 std::ostringstream os;
704 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
705 <<
"Form explicit xpose of B" << std::endl;
706 std::cerr << os.str ();
708 transposer_type transposer (Bprime);
709 Bprime = transposer.createTranspose ();
711 #ifdef HAVE_TPETRA_DEBUG
712 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714 TEUCHOS_TEST_FOR_EXCEPTION(
715 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
717 "Please report this bug to the Tpetra developers.");
719 RCP<const map_type> CDomainMap = domainMap;
720 RCP<const map_type> CRangeMap = rangeMap;
721 if(CDomainMap.is_null())
723 CDomainMap = Bprime->getDomainMap();
725 if(CRangeMap.is_null())
727 CRangeMap = Bprime->getRangeMap();
729 assert(!(CDomainMap.is_null()));
730 assert(!(CRangeMap.is_null()));
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
734 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
737 row_ptrs_array rowptrs;
738 col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
741 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
743 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
746 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
751 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
753 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754 bool sorted = AGraphSorted && BGraphSorted;
755 RCP<const import_type> Cimport = Teuchos::null;
756 RCP<export_type> Cexport = Teuchos::null;
757 bool doFillComplete =
true;
758 if(Teuchos::nonnull(params) && params->isParameter(
"Call fillComplete"))
760 doFillComplete = params->get<
bool>(
"Call fillComplete");
762 auto Alocal = Aprime->getLocalMatrixDevice();
763 auto Blocal = Bprime->getLocalMatrixDevice();
764 LO numLocalRows = Alocal.numRows();
765 if(numLocalRows == 0)
772 rowptrs = row_ptrs_array(
"C rowptrs", 0);
774 auto Acolmap = Aprime->getColMap();
775 auto Bcolmap = Bprime->getColMap();
778 using global_col_inds_array =
typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
781 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
784 auto AlocalColmap = Acolmap->getLocalMap();
785 auto BlocalColmap = Bcolmap->getLocalMap();
786 global_col_inds_array globalColinds;
788 std::ostringstream os;
789 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
790 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr << os.str ();
793 AddKern::convertToGlobalAndAdd(
794 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795 vals, rowptrs, globalColinds);
797 std::ostringstream os;
798 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
799 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr << os.str ();
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
804 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Constructing graph"))));
806 RCP<const map_type> CcolMap;
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808 (CcolMap, CDomainMap, globalColinds);
810 col_inds_array localColinds(
"C colinds", globalColinds.extent(0));
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
815 (localColinds, globalColinds, CcolMap->getLocalMap()));
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
825 auto Avals = Alocal.values;
826 auto Bvals = Blocal.values;
827 auto Arowptrs = Alocal.graph.row_map;
828 auto Browptrs = Blocal.graph.row_map;
829 auto Acolinds = Alocal.graph.entries;
830 auto Bcolinds = Blocal.graph.entries;
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
836 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
839 std::ostringstream os;
840 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
841 <<
"Call AddKern::addSorted(...)" << std::endl;
842 std::cerr << os.str ();
844 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
851 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
854 std::ostringstream os;
855 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
856 <<
"Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr << os.str ();
859 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
862 RCP<const map_type> Ccolmap = Bcolmap;
867 #ifdef HAVE_TPETRA_MMM_TIMINGS
869 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
871 if(!CDomainMap->isSameAs(*Ccolmap))
874 std::ostringstream os;
875 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
876 <<
"Create Cimport" << std::endl;
877 std::cerr << os.str ();
879 Cimport = rcp(
new import_type(CDomainMap, Ccolmap));
884 std::ostringstream os;
885 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
886 <<
"Create Cexport" << std::endl;
887 std::cerr << os.str ();
889 Cexport = rcp(
new export_type(C.
getRowMap(), CRangeMap));
893 std::ostringstream os;
894 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
895 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr << os.str ();
903 template <
class Scalar,
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
934 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
936 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937 prefix <<
"The case C == null does not actually work. Fixing this will require an interface change.");
939 TEUCHOS_TEST_FOR_EXCEPTION(
941 prefix <<
"Both input matrices must be fill complete before calling this function.");
943 #ifdef HAVE_TPETRA_DEBUG
945 const bool domainMapsSame =
949 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
952 const bool rangeMapsSame =
956 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
961 using Teuchos::ParameterList;
962 RCP<ParameterList> transposeParams (
new ParameterList);
963 transposeParams->set (
"sort",
false);
966 RCP<const crs_matrix_type> Aprime;
968 transposer_type theTransposer (rcpFromRef (A));
969 Aprime = theTransposer.createTranspose (transposeParams);
972 Aprime = rcpFromRef (A);
975 #ifdef HAVE_TPETRA_DEBUG
976 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
981 RCP<const crs_matrix_type> Bprime;
983 transposer_type theTransposer (rcpFromRef (B));
984 Bprime = theTransposer.createTranspose (transposeParams);
987 Bprime = rcpFromRef (B);
990 #ifdef HAVE_TPETRA_DEBUG
991 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
996 if (! C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
1002 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), 0));
1005 #ifdef HAVE_TPETRA_DEBUG
1006 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1014 Array<RCP<const crs_matrix_type> > Mat =
1015 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1019 for (
int k = 0; k < 2; ++k) {
1020 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1021 typename crs_matrix_type::nonconst_values_host_view_type Values;
1029 #ifdef HAVE_TPETRA_DEBUG
1030 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1033 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1039 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040 for (
size_t i = 0; i < localNumRows; ++i) {
1041 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043 if (numEntries > 0) {
1044 Kokkos::resize(Indices,numEntries);
1045 Kokkos::resize(Values,numEntries);
1046 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1048 if (scalar[k] != STS::one ()) {
1049 for (
size_t j = 0; j < numEntries; ++j) {
1050 Values[j] *= scalar[k];
1054 if (C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (globalRow, numEntries,
1056 reinterpret_cast<Scalar *
>(Values.data()), Indices.data());
1058 C->insertGlobalValues (globalRow, numEntries,
1059 reinterpret_cast<Scalar *
>(Values.data()), Indices.data());
1068 namespace MMdetails{
1072 template <
class TransferType>
1073 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer,
const std::string &label) {
1074 if (Transfer.is_null())
1077 const Distributor & Distor = Transfer->getDistributor();
1078 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1080 size_t rows_send = Transfer->getNumExportIDs();
1081 size_t rows_recv = Transfer->getNumRemoteIDs();
1083 size_t round1_send = Transfer->getNumExportIDs() *
sizeof(size_t);
1084 size_t round1_recv = Transfer->getNumRemoteIDs() *
sizeof(size_t);
1085 size_t num_send_neighbors = Distor.getNumSends();
1086 size_t num_recv_neighbors = Distor.getNumReceives();
1087 size_t round2_send, round2_recv;
1088 Distor.getLastDoStatistics(round2_send,round2_recv);
1090 int myPID = Comm->getRank();
1091 int NumProcs = Comm->getSize();
1098 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099 size_t gstats_min[8], gstats_max[8];
1101 double lstats_avg[8], gstats_avg[8];
1102 for(
int i=0; i<8; i++)
1103 lstats_avg[i] = ((
double)lstats[i])/NumProcs;
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1110 printf(
"%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111 (
int)gstats_min[0],gstats_avg[0],(
int)gstats_max[0], (
int)gstats_min[2],gstats_avg[2],(
int)gstats_max[2],
1112 (
int)gstats_min[4],gstats_avg[4],(
int)gstats_max[4], (
int)gstats_min[6],gstats_avg[6],(
int)gstats_max[6]);
1113 printf(
"%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114 (
int)gstats_min[1],gstats_avg[1],(
int)gstats_max[1], (
int)gstats_min[3],gstats_avg[3],(
int)gstats_max[3],
1115 (
int)gstats_min[5],gstats_avg[5],(
int)gstats_max[5], (
int)gstats_min[7],gstats_avg[7],(
int)gstats_max[7]);
1120 template<
class Scalar,
1122 class GlobalOrdinal,
1124 void mult_AT_B_newmatrix(
1125 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128 const std::string & label,
1129 const Teuchos::RCP<Teuchos::ParameterList>& params)
1134 typedef LocalOrdinal LO;
1135 typedef GlobalOrdinal GO;
1137 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1140 #ifdef HAVE_TPETRA_MMM_TIMINGS
1141 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1142 using Teuchos::TimeMonitor;
1143 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1144 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1150 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1152 using Teuchos::ParameterList;
1153 RCP<ParameterList> transposeParams (
new ParameterList);
1154 transposeParams->set (
"sort",
false);
1155 if(! params.is_null ()) {
1156 transposeParams->set (
"compute global constants",
1157 params->get (
"compute global constants: temporaries",
1160 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161 transposer.createTransposeLocal (transposeParams);
1166 #ifdef HAVE_TPETRA_MMM_TIMINGS
1168 MM = rcp (
new TimeMonitor
1169 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1173 crs_matrix_struct_type Aview;
1174 crs_matrix_struct_type Bview;
1175 RCP<const Import<LO, GO, NO> > dummyImporter;
1178 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1179 if (! params.is_null ()) {
1180 importParams1->set (
"compute global constants",
1181 params->get (
"compute global constants: temporaries",
1183 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1184 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1185 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1186 int mm_optimization_core_count =
1188 mm_optimization_core_count =
1189 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 int mm_optimization_core_count2 =
1191 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193 mm_optimization_core_count = mm_optimization_core_count2;
1195 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1196 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1198 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1201 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202 Aview, dummyImporter,
true,
1203 label, importParams1);
1205 RCP<ParameterList> importParams2 (
new ParameterList);
1206 if (! params.is_null ()) {
1207 importParams2->set (
"compute global constants",
1208 params->get (
"compute global constants: temporaries",
1210 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1211 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1212 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1213 int mm_optimization_core_count =
1215 mm_optimization_core_count =
1216 slist.get (
"MM_TAFC_OptimizationCoreCount",
1217 mm_optimization_core_count);
1218 int mm_optimization_core_count2 =
1219 params->get (
"MM_TAFC_OptimizationCoreCount",
1220 mm_optimization_core_count);
1221 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222 mm_optimization_core_count = mm_optimization_core_count2;
1224 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1225 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1227 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1230 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1234 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1237 #ifdef HAVE_TPETRA_MMM_TIMINGS
1239 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1242 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1245 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246 if (needs_final_export) {
1250 Ctemp = rcp (&C,
false);
1253 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1258 #ifdef HAVE_TPETRA_MMM_TIMINGS
1260 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1263 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1265 if (needs_final_export) {
1266 ParameterList labelList;
1267 labelList.set(
"Timer Label", label);
1268 if(!params.is_null()) {
1269 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1270 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1272 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1276 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1277 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1279 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1280 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1282 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1284 Ctemp->exportAndFillComplete (Crcp,
1285 *Ctemp->getGraph ()->getExporter (),
1288 rcp (&labelList,
false));
1290 #ifdef HAVE_TPETRA_MMM_STATISTICS
1291 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1297 template<
class Scalar,
1299 class GlobalOrdinal,
1302 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305 const std::string& ,
1306 const Teuchos::RCP<Teuchos::ParameterList>& )
1308 using Teuchos::Array;
1309 using Teuchos::ArrayRCP;
1310 using Teuchos::ArrayView;
1311 using Teuchos::OrdinalTraits;
1312 using Teuchos::null;
1314 typedef Teuchos::ScalarTraits<Scalar> STS;
1316 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1317 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1319 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1322 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323 ArrayView<const GlobalOrdinal> bcols_import =
null;
1324 if (Bview.importColMap !=
null) {
1325 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1328 bcols_import = Bview.importColMap->getNodeElementList();
1331 size_t C_numCols = C_lastCol - C_firstCol +
1332 OrdinalTraits<LocalOrdinal>::one();
1333 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334 OrdinalTraits<LocalOrdinal>::one();
1336 if (C_numCols_import > C_numCols)
1337 C_numCols = C_numCols_import;
1339 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1343 Array<Scalar> C_row_i = dwork;
1344 Array<GlobalOrdinal> C_cols = iwork;
1345 Array<size_t> c_index = iwork2;
1346 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1349 size_t C_row_i_length, j, k, last_index;
1352 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1357 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358 Aview.colMap->getMaxLocalIndex(); i++)
1363 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364 Aview.colMap->getMaxLocalIndex(); i++) {
1365 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1378 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383 ArrayView<const Scalar> Avals, Bvals, Ivals;
1384 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388 if(!Bview.importMatrix.is_null()) {
1389 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1393 bool C_filled = C.isFillComplete();
1395 for (
size_t i = 0; i < C_numCols; i++)
1396 c_index[i] = OrdinalTraits<size_t>::invalid();
1399 size_t Arows = Aview.rowMap->getNodeNumElements();
1400 for(
size_t i=0; i<Arows; ++i) {
1404 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1410 C_row_i_length = OrdinalTraits<size_t>::zero();
1412 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414 const Scalar Aval = Avals[k];
1415 if (Aval == STS::zero())
1418 if (Ak == LO_INVALID)
1421 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422 LocalOrdinal col = Bcolind[j];
1425 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1428 C_row_i[C_row_i_length] = Aval*Bvals[j];
1429 C_cols[C_row_i_length] = col;
1430 c_index[col] = C_row_i_length;
1434 C_row_i[c_index[col]] += Aval*Bvals[j];
1439 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1440 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441 C_cols[ii] = bcols[C_cols[ii]];
1442 combined_index[ii] = C_cols[ii];
1443 combined_values[ii] = C_row_i[ii];
1445 last_index = C_row_i_length;
1451 C_row_i_length = OrdinalTraits<size_t>::zero();
1453 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455 const Scalar Aval = Avals[k];
1456 if (Aval == STS::zero())
1459 if (Ak!=LO_INVALID)
continue;
1461 Ak = Acol2Irow[Acolind[k]];
1462 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463 LocalOrdinal col = Icolind[j];
1466 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1469 C_row_i[C_row_i_length] = Aval*Ivals[j];
1470 C_cols[C_row_i_length] = col;
1471 c_index[col] = C_row_i_length;
1476 C_row_i[c_index[col]] += Aval*Ivals[j];
1481 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1482 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483 C_cols[ii] = bcols_import[C_cols[ii]];
1484 combined_index[last_index] = C_cols[ii];
1485 combined_values[last_index] = C_row_i[ii];
1492 C.sumIntoGlobalValues(
1494 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1497 C.insertGlobalValues(
1499 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1506 template<
class Scalar,
1508 class GlobalOrdinal,
1510 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1514 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515 Mview.maxNumRowEntries = Mview.indices[0].size();
1517 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519 Mview.maxNumRowEntries = Mview.indices[i].size();
1524 template<
class CrsMatrixType>
1525 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1527 size_t Aest = 100, Best=100;
1528 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1533 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1534 nnzperrow *= nnzperrow;
1536 return (
size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1546 template<
class Scalar,
1548 class GlobalOrdinal,
1550 void mult_A_B_newmatrix(
1551 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554 const std::string& label,
1555 const Teuchos::RCP<Teuchos::ParameterList>& params)
1557 using Teuchos::Array;
1558 using Teuchos::ArrayRCP;
1559 using Teuchos::ArrayView;
1564 typedef LocalOrdinal LO;
1565 typedef GlobalOrdinal GO;
1567 typedef Import<LO,GO,NO> import_type;
1568 typedef Map<LO,GO,NO> map_type;
1571 typedef typename map_type::local_map_type local_map_type;
1573 typedef typename KCRS::StaticCrsGraphType graph_t;
1574 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575 typedef typename NO::execution_space execution_space;
1576 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1579 #ifdef HAVE_TPETRA_MMM_TIMINGS
1580 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1581 using Teuchos::TimeMonitor;
1582 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1584 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1587 RCP<const import_type> Cimport;
1588 RCP<const map_type> Ccolmap;
1589 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1603 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1605 if (Bview.importMatrix.is_null()) {
1608 Ccolmap = Bview.colMap;
1609 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getNodeNumElements());
1611 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613 KOKKOS_LAMBDA(
const LO i) {
1626 if (!Bimport.is_null() && !Iimport.is_null()) {
1627 Cimport = Bimport->setUnion(*Iimport);
1629 else if (!Bimport.is_null() && Iimport.is_null()) {
1630 Cimport = Bimport->setUnion();
1632 else if (Bimport.is_null() && !Iimport.is_null()) {
1633 Cimport = Iimport->setUnion();
1636 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1638 Ccolmap = Cimport->getTargetMap();
1643 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1651 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1654 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1656 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1657 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1666 C.replaceColMap(Ccolmap);
1684 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1687 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1688 GO aidx = Acolmap_local.getGlobalElement(i);
1689 LO B_LID = Browmap_local.getLocalElement(aidx);
1690 if (B_LID != LO_INVALID) {
1691 targetMapToOrigRow(i) = B_LID;
1692 targetMapToImportRow(i) = LO_INVALID;
1694 LO I_LID = Irowmap_local.getLocalElement(aidx);
1695 targetMapToOrigRow(i) = LO_INVALID;
1696 targetMapToImportRow(i) = I_LID;
1703 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1710 template<
class Scalar,
1712 class GlobalOrdinal,
1714 class LocalOrdinalViewType>
1715 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717 const LocalOrdinalViewType & targetMapToOrigRow,
1718 const LocalOrdinalViewType & targetMapToImportRow,
1719 const LocalOrdinalViewType & Bcol2Ccol,
1720 const LocalOrdinalViewType & Icol2Ccol,
1721 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723 const std::string& label,
1724 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725 #ifdef HAVE_TPETRA_MMM_TIMINGS
1726 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1727 using Teuchos::TimeMonitor;
1728 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
1731 using Teuchos::Array;
1732 using Teuchos::ArrayRCP;
1733 using Teuchos::ArrayView;
1738 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739 typedef typename KCRS::StaticCrsGraphType graph_t;
1740 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1746 typedef LocalOrdinal LO;
1747 typedef GlobalOrdinal GO;
1749 typedef Map<LO,GO,NO> map_type;
1750 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1755 RCP<const map_type> Ccolmap = C.getColMap();
1756 size_t m = Aview.origMatrix->getNodeNumRows();
1757 size_t n = Ccolmap->getNodeNumElements();
1758 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1761 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1764 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1768 c_lno_view_t Irowptr;
1769 lno_nnz_view_t Icolind;
1770 scalar_view_t Ivals;
1771 if(!Bview.importMatrix.is_null()) {
1772 auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773 Irowptr = lclB.graph.row_map;
1774 Icolind = lclB.graph.entries;
1775 Ivals = lclB.values;
1776 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1779 #ifdef HAVE_TPETRA_MMM_TIMINGS
1780 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
1790 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
1792 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
1793 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
1803 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804 std::vector<size_t> c_status(n, ST_INVALID);
1814 size_t CSR_ip = 0, OLD_ip = 0;
1815 for (
size_t i = 0; i < m; i++) {
1818 Crowptr[i] = CSR_ip;
1821 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822 LO Aik = Acolind[k];
1823 const SC Aval = Avals[k];
1824 if (Aval == SC_ZERO)
1827 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1834 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
1837 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838 LO Bkj = Bcolind[j];
1839 LO Cij = Bcol2Ccol[Bkj];
1841 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1843 c_status[Cij] = CSR_ip;
1844 Ccolind[CSR_ip] = Cij;
1845 Cvals[CSR_ip] = Aval*Bvals[j];
1849 Cvals[c_status[Cij]] += Aval*Bvals[j];
1860 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
1861 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862 LO Ikj = Icolind[j];
1863 LO Cij = Icol2Ccol[Ikj];
1865 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1867 c_status[Cij] = CSR_ip;
1868 Ccolind[CSR_ip] = Cij;
1869 Cvals[CSR_ip] = Aval*Ivals[j];
1872 Cvals[c_status[Cij]] += Aval*Ivals[j];
1879 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1881 Kokkos::resize(Ccolind,CSR_alloc);
1882 Kokkos::resize(Cvals,CSR_alloc);
1887 Crowptr[m] = CSR_ip;
1890 Kokkos::resize(Ccolind,CSR_ip);
1891 Kokkos::resize(Cvals,CSR_ip);
1893 #ifdef HAVE_TPETRA_MMM_TIMINGS
1895 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
1899 if (params.is_null() || params->get(
"sort entries",
true))
1900 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901 C.setAllValues(Crowptr,Ccolind, Cvals);
1904 #ifdef HAVE_TPETRA_MMM_TIMINGS
1906 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
1918 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1919 labelList->set(
"Timer Label",label);
1920 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1921 RCP<const Export<LO,GO,NO> > dummyExport;
1922 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923 #ifdef HAVE_TPETRA_MMM_TIMINGS
1925 MM2 = Teuchos::null;
1935 template<
class Scalar,
1937 class GlobalOrdinal,
1939 void mult_A_B_reuse(
1940 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943 const std::string& label,
1944 const Teuchos::RCP<Teuchos::ParameterList>& params)
1946 using Teuchos::Array;
1947 using Teuchos::ArrayRCP;
1948 using Teuchos::ArrayView;
1953 typedef LocalOrdinal LO;
1954 typedef GlobalOrdinal GO;
1956 typedef Import<LO,GO,NO> import_type;
1957 typedef Map<LO,GO,NO> map_type;
1960 typedef typename map_type::local_map_type local_map_type;
1962 typedef typename KCRS::StaticCrsGraphType graph_t;
1963 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964 typedef typename NO::execution_space execution_space;
1965 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1968 #ifdef HAVE_TPETRA_MMM_TIMINGS
1969 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1970 using Teuchos::TimeMonitor;
1971 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
1973 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1976 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977 RCP<const map_type> Ccolmap = C.getColMap();
1978 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1986 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1990 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1991 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1994 if (!Bview.importMatrix.is_null()) {
1995 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1998 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2000 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2006 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2009 GO aidx = Acolmap_local.getGlobalElement(i);
2010 LO B_LID = Browmap_local.getLocalElement(aidx);
2011 if (B_LID != LO_INVALID) {
2012 targetMapToOrigRow(i) = B_LID;
2013 targetMapToImportRow(i) = LO_INVALID;
2015 LO I_LID = Irowmap_local.getLocalElement(aidx);
2016 targetMapToOrigRow(i) = LO_INVALID;
2017 targetMapToImportRow(i) = I_LID;
2024 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2028 template<
class Scalar,
2030 class GlobalOrdinal,
2032 class LocalOrdinalViewType>
2033 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035 const LocalOrdinalViewType & targetMapToOrigRow,
2036 const LocalOrdinalViewType & targetMapToImportRow,
2037 const LocalOrdinalViewType & Bcol2Ccol,
2038 const LocalOrdinalViewType & Icol2Ccol,
2039 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2041 const std::string& label,
2042 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2043 #ifdef HAVE_TPETRA_MMM_TIMINGS
2044 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2045 using Teuchos::TimeMonitor;
2046 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2047 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2056 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057 typedef typename KCRS::StaticCrsGraphType graph_t;
2058 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2063 typedef LocalOrdinal LO;
2064 typedef GlobalOrdinal GO;
2066 typedef Map<LO,GO,NO> map_type;
2067 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2072 RCP<const map_type> Ccolmap = C.getColMap();
2073 size_t m = Aview.origMatrix->getNodeNumRows();
2074 size_t n = Ccolmap->getNodeNumElements();
2077 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079 const KCRS & Cmat = C.getLocalMatrixHost();
2081 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084 scalar_view_t Cvals = Cmat.values;
2086 c_lno_view_t Irowptr;
2087 lno_nnz_view_t Icolind;
2088 scalar_view_t Ivals;
2089 if(!Bview.importMatrix.is_null()) {
2090 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091 Irowptr = lclB.graph.row_map;
2092 Icolind = lclB.graph.entries;
2093 Ivals = lclB.values;
2096 #ifdef HAVE_TPETRA_MMM_TIMINGS
2097 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2109 std::vector<size_t> c_status(n, ST_INVALID);
2112 size_t CSR_ip = 0, OLD_ip = 0;
2113 for (
size_t i = 0; i < m; i++) {
2116 OLD_ip = Crowptr[i];
2117 CSR_ip = Crowptr[i+1];
2118 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2119 c_status[Ccolind[k]] = k;
2125 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126 LO Aik = Acolind[k];
2127 const SC Aval = Avals[k];
2128 if (Aval == SC_ZERO)
2131 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2133 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2135 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136 LO Bkj = Bcolind[j];
2137 LO Cij = Bcol2Ccol[Bkj];
2139 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2141 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2143 Cvals[c_status[Cij]] += Aval * Bvals[j];
2148 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2149 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150 LO Ikj = Icolind[j];
2151 LO Cij = Icol2Ccol[Ikj];
2153 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2155 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2157 Cvals[c_status[Cij]] += Aval * Ivals[j];
2163 #ifdef HAVE_TPETRA_MMM_TIMINGS
2164 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2167 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2173 template<
class Scalar,
2175 class GlobalOrdinal,
2177 void jacobi_A_B_newmatrix(
2179 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183 const std::string& label,
2184 const Teuchos::RCP<Teuchos::ParameterList>& params)
2186 using Teuchos::Array;
2187 using Teuchos::ArrayRCP;
2188 using Teuchos::ArrayView;
2192 typedef LocalOrdinal LO;
2193 typedef GlobalOrdinal GO;
2196 typedef Import<LO,GO,NO> import_type;
2197 typedef Map<LO,GO,NO> map_type;
2198 typedef typename map_type::local_map_type local_map_type;
2202 typedef typename KCRS::StaticCrsGraphType graph_t;
2203 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204 typedef typename NO::execution_space execution_space;
2205 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2209 #ifdef HAVE_TPETRA_MMM_TIMINGS
2210 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2211 using Teuchos::TimeMonitor;
2212 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2214 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2217 RCP<const import_type> Cimport;
2218 RCP<const map_type> Ccolmap;
2219 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2233 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2235 if (Bview.importMatrix.is_null()) {
2238 Ccolmap = Bview.colMap;
2242 Kokkos::RangePolicy<execution_space, LO> range (0,
static_cast<LO
> (Bview.colMap->getNodeNumElements ()));
2243 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2244 Bcol2Ccol(i) =
static_cast<LO
> (i);
2255 if (!Bimport.is_null() && !Iimport.is_null()){
2256 Cimport = Bimport->setUnion(*Iimport);
2257 Ccolmap = Cimport->getTargetMap();
2259 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2260 Cimport = Bimport->setUnion();
2262 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2263 Cimport = Iimport->setUnion();
2266 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2268 Ccolmap = Cimport->getTargetMap();
2270 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2278 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2281 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2283 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2284 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2293 C.replaceColMap(Ccolmap);
2311 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2314 GO aidx = Acolmap_local.getGlobalElement(i);
2315 LO B_LID = Browmap_local.getLocalElement(aidx);
2316 if (B_LID != LO_INVALID) {
2317 targetMapToOrigRow(i) = B_LID;
2318 targetMapToImportRow(i) = LO_INVALID;
2320 LO I_LID = Irowmap_local.getLocalElement(aidx);
2321 targetMapToOrigRow(i) = LO_INVALID;
2322 targetMapToImportRow(i) = I_LID;
2329 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2338 template<
class Scalar,
2340 class GlobalOrdinal,
2342 class LocalOrdinalViewType>
2343 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347 const LocalOrdinalViewType & targetMapToOrigRow,
2348 const LocalOrdinalViewType & targetMapToImportRow,
2349 const LocalOrdinalViewType & Bcol2Ccol,
2350 const LocalOrdinalViewType & Icol2Ccol,
2351 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353 const std::string& label,
2354 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2356 #ifdef HAVE_TPETRA_MMM_TIMINGS
2357 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2358 using Teuchos::TimeMonitor;
2359 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2362 using Teuchos::Array;
2363 using Teuchos::ArrayRCP;
2364 using Teuchos::ArrayView;
2369 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370 typedef typename KCRS::StaticCrsGraphType graph_t;
2371 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2377 typedef typename scalar_view_t::memory_space scalar_memory_space;
2380 typedef LocalOrdinal LO;
2381 typedef GlobalOrdinal GO;
2384 typedef Map<LO,GO,NO> map_type;
2385 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2389 RCP<const map_type> Ccolmap = C.getColMap();
2390 size_t m = Aview.origMatrix->getNodeNumRows();
2391 size_t n = Ccolmap->getNodeNumElements();
2392 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2395 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2398 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2402 c_lno_view_t Irowptr;
2403 lno_nnz_view_t Icolind;
2404 scalar_view_t Ivals;
2405 if(!Bview.importMatrix.is_null()) {
2406 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407 Irowptr = lclB.graph.row_map;
2408 Icolind = lclB.graph.entries;
2409 Ivals = lclB.values;
2410 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2415 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2423 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424 Array<size_t> c_status(n, ST_INVALID);
2433 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2435 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2436 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2437 size_t CSR_ip = 0, OLD_ip = 0;
2439 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2453 for (
size_t i = 0; i < m; i++) {
2456 Crowptr[i] = CSR_ip;
2457 SC minusOmegaDval = -omega*Dvals(i,0);
2460 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461 Scalar Bval = Bvals[j];
2462 if (Bval == SC_ZERO)
2464 LO Bij = Bcolind[j];
2465 LO Cij = Bcol2Ccol[Bij];
2468 c_status[Cij] = CSR_ip;
2469 Ccolind[CSR_ip] = Cij;
2470 Cvals[CSR_ip] = Bvals[j];
2475 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476 LO Aik = Acolind[k];
2477 const SC Aval = Avals[k];
2478 if (Aval == SC_ZERO)
2481 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2483 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2485 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486 LO Bkj = Bcolind[j];
2487 LO Cij = Bcol2Ccol[Bkj];
2489 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2491 c_status[Cij] = CSR_ip;
2492 Ccolind[CSR_ip] = Cij;
2493 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2497 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2503 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2504 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505 LO Ikj = Icolind[j];
2506 LO Cij = Icol2Ccol[Ikj];
2508 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2510 c_status[Cij] = CSR_ip;
2511 Ccolind[CSR_ip] = Cij;
2512 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2515 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2522 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2524 Kokkos::resize(Ccolind,CSR_alloc);
2525 Kokkos::resize(Cvals,CSR_alloc);
2529 Crowptr[m] = CSR_ip;
2532 Kokkos::resize(Ccolind,CSR_ip);
2533 Kokkos::resize(Cvals,CSR_ip);
2536 #ifdef HAVE_TPETRA_MMM_TIMINGS
2537 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2544 C.replaceColMap(Ccolmap);
2551 if (params.is_null() || params->get(
"sort entries",
true))
2552 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553 C.setAllValues(Crowptr,Ccolind, Cvals);
2556 #ifdef HAVE_TPETRA_MMM_TIMINGS
2557 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2568 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2569 labelList->set(
"Timer Label",label);
2570 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2571 RCP<const Export<LO,GO,NO> > dummyExport;
2572 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2580 template<
class Scalar,
2582 class GlobalOrdinal,
2584 void jacobi_A_B_reuse(
2586 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590 const std::string& label,
2591 const Teuchos::RCP<Teuchos::ParameterList>& params)
2593 using Teuchos::Array;
2594 using Teuchos::ArrayRCP;
2595 using Teuchos::ArrayView;
2599 typedef LocalOrdinal LO;
2600 typedef GlobalOrdinal GO;
2603 typedef Import<LO,GO,NO> import_type;
2604 typedef Map<LO,GO,NO> map_type;
2607 typedef typename map_type::local_map_type local_map_type;
2609 typedef typename KCRS::StaticCrsGraphType graph_t;
2610 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611 typedef typename NO::execution_space execution_space;
2612 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2615 #ifdef HAVE_TPETRA_MMM_TIMINGS
2616 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2617 using Teuchos::TimeMonitor;
2618 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2620 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2623 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624 RCP<const map_type> Ccolmap = C.getColMap();
2625 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2633 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2637 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2638 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2641 if (!Bview.importMatrix.is_null()) {
2642 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2645 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2647 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2653 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2656 GO aidx = Acolmap_local.getGlobalElement(i);
2657 LO B_LID = Browmap_local.getLocalElement(aidx);
2658 if (B_LID != LO_INVALID) {
2659 targetMapToOrigRow(i) = B_LID;
2660 targetMapToImportRow(i) = LO_INVALID;
2662 LO I_LID = Irowmap_local.getLocalElement(aidx);
2663 targetMapToOrigRow(i) = LO_INVALID;
2664 targetMapToImportRow(i) = I_LID;
2669 #ifdef HAVE_TPETRA_MMM_TIMINGS
2675 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2681 template<
class Scalar,
2683 class GlobalOrdinal,
2685 class LocalOrdinalViewType>
2686 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690 const LocalOrdinalViewType & targetMapToOrigRow,
2691 const LocalOrdinalViewType & targetMapToImportRow,
2692 const LocalOrdinalViewType & Bcol2Ccol,
2693 const LocalOrdinalViewType & Icol2Ccol,
2694 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2696 const std::string& label,
2697 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2698 #ifdef HAVE_TPETRA_MMM_TIMINGS
2699 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2700 using Teuchos::TimeMonitor;
2701 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
2702 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2710 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711 typedef typename KCRS::StaticCrsGraphType graph_t;
2712 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715 typedef typename scalar_view_t::memory_space scalar_memory_space;
2718 typedef LocalOrdinal LO;
2719 typedef GlobalOrdinal GO;
2721 typedef Map<LO,GO,NO> map_type;
2722 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2727 RCP<const map_type> Ccolmap = C.getColMap();
2728 size_t m = Aview.origMatrix->getNodeNumRows();
2729 size_t n = Ccolmap->getNodeNumElements();
2732 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734 const KCRS & Cmat = C.getLocalMatrixHost();
2736 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739 scalar_view_t Cvals = Cmat.values;
2741 c_lno_view_t Irowptr;
2742 lno_nnz_view_t Icolind;
2743 scalar_view_t Ivals;
2744 if(!Bview.importMatrix.is_null()) {
2745 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746 Irowptr = lclB.graph.row_map;
2747 Icolind = lclB.graph.entries;
2748 Ivals = lclB.values;
2753 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2755 #ifdef HAVE_TPETRA_MMM_TIMINGS
2756 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
2764 std::vector<size_t> c_status(n, ST_INVALID);
2767 size_t CSR_ip = 0, OLD_ip = 0;
2768 for (
size_t i = 0; i < m; i++) {
2772 OLD_ip = Crowptr[i];
2773 CSR_ip = Crowptr[i+1];
2774 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2775 c_status[Ccolind[k]] = k;
2781 SC minusOmegaDval = -omega*Dvals(i,0);
2784 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785 Scalar Bval = Bvals[j];
2786 if (Bval == SC_ZERO)
2788 LO Bij = Bcolind[j];
2789 LO Cij = Bcol2Ccol[Bij];
2791 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792 std::runtime_error,
"Trying to insert a new entry into a static graph");
2794 Cvals[c_status[Cij]] = Bvals[j];
2798 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799 LO Aik = Acolind[k];
2800 const SC Aval = Avals[k];
2801 if (Aval == SC_ZERO)
2804 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2806 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2808 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809 LO Bkj = Bcolind[j];
2810 LO Cij = Bcol2Ccol[Bkj];
2812 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813 std::runtime_error,
"Trying to insert a new entry into a static graph");
2815 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2820 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2821 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822 LO Ikj = Icolind[j];
2823 LO Cij = Icol2Ccol[Ikj];
2825 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826 std::runtime_error,
"Trying to insert a new entry into a static graph");
2828 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2834 #ifdef HAVE_TPETRA_MMM_TIMINGS
2835 MM2 = Teuchos::null;
2836 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
2839 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840 #ifdef HAVE_TPETRA_MMM_TIMINGS
2841 MM2 = Teuchos::null;
2850 template<
class Scalar,
2852 class GlobalOrdinal,
2854 void import_and_extract_views(
2855 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859 bool userAssertsThereAreNoRemotes,
2860 const std::string& label,
2861 const Teuchos::RCP<Teuchos::ParameterList>& params)
2863 using Teuchos::Array;
2864 using Teuchos::ArrayView;
2867 using Teuchos::null;
2870 typedef LocalOrdinal LO;
2871 typedef GlobalOrdinal GO;
2874 typedef Map<LO,GO,NO> map_type;
2875 typedef Import<LO,GO,NO> import_type;
2876 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2878 #ifdef HAVE_TPETRA_MMM_TIMINGS
2879 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2880 using Teuchos::TimeMonitor;
2881 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
2889 Aview.deleteContents();
2891 Aview.origMatrix = rcp(&A,
false);
2892 Aview.origRowMap = A.getRowMap();
2893 Aview.rowMap = targetMap;
2894 Aview.colMap = A.getColMap();
2895 Aview.domainMap = A.getDomainMap();
2896 Aview.importColMap =
null;
2897 RCP<const map_type> rowMap = A.getRowMap();
2898 const int numProcs = rowMap->getComm()->getSize();
2901 if (userAssertsThereAreNoRemotes || numProcs < 2)
2904 RCP<const import_type> importer;
2905 if (params !=
null && params->isParameter(
"importer")) {
2906 importer = params->get<RCP<const import_type> >(
"importer");
2909 #ifdef HAVE_TPETRA_MMM_TIMINGS
2911 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
2916 RCP<const map_type> remoteRowMap;
2917 size_t numRemote = 0;
2919 if (!prototypeImporter.is_null() &&
2920 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2923 #ifdef HAVE_TPETRA_MMM_TIMINGS
2924 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
2926 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927 numRemote = prototypeImporter->getNumRemoteIDs();
2929 Array<GO> remoteRows(numRemote);
2930 for (
size_t i = 0; i < numRemote; i++)
2931 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2933 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934 rowMap->getIndexBase(), rowMap->getComm()));
2937 }
else if (prototypeImporter.is_null()) {
2939 #ifdef HAVE_TPETRA_MMM_TIMINGS
2940 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
2942 ArrayView<const GO> rows = targetMap->getNodeElementList();
2943 size_t numRows = targetMap->getNodeNumElements();
2945 Array<GO> remoteRows(numRows);
2946 for(
size_t i = 0; i < numRows; ++i) {
2947 const LO mlid = rowMap->getLocalElement(rows[i]);
2949 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950 remoteRows[numRemote++] = rows[i];
2952 remoteRows.resize(numRemote);
2953 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954 rowMap->getIndexBase(), rowMap->getComm()));
2963 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2973 #ifdef HAVE_TPETRA_MMM_TIMINGS
2975 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
2979 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2981 if (globalMaxNumRemote > 0) {
2982 #ifdef HAVE_TPETRA_MMM_TIMINGS
2984 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
2988 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2990 importer = rcp(
new import_type(rowMap, remoteRowMap));
2992 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
2996 params->set(
"importer", importer);
2999 if (importer !=
null) {
3000 #ifdef HAVE_TPETRA_MMM_TIMINGS
3002 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
3006 Teuchos::ParameterList labelList;
3007 labelList.set(
"Timer Label", label);
3008 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3011 bool overrideAllreduce =
false;
3014 Teuchos::ParameterList params_sublist;
3015 if(!params.is_null()) {
3016 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3017 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3018 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3022 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3024 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3025 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3028 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3029 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3035 sprintf(str,
"import_matrix.%d.dat",count);
3040 #ifdef HAVE_TPETRA_MMM_STATISTICS
3041 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3045 #ifdef HAVE_TPETRA_MMM_TIMINGS
3047 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3051 Aview.importColMap = Aview.importMatrix->getColMap();
3052 #ifdef HAVE_TPETRA_MMM_TIMINGS
3064 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3066 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068 const LocalOrdinalViewType & Acol2Brow,
3069 const LocalOrdinalViewType & Acol2Irow,
3070 const LocalOrdinalViewType & Bcol2Ccol,
3071 const LocalOrdinalViewType & Icol2Ccol,
3072 const size_t mergedNodeNumCols) {
3076 typedef typename KCRS::StaticCrsGraphType graph_t;
3077 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3081 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3082 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3085 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3089 RCP<const KCRS> Ik_;
3090 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3093 if(Ik!=0) Iks = *Ik;
3094 size_t merge_numrows = Ak.numCols();
3096 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3098 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3101 typedef typename Node::execution_space execution_space;
3102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3105 if(
final) Mrowptr(i) = update;
3108 if(Acol2Brow(i)!=LO_INVALID)
3109 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3111 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3114 if(
final && i+1==merge_numrows)
3115 Mrowptr(i+1)=update;
3119 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3121 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3124 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3126 if(Acol2Brow(i)!=LO_INVALID) {
3127 size_t row = Acol2Brow(i);
3128 size_t start = Bk.graph.row_map(row);
3129 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3135 size_t row = Acol2Irow(i);
3136 size_t start = Iks.graph.row_map(row);
3137 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3144 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3153 template<
typename SC,
typename LO,
typename GO,
typename NO>
3154 void AddKernels<SC, LO, GO, NO>::
3156 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3168 using Teuchos::TimeMonitor;
3169 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3171 auto nrows = Arowptrs.extent(0) - 1;
3172 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3173 typename AddKern::KKH handle;
3174 handle.create_spadd_handle(
true);
3175 auto addHandle = handle.get_spadd_handle();
3176 #ifdef HAVE_TPETRA_MMM_TIMINGS
3177 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3179 KokkosSparse::Experimental::spadd_symbolic
3181 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3182 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3183 row_ptrs_array, col_inds_array>
3184 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3186 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3187 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3188 #ifdef HAVE_TPETRA_MMM_TIMINGS
3190 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3192 KokkosSparse::Experimental::spadd_numeric(&handle,
3193 Arowptrs, Acolinds, Avals, scalarA,
3194 Browptrs, Bcolinds, Bvals, scalarB,
3195 Crowptrs, Ccolinds, Cvals);
3198 template<
typename SC,
typename LO,
typename GO,
typename NO>
3199 void AddKernels<SC, LO, GO, NO>::
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3210 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3214 using Teuchos::TimeMonitor;
3215 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3217 auto nrows = Arowptrs.extent(0) - 1;
3218 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3219 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220 typename AddKern::KKH handle;
3221 handle.create_spadd_handle(
false);
3222 auto addHandle = handle.get_spadd_handle();
3223 #ifdef HAVE_TPETRA_MMM_TIMINGS
3224 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3226 KokkosSparse::Experimental::spadd_symbolic
3228 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3229 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3230 row_ptrs_array, col_inds_array>
3231 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3233 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3234 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3235 #ifdef HAVE_TPETRA_MMM_TIMINGS
3237 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3239 KokkosSparse::Experimental::spadd_numeric(&handle,
3240 Arowptrs, Acolinds, Avals, scalarA,
3241 Browptrs, Bcolinds, Bvals, scalarB,
3242 Crowptrs, Ccolinds, Cvals);
3245 template<
typename GO,
3246 typename LocalIndicesType,
3247 typename GlobalIndicesType,
3248 typename ColMapType>
3249 struct ConvertLocalToGlobalFunctor
3251 ConvertLocalToGlobalFunctor(
3252 const LocalIndicesType& colindsOrig_,
3253 const GlobalIndicesType& colindsConverted_,
3254 const ColMapType& colmap_) :
3255 colindsOrig (colindsOrig_),
3256 colindsConverted (colindsConverted_),
3259 KOKKOS_INLINE_FUNCTION
void
3260 operator() (
const GO i)
const
3262 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3264 LocalIndicesType colindsOrig;
3265 GlobalIndicesType colindsConverted;
3269 template<
typename SC,
typename LO,
typename GO,
typename NO>
3270 void MMdetails::AddKernels<SC, LO, GO, NO>::
3271 convertToGlobalAndAdd(
3272 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3282 using Teuchos::TimeMonitor;
3285 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286 typename NO::execution_space,
typename NO::memory_space,
typename NO::memory_space>;
3288 const values_array Avals = A.values;
3289 const values_array Bvals = B.values;
3290 const col_inds_array Acolinds = A.graph.entries;
3291 const col_inds_array Bcolinds = B.graph.entries;
3292 auto Arowptrs = A.graph.row_map;
3293 auto Browptrs = B.graph.row_map;
3294 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3295 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3296 #ifdef HAVE_TPETRA_MMM_TIMINGS
3297 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3299 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3304 handle.create_spadd_handle(
false);
3305 auto addHandle = handle.get_spadd_handle();
3306 #ifdef HAVE_TPETRA_MMM_TIMINGS
3308 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3310 auto nrows = Arowptrs.extent(0) - 1;
3311 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3312 KokkosSparse::Experimental::spadd_symbolic
3313 <KKH_GO,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3316 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3317 #ifdef HAVE_TPETRA_MMM_TIMINGS
3319 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3321 KokkosSparse::Experimental::spadd_numeric(&handle,
3322 Arowptrs, AcolindsConverted, Avals, scalarA,
3323 Browptrs, BcolindsConverted, Bvals, scalarB,
3324 Crowptrs, Ccolinds, Cvals);
3340 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3342 void MatrixMatrix::Multiply( \
3343 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3345 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3347 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348 bool call_FillComplete_on_result, \
3349 const std::string & label, \
3350 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3353 void MatrixMatrix::Jacobi( \
3355 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359 bool call_FillComplete_on_result, \
3360 const std::string & label, \
3361 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3364 void MatrixMatrix::Add( \
3365 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3368 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3371 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3374 void MatrixMatrix::Add( \
3375 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3378 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3382 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384 (const SCALAR & alpha, \
3385 const bool transposeA, \
3386 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387 const SCALAR & beta, \
3388 const bool transposeB, \
3389 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3396 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397 (const SCALAR & alpha, \
3398 const bool transposeA, \
3399 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400 const SCALAR& beta, \
3401 const bool transposeB, \
3402 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3408 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed dense vector.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.