41#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42#define TPETRA_MATRIXMATRIX_DEF_HPP
43#include "Tpetra_ConfigDefs.hpp"
45#include "Teuchos_VerboseObject.hpp"
46#include "Teuchos_Array.hpp"
48#include "Tpetra_CrsMatrix.hpp"
50#include "Tpetra_RowMatrixTransposer.hpp"
53#include "Tpetra_Details_makeColMap.hpp"
54#include "Tpetra_ConfigDefs.hpp"
55#include "Tpetra_Map.hpp"
56#include "Tpetra_Export.hpp"
61#include "Teuchos_FancyOStream.hpp"
63#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
66#include "KokkosSparse_spgemm.hpp"
67#include "KokkosSparse_spadd.hpp"
68#include "Kokkos_Bitset.hpp"
82#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83#include "TpetraExt_MatrixMatrix_Cuda.hpp"
84#include "TpetraExt_MatrixMatrix_HIP.hpp"
88namespace MatrixMatrix{
96template <
class Scalar,
107 const std::string&
label,
108 const Teuchos::RCP<Teuchos::ParameterList>&
params)
123#ifdef HAVE_TPETRA_MMM_TIMINGS
124 std::string
prefix_mmm = std::string(
"TpetraExt ") +
label + std::string(
": ");
125 using Teuchos::TimeMonitor;
131 const std::string
prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
156 const bool newFlag = !
C.getGraph()->isLocallyIndexed() && !
C.getGraph()->isGloballyIndexed();
162#ifdef USE_OLD_TRANSPOSE
166 using Teuchos::ParameterList;
194 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
195 "must match for matrix-matrix product. op(A) is "
203 prefix <<
"ERROR, dimensions of result C must "
204 "match dimensions of op(A) * op(B). C has " <<
C.getGlobalNumRows()
205 <<
" rows, should have at least " <<
Aouter << std::endl);
213 if (!
C.isFillActive())
C.resumeFill();
227#ifdef HAVE_TPETRA_MMM_TIMINGS
249#ifdef HAVE_TPETRA_MMM_TIMINGS
274#ifdef HAVE_TPETRA_MMM_TIMINGS
284 C.fillComplete(
Bprime->getDomainMap(),
Aprime->getRangeMap());
299 const std::string&
label,
300 const Teuchos::RCP<Teuchos::ParameterList>&
params)
312#ifdef HAVE_TPETRA_MMM_TIMINGS
313 std::string
prefix_mmm = std::string(
"TpetraExt ")+
label + std::string(
": ");
314 using Teuchos::TimeMonitor;
318 const std::string
prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
337 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
338 "must match for matrix-matrix product. op(A) is "
346 prefix <<
"ERROR, dimensions of result C must "
347 "match dimensions of op(A) * op(B). C has "<<
C.getGlobalNumRows()
348 <<
" rows, should have at least "<<
Aouter << std::endl);
370#ifdef HAVE_TPETRA_MMM_TIMINGS
379 importParams1->set(
"compute global constants",
params->get(
"compute global constants: temporaries",
false));
381 auto slist =
params->sublist(
"matrixmatrix: kernel params",
false);
385 bool isMM =
slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
389 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",
isMM);
407 importParams2->set(
"compute global constants",
params->get(
"compute global constants: temporaries",
false));
409 auto slist =
params->sublist(
"matrixmatrix: kernel params",
false);
411 bool isMM =
slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
417 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",
isMM);
423#ifdef HAVE_TPETRA_MMM_TIMINGS
432 bool newFlag = !
C.getGraph()->isLocallyIndexed() && !
C.getGraph()->isGloballyIndexed();
475 using Teuchos::Array;
485 const std::string
prefix =
"TpetraExt::MatrixMatrix::Add(): ";
488 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489 "(Result matrix B is not required to be isFillComplete()).");
491 prefix <<
"ERROR, input matrix B must not be fill complete!");
493 prefix <<
"ERROR, input matrix B must not have static graph!");
495 prefix <<
"ERROR, input matrix B must not be locally indexed!");
497 using Teuchos::ParameterList;
511 typename crs_matrix_type::nonconst_global_inds_host_view_type
a_inds(
"a_inds",
A.getNodeMaxNumRowEntries());
512 typename crs_matrix_type::nonconst_values_host_view_type
a_vals(
"a_vals",
A.getNodeMaxNumRowEntries());
515 if (
scalarB != Teuchos::ScalarTraits<SC>::one())
520 if (
scalarA != Teuchos::ScalarTraits<SC>::zero()) {
522 row =
B.getRowMap()->getGlobalElement(
i);
525 if (
scalarA != Teuchos::ScalarTraits<SC>::one())
541Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
550 const Teuchos::RCP<Teuchos::ParameterList>&
params)
553 using Teuchos::rcpFromRef;
555 using Teuchos::ParameterList;
560 params->isParameter(
"Call fillComplete") && !
params->get<
bool>(
"Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set(
"Call fillComplete",
true);
576 (!
A.isFillComplete () || !
Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
588template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
589struct ConvertGlobalToLocalFunctor
595 KOKKOS_FUNCTION
void operator() (
const GO i)
const
597 lids(i) = localColMap.getLocalElement(gids(i));
602 const LocalMap localColMap;
605template <
class Scalar,
619 const Teuchos::RCP<Teuchos::ParameterList>&
params)
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
637 using exec_space =
typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char*
prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
640 constexpr bool debug =
false;
642#ifdef HAVE_TPETRA_MMM_TIMINGS
647 std::ostringstream
os;
648 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
649 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr <<
os.str ();
654 (
C.isLocallyIndexed() ||
C.isGloballyIndexed(), std::invalid_argument,
655 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
657 (!
A.isFillComplete () || !
B.isFillComplete (), std::invalid_argument,
658 prefix_mmm <<
"A and B must both be fill complete.");
659#ifdef HAVE_TPETRA_DEBUG
661 if (
A.isFillComplete () &&
B.isFillComplete ()) {
664 !
A.getDomainMap()->locallySameAs (*
B.getDomainMap ())) ||
666 !
A.getDomainMap()->isSameAs (*
B.getRangeMap ())) ||
668 !
A.getRangeMap ()->isSameAs (*
B.getDomainMap ()));
670 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
674 !
A.getRangeMap ()->isSameAs (*
B.getRangeMap ())) ||
676 !
A.getRangeMap ()->isSameAs (*
B.getDomainMap())) ||
678 !
A.getDomainMap()->isSameAs (*
B.getRangeMap ()));
680 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
684 using Teuchos::ParameterList;
692#ifdef HAVE_TPETRA_DEBUG
694 (
Aprime.is_null (), std::logic_error,
696 "Please report this bug to the Tpetra developers.");
703 std::ostringstream
os;
704 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
705 <<
"Form explicit xpose of B" << std::endl;
706 std::cerr <<
os.str ();
711#ifdef HAVE_TPETRA_DEBUG
713 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
715 !
Aprime->isFillComplete () || !
Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
717 "Please report this bug to the Tpetra developers.");
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
739#ifdef HAVE_TPETRA_MMM_TIMINGS
743 if(!(
Aprime->getRowMap()->isSameAs(*(
Bprime->getRowMap()))))
746 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
758 if(Teuchos::nonnull(
params) &&
params->isParameter(
"Call fillComplete"))
772 rowptrs = row_ptrs_array(
"C rowptrs", 0);
778 using global_col_inds_array =
typename AddKern::global_col_inds_array;
779#ifdef HAVE_TPETRA_MMM_TIMINGS
788 std::ostringstream
os;
789 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
790 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr <<
os.str ();
793 AddKern::convertToGlobalAndAdd(
797 std::ostringstream
os;
798 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
799 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr <<
os.str ();
802#ifdef HAVE_TPETRA_MMM_TIMINGS
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0,
globalColinds.extent(0)),
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(
rowptrs,
localColinds,
vals);
827 auto Arowptrs =
Alocal.graph.row_map;
828 auto Browptrs =
Blocal.graph.row_map;
829 auto Acolinds =
Alocal.graph.entries;
830 auto Bcolinds =
Blocal.graph.entries;
834#ifdef HAVE_TPETRA_MMM_TIMINGS
839 std::ostringstream
os;
840 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
841 <<
"Call AddKern::addSorted(...)" << std::endl;
842 std::cerr <<
os.str ();
844 AddKern::addSorted(
Avals, Arowptrs, Acolinds,
alpha,
Bvals, Browptrs, Bcolinds,
beta,
vals,
rowptrs,
colinds);
849#ifdef HAVE_TPETRA_MMM_TIMINGS
854 std::ostringstream
os;
855 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
856 <<
"Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr <<
os.str ();
859 AddKern::addUnsorted(
Avals, Arowptrs, Acolinds,
alpha,
Bvals, Browptrs, Bcolinds,
beta,
Aprime->getGlobalNumCols(),
vals,
rowptrs,
colinds);
867 #ifdef HAVE_TPETRA_MMM_TIMINGS
874 std::ostringstream
os;
875 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
876 <<
"Create Cimport" << std::endl;
877 std::cerr <<
os.str ();
884 std::ostringstream
os;
885 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
886 <<
"Create Cexport" << std::endl;
887 std::cerr <<
os.str ();
893 std::ostringstream
os;
894 os <<
"Proc " <<
A.getMap ()->getComm ()->getRank () <<
": "
895 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr <<
os.str ();
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
934 std::string
prefix =
"TpetraExt::MatrixMatrix::Add(): ";
937 prefix <<
"The case C == null does not actually work. Fixing this will require an interface change.");
940 !
A.isFillComplete () || !
B.isFillComplete (), std::invalid_argument,
941 prefix <<
"Both input matrices must be fill complete before calling this function.");
943#ifdef HAVE_TPETRA_DEBUG
950 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
957 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
961 using Teuchos::ParameterList;
975#ifdef HAVE_TPETRA_DEBUG
977 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
990#ifdef HAVE_TPETRA_DEBUG
992 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
996 if (!
C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
1002 C =
rcp (
new crs_matrix_type (
Aprime->getRowMap (), 0));
1005#ifdef HAVE_TPETRA_DEBUG
1007 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1009 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1011 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1019 for (
int k = 0;
k < 2; ++
k) {
1020 typename crs_matrix_type::nonconst_global_inds_host_view_type
Indices;
1021 typename crs_matrix_type::nonconst_values_host_view_type
Values;
1029#ifdef HAVE_TPETRA_DEBUG
1031 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1034#ifdef HAVE_TPETRA_DEBUG
1036 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1042 size_t numEntries =
Mat[
k]->getNumEntriesInGlobalRow (
globalRow);
1043 if (numEntries > 0) {
1044 Kokkos::resize(
Indices,numEntries);
1045 Kokkos::resize(
Values,numEntries);
1048 if (
scalar[
k] != STS::one ()) {
1049 for (
size_t j = 0;
j < numEntries; ++
j) {
1054 if (
C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (
globalRow, numEntries,
1058 C->insertGlobalValues (
globalRow, numEntries,
1072template <
class TransferType>
1073void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer,
const std::string &label) {
1074 if (Transfer.is_null())
1077 const Distributor & Distor = Transfer->getDistributor();
1078 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1080 size_t rows_send = Transfer->getNumExportIDs();
1081 size_t rows_recv = Transfer->getNumRemoteIDs();
1083 size_t round1_send = Transfer->getNumExportIDs() *
sizeof(size_t);
1084 size_t round1_recv = Transfer->getNumRemoteIDs() *
sizeof(size_t);
1085 size_t num_send_neighbors = Distor.getNumSends();
1086 size_t num_recv_neighbors = Distor.getNumReceives();
1087 size_t round2_send, round2_recv;
1088 Distor.getLastDoStatistics(round2_send,round2_recv);
1090 int myPID = Comm->getRank();
1091 int NumProcs = Comm->getSize();
1098 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099 size_t gstats_min[8], gstats_max[8];
1101 double lstats_avg[8], gstats_avg[8];
1102 for(
int i=0; i<8; i++)
1103 lstats_avg[i] = ((
double)lstats[i])/NumProcs;
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1110 printf(
"%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111 (
int)gstats_min[0],gstats_avg[0],(
int)gstats_max[0], (
int)gstats_min[2],gstats_avg[2],(
int)gstats_max[2],
1112 (
int)gstats_min[4],gstats_avg[4],(
int)gstats_max[4], (
int)gstats_min[6],gstats_avg[6],(
int)gstats_max[6]);
1113 printf(
"%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114 (
int)gstats_min[1],gstats_avg[1],(
int)gstats_max[1], (
int)gstats_min[3],gstats_avg[3],(
int)gstats_max[3],
1115 (
int)gstats_min[5],gstats_avg[5],(
int)gstats_max[5], (
int)gstats_min[7],gstats_avg[7],(
int)gstats_max[7]);
1120template<
class Scalar,
1122 class GlobalOrdinal,
1124void mult_AT_B_newmatrix(
1125 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128 const std::string & label,
1129 const Teuchos::RCP<Teuchos::ParameterList>& params)
1134 typedef LocalOrdinal LO;
1135 typedef GlobalOrdinal GO;
1137 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1140#ifdef HAVE_TPETRA_MMM_TIMINGS
1141 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1142 using Teuchos::TimeMonitor;
1143 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1144 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1150 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1152 using Teuchos::ParameterList;
1153 RCP<ParameterList> transposeParams (
new ParameterList);
1154 transposeParams->set (
"sort",
false);
1155 if(! params.is_null ()) {
1156 transposeParams->set (
"compute global constants",
1157 params->get (
"compute global constants: temporaries",
1160 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161 transposer.createTransposeLocal (transposeParams);
1166#ifdef HAVE_TPETRA_MMM_TIMINGS
1168 MM = rcp (
new TimeMonitor
1169 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1173 crs_matrix_struct_type Aview;
1174 crs_matrix_struct_type Bview;
1175 RCP<const Import<LO, GO, NO> > dummyImporter;
1178 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1179 if (! params.is_null ()) {
1180 importParams1->set (
"compute global constants",
1181 params->get (
"compute global constants: temporaries",
1183 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1184 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1185 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1186 int mm_optimization_core_count =
1188 mm_optimization_core_count =
1189 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 int mm_optimization_core_count2 =
1191 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193 mm_optimization_core_count = mm_optimization_core_count2;
1195 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1196 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1198 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1201 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202 Aview, dummyImporter,
true,
1203 label, importParams1);
1205 RCP<ParameterList> importParams2 (
new ParameterList);
1206 if (! params.is_null ()) {
1207 importParams2->set (
"compute global constants",
1208 params->get (
"compute global constants: temporaries",
1210 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1211 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1212 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1213 int mm_optimization_core_count =
1215 mm_optimization_core_count =
1216 slist.get (
"MM_TAFC_OptimizationCoreCount",
1217 mm_optimization_core_count);
1218 int mm_optimization_core_count2 =
1219 params->get (
"MM_TAFC_OptimizationCoreCount",
1220 mm_optimization_core_count);
1221 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222 mm_optimization_core_count = mm_optimization_core_count2;
1224 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1225 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1227 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1230 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1234 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1237#ifdef HAVE_TPETRA_MMM_TIMINGS
1239 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1242 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1245 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246 if (needs_final_export) {
1250 Ctemp = rcp (&C,
false);
1253 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1258#ifdef HAVE_TPETRA_MMM_TIMINGS
1260 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1263 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1265 if (needs_final_export) {
1266 ParameterList labelList;
1267 labelList.set(
"Timer Label", label);
1268 if(!params.is_null()) {
1269 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1270 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1272 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1276 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1277 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1279 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1280 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1282 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1284 Ctemp->exportAndFillComplete (Crcp,
1285 *Ctemp->getGraph ()->getExporter (),
1288 rcp (&labelList,
false));
1290#ifdef HAVE_TPETRA_MMM_STATISTICS
1291 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1297template<
class Scalar,
1299 class GlobalOrdinal,
1302 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305 const std::string& ,
1306 const Teuchos::RCP<Teuchos::ParameterList>& )
1308 using Teuchos::Array;
1309 using Teuchos::ArrayRCP;
1310 using Teuchos::ArrayView;
1311 using Teuchos::OrdinalTraits;
1312 using Teuchos::null;
1314 typedef Teuchos::ScalarTraits<Scalar> STS;
1316 LocalOrdinal C_firstCol = Bview.
colMap->getMinLocalIndex();
1317 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1319 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1322 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323 ArrayView<const GlobalOrdinal> bcols_import = null;
1324 if (Bview.importColMap != null) {
1325 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1328 bcols_import = Bview.importColMap->getNodeElementList();
1331 size_t C_numCols = C_lastCol - C_firstCol +
1332 OrdinalTraits<LocalOrdinal>::one();
1333 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334 OrdinalTraits<LocalOrdinal>::one();
1336 if (C_numCols_import > C_numCols)
1337 C_numCols = C_numCols_import;
1339 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1343 Array<Scalar> C_row_i = dwork;
1344 Array<GlobalOrdinal> C_cols = iwork;
1345 Array<size_t> c_index = iwork2;
1346 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1349 size_t C_row_i_length, j, k, last_index;
1352 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1357 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358 Aview.colMap->getMaxLocalIndex(); i++)
1363 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364 Aview.colMap->getMaxLocalIndex(); i++) {
1365 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1378 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383 ArrayView<const Scalar> Avals, Bvals, Ivals;
1384 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388 if(!Bview.importMatrix.is_null()) {
1389 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1393 bool C_filled = C.isFillComplete();
1395 for (
size_t i = 0; i < C_numCols; i++)
1396 c_index[i] = OrdinalTraits<size_t>::invalid();
1399 size_t Arows = Aview.rowMap->getNodeNumElements();
1400 for(
size_t i=0; i<Arows; ++i) {
1404 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1410 C_row_i_length = OrdinalTraits<size_t>::zero();
1412 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414 const Scalar Aval = Avals[k];
1415 if (Aval == STS::zero())
1418 if (Ak == LO_INVALID)
1421 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422 LocalOrdinal col = Bcolind[j];
1425 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1428 C_row_i[C_row_i_length] = Aval*Bvals[j];
1429 C_cols[C_row_i_length] = col;
1430 c_index[col] = C_row_i_length;
1434 C_row_i[c_index[col]] += Aval*Bvals[j];
1439 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1440 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441 C_cols[ii] = bcols[C_cols[ii]];
1442 combined_index[ii] = C_cols[ii];
1443 combined_values[ii] = C_row_i[ii];
1445 last_index = C_row_i_length;
1451 C_row_i_length = OrdinalTraits<size_t>::zero();
1453 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455 const Scalar Aval = Avals[k];
1456 if (Aval == STS::zero())
1459 if (Ak!=LO_INVALID)
continue;
1461 Ak = Acol2Irow[Acolind[k]];
1462 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463 LocalOrdinal col = Icolind[j];
1466 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1469 C_row_i[C_row_i_length] = Aval*Ivals[j];
1470 C_cols[C_row_i_length] = col;
1471 c_index[col] = C_row_i_length;
1476 C_row_i[c_index[col]] += Aval*Ivals[j];
1481 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1482 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483 C_cols[ii] = bcols_import[C_cols[ii]];
1484 combined_index[last_index] = C_cols[ii];
1485 combined_values[last_index] = C_row_i[ii];
1492 C.sumIntoGlobalValues(
1494 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1497 C.insertGlobalValues(
1499 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1506template<
class Scalar,
1508 class GlobalOrdinal,
1510void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1514 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515 Mview.maxNumRowEntries = Mview.indices[0].size();
1517 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519 Mview.maxNumRowEntries = Mview.indices[i].size();
1524template<
class CrsMatrixType>
1525size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1527 size_t Aest = 100, Best=100;
1528 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1533 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1534 nnzperrow *= nnzperrow;
1536 return (
size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1546template<
class Scalar,
1548 class GlobalOrdinal,
1550void mult_A_B_newmatrix(
1551 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554 const std::string& label,
1555 const Teuchos::RCP<Teuchos::ParameterList>& params)
1557 using Teuchos::Array;
1558 using Teuchos::ArrayRCP;
1559 using Teuchos::ArrayView;
1564 typedef LocalOrdinal LO;
1565 typedef GlobalOrdinal GO;
1567 typedef Import<LO,GO,NO> import_type;
1568 typedef Map<LO,GO,NO> map_type;
1571 typedef typename map_type::local_map_type local_map_type;
1573 typedef typename KCRS::StaticCrsGraphType graph_t;
1574 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575 typedef typename NO::execution_space execution_space;
1576 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1579#ifdef HAVE_TPETRA_MMM_TIMINGS
1580 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1581 using Teuchos::TimeMonitor;
1582 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1584 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1587 RCP<const import_type> Cimport;
1588 RCP<const map_type> Ccolmap;
1589 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1603 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1605 if (Bview.importMatrix.is_null()) {
1608 Ccolmap = Bview.colMap;
1609 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getNodeNumElements());
1611 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613 KOKKOS_LAMBDA(
const LO i) {
1626 if (!Bimport.is_null() && !Iimport.is_null()) {
1627 Cimport = Bimport->setUnion(*Iimport);
1629 else if (!Bimport.is_null() && Iimport.is_null()) {
1630 Cimport = Bimport->setUnion();
1632 else if (Bimport.is_null() && !Iimport.is_null()) {
1633 Cimport = Iimport->setUnion();
1636 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1638 Ccolmap = Cimport->getTargetMap();
1643 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1651 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1654 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1656 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1657 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1666 C.replaceColMap(Ccolmap);
1684 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1687 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1688 GO aidx = Acolmap_local.getGlobalElement(i);
1689 LO B_LID = Browmap_local.getLocalElement(aidx);
1690 if (B_LID != LO_INVALID) {
1691 targetMapToOrigRow(i) = B_LID;
1692 targetMapToImportRow(i) = LO_INVALID;
1694 LO I_LID = Irowmap_local.getLocalElement(aidx);
1695 targetMapToOrigRow(i) = LO_INVALID;
1696 targetMapToImportRow(i) = I_LID;
1703 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1710template<
class Scalar,
1712 class GlobalOrdinal,
1714 class LocalOrdinalViewType>
1715void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717 const LocalOrdinalViewType & targetMapToOrigRow,
1718 const LocalOrdinalViewType & targetMapToImportRow,
1719 const LocalOrdinalViewType & Bcol2Ccol,
1720 const LocalOrdinalViewType & Icol2Ccol,
1721 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723 const std::string& label,
1724 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725#ifdef HAVE_TPETRA_MMM_TIMINGS
1726 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1727 using Teuchos::TimeMonitor;
1728 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
1731 using Teuchos::Array;
1732 using Teuchos::ArrayRCP;
1733 using Teuchos::ArrayView;
1738 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739 typedef typename KCRS::StaticCrsGraphType graph_t;
1740 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1746 typedef LocalOrdinal LO;
1747 typedef GlobalOrdinal GO;
1749 typedef Map<LO,GO,NO> map_type;
1750 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1755 RCP<const map_type> Ccolmap = C.getColMap();
1756 size_t m = Aview.origMatrix->getNodeNumRows();
1757 size_t n = Ccolmap->getNodeNumElements();
1758 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1761 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1764 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1768 c_lno_view_t Irowptr;
1769 lno_nnz_view_t Icolind;
1770 scalar_view_t Ivals;
1771 if(!Bview.importMatrix.is_null()) {
1772 auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773 Irowptr = lclB.graph.row_map;
1774 Icolind = lclB.graph.entries;
1775 Ivals = lclB.values;
1776 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1779#ifdef HAVE_TPETRA_MMM_TIMINGS
1780 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
1790 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
1792 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
1793 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
1803 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804 std::vector<size_t> c_status(n, ST_INVALID);
1814 size_t CSR_ip = 0, OLD_ip = 0;
1815 for (
size_t i = 0; i < m; i++) {
1818 Crowptr[i] = CSR_ip;
1821 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822 LO Aik = Acolind[k];
1823 const SC Aval = Avals[k];
1824 if (Aval == SC_ZERO)
1827 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1834 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
1837 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838 LO Bkj = Bcolind[j];
1839 LO Cij = Bcol2Ccol[Bkj];
1841 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1843 c_status[Cij] = CSR_ip;
1844 Ccolind[CSR_ip] = Cij;
1845 Cvals[CSR_ip] = Aval*Bvals[j];
1849 Cvals[c_status[Cij]] += Aval*Bvals[j];
1860 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
1861 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862 LO Ikj = Icolind[j];
1863 LO Cij = Icol2Ccol[Ikj];
1865 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1867 c_status[Cij] = CSR_ip;
1868 Ccolind[CSR_ip] = Cij;
1869 Cvals[CSR_ip] = Aval*Ivals[j];
1872 Cvals[c_status[Cij]] += Aval*Ivals[j];
1879 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1881 Kokkos::resize(Ccolind,CSR_alloc);
1882 Kokkos::resize(Cvals,CSR_alloc);
1887 Crowptr[m] = CSR_ip;
1890 Kokkos::resize(Ccolind,CSR_ip);
1891 Kokkos::resize(Cvals,CSR_ip);
1893#ifdef HAVE_TPETRA_MMM_TIMINGS
1895 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
1899 if (params.is_null() || params->get(
"sort entries",
true))
1900 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901 C.setAllValues(Crowptr,Ccolind, Cvals);
1904#ifdef HAVE_TPETRA_MMM_TIMINGS
1906 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
1918 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1919 labelList->set(
"Timer Label",label);
1920 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1921 RCP<const Export<LO,GO,NO> > dummyExport;
1922 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923#ifdef HAVE_TPETRA_MMM_TIMINGS
1925 MM2 = Teuchos::null;
1935template<
class Scalar,
1937 class GlobalOrdinal,
1940 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943 const std::string& label,
1944 const Teuchos::RCP<Teuchos::ParameterList>& params)
1946 using Teuchos::Array;
1947 using Teuchos::ArrayRCP;
1948 using Teuchos::ArrayView;
1953 typedef LocalOrdinal LO;
1954 typedef GlobalOrdinal GO;
1956 typedef Import<LO,GO,NO> import_type;
1957 typedef Map<LO,GO,NO> map_type;
1960 typedef typename map_type::local_map_type local_map_type;
1962 typedef typename KCRS::StaticCrsGraphType graph_t;
1963 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964 typedef typename NO::execution_space execution_space;
1965 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1968#ifdef HAVE_TPETRA_MMM_TIMINGS
1969 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1970 using Teuchos::TimeMonitor;
1971 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
1973 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1976 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977 RCP<const map_type> Ccolmap = C.getColMap();
1978 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1986 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1990 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1991 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1994 if (!Bview.importMatrix.is_null()) {
1995 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1998 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2000 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2006 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2009 GO aidx = Acolmap_local.getGlobalElement(i);
2010 LO B_LID = Browmap_local.getLocalElement(aidx);
2011 if (B_LID != LO_INVALID) {
2012 targetMapToOrigRow(i) = B_LID;
2013 targetMapToImportRow(i) = LO_INVALID;
2015 LO I_LID = Irowmap_local.getLocalElement(aidx);
2016 targetMapToOrigRow(i) = LO_INVALID;
2017 targetMapToImportRow(i) = I_LID;
2024 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2028template<
class Scalar,
2030 class GlobalOrdinal,
2032 class LocalOrdinalViewType>
2033void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035 const LocalOrdinalViewType & targetMapToOrigRow,
2036 const LocalOrdinalViewType & targetMapToImportRow,
2037 const LocalOrdinalViewType & Bcol2Ccol,
2038 const LocalOrdinalViewType & Icol2Ccol,
2039 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2041 const std::string& label,
2042 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2043#ifdef HAVE_TPETRA_MMM_TIMINGS
2044 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2045 using Teuchos::TimeMonitor;
2046 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2047 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2056 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057 typedef typename KCRS::StaticCrsGraphType graph_t;
2058 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2063 typedef LocalOrdinal LO;
2064 typedef GlobalOrdinal GO;
2066 typedef Map<LO,GO,NO> map_type;
2067 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2072 RCP<const map_type> Ccolmap = C.getColMap();
2073 size_t m = Aview.origMatrix->getNodeNumRows();
2074 size_t n = Ccolmap->getNodeNumElements();
2077 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079 const KCRS & Cmat = C.getLocalMatrixHost();
2081 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084 scalar_view_t Cvals = Cmat.values;
2086 c_lno_view_t Irowptr;
2087 lno_nnz_view_t Icolind;
2088 scalar_view_t Ivals;
2089 if(!Bview.importMatrix.is_null()) {
2090 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091 Irowptr = lclB.graph.row_map;
2092 Icolind = lclB.graph.entries;
2093 Ivals = lclB.values;
2096#ifdef HAVE_TPETRA_MMM_TIMINGS
2097 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2109 std::vector<size_t> c_status(n, ST_INVALID);
2112 size_t CSR_ip = 0, OLD_ip = 0;
2113 for (
size_t i = 0; i < m; i++) {
2116 OLD_ip = Crowptr[i];
2117 CSR_ip = Crowptr[i+1];
2118 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2119 c_status[Ccolind[k]] = k;
2125 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126 LO Aik = Acolind[k];
2127 const SC Aval = Avals[k];
2128 if (Aval == SC_ZERO)
2131 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2133 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2135 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136 LO Bkj = Bcolind[j];
2137 LO Cij = Bcol2Ccol[Bkj];
2139 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2141 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2143 Cvals[c_status[Cij]] += Aval * Bvals[j];
2148 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2149 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150 LO Ikj = Icolind[j];
2151 LO Cij = Icol2Ccol[Ikj];
2153 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2155 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2157 Cvals[c_status[Cij]] += Aval * Ivals[j];
2163#ifdef HAVE_TPETRA_MMM_TIMINGS
2164 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2167 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2173template<
class Scalar,
2175 class GlobalOrdinal,
2177void jacobi_A_B_newmatrix(
2179 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183 const std::string& label,
2184 const Teuchos::RCP<Teuchos::ParameterList>& params)
2186 using Teuchos::Array;
2187 using Teuchos::ArrayRCP;
2188 using Teuchos::ArrayView;
2192 typedef LocalOrdinal LO;
2193 typedef GlobalOrdinal GO;
2196 typedef Import<LO,GO,NO> import_type;
2197 typedef Map<LO,GO,NO> map_type;
2198 typedef typename map_type::local_map_type local_map_type;
2202 typedef typename KCRS::StaticCrsGraphType graph_t;
2203 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204 typedef typename NO::execution_space execution_space;
2205 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2209#ifdef HAVE_TPETRA_MMM_TIMINGS
2210 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2211 using Teuchos::TimeMonitor;
2212 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2214 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2217 RCP<const import_type> Cimport;
2218 RCP<const map_type> Ccolmap;
2219 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2233 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2235 if (Bview.importMatrix.is_null()) {
2238 Ccolmap = Bview.colMap;
2242 Kokkos::RangePolicy<execution_space, LO> range (0,
static_cast<LO
> (Bview.colMap->getNodeNumElements ()));
2243 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2244 Bcol2Ccol(i) =
static_cast<LO
> (i);
2255 if (!Bimport.is_null() && !Iimport.is_null()){
2256 Cimport = Bimport->setUnion(*Iimport);
2257 Ccolmap = Cimport->getTargetMap();
2259 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2260 Cimport = Bimport->setUnion();
2262 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2263 Cimport = Iimport->setUnion();
2266 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2268 Ccolmap = Cimport->getTargetMap();
2270 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2278 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2281 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2283 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2284 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2293 C.replaceColMap(Ccolmap);
2311 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2314 GO aidx = Acolmap_local.getGlobalElement(i);
2315 LO B_LID = Browmap_local.getLocalElement(aidx);
2316 if (B_LID != LO_INVALID) {
2317 targetMapToOrigRow(i) = B_LID;
2318 targetMapToImportRow(i) = LO_INVALID;
2320 LO I_LID = Irowmap_local.getLocalElement(aidx);
2321 targetMapToOrigRow(i) = LO_INVALID;
2322 targetMapToImportRow(i) = I_LID;
2329 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2338template<
class Scalar,
2340 class GlobalOrdinal,
2342 class LocalOrdinalViewType>
2343void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347 const LocalOrdinalViewType & targetMapToOrigRow,
2348 const LocalOrdinalViewType & targetMapToImportRow,
2349 const LocalOrdinalViewType & Bcol2Ccol,
2350 const LocalOrdinalViewType & Icol2Ccol,
2351 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353 const std::string& label,
2354 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2356#ifdef HAVE_TPETRA_MMM_TIMINGS
2357 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2358 using Teuchos::TimeMonitor;
2359 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2362 using Teuchos::Array;
2363 using Teuchos::ArrayRCP;
2364 using Teuchos::ArrayView;
2369 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370 typedef typename KCRS::StaticCrsGraphType graph_t;
2371 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2377 typedef typename scalar_view_t::memory_space scalar_memory_space;
2380 typedef LocalOrdinal LO;
2381 typedef GlobalOrdinal GO;
2384 typedef Map<LO,GO,NO> map_type;
2385 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2389 RCP<const map_type> Ccolmap = C.getColMap();
2390 size_t m = Aview.origMatrix->getNodeNumRows();
2391 size_t n = Ccolmap->getNodeNumElements();
2392 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2395 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2398 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2402 c_lno_view_t Irowptr;
2403 lno_nnz_view_t Icolind;
2404 scalar_view_t Ivals;
2405 if(!Bview.importMatrix.is_null()) {
2406 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407 Irowptr = lclB.graph.row_map;
2408 Icolind = lclB.graph.entries;
2409 Ivals = lclB.values;
2410 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2415 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2423 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424 Array<size_t> c_status(n, ST_INVALID);
2433 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2435 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2436 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2437 size_t CSR_ip = 0, OLD_ip = 0;
2439 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2453 for (
size_t i = 0; i < m; i++) {
2456 Crowptr[i] = CSR_ip;
2457 SC minusOmegaDval = -omega*Dvals(i,0);
2460 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461 Scalar Bval = Bvals[j];
2462 if (Bval == SC_ZERO)
2464 LO Bij = Bcolind[j];
2465 LO Cij = Bcol2Ccol[Bij];
2468 c_status[Cij] = CSR_ip;
2469 Ccolind[CSR_ip] = Cij;
2470 Cvals[CSR_ip] = Bvals[j];
2475 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476 LO Aik = Acolind[k];
2477 const SC Aval = Avals[k];
2478 if (Aval == SC_ZERO)
2481 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2483 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2485 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486 LO Bkj = Bcolind[j];
2487 LO Cij = Bcol2Ccol[Bkj];
2489 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2491 c_status[Cij] = CSR_ip;
2492 Ccolind[CSR_ip] = Cij;
2493 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2497 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2503 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2504 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505 LO Ikj = Icolind[j];
2506 LO Cij = Icol2Ccol[Ikj];
2508 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2510 c_status[Cij] = CSR_ip;
2511 Ccolind[CSR_ip] = Cij;
2512 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2515 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2522 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2524 Kokkos::resize(Ccolind,CSR_alloc);
2525 Kokkos::resize(Cvals,CSR_alloc);
2529 Crowptr[m] = CSR_ip;
2532 Kokkos::resize(Ccolind,CSR_ip);
2533 Kokkos::resize(Cvals,CSR_ip);
2536#ifdef HAVE_TPETRA_MMM_TIMINGS
2537 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2544 C.replaceColMap(Ccolmap);
2551 if (params.is_null() || params->get(
"sort entries",
true))
2552 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553 C.setAllValues(Crowptr,Ccolind, Cvals);
2556#ifdef HAVE_TPETRA_MMM_TIMINGS
2557 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2568 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2569 labelList->set(
"Timer Label",label);
2570 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2571 RCP<const Export<LO,GO,NO> > dummyExport;
2572 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2580template<
class Scalar,
2582 class GlobalOrdinal,
2584void jacobi_A_B_reuse(
2586 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590 const std::string& label,
2591 const Teuchos::RCP<Teuchos::ParameterList>& params)
2593 using Teuchos::Array;
2594 using Teuchos::ArrayRCP;
2595 using Teuchos::ArrayView;
2599 typedef LocalOrdinal LO;
2600 typedef GlobalOrdinal GO;
2603 typedef Import<LO,GO,NO> import_type;
2604 typedef Map<LO,GO,NO> map_type;
2607 typedef typename map_type::local_map_type local_map_type;
2609 typedef typename KCRS::StaticCrsGraphType graph_t;
2610 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611 typedef typename NO::execution_space execution_space;
2612 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2615#ifdef HAVE_TPETRA_MMM_TIMINGS
2616 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2617 using Teuchos::TimeMonitor;
2618 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2620 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2623 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624 RCP<const map_type> Ccolmap = C.getColMap();
2625 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2633 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2637 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2638 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2641 if (!Bview.importMatrix.is_null()) {
2642 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2645 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2647 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2653 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2656 GO aidx = Acolmap_local.getGlobalElement(i);
2657 LO B_LID = Browmap_local.getLocalElement(aidx);
2658 if (B_LID != LO_INVALID) {
2659 targetMapToOrigRow(i) = B_LID;
2660 targetMapToImportRow(i) = LO_INVALID;
2662 LO I_LID = Irowmap_local.getLocalElement(aidx);
2663 targetMapToOrigRow(i) = LO_INVALID;
2664 targetMapToImportRow(i) = I_LID;
2669#ifdef HAVE_TPETRA_MMM_TIMINGS
2675 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2681template<
class Scalar,
2683 class GlobalOrdinal,
2685 class LocalOrdinalViewType>
2686void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690 const LocalOrdinalViewType & targetMapToOrigRow,
2691 const LocalOrdinalViewType & targetMapToImportRow,
2692 const LocalOrdinalViewType & Bcol2Ccol,
2693 const LocalOrdinalViewType & Icol2Ccol,
2694 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2696 const std::string& label,
2697 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2698#ifdef HAVE_TPETRA_MMM_TIMINGS
2699 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2700 using Teuchos::TimeMonitor;
2701 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
2702 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2710 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711 typedef typename KCRS::StaticCrsGraphType graph_t;
2712 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715 typedef typename scalar_view_t::memory_space scalar_memory_space;
2718 typedef LocalOrdinal LO;
2719 typedef GlobalOrdinal GO;
2721 typedef Map<LO,GO,NO> map_type;
2722 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2727 RCP<const map_type> Ccolmap = C.getColMap();
2728 size_t m = Aview.origMatrix->getNodeNumRows();
2729 size_t n = Ccolmap->getNodeNumElements();
2732 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734 const KCRS & Cmat = C.getLocalMatrixHost();
2736 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739 scalar_view_t Cvals = Cmat.values;
2741 c_lno_view_t Irowptr;
2742 lno_nnz_view_t Icolind;
2743 scalar_view_t Ivals;
2744 if(!Bview.importMatrix.is_null()) {
2745 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746 Irowptr = lclB.graph.row_map;
2747 Icolind = lclB.graph.entries;
2748 Ivals = lclB.values;
2753 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2755#ifdef HAVE_TPETRA_MMM_TIMINGS
2756 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
2764 std::vector<size_t> c_status(n, ST_INVALID);
2767 size_t CSR_ip = 0, OLD_ip = 0;
2768 for (
size_t i = 0; i < m; i++) {
2772 OLD_ip = Crowptr[i];
2773 CSR_ip = Crowptr[i+1];
2774 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2775 c_status[Ccolind[k]] = k;
2781 SC minusOmegaDval = -omega*Dvals(i,0);
2784 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785 Scalar Bval = Bvals[j];
2786 if (Bval == SC_ZERO)
2788 LO Bij = Bcolind[j];
2789 LO Cij = Bcol2Ccol[Bij];
2791 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792 std::runtime_error,
"Trying to insert a new entry into a static graph");
2794 Cvals[c_status[Cij]] = Bvals[j];
2798 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799 LO Aik = Acolind[k];
2800 const SC Aval = Avals[k];
2801 if (Aval == SC_ZERO)
2804 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2806 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2808 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809 LO Bkj = Bcolind[j];
2810 LO Cij = Bcol2Ccol[Bkj];
2812 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813 std::runtime_error,
"Trying to insert a new entry into a static graph");
2815 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2820 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2821 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822 LO Ikj = Icolind[j];
2823 LO Cij = Icol2Ccol[Ikj];
2825 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826 std::runtime_error,
"Trying to insert a new entry into a static graph");
2828 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2834#ifdef HAVE_TPETRA_MMM_TIMINGS
2835 MM2 = Teuchos::null;
2836 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
2839 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840#ifdef HAVE_TPETRA_MMM_TIMINGS
2841 MM2 = Teuchos::null;
2850template<
class Scalar,
2852 class GlobalOrdinal,
2854void import_and_extract_views(
2855 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859 bool userAssertsThereAreNoRemotes,
2860 const std::string& label,
2861 const Teuchos::RCP<Teuchos::ParameterList>& params)
2863 using Teuchos::Array;
2864 using Teuchos::ArrayView;
2867 using Teuchos::null;
2870 typedef LocalOrdinal LO;
2871 typedef GlobalOrdinal GO;
2874 typedef Map<LO,GO,NO> map_type;
2875 typedef Import<LO,GO,NO> import_type;
2876 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2878#ifdef HAVE_TPETRA_MMM_TIMINGS
2879 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2880 using Teuchos::TimeMonitor;
2881 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
2889 Aview.deleteContents();
2891 Aview.origMatrix = rcp(&A,
false);
2892 Aview.origRowMap = A.getRowMap();
2893 Aview.rowMap = targetMap;
2894 Aview.colMap = A.getColMap();
2895 Aview.domainMap = A.getDomainMap();
2896 Aview.importColMap = null;
2897 RCP<const map_type> rowMap = A.getRowMap();
2898 const int numProcs = rowMap->getComm()->getSize();
2901 if (userAssertsThereAreNoRemotes || numProcs < 2)
2904 RCP<const import_type> importer;
2905 if (params != null && params->isParameter(
"importer")) {
2906 importer = params->get<RCP<const import_type> >(
"importer");
2909#ifdef HAVE_TPETRA_MMM_TIMINGS
2911 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
2916 RCP<const map_type> remoteRowMap;
2917 size_t numRemote = 0;
2919 if (!prototypeImporter.is_null() &&
2920 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2923#ifdef HAVE_TPETRA_MMM_TIMINGS
2924 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
2926 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927 numRemote = prototypeImporter->getNumRemoteIDs();
2929 Array<GO> remoteRows(numRemote);
2930 for (
size_t i = 0; i < numRemote; i++)
2931 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2933 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934 rowMap->getIndexBase(), rowMap->getComm()));
2937 }
else if (prototypeImporter.is_null()) {
2939#ifdef HAVE_TPETRA_MMM_TIMINGS
2940 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
2942 ArrayView<const GO> rows = targetMap->getNodeElementList();
2943 size_t numRows = targetMap->getNodeNumElements();
2945 Array<GO> remoteRows(numRows);
2946 for(
size_t i = 0; i < numRows; ++i) {
2947 const LO mlid = rowMap->getLocalElement(rows[i]);
2949 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950 remoteRows[numRemote++] = rows[i];
2952 remoteRows.resize(numRemote);
2953 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954 rowMap->getIndexBase(), rowMap->getComm()));
2963 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2973#ifdef HAVE_TPETRA_MMM_TIMINGS
2975 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
2979 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2981 if (globalMaxNumRemote > 0) {
2982#ifdef HAVE_TPETRA_MMM_TIMINGS
2984 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
2988 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2990 importer = rcp(
new import_type(rowMap, remoteRowMap));
2992 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
2996 params->set(
"importer", importer);
2999 if (importer != null) {
3000#ifdef HAVE_TPETRA_MMM_TIMINGS
3002 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
3006 Teuchos::ParameterList labelList;
3007 labelList.set(
"Timer Label", label);
3008 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3011 bool overrideAllreduce =
false;
3014 Teuchos::ParameterList params_sublist;
3015 if(!params.is_null()) {
3016 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3017 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3018 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3022 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3024 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3025 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3029 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3035 sprintf(str,
"import_matrix.%d.dat",count);
3040#ifdef HAVE_TPETRA_MMM_STATISTICS
3041 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3045#ifdef HAVE_TPETRA_MMM_TIMINGS
3047 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3051 Aview.importColMap = Aview.importMatrix->getColMap();
3052#ifdef HAVE_TPETRA_MMM_TIMINGS
3064template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3066merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068 const LocalOrdinalViewType & Acol2Brow,
3069 const LocalOrdinalViewType & Acol2Irow,
3070 const LocalOrdinalViewType & Bcol2Ccol,
3071 const LocalOrdinalViewType & Icol2Ccol,
3072 const size_t mergedNodeNumCols) {
3076 typedef typename KCRS::StaticCrsGraphType graph_t;
3077 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3081 const KCRS & Ak = Aview.
origMatrix->getLocalMatrixDevice();
3082 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3085 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3089 RCP<const KCRS> Ik_;
3090 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3093 if(Ik!=0) Iks = *Ik;
3094 size_t merge_numrows = Ak.numCols();
3096 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3098 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3101 typedef typename Node::execution_space execution_space;
3102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3105 if(
final) Mrowptr(i) = update;
3108 if(Acol2Brow(i)!=LO_INVALID)
3109 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3111 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3114 if(
final && i+1==merge_numrows)
3115 Mrowptr(i+1)=update;
3119 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3121 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3124 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3126 if(Acol2Brow(i)!=LO_INVALID) {
3127 size_t row = Acol2Brow(i);
3128 size_t start = Bk.graph.row_map(row);
3129 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3135 size_t row = Acol2Irow(i);
3136 size_t start = Iks.graph.row_map(row);
3137 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3144 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3153template<
typename SC,
typename LO,
typename GO,
typename NO>
3154void AddKernels<SC, LO, GO, NO>::
3156 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3168 using Teuchos::TimeMonitor;
3169 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3171 auto nrows = Arowptrs.extent(0) - 1;
3172 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3173 typename AddKern::KKH handle;
3174 handle.create_spadd_handle(
true);
3175 auto addHandle = handle.get_spadd_handle();
3176#ifdef HAVE_TPETRA_MMM_TIMINGS
3177 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3179 KokkosSparse::Experimental::spadd_symbolic
3181 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3182 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3183 row_ptrs_array, col_inds_array>
3184 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3186 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3187 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3188#ifdef HAVE_TPETRA_MMM_TIMINGS
3190 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3192 KokkosSparse::Experimental::spadd_numeric(&handle,
3193 Arowptrs, Acolinds, Avals, scalarA,
3194 Browptrs, Bcolinds, Bvals, scalarB,
3195 Crowptrs, Ccolinds, Cvals);
3198template<
typename SC,
typename LO,
typename GO,
typename NO>
3199void AddKernels<SC, LO, GO, NO>::
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3210 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3214 using Teuchos::TimeMonitor;
3215 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3217 auto nrows = Arowptrs.extent(0) - 1;
3218 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3219 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220 typename AddKern::KKH handle;
3221 handle.create_spadd_handle(
false);
3222 auto addHandle = handle.get_spadd_handle();
3223#ifdef HAVE_TPETRA_MMM_TIMINGS
3224 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3226 KokkosSparse::Experimental::spadd_symbolic
3228 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3229 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3230 row_ptrs_array, col_inds_array>
3231 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3233 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3234 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3235#ifdef HAVE_TPETRA_MMM_TIMINGS
3237 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3239 KokkosSparse::Experimental::spadd_numeric(&handle,
3240 Arowptrs, Acolinds, Avals, scalarA,
3241 Browptrs, Bcolinds, Bvals, scalarB,
3242 Crowptrs, Ccolinds, Cvals);
3245template<
typename GO,
3246 typename LocalIndicesType,
3247 typename GlobalIndicesType,
3248 typename ColMapType>
3249struct ConvertLocalToGlobalFunctor
3251 ConvertLocalToGlobalFunctor(
3252 const LocalIndicesType& colindsOrig_,
3253 const GlobalIndicesType& colindsConverted_,
3254 const ColMapType& colmap_) :
3255 colindsOrig (colindsOrig_),
3256 colindsConverted (colindsConverted_),
3259 KOKKOS_INLINE_FUNCTION
void
3260 operator() (
const GO i)
const
3262 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3264 LocalIndicesType colindsOrig;
3265 GlobalIndicesType colindsConverted;
3269template<
typename SC,
typename LO,
typename GO,
typename NO>
3270void MMdetails::AddKernels<SC, LO, GO, NO>::
3271convertToGlobalAndAdd(
3272 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3282 using Teuchos::TimeMonitor;
3285 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286 typename NO::execution_space,
typename NO::memory_space,
typename NO::memory_space>;
3288 const values_array Avals = A.values;
3289 const values_array Bvals = B.values;
3290 const col_inds_array Acolinds = A.graph.entries;
3291 const col_inds_array Bcolinds = B.graph.entries;
3292 auto Arowptrs = A.graph.row_map;
3293 auto Browptrs = B.graph.row_map;
3294 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3295 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3296#ifdef HAVE_TPETRA_MMM_TIMINGS
3297 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3299 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3304 handle.create_spadd_handle(
false);
3305 auto addHandle = handle.get_spadd_handle();
3306#ifdef HAVE_TPETRA_MMM_TIMINGS
3308 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3310 auto nrows = Arowptrs.extent(0) - 1;
3311 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3312 KokkosSparse::Experimental::spadd_symbolic
3313 <KKH_GO,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3316 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3317#ifdef HAVE_TPETRA_MMM_TIMINGS
3319 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3321 KokkosSparse::Experimental::spadd_numeric(&handle,
3322 Arowptrs, AcolindsConverted, Avals, scalarA,
3323 Browptrs, BcolindsConverted, Bvals, scalarB,
3324 Crowptrs, Ccolinds, Cvals);
3340#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3342 void MatrixMatrix::Multiply( \
3343 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3345 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3347 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348 bool call_FillComplete_on_result, \
3349 const std::string & label, \
3350 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3353 void MatrixMatrix::Jacobi( \
3355 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359 bool call_FillComplete_on_result, \
3360 const std::string & label, \
3361 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3364 void MatrixMatrix::Add( \
3365 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3368 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3371 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3374 void MatrixMatrix::Add( \
3375 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3378 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3382 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384 (const SCALAR & alpha, \
3385 const bool transposeA, \
3386 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387 const SCALAR & beta, \
3388 const bool transposeB, \
3389 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3396 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397 (const SCALAR & alpha, \
3398 const bool transposeA, \
3399 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400 const SCALAR& beta, \
3401 const bool transposeB, \
3402 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3408 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.