Tpetra parallel linear algebra  Version of the Day
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43 
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <cmath>
60 #include "Teuchos_FancyOStream.hpp"
61 // #include "KokkosSparse_spgemm.hpp"
62 
63 
71 /*********************************************************************************************************/
72 // Include the architecture-specific kernel partial specializations here
73 // NOTE: This needs to be outside all namespaces
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
76 #include "TpetraExt_MatrixMatrix_HIP.hpp"
77 
78 namespace Tpetra {
79 
80  namespace TripleMatrixMultiply{
81 
82  //
83  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
84  // op(A) == A if transposeA is false,
85  // op(A) == A^T if transposeA is true,
86  // and similarly for op(R) and op(P).
87  //
88  template <class Scalar,
89  class LocalOrdinal,
90  class GlobalOrdinal,
91  class Node>
94  bool transposeR,
96  bool transposeA,
98  bool transposeP,
100  bool call_FillComplete_on_result,
101  const std::string& label,
102  const Teuchos::RCP<Teuchos::ParameterList>& params)
103  {
104  using Teuchos::null;
105  using Teuchos::RCP;
106  typedef Scalar SC;
107  typedef LocalOrdinal LO;
108  typedef GlobalOrdinal GO;
109  typedef Node NO;
110  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
111  typedef Import<LO,GO,NO> import_type;
112  typedef Export<LO,GO,NO> export_type;
113  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
114  typedef Map<LO,GO,NO> map_type;
115  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
116 
117 #ifdef HAVE_TPETRA_MMM_TIMINGS
118  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
119  using Teuchos::TimeMonitor;
120  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
121 #endif
122 
123  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
124 
125  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
126 
127  // The input matrices R, A and P must both be fillComplete.
128  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
129  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
130  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
131 
132  // If transposeA is true, then Rprime will be the transpose of R
133  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
134  // will just be a pointer to R.
135  RCP<const crs_matrix_type> Rprime = null;
136  // If transposeA is true, then Aprime will be the transpose of A
137  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
138  // will just be a pointer to A.
139  RCP<const crs_matrix_type> Aprime = null;
140  // If transposeB is true, then Pprime will be the transpose of P
141  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
142  // will just be a pointer to P.
143  RCP<const crs_matrix_type> Pprime = null;
144 
145  // Is this a "clean" matrix?
146  //
147  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
148  // locally nor globally indexed, then it was empty. I don't like
149  // this, because the most straightforward implementation presumes
150  // lazy allocation of indices. However, historical precedent
151  // demands that we keep around this predicate as a way to test
152  // whether the matrix is empty.
153  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
154 
155  using Teuchos::ParameterList;
156  RCP<ParameterList> transposeParams (new ParameterList);
157  transposeParams->set ("sort", false);
158 
159  if (transposeR && &R != &P) {
160  transposer_type transposer(rcpFromRef (R));
161  Rprime = transposer.createTranspose (transposeParams);
162  } else {
163  Rprime = rcpFromRef(R);
164  }
165 
166  if (transposeA) {
167  transposer_type transposer(rcpFromRef (A));
168  Aprime = transposer.createTranspose (transposeParams);
169  } else {
170  Aprime = rcpFromRef(A);
171  }
172 
173  if (transposeP) {
174  transposer_type transposer(rcpFromRef (P));
175  Pprime = transposer.createTranspose (transposeParams);
176  } else {
177  Pprime = rcpFromRef(P);
178  }
179 
180  // Check size compatibility
181  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
182  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
183  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
184  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
185  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
186  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
187  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
188  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
189  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
190  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
191  prefix << "ERROR, inner dimensions of op(R) and op(A) "
192  "must match for matrix-matrix product. op(R) is "
193  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
194 
195  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
196  prefix << "ERROR, inner dimensions of op(A) and op(P) "
197  "must match for matrix-matrix product. op(A) is "
198  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
199 
200  // The result matrix Ac must at least have a row-map that reflects the correct
201  // row-size. Don't check the number of columns because rectangular matrices
202  // which were constructed with only one map can still end up having the
203  // correct capacity and dimensions when filled.
204  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
205  prefix << "ERROR, dimensions of result Ac must "
206  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
207  << " rows, should have at least " << Rleft << std::endl);
208 
209  // It doesn't matter whether Ac is already Filled or not. If it is already
210  // Filled, it must have space allocated for the positions that will be
211  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
212  // we'll error out later when trying to store result values.
213 
214  // CGB: However, matrix must be in active-fill
215  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
216 
217  // We're going to need to import remotely-owned sections of P if
218  // more than one processor is performing this run, depending on the scenario.
219  int numProcs = P.getComm()->getSize();
220 
221  // Declare a couple of structs that will be used to hold views of the data
222  // of R, A and P, to be used for fast access during the matrix-multiplication.
223  crs_matrix_struct_type Rview;
224  crs_matrix_struct_type Aview;
225  crs_matrix_struct_type Pview;
226 
227  RCP<const map_type> targetMap_R = Rprime->getRowMap();
228  RCP<const map_type> targetMap_A = Aprime->getRowMap();
229  RCP<const map_type> targetMap_P = Pprime->getRowMap();
230 
231 #ifdef HAVE_TPETRA_MMM_TIMINGS
232  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
233 #endif
234 
235  // Now import any needed remote rows and populate the Aview struct
236  // NOTE: We assert that an import isn't needed --- since we do the transpose
237  // above to handle that.
238  RCP<const import_type> dummyImporter;
239 
240  if (!(transposeR && &R == &P))
241  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
242 
243  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
244 
245  // We will also need local access to all rows of P that correspond to the
246  // column-map of op(A).
247  if (numProcs > 1)
248  targetMap_P = Aprime->getColMap();
249 
250  // Import any needed remote rows and populate the Pview struct.
251  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
252 
253 
254  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
255 
256  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
257  if (needs_final_export)
258  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
259  else
260  Actemp = rcp(&Ac,false);// don't allow deallocation
261 
262 #ifdef HAVE_TPETRA_MMM_TIMINGS
263  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
264 #endif
265 
266  // Call the appropriate method to perform the actual multiplication.
267  if (call_FillComplete_on_result && newFlag) {
268  if (transposeR && &R == &P)
269  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
270  else
271  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
272  } else if (call_FillComplete_on_result) {
273  if (transposeR && &R == &P)
274  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
275  else
276  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
277  } else {
278  // mfh 27 Sep 2016: Is this the "slow" case? This
279  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
280  // thread-parallel inserts, but that may take some effort.
281  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
282 
283  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
284 
285  // #ifdef HAVE_TPETRA_MMM_TIMINGS
286  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
287  // #endif
288  // if (call_FillComplete_on_result) {
289  // // We'll call FillComplete on the C matrix before we exit, and give it a
290  // // domain-map and a range-map.
291  // // The domain-map will be the domain-map of B, unless
292  // // op(B)==transpose(B), in which case the range-map of B will be used.
293  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
294  // // in which case the domain-map of A will be used.
295  // if (!C.isFillComplete())
296  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
297  // }
298  // Not implemented
299  if (transposeR && &R == &P)
300  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
301  else
302  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
303  }
304 
305  if (needs_final_export) {
306 #ifdef HAVE_TPETRA_MMM_TIMINGS
307  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
308 #endif
309  Teuchos::ParameterList labelList;
310  labelList.set("Timer Label", label);
311  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
312 
313  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
314  bool isMM = true;
315  bool overrideAllreduce = false;
316  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
317  if(!params.is_null()) {
318  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
319  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
320  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
321  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
322  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
323  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
324  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
325 
326  labelList.set("compute global constants",params->get("compute global constants",true));
327  }
328  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
329 
330  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
331  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
332  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
333 
334  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
335  Actemp->exportAndFillComplete(Acprime,
336  exporter,
337  Acprime->getDomainMap(),
338  Acprime->getRangeMap(),
339  rcp(&labelList,false));
340 
341  }
342 #ifdef HAVE_TPETRA_MMM_STATISTICS
343  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
344 #endif
345 
346  }
347 
348 
349  } //End namespace TripleMatrixMultiply
350 
351  namespace MMdetails{
352 
353  // Kernel method for computing the local portion of Ac = R*A*P
354  template<class Scalar,
355  class LocalOrdinal,
356  class GlobalOrdinal,
357  class Node>
358  void mult_R_A_P_newmatrix(
359  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
360  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
361  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
362  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
363  const std::string& label,
364  const Teuchos::RCP<Teuchos::ParameterList>& params)
365  {
366  using Teuchos::Array;
367  using Teuchos::ArrayRCP;
368  using Teuchos::ArrayView;
369  using Teuchos::RCP;
370  using Teuchos::rcp;
371 
372  //typedef Scalar SC; // unused
373  typedef LocalOrdinal LO;
374  typedef GlobalOrdinal GO;
375  typedef Node NO;
376 
377  typedef Import<LO,GO,NO> import_type;
378  typedef Map<LO,GO,NO> map_type;
379 
380  // Kokkos typedefs
381  typedef typename map_type::local_map_type local_map_type;
383  typedef typename KCRS::StaticCrsGraphType graph_t;
384  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
385  typedef typename NO::execution_space execution_space;
386  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
387  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
388 
389 #ifdef HAVE_TPETRA_MMM_TIMINGS
390  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
391  using Teuchos::TimeMonitor;
392  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
393 #endif
394  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
395 
396  // Build the final importer / column map, hash table lookups for Ac
397  RCP<const import_type> Cimport;
398  RCP<const map_type> Ccolmap;
399  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
400  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
401  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
402  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
403  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
404  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
405  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
406 
407 
408  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
409  // indices of B, to local column indices of Ac. (B and Ac have the
410  // same number of columns.) The kernel uses this, instead of
411  // copying the entire input matrix B and converting its column
412  // indices to those of C.
413  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
414 
415  if (Pview.importMatrix.is_null()) {
416  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
417  Cimport = Pimport;
418  Ccolmap = Pview.colMap;
419  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
420  // Pcol2Ccol is trivial
421  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
422  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
423  KOKKOS_LAMBDA(const LO i) {
424  Pcol2Ccol(i) = i;
425  });
426  }
427  else {
428  // mfh 27 Sep 2016: P has "remotes," so we need to build the
429  // column Map of C, as well as C's Import object (from its domain
430  // Map to its column Map). C's column Map is the union of the
431  // column Maps of (the local part of) P, and the "remote" part of
432  // P. Ditto for the Import. We have optimized this "setUnion"
433  // operation on Import objects and Maps.
434 
435  // Choose the right variant of setUnion
436  if (!Pimport.is_null() && !Iimport.is_null()) {
437  Cimport = Pimport->setUnion(*Iimport);
438  }
439  else if (!Pimport.is_null() && Iimport.is_null()) {
440  Cimport = Pimport->setUnion();
441  }
442  else if (Pimport.is_null() && !Iimport.is_null()) {
443  Cimport = Iimport->setUnion();
444  }
445  else {
446  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
447  }
448  Ccolmap = Cimport->getTargetMap();
449 
450  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
451  // in general. We should get rid of it in order to reduce
452  // communication costs of sparse matrix-matrix multiply.
453  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
454  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
455 
456  // NOTE: This is not efficient and should be folded into setUnion
457  //
458  // mfh 27 Sep 2016: What the above comment means, is that the
459  // setUnion operation on Import objects could also compute these
460  // local index - to - local index look-up tables.
461  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
462  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
463  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
464  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
465  });
466  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
467  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
468  });
469 
470  }
471 
472  // Replace the column map
473  //
474  // mfh 27 Sep 2016: We do this because C was originally created
475  // without a column Map. Now we have its column Map.
476  Ac.replaceColMap(Ccolmap);
477 
478  // mfh 27 Sep 2016: Construct tables that map from local column
479  // indices of A, to local row indices of either B_local (the locally
480  // owned part of B), or B_remote (the "imported" remote part of B).
481  //
482  // For column index Aik in row i of A, if the corresponding row of B
483  // exists in the local part of B ("orig") (which I'll call B_local),
484  // then targetMapToOrigRow[Aik] is the local index of that row of B.
485  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
486  //
487  // For column index Aik in row i of A, if the corresponding row of B
488  // exists in the remote part of B ("Import") (which I'll call
489  // B_remote), then targetMapToImportRow[Aik] is the local index of
490  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
491  // (a flag value).
492 
493  // Run through all the hash table lookups once and for all
494  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
495  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
496  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
497  GO aidx = Acolmap_local.getGlobalElement(i);
498  LO P_LID = Prowmap_local.getLocalElement(aidx);
499  if (P_LID != LO_INVALID) {
500  targetMapToOrigRow(i) = P_LID;
501  targetMapToImportRow(i) = LO_INVALID;
502  } else {
503  LO I_LID = Irowmap_local.getLocalElement(aidx);
504  targetMapToOrigRow(i) = LO_INVALID;
505  targetMapToImportRow(i) = I_LID;
506  }
507  });
508 
509  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
510  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
511  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
512  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
513  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
514  Ac, Cimport, label, params);
515  }
516 
517 
518  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
519  template<class Scalar,
520  class LocalOrdinal,
521  class GlobalOrdinal,
522  class Node>
523  void mult_R_A_P_reuse(
524  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
525  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
526  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
527  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
528  const std::string& label,
529  const Teuchos::RCP<Teuchos::ParameterList>& params)
530  {
531  using Teuchos::Array;
532  using Teuchos::ArrayRCP;
533  using Teuchos::ArrayView;
534  using Teuchos::RCP;
535  using Teuchos::rcp;
536 
537  //typedef Scalar SC; // unused
538  typedef LocalOrdinal LO;
539  typedef GlobalOrdinal GO;
540  typedef Node NO;
541 
542  typedef Import<LO,GO,NO> import_type;
543  typedef Map<LO,GO,NO> map_type;
544 
545  // Kokkos typedefs
546  typedef typename map_type::local_map_type local_map_type;
548  typedef typename KCRS::StaticCrsGraphType graph_t;
549  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
550  typedef typename NO::execution_space execution_space;
551  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
552  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
553 
554 #ifdef HAVE_TPETRA_MMM_TIMINGS
555  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
556  using Teuchos::TimeMonitor;
557  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
558 #endif
559  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
560 
561  // Build the final importer / column map, hash table lookups for Ac
562  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
563  RCP<const map_type> Ccolmap = Ac.getColMap();
564  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
565  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
566  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
567  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
568  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
569  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
570  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
571  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
572 
573  // Build the final importer / column map, hash table lookups for C
574  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
575  {
576  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
577  // So, column map of C may be a strict subset of the column map of B
578  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
579  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
580  });
581 
582  if (!Pview.importMatrix.is_null()) {
583  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
584  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
585 
586  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
587  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
588  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
589  });
590  }
591  }
592 
593  // Run through all the hash table lookups once and for all
594  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
595  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
596  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
597  GO aidx = Acolmap_local.getGlobalElement(i);
598  LO B_LID = Prowmap_local.getLocalElement(aidx);
599  if (B_LID != LO_INVALID) {
600  targetMapToOrigRow(i) = B_LID;
601  targetMapToImportRow(i) = LO_INVALID;
602  } else {
603  LO I_LID = Irowmap_local.getLocalElement(aidx);
604  targetMapToOrigRow(i) = LO_INVALID;
605  targetMapToImportRow(i) = I_LID;
606 
607  }
608  });
609 
610  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
611  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
612  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
613  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
614  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
615  Ac, Cimport, label, params);
616  }
617 
618 
619  // Kernel method for computing the local portion of Ac = R*A*P
620  template<class Scalar,
621  class LocalOrdinal,
622  class GlobalOrdinal,
623  class Node>
624  void mult_PT_A_P_newmatrix(
625  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
626  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
627  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
628  const std::string& label,
629  const Teuchos::RCP<Teuchos::ParameterList>& params)
630  {
631  using Teuchos::Array;
632  using Teuchos::ArrayRCP;
633  using Teuchos::ArrayView;
634  using Teuchos::RCP;
635  using Teuchos::rcp;
636 
637  //typedef Scalar SC; // unused
638  typedef LocalOrdinal LO;
639  typedef GlobalOrdinal GO;
640  typedef Node NO;
641 
642  typedef Import<LO,GO,NO> import_type;
643  typedef Map<LO,GO,NO> map_type;
644 
645  // Kokkos typedefs
646  typedef typename map_type::local_map_type local_map_type;
648  typedef typename KCRS::StaticCrsGraphType graph_t;
649  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
650  typedef typename NO::execution_space execution_space;
651  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
652  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
653 
654 #ifdef HAVE_TPETRA_MMM_TIMINGS
655  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
656  using Teuchos::TimeMonitor;
657  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
658 #endif
659  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
660 
661  // Build the final importer / column map, hash table lookups for Ac
662  RCP<const import_type> Cimport;
663  RCP<const map_type> Ccolmap;
664  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
665  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
666  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
667  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
668  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
669  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
670  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
671 
672 
673  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
674  // indices of B, to local column indices of Ac. (B and Ac have the
675  // same number of columns.) The kernel uses this, instead of
676  // copying the entire input matrix B and converting its column
677  // indices to those of C.
678  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
679 
680  if (Pview.importMatrix.is_null()) {
681  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
682  Cimport = Pimport;
683  Ccolmap = Pview.colMap;
684  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
685  // Pcol2Ccol is trivial
686  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
687  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
688  KOKKOS_LAMBDA(const LO i) {
689  Pcol2Ccol(i) = i;
690  });
691  }
692  else {
693  // mfh 27 Sep 2016: P has "remotes," so we need to build the
694  // column Map of C, as well as C's Import object (from its domain
695  // Map to its column Map). C's column Map is the union of the
696  // column Maps of (the local part of) P, and the "remote" part of
697  // P. Ditto for the Import. We have optimized this "setUnion"
698  // operation on Import objects and Maps.
699 
700  // Choose the right variant of setUnion
701  if (!Pimport.is_null() && !Iimport.is_null()) {
702  Cimport = Pimport->setUnion(*Iimport);
703  }
704  else if (!Pimport.is_null() && Iimport.is_null()) {
705  Cimport = Pimport->setUnion();
706  }
707  else if (Pimport.is_null() && !Iimport.is_null()) {
708  Cimport = Iimport->setUnion();
709  }
710  else {
711  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
712  }
713  Ccolmap = Cimport->getTargetMap();
714 
715  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
716  // in general. We should get rid of it in order to reduce
717  // communication costs of sparse matrix-matrix multiply.
718  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
719  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
720 
721  // NOTE: This is not efficient and should be folded into setUnion
722  //
723  // mfh 27 Sep 2016: What the above comment means, is that the
724  // setUnion operation on Import objects could also compute these
725  // local index - to - local index look-up tables.
726  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
727  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
728  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
729  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
730  });
731  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
732  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
733  });
734 
735  }
736 
737  // Replace the column map
738  //
739  // mfh 27 Sep 2016: We do this because C was originally created
740  // without a column Map. Now we have its column Map.
741  Ac.replaceColMap(Ccolmap);
742 
743  // mfh 27 Sep 2016: Construct tables that map from local column
744  // indices of A, to local row indices of either B_local (the locally
745  // owned part of B), or B_remote (the "imported" remote part of B).
746  //
747  // For column index Aik in row i of A, if the corresponding row of B
748  // exists in the local part of B ("orig") (which I'll call B_local),
749  // then targetMapToOrigRow[Aik] is the local index of that row of B.
750  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
751  //
752  // For column index Aik in row i of A, if the corresponding row of B
753  // exists in the remote part of B ("Import") (which I'll call
754  // B_remote), then targetMapToImportRow[Aik] is the local index of
755  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
756  // (a flag value).
757 
758  // Run through all the hash table lookups once and for all
759  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
760  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
761 
762  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
763  GO aidx = Acolmap_local.getGlobalElement(i);
764  LO P_LID = Prowmap_local.getLocalElement(aidx);
765  if (P_LID != LO_INVALID) {
766  targetMapToOrigRow(i) = P_LID;
767  targetMapToImportRow(i) = LO_INVALID;
768  } else {
769  LO I_LID = Irowmap_local.getLocalElement(aidx);
770  targetMapToOrigRow(i) = LO_INVALID;
771  targetMapToImportRow(i) = I_LID;
772  }
773  });
774 
775  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
776  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
777  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
778  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
779  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
780  Ac, Cimport, label, params);
781  }
782 
783  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
784  template<class Scalar,
785  class LocalOrdinal,
786  class GlobalOrdinal,
787  class Node>
788  void mult_PT_A_P_reuse(
789  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
790  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
791  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
792  const std::string& label,
793  const Teuchos::RCP<Teuchos::ParameterList>& params)
794  {
795  using Teuchos::Array;
796  using Teuchos::ArrayRCP;
797  using Teuchos::ArrayView;
798  using Teuchos::RCP;
799  using Teuchos::rcp;
800 
801  //typedef Scalar SC; // unused
802  typedef LocalOrdinal LO;
803  typedef GlobalOrdinal GO;
804  typedef Node NO;
805 
806  typedef Import<LO,GO,NO> import_type;
807  typedef Map<LO,GO,NO> map_type;
808 
809  // Kokkos typedefs
810  typedef typename map_type::local_map_type local_map_type;
812  typedef typename KCRS::StaticCrsGraphType graph_t;
813  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
814  typedef typename NO::execution_space execution_space;
815  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
816  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
817 
818 #ifdef HAVE_TPETRA_MMM_TIMINGS
819  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
820  using Teuchos::TimeMonitor;
821  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
822 #endif
823  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
824 
825  // Build the final importer / column map, hash table lookups for Ac
826  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
827  RCP<const map_type> Ccolmap = Ac.getColMap();
828  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
829  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
830  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
831  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
832  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
833  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
834  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
835  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
836 
837  // Build the final importer / column map, hash table lookups for C
838  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
839  {
840  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
841  // So, column map of C may be a strict subset of the column map of B
842  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
843  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
844  });
845 
846  if (!Pview.importMatrix.is_null()) {
847  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
848  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
849 
850  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
851  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
852  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
853  });
854  }
855  }
856 
857  // Run through all the hash table lookups once and for all
858  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
859  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
860  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
861  GO aidx = Acolmap_local.getGlobalElement(i);
862  LO B_LID = Prowmap_local.getLocalElement(aidx);
863  if (B_LID != LO_INVALID) {
864  targetMapToOrigRow(i) = B_LID;
865  targetMapToImportRow(i) = LO_INVALID;
866  } else {
867  LO I_LID = Irowmap_local.getLocalElement(aidx);
868  targetMapToOrigRow(i) = LO_INVALID;
869  targetMapToImportRow(i) = I_LID;
870 
871  }
872  });
873 
874  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
875  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
876  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
877  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
878  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
879  Ac, Cimport, label, params);
880  }
881 
882 
883  /*********************************************************************************************************/
884  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
885  // Computes R * A * P -> Ac using classic Gustavson approach
886  template<class Scalar,
887  class LocalOrdinal,
888  class GlobalOrdinal,
889  class Node,
890  class LocalOrdinalViewType>
891  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
892  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
893  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
894  const LocalOrdinalViewType & Acol2Prow_dev,
895  const LocalOrdinalViewType & Acol2PIrow_dev,
896  const LocalOrdinalViewType & Pcol2Accol_dev,
897  const LocalOrdinalViewType & PIcol2Accol_dev,
898  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
899  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
900  const std::string& label,
901  const Teuchos::RCP<Teuchos::ParameterList>& params) {
902 #ifdef HAVE_TPETRA_MMM_TIMINGS
903  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
904  using Teuchos::TimeMonitor;
905  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
906 #endif
907 
908  using Teuchos::Array;
909  using Teuchos::ArrayRCP;
910  using Teuchos::ArrayView;
911  using Teuchos::RCP;
912  using Teuchos::rcp;
913 
914  // Lots and lots of typedefs
916  typedef typename KCRS::StaticCrsGraphType graph_t;
917  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
918  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
919  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
920  typedef typename KCRS::values_type::non_const_type scalar_view_t;
921 
922  typedef Scalar SC;
923  typedef LocalOrdinal LO;
924  typedef GlobalOrdinal GO;
925  typedef Node NO;
926  typedef Map<LO,GO,NO> map_type;
927  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
928  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
929  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
930 
931  // Sizes
932  RCP<const map_type> Accolmap = Ac.getColMap();
933  size_t m = Rview.origMatrix->getNodeNumRows();
934  size_t n = Accolmap->getNodeNumElements();
935  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
936 
937  // Routine runs on host; have to put arguments on host, too
938  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
939  Acol2Prow_dev);
940  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
941  Acol2PIrow_dev);
942  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
943  Pcol2Accol_dev);
944  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
945  PIcol2Accol_dev);
946 
947  // Grab the Kokkos::SparseCrsMatrices & inner stuff
948  const auto Amat = Aview.origMatrix->getLocalMatrixHost();
949  const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
950  const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
951 
952  auto Arowptr = Amat.graph.row_map;
953  auto Prowptr = Pmat.graph.row_map;
954  auto Rrowptr = Rmat.graph.row_map;
955  const auto Acolind = Amat.graph.entries;
956  const auto Pcolind = Pmat.graph.entries;
957  const auto Rcolind = Rmat.graph.entries;
958  const auto Avals = Amat.values;
959  const auto Pvals = Pmat.values;
960  const auto Rvals = Rmat.values;
961 
962  typename c_lno_view_t::HostMirror::const_type Irowptr;
963  typename lno_nnz_view_t::HostMirror Icolind;
964  typename scalar_view_t::HostMirror Ivals;
965  if(!Pview.importMatrix.is_null()) {
966  auto lclP = Pview.importMatrix->getLocalMatrixHost();
967  Irowptr = lclP.graph.row_map;
968  Icolind = lclP.graph.entries;
969  Ivals = lclP.values;
970  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
971  }
972 
973 #ifdef HAVE_TPETRA_MMM_TIMINGS
974  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
975 #endif
976 
977  // Classic csr assembly (low memory edition)
978  //
979  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
980  // The method loops over rows of R, and may resize after processing
981  // each row. Chris Siefert says that this reflects experience in
982  // ML; for the non-threaded case, ML found it faster to spend less
983  // effort on estimation and risk an occasional reallocation.
984  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
985  typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
986  typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
987  typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
988 
989  // mfh 27 Sep 2016: The ac_status array is an implementation detail
990  // of the local sparse matrix-matrix multiply routine.
991 
992  // The status array will contain the index into colind where this entry was last deposited.
993  // ac_status[i] < nnz - not in the row yet
994  // ac_status[i] >= nnz - this is the entry where you can find the data
995  // We start with this filled with INVALID's indicating that there are no entries yet.
996  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
997  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
998  Array<size_t> ac_status(n, ST_INVALID);
999 
1000  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1001  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1002  //
1003  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1004  // you whether the corresponding row of P belongs to P_local
1005  // ("orig") or P_remote ("Import").
1006 
1007  // For each row of R
1008  size_t nnz = 0, nnz_old = 0;
1009  for (size_t i = 0; i < m; i++) {
1010  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1011  // on the calling process.
1012  Crowptr[i] = nnz;
1013 
1014  // mfh 27 Sep 2016: For each entry of R in the current row of R
1015  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1016  LO k = Rcolind[kk]; // local column index of current entry of R
1017  const SC Rik = Rvals[kk]; // value of current entry of R
1018  if (Rik == SC_ZERO)
1019  continue; // skip explicitly stored zero values in R
1020  // For each entry of A in the current row of A
1021  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1022  LO l = Acolind[ll]; // local column index of current entry of A
1023  const SC Akl = Avals[ll]; // value of current entry of A
1024  if (Akl == SC_ZERO)
1025  continue; // skip explicitly stored zero values in A
1026 
1027 
1028  if (Acol2Prow[l] != LO_INVALID) {
1029  // mfh 27 Sep 2016: If the entry of Acol2Prow
1030  // corresponding to the current entry of A is populated, then
1031  // the corresponding row of P is in P_local (i.e., it lives on
1032  // the calling process).
1033 
1034  // Local matrix
1035  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1036 
1037  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1038  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1039  LO j = Pcolind[jj];
1040  LO Acj = Pcol2Accol[j];
1041  SC Plj = Pvals[jj];
1042 
1043  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1044 #ifdef HAVE_TPETRA_DEBUG
1045  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1046  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1047  std::runtime_error,
1048  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1049 #endif
1050  // New entry
1051  ac_status[Acj] = nnz;
1052  Ccolind[nnz] = Acj;
1053  Cvals[nnz] = Rik*Akl*Plj;
1054  nnz++;
1055  } else {
1056  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1057  }
1058  }
1059  } else {
1060  // mfh 27 Sep 2016: If the entry of Acol2PRow
1061  // corresponding to the current entry of A is NOT populated (has
1062  // a flag "invalid" value), then the corresponding row of P is
1063  // in P_remote (i.e., it does not live on the calling process).
1064 
1065  // Remote matrix
1066  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1067  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1068  LO j = Icolind[jj];
1069  LO Acj = PIcol2Accol[j];
1070  SC Plj = Ivals[jj];
1071 
1072  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1073 #ifdef HAVE_TPETRA_DEBUG
1074  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1075  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1076  std::runtime_error,
1077  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1078 #endif
1079  // New entry
1080  ac_status[Acj] = nnz;
1081  Ccolind[nnz] = Acj;
1082  Cvals[nnz] = Rik*Akl*Plj;
1083  nnz++;
1084  } else {
1085  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1086  }
1087  }
1088  }
1089  }
1090  }
1091  // Resize for next pass if needed
1092  if (nnz + n > CSR_alloc) {
1093  CSR_alloc *= 2;
1094  Kokkos::resize(Ccolind,CSR_alloc);
1095  Kokkos::resize(Cvals,CSR_alloc);
1096  }
1097  nnz_old = nnz;
1098  }
1099 
1100  Crowptr[m] = nnz;
1101 
1102  // Downward resize
1103  Kokkos::resize(Ccolind,nnz);
1104  Kokkos::resize(Cvals,nnz);
1105 
1106 #ifdef HAVE_TPETRA_MMM_TIMINGS
1107  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1108 #endif
1109  auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1110  typename KCRS::device_type(), Crowptr);
1111  auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1112  typename KCRS::device_type(), Ccolind);
1113  auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1114  typename KCRS::device_type(), Cvals);
1115 
1116  // Final sort & set of CRS arrays
1117  if (params.is_null() || params->get("sort entries",true))
1118  Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1119  Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1120 
1121 #ifdef HAVE_TPETRA_MMM_TIMINGS
1122  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1123 #endif
1124 
1125  // Final FillComplete
1126  //
1127  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1128  // Import (from domain Map to column Map) construction (which costs
1129  // lots of communication) by taking the previously constructed
1130  // Import object. We should be able to do this without interfering
1131  // with the implementation of the local part of sparse matrix-matrix
1132  // multply above.
1133  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1134  labelList->set("Timer Label",label);
1135  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1136  RCP<const Export<LO,GO,NO> > dummyExport;
1137  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1138  Rview.origMatrix->getRangeMap(),
1139  Acimport,
1140  dummyExport,
1141  labelList);
1142 
1143  }
1144 
1145  /*********************************************************************************************************/
1146  // RAP Reuse Kernel wrappers (Default non-threaded version)
1147  // Computes R * A * P -> Ac using reuse Gustavson
1148  template<class Scalar,
1149  class LocalOrdinal,
1150  class GlobalOrdinal,
1151  class Node,
1152  class LocalOrdinalViewType>
1153  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1154  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1155  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1156  const LocalOrdinalViewType & Acol2Prow_dev,
1157  const LocalOrdinalViewType & Acol2PIrow_dev,
1158  const LocalOrdinalViewType & Pcol2Accol_dev,
1159  const LocalOrdinalViewType & PIcol2Accol_dev,
1160  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1161  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1162  const std::string& label,
1163  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1164 #ifdef HAVE_TPETRA_MMM_TIMINGS
1165  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1166  using Teuchos::TimeMonitor;
1167  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1168 #endif
1169 
1170  using Teuchos::Array;
1171  using Teuchos::ArrayRCP;
1172  using Teuchos::ArrayView;
1173  using Teuchos::RCP;
1174  using Teuchos::rcp;
1175 
1176  // Lots and lots of typedefs
1177  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1178  typedef typename KCRS::StaticCrsGraphType graph_t;
1179  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1180  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1181  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1182 
1183  typedef Scalar SC;
1184  typedef LocalOrdinal LO;
1185  typedef GlobalOrdinal GO;
1186  typedef Node NO;
1187  typedef Map<LO,GO,NO> map_type;
1188  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1189  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1190  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1191 
1192  // Sizes
1193  RCP<const map_type> Accolmap = Ac.getColMap();
1194  size_t m = Rview.origMatrix->getNodeNumRows();
1195  size_t n = Accolmap->getNodeNumElements();
1196  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
1197 
1198  // Routine runs on host; have to put arguments on host, too
1199  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1200  Acol2Prow_dev);
1201  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1202  Acol2PIrow_dev);
1203  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1204  Pcol2Accol_dev);
1205  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1206  PIcol2Accol_dev);
1207 
1208  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1209  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1210  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1211  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1212  const KCRS & Cmat = Ac.getLocalMatrixHost();
1213 
1214  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1215  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1216  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1217  scalar_view_t Cvals = Cmat.values;
1218 
1219  c_lno_view_t Irowptr;
1220  lno_nnz_view_t Icolind;
1221  scalar_view_t Ivals;
1222  if(!Pview.importMatrix.is_null()) {
1223  auto lclP = Pview.importMatrix->getLocalMatrixHost();
1224  Irowptr = lclP.graph.row_map;
1225  Icolind = lclP.graph.entries;
1226  Ivals = lclP.values;
1227  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
1228  }
1229 
1230 #ifdef HAVE_TPETRA_MMM_TIMINGS
1231  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1232 #endif
1233 
1234  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1235  // of the local sparse matrix-matrix multiply routine.
1236 
1237  // The status array will contain the index into colind where this entry was last deposited.
1238  // ac_status[i] < nnz - not in the row yet
1239  // ac_status[i] >= nnz - this is the entry where you can find the data
1240  // We start with this filled with INVALID's indicating that there are no entries yet.
1241  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1242  Array<size_t> ac_status(n, ST_INVALID);
1243 
1244  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1245  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1246  //
1247  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1248  // you whether the corresponding row of P belongs to P_local
1249  // ("orig") or P_remote ("Import").
1250 
1251  // Necessary until following UVM host accesses are changed - for example Crowptr
1252  // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1253  Kokkos::fence();
1254 
1255  // For each row of R
1256  size_t OLD_ip = 0, CSR_ip = 0;
1257  for (size_t i = 0; i < m; i++) {
1258  // First fill the c_status array w/ locations where we're allowed to
1259  // generate nonzeros for this row
1260  OLD_ip = Crowptr[i];
1261  CSR_ip = Crowptr[i+1];
1262  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1263  ac_status[Ccolind[k]] = k;
1264 
1265  // Reset values in the row of C
1266  Cvals[k] = SC_ZERO;
1267  }
1268 
1269  // mfh 27 Sep 2016: For each entry of R in the current row of R
1270  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1271  LO k = Rcolind[kk]; // local column index of current entry of R
1272  const SC Rik = Rvals[kk]; // value of current entry of R
1273  if (Rik == SC_ZERO)
1274  continue; // skip explicitly stored zero values in R
1275  // For each entry of A in the current row of A
1276  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1277  LO l = Acolind[ll]; // local column index of current entry of A
1278  const SC Akl = Avals[ll]; // value of current entry of A
1279  if (Akl == SC_ZERO)
1280  continue; // skip explicitly stored zero values in A
1281 
1282 
1283  if (Acol2Prow[l] != LO_INVALID) {
1284  // mfh 27 Sep 2016: If the entry of Acol2Prow
1285  // corresponding to the current entry of A is populated, then
1286  // the corresponding row of P is in P_local (i.e., it lives on
1287  // the calling process).
1288 
1289  // Local matrix
1290  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1291 
1292  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1293  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1294  LO j = Pcolind[jj];
1295  LO Cij = Pcol2Accol[j];
1296  SC Plj = Pvals[jj];
1297 
1298  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1299  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1300  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1301 
1302  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1303  }
1304  } else {
1305  // mfh 27 Sep 2016: If the entry of Acol2PRow
1306  // corresponding to the current entry of A is NOT populated (has
1307  // a flag "invalid" value), then the corresponding row of P is
1308  // in P_remote (i.e., it does not live on the calling process).
1309 
1310  // Remote matrix
1311  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1312  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1313  LO j = Icolind[jj];
1314  LO Cij = PIcol2Accol[j];
1315  SC Plj = Ivals[jj];
1316 
1317  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1318  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1319  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1320 
1321  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1322  }
1323  }
1324  }
1325  }
1326  }
1327 
1328 #ifdef HAVE_TPETRA_MMM_TIMINGS
1329  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1330 #endif
1331 
1332  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1333 
1334 }
1335 
1336 
1337  /*********************************************************************************************************/
1338  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1339  // Computes P.T * A * P -> Ac
1340  template<class Scalar,
1341  class LocalOrdinal,
1342  class GlobalOrdinal,
1343  class Node,
1344  class LocalOrdinalViewType>
1345  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1346  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1347  const LocalOrdinalViewType & Acol2Prow,
1348  const LocalOrdinalViewType & Acol2PIrow,
1349  const LocalOrdinalViewType & Pcol2Accol,
1350  const LocalOrdinalViewType & PIcol2Accol,
1351  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1352  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1353  const std::string& label,
1354  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1355 #ifdef HAVE_TPETRA_MMM_TIMINGS
1356  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1357  using Teuchos::TimeMonitor;
1358  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1359 #endif
1360 
1361  // We don't need a kernel-level PTAP, we just transpose here
1362  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1363  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1364 
1365  using Teuchos::ParameterList;
1366  using Teuchos::RCP;
1367  RCP<ParameterList> transposeParams (new ParameterList);
1368  transposeParams->set ("sort", false);
1369 
1370  if (! params.is_null ()) {
1371  transposeParams->set ("compute global constants",
1372  params->get ("compute global constants: temporaries",
1373  false));
1374  }
1375  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1376  transposer.createTransposeLocal (transposeParams);
1377  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1378  Rview.origMatrix = Ptrans;
1379 
1380  mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1381  }
1382 
1383  /*********************************************************************************************************/
1384  // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1385  // Computes P.T * A * P -> Ac
1386  template<class Scalar,
1387  class LocalOrdinal,
1388  class GlobalOrdinal,
1389  class Node,
1390  class LocalOrdinalViewType>
1391  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1392  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1393  const LocalOrdinalViewType & Acol2Prow,
1394  const LocalOrdinalViewType & Acol2PIrow,
1395  const LocalOrdinalViewType & Pcol2Accol,
1396  const LocalOrdinalViewType & PIcol2Accol,
1397  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1398  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1399  const std::string& label,
1400  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1401 #ifdef HAVE_TPETRA_MMM_TIMINGS
1402  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1403  using Teuchos::TimeMonitor;
1404  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1405 #endif
1406 
1407  // We don't need a kernel-level PTAP, we just transpose here
1408  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1409  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1410 
1411  using Teuchos::ParameterList;
1412  using Teuchos::RCP;
1413  RCP<ParameterList> transposeParams (new ParameterList);
1414  transposeParams->set ("sort", false);
1415 
1416  if (! params.is_null ()) {
1417  transposeParams->set ("compute global constants",
1418  params->get ("compute global constants: temporaries",
1419  false));
1420  }
1421  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1422  transposer.createTransposeLocal (transposeParams);
1423  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1424  Rview.origMatrix = Ptrans;
1425 
1426  mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1427  }
1428 
1429  /*********************************************************************************************************/
1430  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1431  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1432  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1433  // Currently, this implementation never gets called.
1434  template<class Scalar,
1435  class LocalOrdinal,
1436  class GlobalOrdinal,
1437  class Node>
1438  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1439  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1440  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1441  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1442  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1443  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1444  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1445  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1446  const std::string& label,
1447  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1448 #ifdef HAVE_TPETRA_MMM_TIMINGS
1449  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1450  using Teuchos::TimeMonitor;
1451  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1452 #endif
1453 
1454  using Teuchos::Array;
1455  using Teuchos::ArrayRCP;
1456  using Teuchos::ArrayView;
1457  using Teuchos::RCP;
1458  using Teuchos::rcp;
1459 
1460  typedef Scalar SC;
1461  typedef LocalOrdinal LO;
1462  typedef GlobalOrdinal GO;
1463  typedef Node NO;
1464  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1465  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1466  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1467 
1468  // number of rows on the process of the fine matrix
1469  // size_t m = Pview.origMatrix->getNodeNumRows();
1470  // number of rows on the process of the coarse matrix
1471  size_t n = Ac.getRowMap()->getNodeNumElements();
1472  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1473 
1474  // Get Data Pointers
1475  ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1476  ArrayRCP<size_t> Acrowptr_RCP;
1477  ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1478  ArrayRCP<LO> Accolind_RCP;
1479  ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1480  ArrayRCP<SC> Acvals_RCP;
1481 
1482  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1483  // out of the CrsMatrix. This code computes R * A * (P_local +
1484  // P_remote), where P_local contains the locally owned rows of P,
1485  // and P_remote the (previously Import'ed) remote rows of P.
1486 
1487  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1488  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1489 
1490  if (!Pview.importMatrix.is_null())
1491  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1492 
1493  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1494  // where Teuchos::ArrayRCP::operator[] may be slower than
1495  // Teuchos::ArrayView::operator[].
1496 
1497  // For efficiency
1498  ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1499  ArrayView<const LO> Acolind, Pcolind, Icolind;
1500  ArrayView<const SC> Avals, Pvals, Ivals;
1501  ArrayView<size_t> Acrowptr;
1502  ArrayView<LO> Accolind;
1503  ArrayView<SC> Acvals;
1504  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1505  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1506  if (!Pview.importMatrix.is_null()) {
1507  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1508  }
1509 
1511  // In a first pass, determine the graph of Ac.
1513 
1515  // Get the graph of Ac. This gets the local transpose of P,
1516  // then loops over R, A, P to get the graph of Ac.
1518 
1519 #ifdef HAVE_TPETRA_MMM_TIMINGS
1520  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1521 #endif
1522 
1524  // Get the local transpose of the graph of P by locally transposing
1525  // all of P
1526 
1527  ArrayRCP<const size_t> Rrowptr_RCP;
1528  ArrayRCP<const LO> Rcolind_RCP;
1529  ArrayRCP<const Scalar> Rvals_RCP;
1530  ArrayView<const size_t> Rrowptr;
1531  ArrayView<const LO> Rcolind;
1532  ArrayView<const SC> Rvals;
1533 
1534  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1535 
1536  using Teuchos::ParameterList;
1537  RCP<ParameterList> transposeParams (new ParameterList);
1538  transposeParams->set ("sort", false);
1539  if (! params.is_null ()) {
1540  transposeParams->set ("compute global constants",
1541  params->get ("compute global constants: temporaries",
1542  false));
1543  }
1544  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1545  transposer.createTransposeLocal (transposeParams);
1546 
1547  Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1548  Rrowptr = Rrowptr_RCP();
1549  Rcolind = Rcolind_RCP();
1550  Rvals = Rvals_RCP();
1551 
1553  // Construct graph
1554 
1555  #ifdef HAVE_TPETRA_MMM_TIMINGS
1556  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1557  #endif
1558 
1559  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1560  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1561 
1562  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1563  size_t nnzPerRowA = 100;
1564  if (Aview.origMatrix->getNodeNumEntries() > 0)
1565  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1566  Acrowptr_RCP.resize(n+1);
1567  Acrowptr = Acrowptr_RCP();
1568  Accolind_RCP.resize(nnz_alloc);
1569  Accolind = Accolind_RCP();
1570 
1571  size_t nnz = 0, nnz_old = 0;
1572  for (size_t i = 0; i < n; i++) {
1573  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1574  // on the calling process.
1575  Acrowptr[i] = nnz;
1576 
1577  // mfh 27 Sep 2016: For each entry of R in the current row of R
1578  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1579  LO k = Rcolind[kk]; // local column index of current entry of R
1580  // For each entry of A in the current row of A
1581  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1582  LO l = Acolind[ll]; // local column index of current entry of A
1583 
1584  if (Acol2PRow[l] != LO_INVALID) {
1585  // mfh 27 Sep 2016: If the entry of Acol2PRow
1586  // corresponding to the current entry of A is populated, then
1587  // the corresponding row of P is in P_local (i.e., it lives on
1588  // the calling process).
1589 
1590  // Local matrix
1591  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1592 
1593  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1594  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1595  LO j = Pcolind[jj];
1596  LO Acj = Pcol2Accol[j];
1597 
1598  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1599  // New entry
1600  ac_status[Acj] = nnz;
1601  Accolind[nnz] = Acj;
1602  nnz++;
1603  }
1604  }
1605  } else {
1606  // mfh 27 Sep 2016: If the entry of Acol2PRow
1607  // corresponding to the current entry of A is NOT populated (has
1608  // a flag "invalid" value), then the corresponding row of P is
1609  // in P_remote (i.e., it does not live on the calling process).
1610 
1611  // Remote matrix
1612  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1613  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1614  LO j = Icolind[jj];
1615  LO Acj = PIcol2Accol[j];
1616 
1617  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1618  // New entry
1619  ac_status[Acj] = nnz;
1620  Accolind[nnz] = Acj;
1621  nnz++;
1622  }
1623  }
1624  }
1625  }
1626  }
1627  // Resize for next pass if needed
1628  // cag: Maybe we can do something more subtle here, and not double
1629  // the size right away.
1630  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1631  nnz_alloc *= 2;
1632  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1633  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1634  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1635  }
1636  nnz_old = nnz;
1637  }
1638  Acrowptr[n] = nnz;
1639 
1640  // Downward resize
1641  Accolind_RCP.resize(nnz);
1642  Accolind = Accolind_RCP();
1643 
1644  // Allocate Acvals
1645  Acvals_RCP.resize(nnz, SC_ZERO);
1646  Acvals = Acvals_RCP();
1647 
1648 
1650  // In a second pass, enter the values into Acvals.
1652 
1653  #ifdef HAVE_TPETRA_MMM_TIMINGS
1654  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1655  #endif
1656 
1657 
1658  for (size_t k = 0; k < n; k++) {
1659  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1660  LO i = Pcolind[ii];
1661  const SC Pki = Pvals[ii];
1662  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1663  LO l = Acolind[ll];
1664  const SC Akl = Avals[ll];
1665  if (Akl == 0.)
1666  continue;
1667  if (Acol2PRow[l] != LO_INVALID) {
1668  // mfh 27 Sep 2016: If the entry of Acol2PRow
1669  // corresponding to the current entry of A is populated, then
1670  // the corresponding row of P is in P_local (i.e., it lives on
1671  // the calling process).
1672 
1673  // Local matrix
1674  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1675  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1676  LO j = Pcolind[jj];
1677  LO Acj = Pcol2Accol[j];
1678  size_t pp;
1679  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1680  if (Accolind[pp] == Acj)
1681  break;
1682  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1683  // std::runtime_error, "problem with Ac column indices");
1684  Acvals[pp] += Pki*Akl*Pvals[jj];
1685  }
1686  } else {
1687  // mfh 27 Sep 2016: If the entry of Acol2PRow
1688  // corresponding to the current entry of A NOT populated (has
1689  // a flag "invalid" value), then the corresponding row of P is
1690  // in P_remote (i.e., it does not live on the calling process).
1691 
1692  // Remote matrix
1693  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1694  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1695  LO j = Icolind[jj];
1696  LO Acj = PIcol2Accol[j];
1697  size_t pp;
1698  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1699  if (Accolind[pp] == Acj)
1700  break;
1701  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1702  // std::runtime_error, "problem with Ac column indices");
1703  Acvals[pp] += Pki*Akl*Ivals[jj];
1704  }
1705  }
1706  }
1707  }
1708  }
1709 
1710 
1711 #ifdef HAVE_TPETRA_MMM_TIMINGS
1712  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1713 #endif
1714 
1715  // Final sort & set of CRS arrays
1716  //
1717  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1718  // matrix-matrix multiply routine sort the entries for us?
1719  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1720 
1721  // mfh 27 Sep 2016: This just sets pointers.
1722  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1723 
1724 #ifdef HAVE_TPETRA_MMM_TIMINGS
1725  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1726 #endif
1727 
1728  // Final FillComplete
1729  //
1730  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1731  // Import (from domain Map to column Map) construction (which costs
1732  // lots of communication) by taking the previously constructed
1733  // Import object. We should be able to do this without interfering
1734  // with the implementation of the local part of sparse matrix-matrix
1735  // multply above.
1736  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1737  labelList->set("Timer Label",label);
1738  // labelList->set("Sort column Map ghost GIDs")
1739  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1740  RCP<const Export<LO,GO,NO> > dummyExport;
1741  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1742  Pview.origMatrix->getDomainMap(),
1743  Acimport,
1744  dummyExport, labelList);
1745  }
1746 
1747 
1748 
1749  } //End namepsace MMdetails
1750 
1751 } //End namespace Tpetra
1752 //
1753 // Explicit instantiation macro
1754 //
1755 // Must be expanded from within the Tpetra namespace!
1756 //
1757 
1758 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1759  \
1760  template \
1761  void TripleMatrixMultiply::MultiplyRAP( \
1762  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1763  bool transposeR, \
1764  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1765  bool transposeA, \
1766  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1767  bool transposeP, \
1768  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1769  bool call_FillComplete_on_result, \
1770  const std::string & label, \
1771  const Teuchos::RCP<Teuchos::ParameterList>& params); \
1772  \
1773 
1774 
1775 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isFillComplete() const override
Whether the matrix is fill complete.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.