Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
69 
70 #include <MatrixMarket_Tpetra.hpp>
71 
79 /*********************************************************************************************************/
80 // Include the architecture-specific kernel partial specializations here
81 // NOTE: This needs to be outside all namespaces
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
84 #include "TpetraExt_MatrixMatrix_HIP.hpp"
85 
86 namespace Tpetra {
87 
88 namespace MatrixMatrix{
89 
90 //
91 // This method forms the matrix-matrix product C = op(A) * op(B), where
92 // op(A) == A if transposeA is false,
93 // op(A) == A^T if transposeA is true,
94 // and similarly for op(B).
95 //
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class Node>
100 void Multiply(
102  bool transposeA,
104  bool transposeB,
106  bool call_FillComplete_on_result,
107  const std::string& label,
108  const Teuchos::RCP<Teuchos::ParameterList>& params)
109 {
110  using Teuchos::null;
111  using Teuchos::RCP;
112  using Teuchos::rcp;
113  typedef Scalar SC;
114  typedef LocalOrdinal LO;
115  typedef GlobalOrdinal GO;
116  typedef Node NO;
117  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118  typedef Import<LO,GO,NO> import_type;
119  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
120  typedef Map<LO,GO,NO> map_type;
121  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
122 
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125  using Teuchos::TimeMonitor;
126  //MM is used to time setup, and then multiply.
127 
128  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129 #endif
130 
131  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132 
133  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134 
135  // The input matrices A and B must both be fillComplete.
136  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138 
139  // If transposeA is true, then Aprime will be the transpose of A
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141  // will just be a pointer to A.
142  RCP<const crs_matrix_type> Aprime = null;
143  // If transposeB is true, then Bprime will be the transpose of B
144  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145  // will just be a pointer to B.
146  RCP<const crs_matrix_type> Bprime = null;
147 
148  // Is this a "clean" matrix?
149  //
150  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151  // locally nor globally indexed, then it was empty. I don't like
152  // this, because the most straightforward implementation presumes
153  // lazy allocation of indices. However, historical precedent
154  // demands that we keep around this predicate as a way to test
155  // whether the matrix is empty.
156  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157 
158  bool use_optimized_ATB = false;
159  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160  use_optimized_ATB = true;
161 
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163  use_optimized_ATB = false;
164 #endif
165 
166  using Teuchos::ParameterList;
167  RCP<ParameterList> transposeParams (new ParameterList);
168  transposeParams->set ("sort", false);
169 
170  if (!use_optimized_ATB && transposeA) {
171  transposer_type transposer (rcpFromRef (A));
172  Aprime = transposer.createTranspose (transposeParams);
173  }
174  else {
175  Aprime = rcpFromRef(A);
176  }
177 
178  if (transposeB) {
179  transposer_type transposer (rcpFromRef (B));
180  Bprime = transposer.createTranspose (transposeParams);
181  }
182  else {
183  Bprime = rcpFromRef(B);
184  }
185 
186  // Check size compatibility
187  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194  prefix << "ERROR, inner dimensions of op(A) and op(B) "
195  "must match for matrix-matrix product. op(A) is "
196  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197 
198  // The result matrix C must at least have a row-map that reflects the correct
199  // row-size. Don't check the number of columns because rectangular matrices
200  // which were constructed with only one map can still end up having the
201  // correct capacity and dimensions when filled.
202  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203  prefix << "ERROR, dimensions of result C must "
204  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205  << " rows, should have at least " << Aouter << std::endl);
206 
207  // It doesn't matter whether C is already Filled or not. If it is already
208  // Filled, it must have space allocated for the positions that will be
209  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210  // we'll error out later when trying to store result values.
211 
212  // CGB: However, matrix must be in active-fill
213  if (!C.isFillActive()) C.resumeFill();
214 
215  // We're going to need to import remotely-owned sections of A and/or B if
216  // more than one processor is performing this run, depending on the scenario.
217  int numProcs = A.getComm()->getSize();
218 
219  // Declare a couple of structs that will be used to hold views of the data
220  // of A and B, to be used for fast access during the matrix-multiplication.
221  crs_matrix_struct_type Aview;
222  crs_matrix_struct_type Bview;
223 
224  RCP<const map_type> targetMap_A = Aprime->getRowMap();
225  RCP<const map_type> targetMap_B = Bprime->getRowMap();
226 
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
228  {
229  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230 #endif
231 
232  // Now import any needed remote rows and populate the Aview struct
233  // NOTE: We assert that an import isn't needed --- since we do the transpose
234  // above to handle that.
235  if (!use_optimized_ATB) {
236  RCP<const import_type> dummyImporter;
237  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238  }
239 
240  // We will also need local access to all rows of B that correspond to the
241  // column-map of op(A).
242  if (numProcs > 1)
243  targetMap_B = Aprime->getColMap();
244 
245  // Import any needed remote rows and populate the Bview struct.
246  if (!use_optimized_ATB)
247  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
248 
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250  } //stop MM_importExtract here
251  //stop the setup timer, and start the multiply timer
252  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253 #endif
254 
255  // Call the appropriate method to perform the actual multiplication.
256  if (use_optimized_ATB) {
257  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258 
259  } else if (call_FillComplete_on_result && newFlag) {
260  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261 
262  } else if (call_FillComplete_on_result) {
263  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264 
265  } else {
266  // mfh 27 Sep 2016: Is this the "slow" case? This
267  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268  // thread-parallel inserts, but that may take some effort.
269  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270 
271  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272  }
273 
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276 #endif
277  if (call_FillComplete_on_result && !C.isFillComplete()) {
278  // We'll call FillComplete on the C matrix before we exit, and give it a
279  // domain-map and a range-map.
280  // The domain-map will be the domain-map of B, unless
281  // op(B)==transpose(B), in which case the range-map of B will be used.
282  // The range-map will be the range-map of A, unless op(A)==transpose(A),
283  // in which case the domain-map of A will be used.
284  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285  }
286 }
287 
288 
289 template <class Scalar,
290  class LocalOrdinal,
291  class GlobalOrdinal,
292  class Node>
293 void Jacobi(Scalar omega,
298  bool call_FillComplete_on_result,
299  const std::string& label,
300  const Teuchos::RCP<Teuchos::ParameterList>& params)
301 {
302  using Teuchos::RCP;
303  typedef Scalar SC;
304  typedef LocalOrdinal LO;
305  typedef GlobalOrdinal GO;
306  typedef Node NO;
307  typedef Import<LO,GO,NO> import_type;
308  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
309  typedef Map<LO,GO,NO> map_type;
310  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311 
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314  using Teuchos::TimeMonitor;
315  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316 #endif
317 
318  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319 
320  // A and B should already be Filled.
321  // Should we go ahead and call FillComplete() on them if necessary or error
322  // out? For now, we choose to error out.
323  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325 
326  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328 
329  // Now check size compatibility
330  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332  global_size_t Aouter = A.getGlobalNumRows();
333  global_size_t Bouter = numBCols;
334  global_size_t Ainner = numACols;
335  global_size_t Binner = B.getGlobalNumRows();
336  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337  prefix << "ERROR, inner dimensions of op(A) and op(B) "
338  "must match for matrix-matrix product. op(A) is "
339  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340 
341  // The result matrix C must at least have a row-map that reflects the correct
342  // row-size. Don't check the number of columns because rectangular matrices
343  // which were constructed with only one map can still end up having the
344  // correct capacity and dimensions when filled.
345  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346  prefix << "ERROR, dimensions of result C must "
347  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348  << " rows, should have at least "<< Aouter << std::endl);
349 
350  // It doesn't matter whether C is already Filled or not. If it is already
351  // Filled, it must have space allocated for the positions that will be
352  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353  // we'll error out later when trying to store result values.
354 
355  // CGB: However, matrix must be in active-fill
356  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357 
358  // We're going to need to import remotely-owned sections of A and/or B if
359  // more than one processor is performing this run, depending on the scenario.
360  int numProcs = A.getComm()->getSize();
361 
362  // Declare a couple of structs that will be used to hold views of the data of
363  // A and B, to be used for fast access during the matrix-multiplication.
364  crs_matrix_struct_type Aview;
365  crs_matrix_struct_type Bview;
366 
367  RCP<const map_type> targetMap_A = Aprime->getRowMap();
368  RCP<const map_type> targetMap_B = Bprime->getRowMap();
369 
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372  {
373 #endif
374 
375  // Enable globalConstants by default
376  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378  if(!params.is_null()) {
379  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
380  int mm_optimization_core_count=0;
381  auto slist = params->sublist("matrixmatrix: kernel params",false);
382  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
383  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391  }
392 
393  //Now import any needed remote rows and populate the Aview struct.
394  RCP<const import_type> dummyImporter;
395  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396 
397  // We will also need local access to all rows of B that correspond to the
398  // column-map of op(A).
399  if (numProcs > 1)
400  targetMap_B = Aprime->getColMap();
401 
402  // Now import any needed remote rows and populate the Bview struct.
403  // Enable globalConstants by default
404  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406  if(!params.is_null()) {
407  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408 
409  auto slist = params->sublist("matrixmatrix: kernel params",false);
410  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
411  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419  }
420 
421  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
422 
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424  }
425  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426 #endif
427 
428  // Now call the appropriate method to perform the actual multiplication.
429  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
430 
431  // Is this a "clean" matrix
432  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433 
434  if (call_FillComplete_on_result && newFlag) {
435  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436 
437  } else if (call_FillComplete_on_result) {
438  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439 
440  } else {
441  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443  // commenting it out.
444 // #ifdef HAVE_TPETRA_MMM_TIMINGS
445 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446 // #endif
447  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448  // commenting it out.
449  // if (call_FillComplete_on_result) {
450  // //We'll call FillComplete on the C matrix before we exit, and give
451  // //it a domain-map and a range-map.
452  // //The domain-map will be the domain-map of B, unless
453  // //op(B)==transpose(B), in which case the range-map of B will be used.
454  // //The range-map will be the range-map of A, unless
455  // //op(A)==transpose(A), in which case the domain-map of A will be used.
456  // if (!C.isFillComplete()) {
457  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458  // }
459  // }
460  }
461 }
462 
463 
464 template <class Scalar,
465  class LocalOrdinal,
466  class GlobalOrdinal,
467  class Node>
468 void Add(
470  bool transposeA,
471  Scalar scalarA,
473  Scalar scalarB )
474 {
475  using Teuchos::Array;
476  using Teuchos::RCP;
477  using Teuchos::null;
478  typedef Scalar SC;
479  typedef LocalOrdinal LO;
480  typedef GlobalOrdinal GO;
481  typedef Node NO;
482  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
483  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
484 
485  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486 
487  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489  "(Result matrix B is not required to be isFillComplete()).");
490  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491  prefix << "ERROR, input matrix B must not be fill complete!");
492  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493  prefix << "ERROR, input matrix B must not have static graph!");
494  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495  prefix << "ERROR, input matrix B must not be locally indexed!");
496 
497  using Teuchos::ParameterList;
498  RCP<ParameterList> transposeParams (new ParameterList);
499  transposeParams->set ("sort", false);
500 
501  RCP<const crs_matrix_type> Aprime = null;
502  if (transposeA) {
503  transposer_type transposer (rcpFromRef (A));
504  Aprime = transposer.createTranspose (transposeParams);
505  }
506  else {
507  Aprime = rcpFromRef(A);
508  }
509 
510  size_t a_numEntries;
511  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getNodeMaxNumRowEntries());
512  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getNodeMaxNumRowEntries());
513  GO row;
514 
515  if (scalarB != Teuchos::ScalarTraits<SC>::one())
516  B.scale(scalarB);
517 
518  bool bFilled = B.isFillComplete();
519  size_t numMyRows = B.getNodeNumRows();
520  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521  for (LO i = 0; (size_t)i < numMyRows; ++i) {
522  row = B.getRowMap()->getGlobalElement(i);
523  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
524 
525  if (scalarA != Teuchos::ScalarTraits<SC>::one())
526  for (size_t j = 0; j < a_numEntries; ++j)
527  a_vals[j] *= scalarA;
528 
529  if (bFilled)
530  B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
531  else
532  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
533  }
534  }
535 }
536 
537 template <class Scalar,
538  class LocalOrdinal,
539  class GlobalOrdinal,
540  class Node>
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
542 add (const Scalar& alpha,
543  const bool transposeA,
545  const Scalar& beta,
546  const bool transposeB,
548  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550  const Teuchos::RCP<Teuchos::ParameterList>& params)
551 {
552  using Teuchos::RCP;
553  using Teuchos::rcpFromRef;
554  using Teuchos::rcp;
555  using Teuchos::ParameterList;
557  if(!params.is_null())
558  {
559  TEUCHOS_TEST_FOR_EXCEPTION(
560  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561  std::invalid_argument,
562  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564  params->set("Call fillComplete", true);
565  }
566  //If transposeB, must compute B's explicit transpose to
567  //get the correct row map for C.
568  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569  if(transposeB)
570  {
572  Brcp = transposer.createTranspose();
573  }
574  //Check that A,B are fillComplete before getting B's column map
575  TEUCHOS_TEST_FOR_EXCEPTION
576  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579  //this version of add() always fill completes the result, no matter what is in params on input
580  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581  return C;
582 }
583 
584 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585 //but since the spadd() output is always packed there is no need for a separate
586 //numRowEntries here.
587 //
588 template<class LO, class GO, class LOView, class GOView, class LocalMap>
589 struct ConvertGlobalToLocalFunctor
590 {
591  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592  : lids(lids_), gids(gids_), localColMap(localColMap_)
593  {}
594 
595  KOKKOS_FUNCTION void operator() (const GO i) const
596  {
597  lids(i) = localColMap.getLocalElement(gids(i));
598  }
599 
600  LOView lids;
601  const GOView gids;
602  const LocalMap localColMap;
603 };
604 
605 template <class Scalar,
606  class LocalOrdinal,
607  class GlobalOrdinal,
608  class Node>
609 void
610 add (const Scalar& alpha,
611  const bool transposeA,
613  const Scalar& beta,
614  const bool transposeB,
617  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619  const Teuchos::RCP<Teuchos::ParameterList>& params)
620 {
621  using Teuchos::RCP;
622  using Teuchos::rcp;
623  using Teuchos::rcpFromRef;
624  using Teuchos::rcp_implicit_cast;
625  using Teuchos::rcp_dynamic_cast;
626  using Teuchos::TimeMonitor;
627  using SC = Scalar;
628  using LO = LocalOrdinal;
629  using GO = GlobalOrdinal;
630  using NO = Node;
631  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632  using crs_graph_type = CrsGraph<LO,GO,NO>;
633  using map_type = Map<LO,GO,NO>;
634  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
635  using import_type = Import<LO,GO,NO>;
636  using export_type = Export<LO,GO,NO>;
637  using exec_space = typename crs_graph_type::execution_space;
638  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640  constexpr bool debug = false;
641 
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644 #endif
645 
646  if (debug) {
647  std::ostringstream os;
648  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649  << "TpetraExt::MatrixMatrix::add" << std::endl;
650  std::cerr << os.str ();
651  }
652 
653  TEUCHOS_TEST_FOR_EXCEPTION
654  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
656  TEUCHOS_TEST_FOR_EXCEPTION
657  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658  prefix_mmm << "A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
660  // The matrices don't have domain or range Maps unless they are fill complete.
661  if (A.isFillComplete () && B.isFillComplete ()) {
662  const bool domainMapsSame =
663  (! transposeA && ! transposeB &&
664  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665  (! transposeA && transposeB &&
666  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667  ( transposeA && ! transposeB &&
668  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671 
672  const bool rangeMapsSame =
673  (! transposeA && ! transposeB &&
674  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675  (! transposeA && transposeB &&
676  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677  ( transposeA && ! transposeB &&
678  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681  }
682 #endif // HAVE_TPETRA_DEBUG
683 
684  using Teuchos::ParameterList;
685  // Form the explicit transpose of A if necessary.
686  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687  if (transposeA) {
688  transposer_type transposer (Aprime);
689  Aprime = transposer.createTranspose ();
690  }
691 
692 #ifdef HAVE_TPETRA_DEBUG
693  TEUCHOS_TEST_FOR_EXCEPTION
694  (Aprime.is_null (), std::logic_error,
695  prefix_mmm << "Failed to compute Op(A). "
696  "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
698 
699  // Form the explicit transpose of B if necessary.
700  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701  if (transposeB) {
702  if (debug) {
703  std::ostringstream os;
704  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705  << "Form explicit xpose of B" << std::endl;
706  std::cerr << os.str ();
707  }
708  transposer_type transposer (Bprime);
709  Bprime = transposer.createTranspose ();
710  }
711 #ifdef HAVE_TPETRA_DEBUG
712  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714  TEUCHOS_TEST_FOR_EXCEPTION(
715  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716  prefix_mmm << "Aprime and Bprime must both be fill complete. "
717  "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG
719  RCP<const map_type> CDomainMap = domainMap;
720  RCP<const map_type> CRangeMap = rangeMap;
721  if(CDomainMap.is_null())
722  {
723  CDomainMap = Bprime->getDomainMap();
724  }
725  if(CRangeMap.is_null())
726  {
727  CRangeMap = Bprime->getRangeMap();
728  }
729  assert(!(CDomainMap.is_null()));
730  assert(!(CRangeMap.is_null()));
731  typedef typename AddKern::values_array values_array;
732  typedef typename AddKern::row_ptrs_array row_ptrs_array;
733  typedef typename AddKern::col_inds_array col_inds_array;
734  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736  values_array vals;
737  row_ptrs_array rowptrs;
738  col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
740  MM = Teuchos::null;
741  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742 #endif
743  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744  {
745  //import Aprime into Bprime's row map so the local matrices have same # of rows
746  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747  // cbl do not set
748  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749  // this import _may_ take the form of a transfer. In practice it would be unlikely,
750  // but the general case is not so forgiving.
751  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752  }
753  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754  bool sorted = AGraphSorted && BGraphSorted;
755  RCP<const import_type> Cimport = Teuchos::null;
756  RCP<export_type> Cexport = Teuchos::null;
757  bool doFillComplete = true;
758  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759  {
760  doFillComplete = params->get<bool>("Call fillComplete");
761  }
762  auto Alocal = Aprime->getLocalMatrixDevice();
763  auto Blocal = Bprime->getLocalMatrixDevice();
764  LO numLocalRows = Alocal.numRows();
765  if(numLocalRows == 0)
766  {
767  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769  //Handle this case now
770  //(without interfering with collective operations, since it's possible for
771  //some ranks to have 0 local rows and others not).
772  rowptrs = row_ptrs_array("C rowptrs", 0);
773  }
774  auto Acolmap = Aprime->getColMap();
775  auto Bcolmap = Bprime->getColMap();
776  if(!matchingColMaps)
777  {
778  using global_col_inds_array = typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
780  MM = Teuchos::null;
781  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782 #endif
783  //use kernel that converts col indices in both A and B to common domain map before adding
784  auto AlocalColmap = Acolmap->getLocalMap();
785  auto BlocalColmap = Bcolmap->getLocalMap();
786  global_col_inds_array globalColinds;
787  if (debug) {
788  std::ostringstream os;
789  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791  std::cerr << os.str ();
792  }
793  AddKern::convertToGlobalAndAdd(
794  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795  vals, rowptrs, globalColinds);
796  if (debug) {
797  std::ostringstream os;
798  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800  std::cerr << os.str ();
801  }
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
803  MM = Teuchos::null;
804  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805 #endif
806  RCP<const map_type> CcolMap;
807  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808  (CcolMap, CDomainMap, globalColinds);
809  C.replaceColMap(CcolMap);
810  col_inds_array localColinds("C colinds", globalColinds.extent(0));
811  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813  col_inds_array, global_col_inds_array,
814  typename map_type::local_map_type>
815  (localColinds, globalColinds, CcolMap->getLocalMap()));
816  KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817  C.setAllValues(rowptrs, localColinds, vals);
818  C.fillComplete(CDomainMap, CRangeMap, params);
819  if(!doFillComplete)
820  C.resumeFill();
821  }
822  else
823  {
824  //Aprime, Bprime and C all have the same column maps
825  auto Avals = Alocal.values;
826  auto Bvals = Blocal.values;
827  auto Arowptrs = Alocal.graph.row_map;
828  auto Browptrs = Blocal.graph.row_map;
829  auto Acolinds = Alocal.graph.entries;
830  auto Bcolinds = Blocal.graph.entries;
831  if(sorted)
832  {
833  //use sorted kernel
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
835  MM = Teuchos::null;
836  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837 #endif
838  if (debug) {
839  std::ostringstream os;
840  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841  << "Call AddKern::addSorted(...)" << std::endl;
842  std::cerr << os.str ();
843  }
844  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845  }
846  else
847  {
848  //use unsorted kernel
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
850  MM = Teuchos::null;
851  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852 #endif
853  if (debug) {
854  std::ostringstream os;
855  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856  << "Call AddKern::addUnsorted(...)" << std::endl;
857  std::cerr << os.str ();
858  }
859  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860  }
861  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
862  RCP<const map_type> Ccolmap = Bcolmap;
863  C.replaceColMap(Ccolmap);
864  C.setAllValues(rowptrs, colinds, vals);
865  if(doFillComplete)
866  {
867  #ifdef HAVE_TPETRA_MMM_TIMINGS
868  MM = Teuchos::null;
869  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870  #endif
871  if(!CDomainMap->isSameAs(*Ccolmap))
872  {
873  if (debug) {
874  std::ostringstream os;
875  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876  << "Create Cimport" << std::endl;
877  std::cerr << os.str ();
878  }
879  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880  }
881  if(!C.getRowMap()->isSameAs(*CRangeMap))
882  {
883  if (debug) {
884  std::ostringstream os;
885  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886  << "Create Cexport" << std::endl;
887  std::cerr << os.str ();
888  }
889  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890  }
891 
892  if (debug) {
893  std::ostringstream os;
894  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895  << "Call C->expertStaticFillComplete(...)" << std::endl;
896  std::cerr << os.str ();
897  }
898  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899  }
900  }
901 }
902 
903 template <class Scalar,
904  class LocalOrdinal,
905  class GlobalOrdinal,
906  class Node>
907 void Add(
909  bool transposeA,
910  Scalar scalarA,
912  bool transposeB,
913  Scalar scalarB,
915 {
916  using Teuchos::Array;
917  using Teuchos::ArrayRCP;
918  using Teuchos::ArrayView;
919  using Teuchos::RCP;
920  using Teuchos::rcp;
921  using Teuchos::rcp_dynamic_cast;
922  using Teuchos::rcpFromRef;
923  using Teuchos::tuple;
924  using std::endl;
925  // typedef typename ArrayView<const Scalar>::size_type size_type;
926  typedef Teuchos::ScalarTraits<Scalar> STS;
928  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933 
934  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935 
936  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938 
939  TEUCHOS_TEST_FOR_EXCEPTION(
940  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941  prefix << "Both input matrices must be fill complete before calling this function.");
942 
943 #ifdef HAVE_TPETRA_DEBUG
944  {
945  const bool domainMapsSame =
946  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951 
952  const bool rangeMapsSame =
953  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958  }
959 #endif // HAVE_TPETRA_DEBUG
960 
961  using Teuchos::ParameterList;
962  RCP<ParameterList> transposeParams (new ParameterList);
963  transposeParams->set ("sort", false);
964 
965  // Form the explicit transpose of A if necessary.
966  RCP<const crs_matrix_type> Aprime;
967  if (transposeA) {
968  transposer_type theTransposer (rcpFromRef (A));
969  Aprime = theTransposer.createTranspose (transposeParams);
970  }
971  else {
972  Aprime = rcpFromRef (A);
973  }
974 
975 #ifdef HAVE_TPETRA_DEBUG
976  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG
979 
980  // Form the explicit transpose of B if necessary.
981  RCP<const crs_matrix_type> Bprime;
982  if (transposeB) {
983  transposer_type theTransposer (rcpFromRef (B));
984  Bprime = theTransposer.createTranspose (transposeParams);
985  }
986  else {
987  Bprime = rcpFromRef (B);
988  }
989 
990 #ifdef HAVE_TPETRA_DEBUG
991  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
994 
995  // Allocate or zero the entries of the result matrix.
996  if (! C.is_null ()) {
997  C->setAllToScalar (STS::zero ());
998  } else {
999  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000  // noticed that C was being given the row Map of Aprime (the
1001  // possibly transposed version of A). Is this what we want?
1002  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003  }
1004 
1005 #ifdef HAVE_TPETRA_DEBUG
1006  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG
1013 
1014  Array<RCP<const crs_matrix_type> > Mat =
1015  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1017 
1018  // do a loop over each matrix to add: A reordering might be more efficient
1019  for (int k = 0; k < 2; ++k) {
1020  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1021  typename crs_matrix_type::nonconst_values_host_view_type Values;
1022 
1023  // Loop over each locally owned row of the current matrix (either
1024  // Aprime or Bprime), and sum its entries into the corresponding
1025  // row of C. This works regardless of whether Aprime or Bprime
1026  // has the same row Map as C, because both sumIntoGlobalValues and
1027  // insertGlobalValues allow summing resp. inserting into nonowned
1028  // rows of C.
1029 #ifdef HAVE_TPETRA_DEBUG
1030  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG
1033  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1038 
1039  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040  for (size_t i = 0; i < localNumRows; ++i) {
1041  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043  if (numEntries > 0) {
1044  Kokkos::resize(Indices,numEntries);
1045  Kokkos::resize(Values,numEntries);
1046  Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1047 
1048  if (scalar[k] != STS::one ()) {
1049  for (size_t j = 0; j < numEntries; ++j) {
1050  Values[j] *= scalar[k];
1051  }
1052  }
1053 
1054  if (C->isFillComplete ()) {
1055  C->sumIntoGlobalValues (globalRow, numEntries,
1056  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1057  } else {
1058  C->insertGlobalValues (globalRow, numEntries,
1059  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1060  }
1061  }
1062  }
1063  }
1064 }
1065 
1066 } //End namespace MatrixMatrix
1067 
1068 namespace MMdetails{
1069 
1070 /*********************************************************************************************************/
1071 // Prints MMM-style statistics on communication done with an Import or Export object
1072 template <class TransferType>
1073 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1074  if (Transfer.is_null())
1075  return;
1076 
1077  const Distributor & Distor = Transfer->getDistributor();
1078  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1079 
1080  size_t rows_send = Transfer->getNumExportIDs();
1081  size_t rows_recv = Transfer->getNumRemoteIDs();
1082 
1083  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1084  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1085  size_t num_send_neighbors = Distor.getNumSends();
1086  size_t num_recv_neighbors = Distor.getNumReceives();
1087  size_t round2_send, round2_recv;
1088  Distor.getLastDoStatistics(round2_send,round2_recv);
1089 
1090  int myPID = Comm->getRank();
1091  int NumProcs = Comm->getSize();
1092 
1093  // Processor by processor statistics
1094  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1095  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1096 
1097  // Global statistics
1098  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099  size_t gstats_min[8], gstats_max[8];
1100 
1101  double lstats_avg[8], gstats_avg[8];
1102  for(int i=0; i<8; i++)
1103  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1104 
1105  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108 
1109  if(!myPID) {
1110  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1112  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1113  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1115  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1116  }
1117 }
1118 
1119 // Kernel method for computing the local portion of C = A*B
1120 template<class Scalar,
1121  class LocalOrdinal,
1122  class GlobalOrdinal,
1123  class Node>
1124 void mult_AT_B_newmatrix(
1125  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128  const std::string & label,
1129  const Teuchos::RCP<Teuchos::ParameterList>& params)
1130 {
1131  using Teuchos::RCP;
1132  using Teuchos::rcp;
1133  typedef Scalar SC;
1134  typedef LocalOrdinal LO;
1135  typedef GlobalOrdinal GO;
1136  typedef Node NO;
1137  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1139 
1140 #ifdef HAVE_TPETRA_MMM_TIMINGS
1141  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1142  using Teuchos::TimeMonitor;
1143  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1144  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1145 #endif
1146 
1147  /*************************************************************/
1148  /* 1) Local Transpose of A */
1149  /*************************************************************/
1150  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1151 
1152  using Teuchos::ParameterList;
1153  RCP<ParameterList> transposeParams (new ParameterList);
1154  transposeParams->set ("sort", false);
1155  if(! params.is_null ()) {
1156  transposeParams->set ("compute global constants",
1157  params->get ("compute global constants: temporaries",
1158  false));
1159  }
1160  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161  transposer.createTransposeLocal (transposeParams);
1162 
1163  /*************************************************************/
1164  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1165  /*************************************************************/
1166 #ifdef HAVE_TPETRA_MMM_TIMINGS
1167  MM = Teuchos::null;
1168  MM = rcp (new TimeMonitor
1169  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1170 #endif
1171 
1172  // Get views, asserting that no import is required to speed up computation
1173  crs_matrix_struct_type Aview;
1174  crs_matrix_struct_type Bview;
1175  RCP<const Import<LO, GO, NO> > dummyImporter;
1176 
1177  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1178  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1179  if (! params.is_null ()) {
1180  importParams1->set ("compute global constants",
1181  params->get ("compute global constants: temporaries",
1182  false));
1183  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1184  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1185  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1186  int mm_optimization_core_count =
1188  mm_optimization_core_count =
1189  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190  int mm_optimization_core_count2 =
1191  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193  mm_optimization_core_count = mm_optimization_core_count2;
1194  }
1195  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1196  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1198  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199  }
1200 
1201  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202  Aview, dummyImporter, true,
1203  label, importParams1);
1204 
1205  RCP<ParameterList> importParams2 (new ParameterList);
1206  if (! params.is_null ()) {
1207  importParams2->set ("compute global constants",
1208  params->get ("compute global constants: temporaries",
1209  false));
1210  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1211  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1212  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1213  int mm_optimization_core_count =
1215  mm_optimization_core_count =
1216  slist.get ("MM_TAFC_OptimizationCoreCount",
1217  mm_optimization_core_count);
1218  int mm_optimization_core_count2 =
1219  params->get ("MM_TAFC_OptimizationCoreCount",
1220  mm_optimization_core_count);
1221  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222  mm_optimization_core_count = mm_optimization_core_count2;
1223  }
1224  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1225  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1227  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228  }
1229 
1230  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1232  }
1233  else {
1234  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1235  }
1236 
1237 #ifdef HAVE_TPETRA_MMM_TIMINGS
1238  MM = Teuchos::null;
1239  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1240 #endif
1241 
1242  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243 
1244  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1245  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246  if (needs_final_export) {
1247  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1248  }
1249  else {
1250  Ctemp = rcp (&C, false);
1251  }
1252 
1253  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1254 
1255  /*************************************************************/
1256  /* 4) exportAndFillComplete matrix */
1257  /*************************************************************/
1258 #ifdef HAVE_TPETRA_MMM_TIMINGS
1259  MM = Teuchos::null;
1260  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1261 #endif
1262 
1263  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1264 
1265  if (needs_final_export) {
1266  ParameterList labelList;
1267  labelList.set("Timer Label", label);
1268  if(!params.is_null()) {
1269  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1270  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1271  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1272  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1276  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1277  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1278 
1279  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1280  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281  labelList.set("compute global constants",params->get("compute global constants",true));
1282  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1283  }
1284  Ctemp->exportAndFillComplete (Crcp,
1285  *Ctemp->getGraph ()->getExporter (),
1286  B.getDomainMap (),
1287  A.getDomainMap (),
1288  rcp (&labelList, false));
1289  }
1290 #ifdef HAVE_TPETRA_MMM_STATISTICS
1291  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1292 #endif
1293 }
1294 
1295 /*********************************************************************************************************/
1296 // Kernel method for computing the local portion of C = A*B
1297 template<class Scalar,
1298  class LocalOrdinal,
1299  class GlobalOrdinal,
1300  class Node>
1301 void mult_A_B(
1302  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305  const std::string& /* label */,
1306  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1307 {
1308  using Teuchos::Array;
1309  using Teuchos::ArrayRCP;
1310  using Teuchos::ArrayView;
1311  using Teuchos::OrdinalTraits;
1312  using Teuchos::null;
1313 
1314  typedef Teuchos::ScalarTraits<Scalar> STS;
1315  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1316  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1317  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1318 
1319  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1321 
1322  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323  ArrayView<const GlobalOrdinal> bcols_import = null;
1324  if (Bview.importColMap != null) {
1325  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1327 
1328  bcols_import = Bview.importColMap->getNodeElementList();
1329  }
1330 
1331  size_t C_numCols = C_lastCol - C_firstCol +
1332  OrdinalTraits<LocalOrdinal>::one();
1333  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334  OrdinalTraits<LocalOrdinal>::one();
1335 
1336  if (C_numCols_import > C_numCols)
1337  C_numCols = C_numCols_import;
1338 
1339  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1342 
1343  Array<Scalar> C_row_i = dwork;
1344  Array<GlobalOrdinal> C_cols = iwork;
1345  Array<size_t> c_index = iwork2;
1346  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1348 
1349  size_t C_row_i_length, j, k, last_index;
1350 
1351  // Run through all the hash table lookups once and for all
1352  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1356  // Maps are the same: Use local IDs as the hash
1357  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358  Aview.colMap->getMaxLocalIndex(); i++)
1359  Acol2Brow[i]=i;
1360  }
1361  else {
1362  // Maps are not the same: Use the map's hash
1363  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364  Aview.colMap->getMaxLocalIndex(); i++) {
1365  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1369  }
1370  }
1371 
1372  // To form C = A*B we're going to execute this expression:
1373  //
1374  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1375  //
1376  // Our goal, of course, is to navigate the data in A and B once, without
1377  // performing searches for column-indices, etc.
1378  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383  ArrayView<const Scalar> Avals, Bvals, Ivals;
1384  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388  if(!Bview.importMatrix.is_null()) {
1389  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391  }
1392 
1393  bool C_filled = C.isFillComplete();
1394 
1395  for (size_t i = 0; i < C_numCols; i++)
1396  c_index[i] = OrdinalTraits<size_t>::invalid();
1397 
1398  // Loop over the rows of A.
1399  size_t Arows = Aview.rowMap->getNodeNumElements();
1400  for(size_t i=0; i<Arows; ++i) {
1401 
1402  // Only navigate the local portion of Aview... which is, thankfully, all of
1403  // A since this routine doesn't do transpose modes
1404  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1405 
1406  // Loop across the i-th row of A and for each corresponding row in B, loop
1407  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1408  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1409  // calculating updates for row i of the result matrix C.
1410  C_row_i_length = OrdinalTraits<size_t>::zero();
1411 
1412  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414  const Scalar Aval = Avals[k];
1415  if (Aval == STS::zero())
1416  continue;
1417 
1418  if (Ak == LO_INVALID)
1419  continue;
1420 
1421  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422  LocalOrdinal col = Bcolind[j];
1423  //assert(col >= 0 && col < C_numCols);
1424 
1425  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1427  // This has to be a += so insertGlobalValue goes out
1428  C_row_i[C_row_i_length] = Aval*Bvals[j];
1429  C_cols[C_row_i_length] = col;
1430  c_index[col] = C_row_i_length;
1431  C_row_i_length++;
1432 
1433  } else {
1434  C_row_i[c_index[col]] += Aval*Bvals[j];
1435  }
1436  }
1437  }
1438 
1439  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1440  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441  C_cols[ii] = bcols[C_cols[ii]];
1442  combined_index[ii] = C_cols[ii];
1443  combined_values[ii] = C_row_i[ii];
1444  }
1445  last_index = C_row_i_length;
1446 
1447  //
1448  //Now put the C_row_i values into C.
1449  //
1450  // We might have to revamp this later.
1451  C_row_i_length = OrdinalTraits<size_t>::zero();
1452 
1453  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455  const Scalar Aval = Avals[k];
1456  if (Aval == STS::zero())
1457  continue;
1458 
1459  if (Ak!=LO_INVALID) continue;
1460 
1461  Ak = Acol2Irow[Acolind[k]];
1462  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463  LocalOrdinal col = Icolind[j];
1464  //assert(col >= 0 && col < C_numCols);
1465 
1466  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1468  // This has to be a += so insertGlobalValue goes out
1469  C_row_i[C_row_i_length] = Aval*Ivals[j];
1470  C_cols[C_row_i_length] = col;
1471  c_index[col] = C_row_i_length;
1472  C_row_i_length++;
1473 
1474  } else {
1475  // This has to be a += so insertGlobalValue goes out
1476  C_row_i[c_index[col]] += Aval*Ivals[j];
1477  }
1478  }
1479  }
1480 
1481  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1482  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483  C_cols[ii] = bcols_import[C_cols[ii]];
1484  combined_index[last_index] = C_cols[ii];
1485  combined_values[last_index] = C_row_i[ii];
1486  last_index++;
1487  }
1488 
1489  // Now put the C_row_i values into C.
1490  // We might have to revamp this later.
1491  C_filled ?
1492  C.sumIntoGlobalValues(
1493  global_row,
1494  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1496  :
1497  C.insertGlobalValues(
1498  global_row,
1499  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1501 
1502  }
1503 }
1504 
1505 /*********************************************************************************************************/
1506 template<class Scalar,
1507  class LocalOrdinal,
1508  class GlobalOrdinal,
1509  class Node>
1510 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1513 
1514  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515  Mview.maxNumRowEntries = Mview.indices[0].size();
1516 
1517  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519  Mview.maxNumRowEntries = Mview.indices[i].size();
1520  }
1521 }
1522 
1523 /*********************************************************************************************************/
1524 template<class CrsMatrixType>
1525 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1526  // Follows the NZ estimate in ML's ml_matmatmult.c
1527  size_t Aest = 100, Best=100;
1528  if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529  Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530  if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1532 
1533  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1534  nnzperrow *= nnzperrow;
1535 
1536  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1537 }
1538 
1539 
1540 /*********************************************************************************************************/
1541 // Kernel method for computing the local portion of C = A*B
1542 //
1543 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1544 // function, so this is probably the function we want to
1545 // thread-parallelize.
1546 template<class Scalar,
1547  class LocalOrdinal,
1548  class GlobalOrdinal,
1549  class Node>
1550 void mult_A_B_newmatrix(
1551  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554  const std::string& label,
1555  const Teuchos::RCP<Teuchos::ParameterList>& params)
1556 {
1557  using Teuchos::Array;
1558  using Teuchos::ArrayRCP;
1559  using Teuchos::ArrayView;
1560  using Teuchos::RCP;
1561  using Teuchos::rcp;
1562 
1563  // Tpetra typedefs
1564  typedef LocalOrdinal LO;
1565  typedef GlobalOrdinal GO;
1566  typedef Node NO;
1567  typedef Import<LO,GO,NO> import_type;
1568  typedef Map<LO,GO,NO> map_type;
1569 
1570  // Kokkos typedefs
1571  typedef typename map_type::local_map_type local_map_type;
1573  typedef typename KCRS::StaticCrsGraphType graph_t;
1574  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575  typedef typename NO::execution_space execution_space;
1576  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1578 
1579 #ifdef HAVE_TPETRA_MMM_TIMINGS
1580  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1581  using Teuchos::TimeMonitor;
1582  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1583 #endif
1584  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585 
1586  // Build the final importer / column map, hash table lookups for C
1587  RCP<const import_type> Cimport;
1588  RCP<const map_type> Ccolmap;
1589  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1597 
1598  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1599  // indices of B, to local column indices of C. (B and C have the
1600  // same number of columns.) The kernel uses this, instead of
1601  // copying the entire input matrix B and converting its column
1602  // indices to those of C.
1603  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1604 
1605  if (Bview.importMatrix.is_null()) {
1606  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1607  Cimport = Bimport;
1608  Ccolmap = Bview.colMap;
1609  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1610  // Bcol2Ccol is trivial
1611  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613  KOKKOS_LAMBDA(const LO i) {
1614  Bcol2Ccol(i) = i;
1615  });
1616  }
1617  else {
1618  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1619  // column Map of C, as well as C's Import object (from its domain
1620  // Map to its column Map). C's column Map is the union of the
1621  // column Maps of (the local part of) B, and the "remote" part of
1622  // B. Ditto for the Import. We have optimized this "setUnion"
1623  // operation on Import objects and Maps.
1624 
1625  // Choose the right variant of setUnion
1626  if (!Bimport.is_null() && !Iimport.is_null()) {
1627  Cimport = Bimport->setUnion(*Iimport);
1628  }
1629  else if (!Bimport.is_null() && Iimport.is_null()) {
1630  Cimport = Bimport->setUnion();
1631  }
1632  else if (Bimport.is_null() && !Iimport.is_null()) {
1633  Cimport = Iimport->setUnion();
1634  }
1635  else {
1636  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1637  }
1638  Ccolmap = Cimport->getTargetMap();
1639 
1640  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1641  // in general. We should get rid of it in order to reduce
1642  // communication costs of sparse matrix-matrix multiply.
1643  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1645 
1646  // NOTE: This is not efficient and should be folded into setUnion
1647  //
1648  // mfh 27 Sep 2016: What the above comment means, is that the
1649  // setUnion operation on Import objects could also compute these
1650  // local index - to - local index look-up tables.
1651  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1654  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1655  });
1656  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1657  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1658  });
1659 
1660  }
1661 
1662  // Replace the column map
1663  //
1664  // mfh 27 Sep 2016: We do this because C was originally created
1665  // without a column Map. Now we have its column Map.
1666  C.replaceColMap(Ccolmap);
1667 
1668  // mfh 27 Sep 2016: Construct tables that map from local column
1669  // indices of A, to local row indices of either B_local (the locally
1670  // owned part of B), or B_remote (the "imported" remote part of B).
1671  //
1672  // For column index Aik in row i of A, if the corresponding row of B
1673  // exists in the local part of B ("orig") (which I'll call B_local),
1674  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1675  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1676  //
1677  // For column index Aik in row i of A, if the corresponding row of B
1678  // exists in the remote part of B ("Import") (which I'll call
1679  // B_remote), then targetMapToImportRow[Aik] is the local index of
1680  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1681  // (a flag value).
1682 
1683  // Run through all the hash table lookups once and for all
1684  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1686 
1687  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1688  GO aidx = Acolmap_local.getGlobalElement(i);
1689  LO B_LID = Browmap_local.getLocalElement(aidx);
1690  if (B_LID != LO_INVALID) {
1691  targetMapToOrigRow(i) = B_LID;
1692  targetMapToImportRow(i) = LO_INVALID;
1693  } else {
1694  LO I_LID = Irowmap_local.getLocalElement(aidx);
1695  targetMapToOrigRow(i) = LO_INVALID;
1696  targetMapToImportRow(i) = I_LID;
1697 
1698  }
1699  });
1700 
1701  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1702  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1703  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1704 
1705 }
1706 
1707 
1708 /*********************************************************************************************************/
1709 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1710 template<class Scalar,
1711  class LocalOrdinal,
1712  class GlobalOrdinal,
1713  class Node,
1714  class LocalOrdinalViewType>
1715 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717  const LocalOrdinalViewType & targetMapToOrigRow,
1718  const LocalOrdinalViewType & targetMapToImportRow,
1719  const LocalOrdinalViewType & Bcol2Ccol,
1720  const LocalOrdinalViewType & Icol2Ccol,
1721  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723  const std::string& label,
1724  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725 #ifdef HAVE_TPETRA_MMM_TIMINGS
1726  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1727  using Teuchos::TimeMonitor;
1728  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1729 #endif
1730 
1731  using Teuchos::Array;
1732  using Teuchos::ArrayRCP;
1733  using Teuchos::ArrayView;
1734  using Teuchos::RCP;
1735  using Teuchos::rcp;
1736 
1737  // Lots and lots of typedefs
1738  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739  typedef typename KCRS::StaticCrsGraphType graph_t;
1740  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744 
1745  typedef Scalar SC;
1746  typedef LocalOrdinal LO;
1747  typedef GlobalOrdinal GO;
1748  typedef Node NO;
1749  typedef Map<LO,GO,NO> map_type;
1750  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753 
1754  // Sizes
1755  RCP<const map_type> Ccolmap = C.getColMap();
1756  size_t m = Aview.origMatrix->getNodeNumRows();
1757  size_t n = Ccolmap->getNodeNumElements();
1758  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759 
1760  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1761  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1763 
1764  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1767 
1768  c_lno_view_t Irowptr;
1769  lno_nnz_view_t Icolind;
1770  scalar_view_t Ivals;
1771  if(!Bview.importMatrix.is_null()) {
1772  auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773  Irowptr = lclB.graph.row_map;
1774  Icolind = lclB.graph.entries;
1775  Ivals = lclB.values;
1776  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1777  }
1778 
1779 #ifdef HAVE_TPETRA_MMM_TIMINGS
1780  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1781 #endif
1782 
1783  // Classic csr assembly (low memory edition)
1784  //
1785  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1786  // The method loops over rows of A, and may resize after processing
1787  // each row. Chris Siefert says that this reflects experience in
1788  // ML; for the non-threaded case, ML found it faster to spend less
1789  // effort on estimation and risk an occasional reallocation.
1790  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1792  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1793  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1794 
1795  // mfh 27 Sep 2016: The c_status array is an implementation detail
1796  // of the local sparse matrix-matrix multiply routine.
1797 
1798  // The status array will contain the index into colind where this entry was last deposited.
1799  // c_status[i] < CSR_ip - not in the row yet
1800  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1801  // We start with this filled with INVALID's indicating that there are no entries yet.
1802  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1803  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804  std::vector<size_t> c_status(n, ST_INVALID);
1805 
1806  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1807  // routine. The routine computes C := A * (B_local + B_remote).
1808  //
1809  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1810  // you whether the corresponding row of B belongs to B_local
1811  // ("orig") or B_remote ("Import").
1812 
1813  // For each row of A/C
1814  size_t CSR_ip = 0, OLD_ip = 0;
1815  for (size_t i = 0; i < m; i++) {
1816  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1817  // on the calling process.
1818  Crowptr[i] = CSR_ip;
1819 
1820  // mfh 27 Sep 2016: For each entry of A in the current row of A
1821  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822  LO Aik = Acolind[k]; // local column index of current entry of A
1823  const SC Aval = Avals[k]; // value of current entry of A
1824  if (Aval == SC_ZERO)
1825  continue; // skip explicitly stored zero values in A
1826 
1827  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1828  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1829  // corresponding to the current entry of A is populated, then
1830  // the corresponding row of B is in B_local (i.e., it lives on
1831  // the calling process).
1832 
1833  // Local matrix
1834  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1835 
1836  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1837  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838  LO Bkj = Bcolind[j];
1839  LO Cij = Bcol2Ccol[Bkj];
1840 
1841  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1842  // New entry
1843  c_status[Cij] = CSR_ip;
1844  Ccolind[CSR_ip] = Cij;
1845  Cvals[CSR_ip] = Aval*Bvals[j];
1846  CSR_ip++;
1847 
1848  } else {
1849  Cvals[c_status[Cij]] += Aval*Bvals[j];
1850  }
1851  }
1852 
1853  } else {
1854  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1855  // corresponding to the current entry of A NOT populated (has
1856  // a flag "invalid" value), then the corresponding row of B is
1857  // in B_local (i.e., it lives on the calling process).
1858 
1859  // Remote matrix
1860  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1861  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862  LO Ikj = Icolind[j];
1863  LO Cij = Icol2Ccol[Ikj];
1864 
1865  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1866  // New entry
1867  c_status[Cij] = CSR_ip;
1868  Ccolind[CSR_ip] = Cij;
1869  Cvals[CSR_ip] = Aval*Ivals[j];
1870  CSR_ip++;
1871  } else {
1872  Cvals[c_status[Cij]] += Aval*Ivals[j];
1873  }
1874  }
1875  }
1876  }
1877 
1878  // Resize for next pass if needed
1879  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1880  CSR_alloc *= 2;
1881  Kokkos::resize(Ccolind,CSR_alloc);
1882  Kokkos::resize(Cvals,CSR_alloc);
1883  }
1884  OLD_ip = CSR_ip;
1885  }
1886 
1887  Crowptr[m] = CSR_ip;
1888 
1889  // Downward resize
1890  Kokkos::resize(Ccolind,CSR_ip);
1891  Kokkos::resize(Cvals,CSR_ip);
1892 
1893 #ifdef HAVE_TPETRA_MMM_TIMINGS
1894  {
1895  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1896 #endif
1897 
1898  // Final sort & set of CRS arrays
1899  if (params.is_null() || params->get("sort entries",true))
1900  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901  C.setAllValues(Crowptr,Ccolind, Cvals);
1902 
1903 
1904 #ifdef HAVE_TPETRA_MMM_TIMINGS
1905  }
1906  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1907  {
1908 #endif
1909 
1910  // Final FillComplete
1911  //
1912  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1913  // Import (from domain Map to column Map) construction (which costs
1914  // lots of communication) by taking the previously constructed
1915  // Import object. We should be able to do this without interfering
1916  // with the implementation of the local part of sparse matrix-matrix
1917  // multply above.
1918  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1919  labelList->set("Timer Label",label);
1920  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1921  RCP<const Export<LO,GO,NO> > dummyExport;
1922  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923 #ifdef HAVE_TPETRA_MMM_TIMINGS
1924  }
1925  MM2 = Teuchos::null;
1926 #endif
1927 
1928 }
1929 
1930 
1931 
1932 
1933 /*********************************************************************************************************/
1934 // Kernel method for computing the local portion of C = A*B (reuse)
1935 template<class Scalar,
1936  class LocalOrdinal,
1937  class GlobalOrdinal,
1938  class Node>
1939 void mult_A_B_reuse(
1940  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943  const std::string& label,
1944  const Teuchos::RCP<Teuchos::ParameterList>& params)
1945 {
1946  using Teuchos::Array;
1947  using Teuchos::ArrayRCP;
1948  using Teuchos::ArrayView;
1949  using Teuchos::RCP;
1950  using Teuchos::rcp;
1951 
1952  // Tpetra typedefs
1953  typedef LocalOrdinal LO;
1954  typedef GlobalOrdinal GO;
1955  typedef Node NO;
1956  typedef Import<LO,GO,NO> import_type;
1957  typedef Map<LO,GO,NO> map_type;
1958 
1959  // Kokkos typedefs
1960  typedef typename map_type::local_map_type local_map_type;
1962  typedef typename KCRS::StaticCrsGraphType graph_t;
1963  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964  typedef typename NO::execution_space execution_space;
1965  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1967 
1968 #ifdef HAVE_TPETRA_MMM_TIMINGS
1969  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1970  using Teuchos::TimeMonitor;
1971  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1972 #endif
1973  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1974 
1975  // Grab all the maps
1976  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977  RCP<const map_type> Ccolmap = C.getColMap();
1978  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1984 
1985  // Build the final importer / column map, hash table lookups for C
1986  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987  {
1988  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1989  // So, column map of C may be a strict subset of the column map of B
1990  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1991  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1992  });
1993 
1994  if (!Bview.importMatrix.is_null()) {
1995  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1997 
1998  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2000  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2001  });
2002  }
2003  }
2004 
2005  // Run through all the hash table lookups once and for all
2006  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2009  GO aidx = Acolmap_local.getGlobalElement(i);
2010  LO B_LID = Browmap_local.getLocalElement(aidx);
2011  if (B_LID != LO_INVALID) {
2012  targetMapToOrigRow(i) = B_LID;
2013  targetMapToImportRow(i) = LO_INVALID;
2014  } else {
2015  LO I_LID = Irowmap_local.getLocalElement(aidx);
2016  targetMapToOrigRow(i) = LO_INVALID;
2017  targetMapToImportRow(i) = I_LID;
2018 
2019  }
2020  });
2021 
2022  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2023  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2024  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025 }
2026 
2027 /*********************************************************************************************************/
2028 template<class Scalar,
2029  class LocalOrdinal,
2030  class GlobalOrdinal,
2031  class Node,
2032  class LocalOrdinalViewType>
2033 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035  const LocalOrdinalViewType & targetMapToOrigRow,
2036  const LocalOrdinalViewType & targetMapToImportRow,
2037  const LocalOrdinalViewType & Bcol2Ccol,
2038  const LocalOrdinalViewType & Icol2Ccol,
2039  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2041  const std::string& label,
2042  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2043 #ifdef HAVE_TPETRA_MMM_TIMINGS
2044  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2045  using Teuchos::TimeMonitor;
2046  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2047  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2048 #else
2049  (void)label;
2050 #endif
2051  using Teuchos::RCP;
2052  using Teuchos::rcp;
2053 
2054 
2055  // Lots and lots of typedefs
2056  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057  typedef typename KCRS::StaticCrsGraphType graph_t;
2058  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2061 
2062  typedef Scalar SC;
2063  typedef LocalOrdinal LO;
2064  typedef GlobalOrdinal GO;
2065  typedef Node NO;
2066  typedef Map<LO,GO,NO> map_type;
2067  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2070 
2071  // Sizes
2072  RCP<const map_type> Ccolmap = C.getColMap();
2073  size_t m = Aview.origMatrix->getNodeNumRows();
2074  size_t n = Ccolmap->getNodeNumElements();
2075 
2076  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2077  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079  const KCRS & Cmat = C.getLocalMatrixHost();
2080 
2081  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084  scalar_view_t Cvals = Cmat.values;
2085 
2086  c_lno_view_t Irowptr;
2087  lno_nnz_view_t Icolind;
2088  scalar_view_t Ivals;
2089  if(!Bview.importMatrix.is_null()) {
2090  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091  Irowptr = lclB.graph.row_map;
2092  Icolind = lclB.graph.entries;
2093  Ivals = lclB.values;
2094  }
2095 
2096 #ifdef HAVE_TPETRA_MMM_TIMINGS
2097  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2098 #endif
2099 
2100  // Classic csr assembly (low memory edition)
2101  // mfh 27 Sep 2016: The c_status array is an implementation detail
2102  // of the local sparse matrix-matrix multiply routine.
2103 
2104  // The status array will contain the index into colind where this entry was last deposited.
2105  // c_status[i] < CSR_ip - not in the row yet
2106  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2107  // We start with this filled with INVALID's indicating that there are no entries yet.
2108  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2109  std::vector<size_t> c_status(n, ST_INVALID);
2110 
2111  // For each row of A/C
2112  size_t CSR_ip = 0, OLD_ip = 0;
2113  for (size_t i = 0; i < m; i++) {
2114  // First fill the c_status array w/ locations where we're allowed to
2115  // generate nonzeros for this row
2116  OLD_ip = Crowptr[i];
2117  CSR_ip = Crowptr[i+1];
2118  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2119  c_status[Ccolind[k]] = k;
2120 
2121  // Reset values in the row of C
2122  Cvals[k] = SC_ZERO;
2123  }
2124 
2125  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126  LO Aik = Acolind[k];
2127  const SC Aval = Avals[k];
2128  if (Aval == SC_ZERO)
2129  continue;
2130 
2131  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2132  // Local matrix
2133  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2134 
2135  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136  LO Bkj = Bcolind[j];
2137  LO Cij = Bcol2Ccol[Bkj];
2138 
2139  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2141  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2142 
2143  Cvals[c_status[Cij]] += Aval * Bvals[j];
2144  }
2145 
2146  } else {
2147  // Remote matrix
2148  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2149  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150  LO Ikj = Icolind[j];
2151  LO Cij = Icol2Ccol[Ikj];
2152 
2153  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2155  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2156 
2157  Cvals[c_status[Cij]] += Aval * Ivals[j];
2158  }
2159  }
2160  }
2161  }
2162 
2163 #ifdef HAVE_TPETRA_MMM_TIMINGS
2164  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2165 #endif
2166 
2167  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2168 }
2169 
2170 
2171 /*********************************************************************************************************/
2172 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2173 template<class Scalar,
2174  class LocalOrdinal,
2175  class GlobalOrdinal,
2176  class Node>
2177 void jacobi_A_B_newmatrix(
2178  Scalar omega,
2179  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183  const std::string& label,
2184  const Teuchos::RCP<Teuchos::ParameterList>& params)
2185 {
2186  using Teuchos::Array;
2187  using Teuchos::ArrayRCP;
2188  using Teuchos::ArrayView;
2189  using Teuchos::RCP;
2190  using Teuchos::rcp;
2191  // typedef Scalar SC;
2192  typedef LocalOrdinal LO;
2193  typedef GlobalOrdinal GO;
2194  typedef Node NO;
2195 
2196  typedef Import<LO,GO,NO> import_type;
2197  typedef Map<LO,GO,NO> map_type;
2198  typedef typename map_type::local_map_type local_map_type;
2199 
2200  // All of the Kokkos typedefs
2202  typedef typename KCRS::StaticCrsGraphType graph_t;
2203  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204  typedef typename NO::execution_space execution_space;
2205  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2207 
2208 
2209 #ifdef HAVE_TPETRA_MMM_TIMINGS
2210  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2211  using Teuchos::TimeMonitor;
2212  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2213 #endif
2214  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2215 
2216  // Build the final importer / column map, hash table lookups for C
2217  RCP<const import_type> Cimport;
2218  RCP<const map_type> Ccolmap;
2219  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2227 
2228  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2229  // indices of B, to local column indices of C. (B and C have the
2230  // same number of columns.) The kernel uses this, instead of
2231  // copying the entire input matrix B and converting its column
2232  // indices to those of C.
2233  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2234 
2235  if (Bview.importMatrix.is_null()) {
2236  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2237  Cimport = Bimport;
2238  Ccolmap = Bview.colMap;
2239  // Bcol2Ccol is trivial
2240  // Bcol2Ccol is trivial
2241 
2242  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2243  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2244  Bcol2Ccol(i) = static_cast<LO> (i);
2245  });
2246  } else {
2247  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2248  // column Map of C, as well as C's Import object (from its domain
2249  // Map to its column Map). C's column Map is the union of the
2250  // column Maps of (the local part of) B, and the "remote" part of
2251  // B. Ditto for the Import. We have optimized this "setUnion"
2252  // operation on Import objects and Maps.
2253 
2254  // Choose the right variant of setUnion
2255  if (!Bimport.is_null() && !Iimport.is_null()){
2256  Cimport = Bimport->setUnion(*Iimport);
2257  Ccolmap = Cimport->getTargetMap();
2258 
2259  } else if (!Bimport.is_null() && Iimport.is_null()) {
2260  Cimport = Bimport->setUnion();
2261 
2262  } else if(Bimport.is_null() && !Iimport.is_null()) {
2263  Cimport = Iimport->setUnion();
2264 
2265  } else
2266  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2267 
2268  Ccolmap = Cimport->getTargetMap();
2269 
2270  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2272 
2273  // NOTE: This is not efficient and should be folded into setUnion
2274  //
2275  // mfh 27 Sep 2016: What the above comment means, is that the
2276  // setUnion operation on Import objects could also compute these
2277  // local index - to - local index look-up tables.
2278  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2281  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2282  });
2283  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2284  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2285  });
2286 
2287  }
2288 
2289  // Replace the column map
2290  //
2291  // mfh 27 Sep 2016: We do this because C was originally created
2292  // without a column Map. Now we have its column Map.
2293  C.replaceColMap(Ccolmap);
2294 
2295  // mfh 27 Sep 2016: Construct tables that map from local column
2296  // indices of A, to local row indices of either B_local (the locally
2297  // owned part of B), or B_remote (the "imported" remote part of B).
2298  //
2299  // For column index Aik in row i of A, if the corresponding row of B
2300  // exists in the local part of B ("orig") (which I'll call B_local),
2301  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2302  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2303  //
2304  // For column index Aik in row i of A, if the corresponding row of B
2305  // exists in the remote part of B ("Import") (which I'll call
2306  // B_remote), then targetMapToImportRow[Aik] is the local index of
2307  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2308  // (a flag value).
2309 
2310  // Run through all the hash table lookups once and for all
2311  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2314  GO aidx = Acolmap_local.getGlobalElement(i);
2315  LO B_LID = Browmap_local.getLocalElement(aidx);
2316  if (B_LID != LO_INVALID) {
2317  targetMapToOrigRow(i) = B_LID;
2318  targetMapToImportRow(i) = LO_INVALID;
2319  } else {
2320  LO I_LID = Irowmap_local.getLocalElement(aidx);
2321  targetMapToOrigRow(i) = LO_INVALID;
2322  targetMapToImportRow(i) = I_LID;
2323 
2324  }
2325  });
2326 
2327  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2328  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2329  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2330 
2331 }
2332 
2333 
2334 /*********************************************************************************************************/
2335 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2336 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2337 
2338 template<class Scalar,
2339  class LocalOrdinal,
2340  class GlobalOrdinal,
2341  class Node,
2342  class LocalOrdinalViewType>
2343 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347  const LocalOrdinalViewType & targetMapToOrigRow,
2348  const LocalOrdinalViewType & targetMapToImportRow,
2349  const LocalOrdinalViewType & Bcol2Ccol,
2350  const LocalOrdinalViewType & Icol2Ccol,
2351  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353  const std::string& label,
2354  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2355 
2356 #ifdef HAVE_TPETRA_MMM_TIMINGS
2357  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2358  using Teuchos::TimeMonitor;
2359  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2360 #endif
2361 
2362  using Teuchos::Array;
2363  using Teuchos::ArrayRCP;
2364  using Teuchos::ArrayView;
2365  using Teuchos::RCP;
2366  using Teuchos::rcp;
2367 
2368  // Lots and lots of typedefs
2369  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370  typedef typename KCRS::StaticCrsGraphType graph_t;
2371  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2375 
2376  // Jacobi-specific
2377  typedef typename scalar_view_t::memory_space scalar_memory_space;
2378 
2379  typedef Scalar SC;
2380  typedef LocalOrdinal LO;
2381  typedef GlobalOrdinal GO;
2382  typedef Node NO;
2383 
2384  typedef Map<LO,GO,NO> map_type;
2385  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2387 
2388  // Sizes
2389  RCP<const map_type> Ccolmap = C.getColMap();
2390  size_t m = Aview.origMatrix->getNodeNumRows();
2391  size_t n = Ccolmap->getNodeNumElements();
2392  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2393 
2394  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2395  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2397 
2398  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2401 
2402  c_lno_view_t Irowptr;
2403  lno_nnz_view_t Icolind;
2404  scalar_view_t Ivals;
2405  if(!Bview.importMatrix.is_null()) {
2406  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407  Irowptr = lclB.graph.row_map;
2408  Icolind = lclB.graph.entries;
2409  Ivals = lclB.values;
2410  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2411  }
2412 
2413  // Jacobi-specific inner stuff
2414  auto Dvals =
2415  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2416 
2417  // Teuchos::ArrayView::operator[].
2418  // The status array will contain the index into colind where this entry was last deposited.
2419  // c_status[i] < CSR_ip - not in the row yet.
2420  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2421  // We start with this filled with INVALID's indicating that there are no entries yet.
2422  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2423  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424  Array<size_t> c_status(n, ST_INVALID);
2425 
2426  // Classic csr assembly (low memory edition)
2427  //
2428  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2429  // The method loops over rows of A, and may resize after processing
2430  // each row. Chris Siefert says that this reflects experience in
2431  // ML; for the non-threaded case, ML found it faster to spend less
2432  // effort on estimation and risk an occasional reallocation.
2433  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2435  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2436  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2437  size_t CSR_ip = 0, OLD_ip = 0;
2438 
2439  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2440 
2441  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2442  // routine. The routine computes
2443  //
2444  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2445  //
2446  // This corresponds to one sweep of (weighted) Jacobi.
2447  //
2448  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2449  // you whether the corresponding row of B belongs to B_local
2450  // ("orig") or B_remote ("Import").
2451 
2452  // For each row of A/C
2453  for (size_t i = 0; i < m; i++) {
2454  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2455  // on the calling process.
2456  Crowptr[i] = CSR_ip;
2457  SC minusOmegaDval = -omega*Dvals(i,0);
2458 
2459  // Entries of B
2460  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461  Scalar Bval = Bvals[j];
2462  if (Bval == SC_ZERO)
2463  continue;
2464  LO Bij = Bcolind[j];
2465  LO Cij = Bcol2Ccol[Bij];
2466 
2467  // Assume no repeated entries in B
2468  c_status[Cij] = CSR_ip;
2469  Ccolind[CSR_ip] = Cij;
2470  Cvals[CSR_ip] = Bvals[j];
2471  CSR_ip++;
2472  }
2473 
2474  // Entries of -omega * Dinv * A * B
2475  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476  LO Aik = Acolind[k];
2477  const SC Aval = Avals[k];
2478  if (Aval == SC_ZERO)
2479  continue;
2480 
2481  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2482  // Local matrix
2483  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2484 
2485  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486  LO Bkj = Bcolind[j];
2487  LO Cij = Bcol2Ccol[Bkj];
2488 
2489  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2490  // New entry
2491  c_status[Cij] = CSR_ip;
2492  Ccolind[CSR_ip] = Cij;
2493  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2494  CSR_ip++;
2495 
2496  } else {
2497  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2498  }
2499  }
2500 
2501  } else {
2502  // Remote matrix
2503  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2504  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505  LO Ikj = Icolind[j];
2506  LO Cij = Icol2Ccol[Ikj];
2507 
2508  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2509  // New entry
2510  c_status[Cij] = CSR_ip;
2511  Ccolind[CSR_ip] = Cij;
2512  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2513  CSR_ip++;
2514  } else {
2515  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516  }
2517  }
2518  }
2519  }
2520 
2521  // Resize for next pass if needed
2522  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2523  CSR_alloc *= 2;
2524  Kokkos::resize(Ccolind,CSR_alloc);
2525  Kokkos::resize(Cvals,CSR_alloc);
2526  }
2527  OLD_ip = CSR_ip;
2528  }
2529  Crowptr[m] = CSR_ip;
2530 
2531  // Downward resize
2532  Kokkos::resize(Ccolind,CSR_ip);
2533  Kokkos::resize(Cvals,CSR_ip);
2534 
2535  {
2536 #ifdef HAVE_TPETRA_MMM_TIMINGS
2537  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2538 #endif
2539 
2540  // Replace the column map
2541  //
2542  // mfh 27 Sep 2016: We do this because C was originally created
2543  // without a column Map. Now we have its column Map.
2544  C.replaceColMap(Ccolmap);
2545 
2546  // Final sort & set of CRS arrays
2547  //
2548  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2549  // matrix-matrix multiply routine sort the entries for us?
2550  // Final sort & set of CRS arrays
2551  if (params.is_null() || params->get("sort entries",true))
2552  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553  C.setAllValues(Crowptr,Ccolind, Cvals);
2554  }
2555  {
2556 #ifdef HAVE_TPETRA_MMM_TIMINGS
2557  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2558 #endif
2559 
2560  // Final FillComplete
2561  //
2562  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2563  // Import (from domain Map to column Map) construction (which costs
2564  // lots of communication) by taking the previously constructed
2565  // Import object. We should be able to do this without interfering
2566  // with the implementation of the local part of sparse matrix-matrix
2567  // multply above
2568  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2569  labelList->set("Timer Label",label);
2570  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2571  RCP<const Export<LO,GO,NO> > dummyExport;
2572  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2573 
2574  }
2575 }
2576 
2577 
2578 /*********************************************************************************************************/
2579 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2580 template<class Scalar,
2581  class LocalOrdinal,
2582  class GlobalOrdinal,
2583  class Node>
2584 void jacobi_A_B_reuse(
2585  Scalar omega,
2586  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590  const std::string& label,
2591  const Teuchos::RCP<Teuchos::ParameterList>& params)
2592 {
2593  using Teuchos::Array;
2594  using Teuchos::ArrayRCP;
2595  using Teuchos::ArrayView;
2596  using Teuchos::RCP;
2597  using Teuchos::rcp;
2598 
2599  typedef LocalOrdinal LO;
2600  typedef GlobalOrdinal GO;
2601  typedef Node NO;
2602 
2603  typedef Import<LO,GO,NO> import_type;
2604  typedef Map<LO,GO,NO> map_type;
2605 
2606  // Kokkos typedefs
2607  typedef typename map_type::local_map_type local_map_type;
2609  typedef typename KCRS::StaticCrsGraphType graph_t;
2610  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611  typedef typename NO::execution_space execution_space;
2612  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2614 
2615 #ifdef HAVE_TPETRA_MMM_TIMINGS
2616  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2617  using Teuchos::TimeMonitor;
2618  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2619 #endif
2620  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2621 
2622  // Grab all the maps
2623  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624  RCP<const map_type> Ccolmap = C.getColMap();
2625  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2631 
2632  // Build the final importer / column map, hash table lookups for C
2633  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2634  {
2635  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2636  // So, column map of C may be a strict subset of the column map of B
2637  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2638  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2639  });
2640 
2641  if (!Bview.importMatrix.is_null()) {
2642  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2644 
2645  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2647  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2648  });
2649  }
2650  }
2651 
2652  // Run through all the hash table lookups once and for all
2653  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2656  GO aidx = Acolmap_local.getGlobalElement(i);
2657  LO B_LID = Browmap_local.getLocalElement(aidx);
2658  if (B_LID != LO_INVALID) {
2659  targetMapToOrigRow(i) = B_LID;
2660  targetMapToImportRow(i) = LO_INVALID;
2661  } else {
2662  LO I_LID = Irowmap_local.getLocalElement(aidx);
2663  targetMapToOrigRow(i) = LO_INVALID;
2664  targetMapToImportRow(i) = I_LID;
2665 
2666  }
2667  });
2668 
2669 #ifdef HAVE_TPETRA_MMM_TIMINGS
2670  MM = Teuchos::null;
2671 #endif
2672 
2673  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2674  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2675  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2676 }
2677 
2678 
2679 
2680 /*********************************************************************************************************/
2681 template<class Scalar,
2682  class LocalOrdinal,
2683  class GlobalOrdinal,
2684  class Node,
2685  class LocalOrdinalViewType>
2686 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690  const LocalOrdinalViewType & targetMapToOrigRow,
2691  const LocalOrdinalViewType & targetMapToImportRow,
2692  const LocalOrdinalViewType & Bcol2Ccol,
2693  const LocalOrdinalViewType & Icol2Ccol,
2694  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2696  const std::string& label,
2697  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2698 #ifdef HAVE_TPETRA_MMM_TIMINGS
2699  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2700  using Teuchos::TimeMonitor;
2701  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2702  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2703 #else
2704  (void)label;
2705 #endif
2706  using Teuchos::RCP;
2707  using Teuchos::rcp;
2708 
2709  // Lots and lots of typedefs
2710  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711  typedef typename KCRS::StaticCrsGraphType graph_t;
2712  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715  typedef typename scalar_view_t::memory_space scalar_memory_space;
2716 
2717  typedef Scalar SC;
2718  typedef LocalOrdinal LO;
2719  typedef GlobalOrdinal GO;
2720  typedef Node NO;
2721  typedef Map<LO,GO,NO> map_type;
2722  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2725 
2726  // Sizes
2727  RCP<const map_type> Ccolmap = C.getColMap();
2728  size_t m = Aview.origMatrix->getNodeNumRows();
2729  size_t n = Ccolmap->getNodeNumElements();
2730 
2731  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2732  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734  const KCRS & Cmat = C.getLocalMatrixHost();
2735 
2736  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739  scalar_view_t Cvals = Cmat.values;
2740 
2741  c_lno_view_t Irowptr;
2742  lno_nnz_view_t Icolind;
2743  scalar_view_t Ivals;
2744  if(!Bview.importMatrix.is_null()) {
2745  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746  Irowptr = lclB.graph.row_map;
2747  Icolind = lclB.graph.entries;
2748  Ivals = lclB.values;
2749  }
2750 
2751  // Jacobi-specific inner stuff
2752  auto Dvals =
2753  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2754 
2755 #ifdef HAVE_TPETRA_MMM_TIMINGS
2756  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2757 #endif
2758 
2759  // The status array will contain the index into colind where this entry was last deposited.
2760  // c_status[i] < CSR_ip - not in the row yet
2761  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2762  // We start with this filled with INVALID's indicating that there are no entries yet.
2763  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2764  std::vector<size_t> c_status(n, ST_INVALID);
2765 
2766  // For each row of A/C
2767  size_t CSR_ip = 0, OLD_ip = 0;
2768  for (size_t i = 0; i < m; i++) {
2769 
2770  // First fill the c_status array w/ locations where we're allowed to
2771  // generate nonzeros for this row
2772  OLD_ip = Crowptr[i];
2773  CSR_ip = Crowptr[i+1];
2774  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2775  c_status[Ccolind[k]] = k;
2776 
2777  // Reset values in the row of C
2778  Cvals[k] = SC_ZERO;
2779  }
2780 
2781  SC minusOmegaDval = -omega*Dvals(i,0);
2782 
2783  // Entries of B
2784  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785  Scalar Bval = Bvals[j];
2786  if (Bval == SC_ZERO)
2787  continue;
2788  LO Bij = Bcolind[j];
2789  LO Cij = Bcol2Ccol[Bij];
2790 
2791  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792  std::runtime_error, "Trying to insert a new entry into a static graph");
2793 
2794  Cvals[c_status[Cij]] = Bvals[j];
2795  }
2796 
2797  // Entries of -omega * Dinv * A * B
2798  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799  LO Aik = Acolind[k];
2800  const SC Aval = Avals[k];
2801  if (Aval == SC_ZERO)
2802  continue;
2803 
2804  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2805  // Local matrix
2806  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2807 
2808  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809  LO Bkj = Bcolind[j];
2810  LO Cij = Bcol2Ccol[Bkj];
2811 
2812  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813  std::runtime_error, "Trying to insert a new entry into a static graph");
2814 
2815  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2816  }
2817 
2818  } else {
2819  // Remote matrix
2820  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2821  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822  LO Ikj = Icolind[j];
2823  LO Cij = Icol2Ccol[Ikj];
2824 
2825  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826  std::runtime_error, "Trying to insert a new entry into a static graph");
2827 
2828  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2829  }
2830  }
2831  }
2832  }
2833 
2834 #ifdef HAVE_TPETRA_MMM_TIMINGS
2835  MM2 = Teuchos::null;
2836  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2837 #endif
2838 
2839  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840 #ifdef HAVE_TPETRA_MMM_TIMINGS
2841  MM2 = Teuchos::null;
2842  MM = Teuchos::null;
2843 #endif
2844 
2845 }
2846 
2847 
2848 
2849 /*********************************************************************************************************/
2850 template<class Scalar,
2851  class LocalOrdinal,
2852  class GlobalOrdinal,
2853  class Node>
2854 void import_and_extract_views(
2855  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859  bool userAssertsThereAreNoRemotes,
2860  const std::string& label,
2861  const Teuchos::RCP<Teuchos::ParameterList>& params)
2862 {
2863  using Teuchos::Array;
2864  using Teuchos::ArrayView;
2865  using Teuchos::RCP;
2866  using Teuchos::rcp;
2867  using Teuchos::null;
2868 
2869  typedef Scalar SC;
2870  typedef LocalOrdinal LO;
2871  typedef GlobalOrdinal GO;
2872  typedef Node NO;
2873 
2874  typedef Map<LO,GO,NO> map_type;
2875  typedef Import<LO,GO,NO> import_type;
2876  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2877 
2878 #ifdef HAVE_TPETRA_MMM_TIMINGS
2879  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2880  using Teuchos::TimeMonitor;
2881  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2882 #endif
2883  // The goal of this method is to populate the 'Aview' struct with views of the
2884  // rows of A, including all rows that correspond to elements in 'targetMap'.
2885  //
2886  // If targetMap includes local elements that correspond to remotely-owned rows
2887  // of A, then those remotely-owned rows will be imported into
2888  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2889  Aview.deleteContents();
2890 
2891  Aview.origMatrix = rcp(&A, false);
2892  Aview.origRowMap = A.getRowMap();
2893  Aview.rowMap = targetMap;
2894  Aview.colMap = A.getColMap();
2895  Aview.domainMap = A.getDomainMap();
2896  Aview.importColMap = null;
2897  RCP<const map_type> rowMap = A.getRowMap();
2898  const int numProcs = rowMap->getComm()->getSize();
2899 
2900  // Short circuit if the user swears there are no remotes (or if we're in serial)
2901  if (userAssertsThereAreNoRemotes || numProcs < 2)
2902  return;
2903 
2904  RCP<const import_type> importer;
2905  if (params != null && params->isParameter("importer")) {
2906  importer = params->get<RCP<const import_type> >("importer");
2907 
2908  } else {
2909 #ifdef HAVE_TPETRA_MMM_TIMINGS
2910  MM = Teuchos::null;
2911  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2912 #endif
2913 
2914  // Mark each row in targetMap as local or remote, and go ahead and get a view
2915  // for the local rows
2916  RCP<const map_type> remoteRowMap;
2917  size_t numRemote = 0;
2918  int mode = 0;
2919  if (!prototypeImporter.is_null() &&
2920  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2922  // We have a valid prototype importer --- ask it for the remotes
2923 #ifdef HAVE_TPETRA_MMM_TIMINGS
2924  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2925 #endif
2926  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927  numRemote = prototypeImporter->getNumRemoteIDs();
2928 
2929  Array<GO> remoteRows(numRemote);
2930  for (size_t i = 0; i < numRemote; i++)
2931  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2932 
2933  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934  rowMap->getIndexBase(), rowMap->getComm()));
2935  mode = 1;
2936 
2937  } else if (prototypeImporter.is_null()) {
2938  // No prototype importer --- count the remotes the hard way
2939 #ifdef HAVE_TPETRA_MMM_TIMINGS
2940  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2941 #endif
2942  ArrayView<const GO> rows = targetMap->getNodeElementList();
2943  size_t numRows = targetMap->getNodeNumElements();
2944 
2945  Array<GO> remoteRows(numRows);
2946  for(size_t i = 0; i < numRows; ++i) {
2947  const LO mlid = rowMap->getLocalElement(rows[i]);
2948 
2949  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950  remoteRows[numRemote++] = rows[i];
2951  }
2952  remoteRows.resize(numRemote);
2953  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954  rowMap->getIndexBase(), rowMap->getComm()));
2955  mode = 2;
2956 
2957  } else {
2958  // PrototypeImporter is bad. But if we're in serial that's OK.
2959  mode = 3;
2960  }
2961 
2962  if (numProcs < 2) {
2963  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965  // If only one processor we don't need to import any remote rows, so return.
2966  return;
2967  }
2968 
2969  //
2970  // Now we will import the needed remote rows of A, if the global maximum
2971  // value of numRemote is greater than 0.
2972  //
2973 #ifdef HAVE_TPETRA_MMM_TIMINGS
2974  MM = Teuchos::null;
2975  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2976 #endif
2977 
2978  global_size_t globalMaxNumRemote = 0;
2979  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2980 
2981  if (globalMaxNumRemote > 0) {
2982 #ifdef HAVE_TPETRA_MMM_TIMINGS
2983  MM = Teuchos::null;
2984  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2985 #endif
2986  // Create an importer with target-map remoteRowMap and source-map rowMap.
2987  if (mode == 1)
2988  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2989  else if (mode == 2)
2990  importer = rcp(new import_type(rowMap, remoteRowMap));
2991  else
2992  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2993  }
2994 
2995  if (params != null)
2996  params->set("importer", importer);
2997  }
2998 
2999  if (importer != null) {
3000 #ifdef HAVE_TPETRA_MMM_TIMINGS
3001  MM = Teuchos::null;
3002  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3003 #endif
3004 
3005  // Now create a new matrix into which we can import the remote rows of A that we need.
3006  Teuchos::ParameterList labelList;
3007  labelList.set("Timer Label", label);
3008  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3009 
3010  bool isMM = true;
3011  bool overrideAllreduce = false;
3012  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3013  // Minor speedup tweak - avoid computing the global constants
3014  Teuchos::ParameterList params_sublist;
3015  if(!params.is_null()) {
3016  labelList.set("compute global constants", params->get("compute global constants",false));
3017  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3018  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3022  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3023  }
3024  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3025  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3027 
3028  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3029  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3030 
3031 #if 0
3032  // Disabled code for dumping input matrices
3033  static int count=0;
3034  char str[80];
3035  sprintf(str,"import_matrix.%d.dat",count);
3037  count++;
3038 #endif
3039 
3040 #ifdef HAVE_TPETRA_MMM_STATISTICS
3041  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3042 #endif
3043 
3044 
3045 #ifdef HAVE_TPETRA_MMM_TIMINGS
3046  MM = Teuchos::null;
3047  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3048 #endif
3049 
3050  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3051  Aview.importColMap = Aview.importMatrix->getColMap();
3052 #ifdef HAVE_TPETRA_MMM_TIMINGS
3053  MM = Teuchos::null;
3054 #endif
3055  }
3056 }
3057 
3058 
3059 
3060 
3061 
3062 /*********************************************************************************************************/
3063  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3064 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3066 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068  const LocalOrdinalViewType & Acol2Brow,
3069  const LocalOrdinalViewType & Acol2Irow,
3070  const LocalOrdinalViewType & Bcol2Ccol,
3071  const LocalOrdinalViewType & Icol2Ccol,
3072  const size_t mergedNodeNumCols) {
3073 
3074  using Teuchos::RCP;
3076  typedef typename KCRS::StaticCrsGraphType graph_t;
3077  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3080  // Grab the Kokkos::SparseCrsMatrices
3081  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3082  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3083 
3084  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3085  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3086  // We do have a Bimport
3087  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3088  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3089  RCP<const KCRS> Ik_;
3090  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3092  KCRS Iks;
3093  if(Ik!=0) Iks = *Ik;
3094  size_t merge_numrows = Ak.numCols();
3095  // The last entry of this at least, need to be initialized
3096  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3097 
3098  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3099 
3100  // Use a Kokkos::parallel_scan to build the rowptr
3101  typedef typename Node::execution_space execution_space;
3102  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3105  if(final) Mrowptr(i) = update;
3106  // Get the row count
3107  size_t ct=0;
3108  if(Acol2Brow(i)!=LO_INVALID)
3109  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3110  else
3111  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3112  update+=ct;
3113 
3114  if(final && i+1==merge_numrows)
3115  Mrowptr(i+1)=update;
3116  });
3117 
3118  // Allocate nnz
3119  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3121  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3122 
3123  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3124  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3126  if(Acol2Brow(i)!=LO_INVALID) {
3127  size_t row = Acol2Brow(i);
3128  size_t start = Bk.graph.row_map(row);
3129  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3132  }
3133  }
3134  else {
3135  size_t row = Acol2Irow(i);
3136  size_t start = Iks.graph.row_map(row);
3137  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3140  }
3141  }
3142  });
3143 
3144  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145  return newmat;
3146  }
3147  else {
3148  // We don't have a Bimport (the easy case)
3149  return Bk;
3150  }
3151 }//end merge_matrices
3152 
3153 template<typename SC, typename LO, typename GO, typename NO>
3154 void AddKernels<SC, LO, GO, NO>::
3155 addSorted(
3156  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3167 {
3168  using Teuchos::TimeMonitor;
3169  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3171  auto nrows = Arowptrs.extent(0) - 1;
3172  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3173  typename AddKern::KKH handle;
3174  handle.create_spadd_handle(true);
3175  auto addHandle = handle.get_spadd_handle();
3176 #ifdef HAVE_TPETRA_MMM_TIMINGS
3177  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3178 #endif
3179  KokkosSparse::Experimental::spadd_symbolic
3180  <KKH,
3181  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3182  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3183  row_ptrs_array, col_inds_array>
3184  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3185  //KokkosKernels requires values to be zeroed
3186  Cvals = values_array("C values", addHandle->get_c_nnz());
3187  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3188 #ifdef HAVE_TPETRA_MMM_TIMINGS
3189  MM = Teuchos::null;
3190  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3191 #endif
3192  KokkosSparse::Experimental::spadd_numeric(&handle,
3193  Arowptrs, Acolinds, Avals, scalarA,
3194  Browptrs, Bcolinds, Bvals, scalarB,
3195  Crowptrs, Ccolinds, Cvals);
3196 }
3197 
3198 template<typename SC, typename LO, typename GO, typename NO>
3199 void AddKernels<SC, LO, GO, NO>::
3200 addUnsorted(
3201  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3209  GO /* numGlobalCols */,
3210  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3213 {
3214  using Teuchos::TimeMonitor;
3215  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3217  auto nrows = Arowptrs.extent(0) - 1;
3218  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3219  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220  typename AddKern::KKH handle;
3221  handle.create_spadd_handle(false);
3222  auto addHandle = handle.get_spadd_handle();
3223 #ifdef HAVE_TPETRA_MMM_TIMINGS
3224  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3225 #endif
3226  KokkosSparse::Experimental::spadd_symbolic
3227  <KKH,
3228  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3229  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3230  row_ptrs_array, col_inds_array>
3231  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3232  //Cvals must be zeroed out
3233  Cvals = values_array("C values", addHandle->get_c_nnz());
3234  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3235 #ifdef HAVE_TPETRA_MMM_TIMINGS
3236  MM = Teuchos::null;
3237  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3238 #endif
3239  KokkosSparse::Experimental::spadd_numeric(&handle,
3240  Arowptrs, Acolinds, Avals, scalarA,
3241  Browptrs, Bcolinds, Bvals, scalarB,
3242  Crowptrs, Ccolinds, Cvals);
3243 }
3244 
3245 template<typename GO,
3246  typename LocalIndicesType,
3247  typename GlobalIndicesType,
3248  typename ColMapType>
3249 struct ConvertLocalToGlobalFunctor
3250 {
3251  ConvertLocalToGlobalFunctor(
3252  const LocalIndicesType& colindsOrig_,
3253  const GlobalIndicesType& colindsConverted_,
3254  const ColMapType& colmap_) :
3255  colindsOrig (colindsOrig_),
3256  colindsConverted (colindsConverted_),
3257  colmap (colmap_)
3258  {}
3259  KOKKOS_INLINE_FUNCTION void
3260  operator() (const GO i) const
3261  {
3262  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3263  }
3264  LocalIndicesType colindsOrig;
3265  GlobalIndicesType colindsConverted;
3266  ColMapType colmap;
3267 };
3268 
3269 template<typename SC, typename LO, typename GO, typename NO>
3270 void MMdetails::AddKernels<SC, LO, GO, NO>::
3271 convertToGlobalAndAdd(
3272  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3281 {
3282  using Teuchos::TimeMonitor;
3283  //Need to use a different KokkosKernelsHandle type than other versions,
3284  //since the ordinals are now GO
3285  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3287 
3288  const values_array Avals = A.values;
3289  const values_array Bvals = B.values;
3290  const col_inds_array Acolinds = A.graph.entries;
3291  const col_inds_array Bcolinds = B.graph.entries;
3292  auto Arowptrs = A.graph.row_map;
3293  auto Browptrs = B.graph.row_map;
3294  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3295  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3296 #ifdef HAVE_TPETRA_MMM_TIMINGS
3297  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3298 #endif
3299  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3303  KKH_GO handle;
3304  handle.create_spadd_handle(false);
3305  auto addHandle = handle.get_spadd_handle();
3306 #ifdef HAVE_TPETRA_MMM_TIMINGS
3307  MM = Teuchos::null;
3308  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3309 #endif
3310  auto nrows = Arowptrs.extent(0) - 1;
3311  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3312  KokkosSparse::Experimental::spadd_symbolic
3313  <KKH_GO, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315  Cvals = values_array("C values", addHandle->get_c_nnz());
3316  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3317 #ifdef HAVE_TPETRA_MMM_TIMINGS
3318  MM = Teuchos::null;
3319  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3320 #endif
3321  KokkosSparse::Experimental::spadd_numeric(&handle,
3322  Arowptrs, AcolindsConverted, Avals, scalarA,
3323  Browptrs, BcolindsConverted, Bvals, scalarB,
3324  Crowptrs, Ccolinds, Cvals);
3325 }
3326 
3327 
3328 } //End namepsace MMdetails
3329 
3330 } //End namespace Tpetra
3331 
3332 /*********************************************************************************************************/
3333 //
3334 // Explicit instantiation macro
3335 //
3336 // Must be expanded from within the Tpetra namespace!
3337 //
3338 namespace Tpetra {
3339 
3340 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3341  template \
3342  void MatrixMatrix::Multiply( \
3343  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3344  bool transposeA, \
3345  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3346  bool transposeB, \
3347  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348  bool call_FillComplete_on_result, \
3349  const std::string & label, \
3350  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3351 \
3352 template \
3353  void MatrixMatrix::Jacobi( \
3354  SCALAR omega, \
3355  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359  bool call_FillComplete_on_result, \
3360  const std::string & label, \
3361  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3362 \
3363  template \
3364  void MatrixMatrix::Add( \
3365  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3366  bool transposeA, \
3367  SCALAR scalarA, \
3368  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3369  bool transposeB, \
3370  SCALAR scalarB, \
3371  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3372 \
3373  template \
3374  void MatrixMatrix::Add( \
3375  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3376  bool transposeA, \
3377  SCALAR scalarA, \
3378  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3379  SCALAR scalarB ); \
3380 \
3381  template \
3382  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384  (const SCALAR & alpha, \
3385  const bool transposeA, \
3386  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387  const SCALAR & beta, \
3388  const bool transposeB, \
3389  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3393 \
3394  template \
3395  void \
3396  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397  (const SCALAR & alpha, \
3398  const bool transposeA, \
3399  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400  const SCALAR& beta, \
3401  const bool transposeB, \
3402  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3407 \
3408  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3409 
3410 } //End namespace Tpetra
3411 
3412 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Struct that holds views of the contents of a CrsMatrix.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed dense vector.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.