Tpetra parallel linear algebra  Version of the Day
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_ConfigDefs.hpp"
49 #include "Tpetra_CrsMatrix.hpp"
51 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_ConfigDefs.hpp"
53 #include "Tpetra_Map.hpp"
54 #include "Tpetra_Export.hpp"
55 #include "Tpetra_Import_Util.hpp"
56 #include "Tpetra_Import_Util2.hpp"
57 #include <algorithm>
58 #include "Teuchos_FancyOStream.hpp"
59 
60 
61 #ifdef HAVE_KOKKOSKERNELS_EXPERIMENTAL
62 #include "KokkosKernels_SPGEMM.hpp"
63 #endif
64 
65 
71 namespace Tpetra {
72 
73 namespace MatrixMatrix{
74 
75 //
76 // This method forms the matrix-matrix product C = op(A) * op(B), where
77 // op(A) == A if transposeA is false,
78 // op(A) == A^T if transposeA is true,
79 // and similarly for op(B).
80 //
81 template <class Scalar,
82  class LocalOrdinal,
83  class GlobalOrdinal,
84  class Node>
85 void Multiply(
87  bool transposeA,
89  bool transposeB,
91  bool call_FillComplete_on_result,
92  const std::string& label,
93  const Teuchos::RCP<Teuchos::ParameterList>& params)
94 {
95  using Teuchos::null;
96  using Teuchos::RCP;
97  typedef Scalar SC;
98  typedef LocalOrdinal LO;
99  typedef GlobalOrdinal GO;
100  typedef Node NO;
101  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
102  typedef Import<LO,GO,NO> import_type;
103  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
104  typedef Map<LO,GO,NO> map_type;
105  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
106 
107 #ifdef HAVE_TPETRA_MMM_TIMINGS
108  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
109  using Teuchos::TimeMonitor;
110  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
111 #endif
112 
113  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
114 
115  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
116 
117  // The input matrices A and B must both be fillComplete.
118  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
119  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
120 
121  // If transposeA is true, then Aprime will be the transpose of A
122  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
123  // will just be a pointer to A.
124  RCP<const crs_matrix_type> Aprime = null;
125  // If transposeB is true, then Bprime will be the transpose of B
126  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
127  // will just be a pointer to B.
128  RCP<const crs_matrix_type> Bprime = null;
129 
130  // Is this a "clean" matrix?
131  //
132  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
133  // locally nor globally indexed, then it was empty. I don't like
134  // this, because the most straightforward implementation presumes
135  // lazy allocation of indices. However, historical precedent
136  // demands that we keep around this predicate as a way to test
137  // whether the matrix is empty.
138  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
139 
140  bool use_optimized_ATB = false;
141  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
142  use_optimized_ATB = true;
143 
144 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
145  use_optimized_ATB = false;
146 #endif
147 
148  if (!use_optimized_ATB && transposeA) {
149  transposer_type transposer(rcpFromRef (A));
150  Aprime = transposer.createTranspose();
151 
152  } else {
153  Aprime = rcpFromRef(A);
154  }
155 
156  if (transposeB) {
157  transposer_type transposer(rcpFromRef (B));
158  Bprime = transposer.createTranspose();
159 
160  } else {
161  Bprime = rcpFromRef(B);
162  }
163 
164  // Check size compatibility
165  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
166  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
167  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
168  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
169  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
170  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
171  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
172  prefix << "ERROR, inner dimensions of op(A) and op(B) "
173  "must match for matrix-matrix product. op(A) is "
174  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
175 
176  // The result matrix C must at least have a row-map that reflects the correct
177  // row-size. Don't check the number of columns because rectangular matrices
178  // which were constructed with only one map can still end up having the
179  // correct capacity and dimensions when filled.
180  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
181  prefix << "ERROR, dimensions of result C must "
182  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
183  << " rows, should have at least " << Aouter << std::endl);
184 
185  // It doesn't matter whether C is already Filled or not. If it is already
186  // Filled, it must have space allocated for the positions that will be
187  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
188  // we'll error out later when trying to store result values.
189 
190  // CGB: However, matrix must be in active-fill
191  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
192 
193  // We're going to need to import remotely-owned sections of A and/or B if
194  // more than one processor is performing this run, depending on the scenario.
195  int numProcs = A.getComm()->getSize();
196 
197  // Declare a couple of structs that will be used to hold views of the data
198  // of A and B, to be used for fast access during the matrix-multiplication.
199  crs_matrix_struct_type Aview;
200  crs_matrix_struct_type Bview;
201 
202  RCP<const map_type> targetMap_A = Aprime->getRowMap();
203  RCP<const map_type> targetMap_B = Bprime->getRowMap();
204 
205 #ifdef HAVE_TPETRA_MMM_TIMINGS
206  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X"))));
207 #endif
208 
209  // Now import any needed remote rows and populate the Aview struct
210  // NOTE: We assert that an import isn't needed --- since we do the transpose
211  // above to handle that.
212  if (!use_optimized_ATB) {
213  RCP<const import_type> dummyImporter;
214  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
215  }
216 
217  // We will also need local access to all rows of B that correspond to the
218  // column-map of op(A).
219  if (numProcs > 1)
220  targetMap_B = Aprime->getColMap();
221 
222  // Import any needed remote rows and populate the Bview struct.
223  if (!use_optimized_ATB)
224  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
225 
226 #ifdef HAVE_TPETRA_MMM_TIMINGS
227  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
228 #endif
229 
230  // Call the appropriate method to perform the actual multiplication.
231  if (use_optimized_ATB) {
232  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
233 
234  } else if (call_FillComplete_on_result && newFlag) {
235  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
236 
237  } else if (call_FillComplete_on_result) {
238  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
239 
240  } else {
241  // mfh 27 Sep 2016: Is this the "slow" case? This
242  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
243  // thread-parallel inserts, but that may take some effort.
244  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
245 
246  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
247 
248 #ifdef HAVE_TPETRA_MMM_TIMINGS
249  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete"))));
250 #endif
251  if (call_FillComplete_on_result) {
252  // We'll call FillComplete on the C matrix before we exit, and give it a
253  // domain-map and a range-map.
254  // The domain-map will be the domain-map of B, unless
255  // op(B)==transpose(B), in which case the range-map of B will be used.
256  // The range-map will be the range-map of A, unless op(A)==transpose(A),
257  // in which case the domain-map of A will be used.
258  if (!C.isFillComplete())
259  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
260  }
261  }
262 }
263 
264 
265 template <class Scalar,
266  class LocalOrdinal,
267  class GlobalOrdinal,
268  class Node>
269 void Jacobi(Scalar omega,
274  bool call_FillComplete_on_result,
275  const std::string& label,
276  const Teuchos::RCP<Teuchos::ParameterList>& params)
277 {
278  using Teuchos::RCP;
279  typedef Scalar SC;
280  typedef LocalOrdinal LO;
281  typedef GlobalOrdinal GO;
282  typedef Node NO;
283  typedef Import<LO,GO,NO> import_type;
284  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
285  typedef Map<LO,GO,NO> map_type;
286  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
287 
288 #ifdef HAVE_TPETRA_MMM_TIMINGS
289  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
290  using Teuchos::TimeMonitor;
291  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup"))));
292 #endif
293 
294  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
295 
296  // A and B should already be Filled.
297  // Should we go ahead and call FillComplete() on them if necessary or error
298  // out? For now, we choose to error out.
299  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
300  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
301 
302  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
303  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
304 
305  // Now check size compatibility
306  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
307  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
308  global_size_t Aouter = A.getGlobalNumRows();
309  global_size_t Bouter = numBCols;
310  global_size_t Ainner = numACols;
311  global_size_t Binner = B.getGlobalNumRows();
312  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
313  prefix << "ERROR, inner dimensions of op(A) and op(B) "
314  "must match for matrix-matrix product. op(A) is "
315  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
316 
317  // The result matrix C must at least have a row-map that reflects the correct
318  // row-size. Don't check the number of columns because rectangular matrices
319  // which were constructed with only one map can still end up having the
320  // correct capacity and dimensions when filled.
321  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
322  prefix << "ERROR, dimensions of result C must "
323  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
324  << " rows, should have at least "<< Aouter << std::endl);
325 
326  // It doesn't matter whether C is already Filled or not. If it is already
327  // Filled, it must have space allocated for the positions that will be
328  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
329  // we'll error out later when trying to store result values.
330 
331  // CGB: However, matrix must be in active-fill
332  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
333 
334  // We're going to need to import remotely-owned sections of A and/or B if
335  // more than one processor is performing this run, depending on the scenario.
336  int numProcs = A.getComm()->getSize();
337 
338  // Declare a couple of structs that will be used to hold views of the data of
339  // A and B, to be used for fast access during the matrix-multiplication.
340  crs_matrix_struct_type Aview;
341  crs_matrix_struct_type Bview;
342 
343  RCP<const map_type> targetMap_A = Aprime->getRowMap();
344  RCP<const map_type> targetMap_B = Bprime->getRowMap();
345 
346 #ifdef HAVE_TPETRA_MMM_TIMINGS
347  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X"))));
348 #endif
349 
350  // Enable globalConstants by default
351  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
352  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
353  if(!params.is_null()) importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
354 
355  //Now import any needed remote rows and populate the Aview struct.
356  RCP<const import_type> dummyImporter;
357  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, false, label,importParams1);
358 
359  // We will also need local access to all rows of B that correspond to the
360  // column-map of op(A).
361  if (numProcs > 1)
362  targetMap_B = Aprime->getColMap();
363 
364  // Now import any needed remote rows and populate the Bview struct.
365  // Enable globalConstants by default
366  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
367  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
368  if(!params.is_null()) importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
369  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
370 
371 #ifdef HAVE_TPETRA_MMM_TIMINGS
372  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply"))));
373 #endif
374 
375  // Now call the appropriate method to perform the actual multiplication.
376  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
377 
378  // Is this a "clean" matrix
379  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
380 
381  if (call_FillComplete_on_result && newFlag) {
382  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
383 
384  } else if (call_FillComplete_on_result) {
385  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
386 
387  } else {
388  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
389  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
390  // commenting it out.
391 // #ifdef HAVE_TPETRA_MMM_TIMINGS
392 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
393 // #endif
394  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
395  // commenting it out.
396  // if (call_FillComplete_on_result) {
397  // //We'll call FillComplete on the C matrix before we exit, and give
398  // //it a domain-map and a range-map.
399  // //The domain-map will be the domain-map of B, unless
400  // //op(B)==transpose(B), in which case the range-map of B will be used.
401  // //The range-map will be the range-map of A, unless
402  // //op(A)==transpose(A), in which case the domain-map of A will be used.
403  // if (!C.isFillComplete()) {
404  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
405  // }
406  // }
407  }
408 }
409 
410 
411 template <class Scalar,
412  class LocalOrdinal,
413  class GlobalOrdinal,
414  class Node>
415 void Add(
417  bool transposeA,
418  Scalar scalarA,
420  Scalar scalarB )
421 {
422  using Teuchos::Array;
423  using Teuchos::RCP;
424  using Teuchos::null;
425  typedef Scalar SC;
426  typedef LocalOrdinal LO;
427  typedef GlobalOrdinal GO;
428  typedef Node NO;
429  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
430  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
431 
432  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
433 
434  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
435  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
436  "(Result matrix B is not required to be isFillComplete()).");
437  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
438  prefix << "ERROR, input matrix B must not be fill complete!");
439  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
440  prefix << "ERROR, input matrix B must not have static graph!");
441  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
442  prefix << "ERROR, input matrix B must not be locally indexed!");
443  TEUCHOS_TEST_FOR_EXCEPTION(B.getProfileType()!=DynamicProfile, std::runtime_error,
444  prefix << "ERROR, input matrix B must have a dynamic profile!");
445 
446 
447  RCP<const crs_matrix_type> Aprime = null;
448  if (transposeA) {
449  transposer_type transposer(rcpFromRef (A));
450  Aprime = transposer.createTranspose();
451  } else {
452  Aprime = rcpFromRef(A);
453  }
454 
455  size_t a_numEntries;
456  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
457  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
458  GO row;
459 
460  if (scalarB != Teuchos::ScalarTraits<SC>::one())
461  B.scale(scalarB);
462 
463  bool bFilled = B.isFillComplete();
464  size_t numMyRows = B.getNodeNumRows();
465  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
466  for (LO i = 0; (size_t)i < numMyRows; ++i) {
467  row = B.getRowMap()->getGlobalElement(i);
468  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
469 
470  if (scalarA != Teuchos::ScalarTraits<SC>::one())
471  for (size_t j = 0; j < a_numEntries; ++j)
472  a_vals[j] *= scalarA;
473 
474  if (bFilled)
475  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
476  else
477  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
478  }
479  }
480 }
481 
482 
483 template <class Scalar,
484  class LocalOrdinal,
485  class GlobalOrdinal,
486  class Node>
487 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
488 add (const Scalar& alpha,
489  const bool transposeA,
491  const Scalar& beta,
492  const bool transposeB,
494  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
495  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
496  const Teuchos::RCP<Teuchos::ParameterList>& params)
497 {
498  using Teuchos::RCP;
499  using Teuchos::rcpFromRef;
500  using Teuchos::rcp_implicit_cast;
501  using Teuchos::rcp_dynamic_cast;
502 
503  typedef Scalar SC;
504  typedef LocalOrdinal LO;
505  typedef GlobalOrdinal GO;
506  typedef Node NO;
507  typedef RowMatrix<SC,LO,GO,NO> row_matrix_type;
508  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
509  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
510 
511  const std::string prefix = "TpetraExt::MatrixMatrix::add(): ";
512 
513  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete () || !B.isFillComplete (), std::invalid_argument,
514  prefix << "A and B must both be fill complete.");
515 
516 #ifdef HAVE_TPETRA_DEBUG
517  // The matrices don't have domain or range Maps unless they are fill complete.
518  if (A.isFillComplete () && B.isFillComplete ()) {
519  const bool domainMapsSame =
520  (!transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getDomainMap ())) ||
521  (!transposeA && transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
522  ( transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
523  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
524  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
525 
526  const bool rangeMapsSame =
527  (!transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
528  (!transposeA && transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
529  ( transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ()));
530  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
531  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
532  }
533 #endif // HAVE_TPETRA_DEBUG
534 
535  // Form the explicit transpose of A if necessary.
536  RCP<const crs_matrix_type> Aprime;
537  if (transposeA) {
538  transposer_type transposer (rcpFromRef (A));
539  Aprime = transposer.createTranspose ();
540 
541  } else {
542  Aprime = rcpFromRef (A);
543  }
544 
545 #ifdef HAVE_TPETRA_DEBUG
546  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
547  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
548 #endif // HAVE_TPETRA_DEBUG
549 
550  // Form the explicit transpose of B if necessary.
551  RCP<const crs_matrix_type> Bprime;
552  if (transposeB) {
553  transposer_type transposer (rcpFromRef (B));
554  Bprime = transposer.createTranspose ();
555 
556  } else {
557  Bprime = rcpFromRef (B);
558  }
559 
560 #ifdef HAVE_TPETRA_DEBUG
561  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
562  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
563 
564  TEUCHOS_TEST_FOR_EXCEPTION(
565  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
566  prefix << "Aprime and Bprime must both be fill complete. "
567  "Please report this bug to the Tpetra developers.");
568 #endif // HAVE_TPETRA_DEBUG
569 
570  RCP<row_matrix_type> C =
571  Bprime->add (alpha, *rcp_implicit_cast<const row_matrix_type> (Aprime),
572  beta, domainMap, rangeMap, params);
573 
574  return rcp_dynamic_cast<crs_matrix_type> (C);
575 }
576 
577 
578 template <class Scalar,
579  class LocalOrdinal,
580  class GlobalOrdinal,
581  class Node>
582 void Add(
584  bool transposeA,
585  Scalar scalarA,
587  bool transposeB,
588  Scalar scalarB,
590 {
591  using Teuchos::as;
592  using Teuchos::Array;
593  using Teuchos::ArrayRCP;
594  using Teuchos::ArrayView;
595  using Teuchos::RCP;
596  using Teuchos::rcp;
597  using Teuchos::rcp_dynamic_cast;
598  using Teuchos::rcpFromRef;
599  using Teuchos::tuple;
600  using std::endl;
601  // typedef typename ArrayView<const Scalar>::size_type size_type;
602  typedef Teuchos::ScalarTraits<Scalar> STS;
604  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
605  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
606  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
609 
610  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
611 
612  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
613  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
614 
615  TEUCHOS_TEST_FOR_EXCEPTION(
616  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
617  prefix << "Both input matrices must be fill complete before calling this function.");
618 
619 #ifdef HAVE_TPETRA_DEBUG
620  {
621  const bool domainMapsSame =
622  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
623  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
624  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
625  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
626  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
627 
628  const bool rangeMapsSame =
629  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
630  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
631  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
632  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
633  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
634  }
635 #endif // HAVE_TPETRA_DEBUG
636 
637  // Form the explicit transpose of A if necessary.
638  RCP<const crs_matrix_type> Aprime;
639  if (transposeA) {
640  transposer_type theTransposer (rcpFromRef (A));
641  Aprime = theTransposer.createTranspose ();
642  } else {
643  Aprime = rcpFromRef (A);
644  }
645 
646 #ifdef HAVE_TPETRA_DEBUG
647  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
648  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
649 #endif // HAVE_TPETRA_DEBUG
650 
651  // Form the explicit transpose of B if necessary.
652  RCP<const crs_matrix_type> Bprime;
653  if (transposeB) {
654  transposer_type theTransposer (rcpFromRef (B));
655  Bprime = theTransposer.createTranspose ();
656  } else {
657  Bprime = rcpFromRef (B);
658  }
659 
660 #ifdef HAVE_TPETRA_DEBUG
661  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
662  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
663 #endif // HAVE_TPETRA_DEBUG
664 
665  // Allocate or zero the entries of the result matrix.
666  if (! C.is_null ()) {
667  C->setAllToScalar (STS::zero ());
668  } else {
669 #if 0
670  // If Aprime and Bprime have the same row Map, and if C is null,
671  // we can optimize construction and fillComplete of C. For now,
672  // we just check pointer equality, to avoid the all-reduce in
673  // isSameAs. It may be worth that all-reduce to check, however.
674  //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ())
675  if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
676  RCP<const map_type> rowMap = Aprime->getRowMap ();
677 
678  RCP<const crs_graph_type> A_graph =
679  rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
680 #ifdef HAVE_TPETRA_DEBUG
681  TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
682  "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
683  "Please report this bug to the Tpetra developers.");
684 #endif // HAVE_TPETRA_DEBUG
685  RCP<const crs_graph_type> B_graph =
686  rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
687 #ifdef HAVE_TPETRA_DEBUG
688  TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
689  "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
690  "Please report this bug to the Tpetra developers.");
691 #endif // HAVE_TPETRA_DEBUG
692  RCP<const map_type> A_colMap = A_graph->getColMap ();
693 #ifdef HAVE_TPETRA_DEBUG
694  TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
695  "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
696  "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
698  RCP<const map_type> B_colMap = B_graph->getColMap ();
699 #ifdef HAVE_TPETRA_DEBUG
700  TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
701  "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
702  "Please report this bug to the Tpetra developers.");
703  TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
704  std::logic_error,
705  "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
706  "Please report this bug to the Tpetra developers.");
707  TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
708  std::logic_error,
709  "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
710  "Please report this bug to the Tpetra developers.");
711 #endif // HAVE_TPETRA_DEBUG
712 
713  // Compute the (column Map and) Import of the matrix sum.
714  RCP<const import_type> sumImport =
715  A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
716  RCP<const map_type> C_colMap = sumImport->getTargetMap ();
717 
718  // First, count the number of entries in each row. Then, go
719  // back over the rows again, and compute the actual sum.
720  // Remember that C may have a different column Map than Aprime
721  // or Bprime, so its local indices may be different. That's why
722  // we have to convert from local to global indices.
723 
724  ArrayView<const LocalOrdinal> A_local_ind;
725  Array<GlobalOrdinal> A_global_ind;
726  ArrayView<const LocalOrdinal> B_local_ind;
727  Array<GlobalOrdinal> B_global_ind;
728 
729  const size_t localNumRows = rowMap->getNodeNumElements ();
730  ArrayRCP<size_t> numEntriesPerRow (localNumRows);
731  // Compute the max number of entries in any row of A+B on this
732  // process, so that we won't have to resize temporary arrays.
733  size_t maxNumEntriesPerRow = 0;
734  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
735  // Get view of current row of A_graph, in its local indices.
736  A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
737  const size_type A_numEnt = A_local_ind.size ();
738  if (A_numEnt > A_global_ind.size ()) {
739  A_global_ind.resize (A_numEnt);
740  }
741  // Convert A's local indices to global indices.
742  for (size_type k = 0; k < A_numEnt; ++k) {
743  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
744  }
745 
746  // Get view of current row of B_graph, in its local indices.
747  B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
748  const size_type B_numEnt = B_local_ind.size ();
749  if (B_numEnt > B_global_ind.size ()) {
750  B_global_ind.resize (B_numEnt);
751  }
752  // Convert B's local indices to global indices.
753  for (size_type k = 0; k < B_numEnt; ++k) {
754  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
755  }
756 
757  // Count the number of entries in the merged row of A + B.
758  const size_t curNumEntriesPerRow =
759  keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
760  B_global_ind.begin (), B_global_ind.end ());
761  numEntriesPerRow[localRow] = curNumEntriesPerRow;
762  maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
763  }
764 
765  // Create C, using the sum column Map and number of entries per
766  // row that we computed above. Having the exact number of
767  // entries per row lets us use static profile, making it valid
768  // to call expertStaticFillComplete.
769  C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
770 
771  // Go back through the rows and actually compute the sum. We
772  // don't ever have to resize A_global_ind or B_global_ind below,
773  // since we've already done it above.
774  ArrayView<const Scalar> A_val;
775  ArrayView<const Scalar> B_val;
776 
777  Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
778  Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
779  Array<Scalar> AplusB_val (maxNumEntriesPerRow);
780 
781  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
782  // Get view of current row of A, in A's local indices.
783  Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
784  // Convert A's local indices to global indices.
785  for (size_type k = 0; k < A_local_ind.size (); ++k) {
786  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
787  }
788 
789  // Get view of current row of B, in B's local indices.
790  Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
791  // Convert B's local indices to global indices.
792  for (size_type k = 0; k < B_local_ind.size (); ++k) {
793  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
794  }
795 
796  const size_t curNumEntries = numEntriesPerRow[localRow];
797  ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
798  ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
799  ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
800 
801  // Sum the entries in the current row of A plus B.
802  keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
803  A_val.begin (), A_val.end (),
804  B_global_ind.begin (), B_global_ind.end (),
805  B_val.begin (), B_val.end (),
806  C_global_ind.begin (), C_val.begin (),
807  std::plus<Scalar> ());
808  // Convert the sum's global indices into C's local indices.
809  for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
810  C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
811  }
812  // Give the current row sum to C.
813  C->replaceLocalValues (localRow, C_local_ind, C_val);
814  }
815 
816  // Use "expert static fill complete" to bypass construction of
817  // the Import and Export (if applicable) object(s).
818  RCP<const map_type> domainMap = A_graph->getDomainMap ();
819  RCP<const map_type> rangeMap = A_graph->getRangeMap ();
820  C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
821 
822  return; // Now we're done!
823  }
824  else {
825  // FIXME (mfh 08 May 2013) When I first looked at this method, I
826  // noticed that C was being given the row Map of Aprime (the
827  // possibly transposed version of A). Is this what we want?
828  C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
829  }
830 
831 #else
832  // FIXME (mfh 08 May 2013) When I first looked at this method, I
833  // noticed that C was being given the row Map of Aprime (the
834  // possibly transposed version of A). Is this what we want?
835  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
836 #endif // 0
837  }
838 
839 #ifdef HAVE_TPETRA_DEBUG
840  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
841  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
842  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
843  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
844  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
845  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
846 #endif // HAVE_TPETRA_DEBUG
847 
848  Array<RCP<const crs_matrix_type> > Mat =
849  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
850  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
851 
852  // do a loop over each matrix to add: A reordering might be more efficient
853  for (int k = 0; k < 2; ++k) {
854  Array<GlobalOrdinal> Indices;
855  Array<Scalar> Values;
856 
857  // Loop over each locally owned row of the current matrix (either
858  // Aprime or Bprime), and sum its entries into the corresponding
859  // row of C. This works regardless of whether Aprime or Bprime
860  // has the same row Map as C, because both sumIntoGlobalValues and
861  // insertGlobalValues allow summing resp. inserting into nonowned
862  // rows of C.
863 #ifdef HAVE_TPETRA_DEBUG
864  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
865  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
866 #endif // HAVE_TPETRA_DEBUG
867  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
868 #ifdef HAVE_TPETRA_DEBUG
869  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
870  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
871 #endif // HAVE_TPETRA_DEBUG
872 
873  const size_t localNumRows = Mat[k]->getNodeNumRows ();
874  for (size_t i = 0; i < localNumRows; ++i) {
875  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
876  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
877  if (numEntries > 0) {
878  Indices.resize (numEntries);
879  Values.resize (numEntries);
880  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
881 
882  if (scalar[k] != STS::one ()) {
883  for (size_t j = 0; j < numEntries; ++j) {
884  Values[j] *= scalar[k];
885  }
886  }
887 
888  if (C->isFillComplete ()) {
889  C->sumIntoGlobalValues (globalRow, Indices, Values);
890  } else {
891  C->insertGlobalValues (globalRow, Indices, Values);
892  }
893  }
894  }
895  }
896 }
897 
898 
899 } //End namespace MatrixMatrix
900 
901 namespace MMdetails{
902 
903 // Prints MMM-style statistics on communication done with an Import or Export object
904 template <class TransferType>
905 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
906  if (Transfer.is_null())
907  return;
908 
909  const Distributor & Distor = Transfer->getDistributor();
910  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
911 
912  size_t rows_send = Transfer->getNumExportIDs();
913  size_t rows_recv = Transfer->getNumRemoteIDs();
914 
915  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
916  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
917  size_t num_send_neighbors = Distor.getNumSends();
918  size_t num_recv_neighbors = Distor.getNumReceives();
919  size_t round2_send, round2_recv;
920  Distor.getLastDoStatistics(round2_send,round2_recv);
921 
922  int myPID = Comm->getRank();
923  int NumProcs = Comm->getSize();
924 
925  // Processor by processor statistics
926  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
927  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
928 
929  // Global statistics
930  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
931  size_t gstats_min[8], gstats_max[8];
932 
933  double lstats_avg[8], gstats_avg[8];
934  for(int i=0; i<8; i++)
935  lstats_avg[i] = ((double)lstats[i])/NumProcs;
936 
937  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
938  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
939  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
940 
941  if(!myPID) {
942  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
943  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
944  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
945  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
946  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
947  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
948  }
949 }
950 
951 
952 // Kernel method for computing the local portion of C = A*B
953 template<class Scalar,
954  class LocalOrdinal,
955  class GlobalOrdinal,
956  class Node>
957 void mult_AT_B_newmatrix(
961  const std::string & label,
962  const Teuchos::RCP<Teuchos::ParameterList>& params)
963 {
964  // Using & Typedefs
965  using Teuchos::RCP;
966  using Teuchos::rcp;
967  typedef Scalar SC;
968  typedef LocalOrdinal LO;
969  typedef GlobalOrdinal GO;
970  typedef Node NO;
971  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
972  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
973 
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
975  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
976  using Teuchos::TimeMonitor;
977  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T Transpose"))));
978 #endif
979 
980  /*************************************************************/
981  /* 1) Local Transpose of A */
982  /*************************************************************/
983  transposer_type transposer (rcpFromRef (A),label+std::string("XP: "));
984  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
985  if(!params.is_null()) transposeParams->set("compute global constants",params->get("compute global constants: temporaries",false));
986  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Atrans = transposer.createTransposeLocal(transposeParams);
987 
988  /*************************************************************/
989  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
990  /*************************************************************/
991 #ifdef HAVE_TPETRA_MMM_TIMINGS
992  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T I&X"))));
993 #endif
994 
995  // Get views, asserting that no import is required to speed up computation
996  crs_matrix_struct_type Aview;
997  crs_matrix_struct_type Bview;
998  RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
999 
1000  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1001  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
1002  if(!params.is_null()) importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
1003  MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(), Aview, dummyImporter,true, label,importParams1);
1004 
1005  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
1006  if(!params.is_null()) importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
1007  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1008 
1009 #ifdef HAVE_TPETRA_MMM_TIMINGS
1010  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1011 #endif
1012 
1013  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >Ctemp;
1014 
1015  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1016  bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
1017  if (needs_final_export)
1018  Ctemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Atrans->getRowMap(),0));
1019  else
1020  Ctemp = rcp(&C,false);// don't allow deallocation
1021 
1022  // Multiply
1023  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1024 
1025  /*************************************************************/
1026  /* 4) exportAndFillComplete matrix */
1027  /*************************************************************/
1028 #ifdef HAVE_TPETRA_MMM_TIMINGS
1029  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1030 #endif
1031 
1032  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Crcp(&C,false);
1033  if (needs_final_export) {
1034  Teuchos::ParameterList labelList;
1035  labelList.set("Timer Label", label);
1036  if(!params.is_null()) labelList.set("compute global constants",params->get("compute global constants",true));
1037 
1038  Ctemp->exportAndFillComplete(Crcp,*Ctemp->getGraph()->getExporter(),
1039  B.getDomainMap(),A.getDomainMap(),rcp(&labelList,false));
1040  }
1041 #ifdef HAVE_TPETRA_MMM_STATISTICS
1042  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1043 #endif
1044 }
1045 
1046 // Kernel method for computing the local portion of C = A*B
1047 template<class Scalar,
1048  class LocalOrdinal,
1049  class GlobalOrdinal,
1050  class Node>
1051 void mult_A_B(
1054  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1055  const std::string& label,
1056  const Teuchos::RCP<Teuchos::ParameterList>& params)
1057 {
1058  using Teuchos::Array;
1059  using Teuchos::ArrayRCP;
1060  using Teuchos::ArrayView;
1061  using Teuchos::OrdinalTraits;
1062  using Teuchos::null;
1063 
1064  typedef Teuchos::ScalarTraits<Scalar> STS;
1065  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1066  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1067  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1068 
1069  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1070  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1071 
1072  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1073  ArrayView<const GlobalOrdinal> bcols_import = null;
1074  if (Bview.importColMap != null) {
1075  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1076  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1077 
1078  bcols_import = Bview.importColMap->getNodeElementList();
1079  }
1080 
1081  size_t C_numCols = C_lastCol - C_firstCol +
1082  OrdinalTraits<LocalOrdinal>::one();
1083  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1084  OrdinalTraits<LocalOrdinal>::one();
1085 
1086  if (C_numCols_import > C_numCols)
1087  C_numCols = C_numCols_import;
1088 
1089  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1090  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1091  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1092 
1093  Array<Scalar> C_row_i = dwork;
1094  Array<GlobalOrdinal> C_cols = iwork;
1095  Array<size_t> c_index = iwork2;
1096  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1097  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1098 
1099  size_t C_row_i_length, j, k, last_index;
1100 
1101  // Run through all the hash table lookups once and for all
1102  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1103  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1104  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1105  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1106  // Maps are the same: Use local IDs as the hash
1107  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1108  Aview.colMap->getMaxLocalIndex(); i++)
1109  Acol2Brow[i]=i;
1110  }
1111  else {
1112  // Maps are not the same: Use the map's hash
1113  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1114  Aview.colMap->getMaxLocalIndex(); i++) {
1115  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1116  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1117  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1118  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1119  }
1120  }
1121 
1122  // To form C = A*B we're going to execute this expression:
1123  //
1124  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1125  //
1126  // Our goal, of course, is to navigate the data in A and B once, without
1127  // performing searches for column-indices, etc.
1128  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1129  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1130  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1131  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1132  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1133  ArrayView<const Scalar> Avals, Bvals, Ivals;
1134  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1135  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1136  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1137  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1138  if(!Bview.importMatrix.is_null()) {
1139  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1140  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1141  }
1142 
1143  bool C_filled = C.isFillComplete();
1144 
1145  for (size_t i = 0; i < C_numCols; i++)
1146  c_index[i] = OrdinalTraits<size_t>::invalid();
1147 
1148  // Loop over the rows of A.
1149  size_t Arows = Aview.rowMap->getNodeNumElements();
1150  for(size_t i=0; i<Arows; ++i) {
1151 
1152  // Only navigate the local portion of Aview... which is, thankfully, all of
1153  // A since this routine doesn't do transpose modes
1154  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1155 
1156  // Loop across the i-th row of A and for each corresponding row in B, loop
1157  // across colums and accumulate product A(i,k)*B(k,j) into our partial sum
1158  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1159  // calculating updates for row i of the result matrix C.
1160  C_row_i_length = OrdinalTraits<size_t>::zero();
1161 
1162  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1163  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1164  const Scalar Aval = Avals[k];
1165  if (Aval == STS::zero())
1166  continue;
1167 
1168  if (Ak == LO_INVALID)
1169  continue;
1170 
1171  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1172  LocalOrdinal col = Bcolind[j];
1173  //assert(col >= 0 && col < C_numCols);
1174 
1175  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1176  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1177  // This has to be a += so insertGlobalValue goes out
1178  C_row_i[C_row_i_length] = Aval*Bvals[j];
1179  C_cols[C_row_i_length] = col;
1180  c_index[col] = C_row_i_length;
1181  C_row_i_length++;
1182 
1183  } else {
1184  C_row_i[c_index[col]] += Aval*Bvals[j];
1185  }
1186  }
1187  }
1188 
1189  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1190  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1191  C_cols[ii] = bcols[C_cols[ii]];
1192  combined_index[ii] = C_cols[ii];
1193  combined_values[ii] = C_row_i[ii];
1194  }
1195  last_index = C_row_i_length;
1196 
1197  //
1198  //Now put the C_row_i values into C.
1199  //
1200  // We might have to revamp this later.
1201  C_row_i_length = OrdinalTraits<size_t>::zero();
1202 
1203  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1204  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1205  const Scalar Aval = Avals[k];
1206  if (Aval == STS::zero())
1207  continue;
1208 
1209  if (Ak!=LO_INVALID) continue;
1210 
1211  Ak = Acol2Irow[Acolind[k]];
1212  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1213  LocalOrdinal col = Icolind[j];
1214  //assert(col >= 0 && col < C_numCols);
1215 
1216  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1217  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1218  // This has to be a += so insertGlobalValue goes out
1219  C_row_i[C_row_i_length] = Aval*Ivals[j];
1220  C_cols[C_row_i_length] = col;
1221  c_index[col] = C_row_i_length;
1222  C_row_i_length++;
1223 
1224  } else {
1225  // This has to be a += so insertGlobalValue goes out
1226  C_row_i[c_index[col]] += Aval*Ivals[j];
1227  }
1228  }
1229  }
1230 
1231  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1232  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1233  C_cols[ii] = bcols_import[C_cols[ii]];
1234  combined_index[last_index] = C_cols[ii];
1235  combined_values[last_index] = C_row_i[ii];
1236  last_index++;
1237  }
1238 
1239  // Now put the C_row_i values into C.
1240  // We might have to revamp this later.
1241  C_filled ?
1242  C.sumIntoGlobalValues(
1243  global_row,
1244  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1245  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1246  :
1247  C.insertGlobalValues(
1248  global_row,
1249  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1250  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1251 
1252  }
1253 }
1254 
1255 template<class Scalar,
1256  class LocalOrdinal,
1257  class GlobalOrdinal,
1258  class Node>
1259 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1260  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1261  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1262 
1263  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1264  Mview.maxNumRowEntries = Mview.indices[0].size();
1265 
1266  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1267  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1268  Mview.maxNumRowEntries = Mview.indices[i].size();
1269  }
1270 }
1271 
1272 
1273 template<class CrsMatrixType>
1274 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1275  // Follows the NZ estimate in ML's ml_matmatmult.c
1276  size_t Aest = 100, Best=100;
1277  if (A.getNodeNumEntries() > 0)
1278  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1279  if (B.getNodeNumEntries() > 0)
1280  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1281 
1282  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1283  nnzperrow *= nnzperrow;
1284 
1285  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1286 }
1287 
1288 
1289 // Kernel method for computing the local portion of C = A*B
1290 //
1291 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1292 // function, so this is probably the function we want to
1293 // thread-parallelize.
1294 template<class Scalar,
1295  class LocalOrdinal,
1296  class GlobalOrdinal,
1297  class Node>
1298 void mult_A_B_newmatrix(
1302  const std::string& label,
1303  const Teuchos::RCP<Teuchos::ParameterList>& params)
1304 {
1305  using Teuchos::Array;
1306  using Teuchos::ArrayRCP;
1307  using Teuchos::ArrayView;
1308  using Teuchos::RCP;
1309  using Teuchos::rcp;
1310 
1311  //typedef Scalar SC; // unused
1312  typedef LocalOrdinal LO;
1313  typedef GlobalOrdinal GO;
1314  typedef Node NO;
1315 
1316  typedef Import<LO,GO,NO> import_type;
1317  typedef Map<LO,GO,NO> map_type;
1318 
1319 #ifdef HAVE_TPETRA_MMM_TIMINGS
1320  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1321  using Teuchos::TimeMonitor;
1322  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1323 #endif
1324  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1325 
1326  // Build the final importer / column map, hash table lookups for C
1327  RCP<const import_type> Cimport;
1328  RCP<const map_type> Ccolmap;
1329  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1330  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1331  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1332 
1333  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1334  // indices of B, to local column indices of C. (B and C have the
1335  // same number of columns.) The kernel uses this, instead of
1336  // copying the entire input matrix B and converting its column
1337  // indices to those of C.
1338  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
1339 
1340  if (Bview.importMatrix.is_null()) {
1341  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1342  Cimport = Bimport;
1343  Ccolmap = Bview.colMap;
1344  // Bcol2Ccol is trivial
1345  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
1346  Bcol2Ccol[i] = Teuchos::as<LO>(i);
1347 
1348  } else {
1349  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1350  // column Map of C, as well as C's Import object (from its domain
1351  // Map to its column Map). C's column Map is the union of the
1352  // column Maps of (the local part of) B, and the "remote" part of
1353  // B. Ditto for the Import. We have optimized this "setUnion"
1354  // operation on Import objects and Maps.
1355 
1356  // Choose the right variant of setUnion
1357  if (!Bimport.is_null() && !Iimport.is_null())
1358  Cimport = Bimport->setUnion(*Iimport);
1359 
1360  else if (!Bimport.is_null() && Iimport.is_null())
1361  Cimport = Bimport->setUnion();
1362 
1363  else if (Bimport.is_null() && !Iimport.is_null())
1364  Cimport = Iimport->setUnion();
1365 
1366  else
1367  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1368 
1369  Ccolmap = Cimport->getTargetMap();
1370 
1371  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1372  // in general. We should get rid of it in order to reduce
1373  // communication costs of sparse matrix-matrix multiply.
1374  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1375  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1376 
1377  // NOTE: This is not efficient and should be folded into setUnion
1378  //
1379  // mfh 27 Sep 2016: What the above comment means, is that the
1380  // setUnion operation on Import objects could also compute these
1381  // local index - to - local index look-up tables.
1382  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
1383  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
1384  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
1385 
1386  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
1387  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
1388  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
1389  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
1390  }
1391 
1392  // Replace the column map
1393  //
1394  // mfh 27 Sep 2016: We do this because C was originally created
1395  // without a column Map. Now we have its column Map.
1396  C.replaceColMap(Ccolmap);
1397 
1398  // mfh 27 Sep 2016: Construct tables that map from local column
1399  // indices of A, to local row indices of either B_local (the locally
1400  // owned part of B), or B_remote (the "imported" remote part of B).
1401  //
1402  // For column index Aik in row i of A, if the corresponding row of B
1403  // exists in the local part of B ("orig") (which I'll call B_local),
1404  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1405  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1406  //
1407  // For column index Aik in row i of A, if the corresponding row of B
1408  // exists in the remote part of B ("Import") (which I'll call
1409  // B_remote), then targetMapToImportRow[Aik] is the local index of
1410  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1411  // (a flag value).
1412 
1413  // Run through all the hash table lookups once and for all
1414  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1415  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
1416 
1417  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
1418  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1419  if (B_LID != LO_INVALID) {
1420  targetMapToOrigRow[i] = B_LID;
1421  } else {
1422  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1423  targetMapToImportRow[i] = I_LID;
1424  }
1425  }
1426 
1427 
1428 
1429  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1430  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1431  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1432 
1433 }
1434 
1435 
1436 
1437 
1438 /*********************************************************************************************************/
1439 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1440 template<class Scalar,
1441  class LocalOrdinal,
1442  class GlobalOrdinal,
1443  class Node>
1444 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1446  const Teuchos::Array<LocalOrdinal> & targetMapToOrigRow,
1447  const Teuchos::Array<LocalOrdinal> & targetMapToImportRow,
1448  const Teuchos::Array<LocalOrdinal> & Bcol2Ccol,
1449  const Teuchos::Array<LocalOrdinal> & Icol2Ccol,
1451  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1452  const std::string& label,
1453  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1454 #ifdef HAVE_TPETRA_MMM_TIMINGS
1455  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1456  using Teuchos::TimeMonitor;
1457  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1458 #endif
1459 
1460  using Teuchos::Array;
1461  using Teuchos::ArrayRCP;
1462  using Teuchos::ArrayView;
1463  using Teuchos::RCP;
1464  using Teuchos::rcp;
1465 
1466  typedef Scalar SC;
1467  typedef LocalOrdinal LO;
1468  typedef GlobalOrdinal GO;
1469  typedef Node NO;
1470  typedef Map<LO,GO,NO> map_type;
1471  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1472  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1473  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1474 
1475  // Sizes
1476  RCP<const map_type> Ccolmap = C.getColMap();
1477  size_t m = Aview.origMatrix->getNodeNumRows();
1478  size_t n = Ccolmap->getNodeNumElements();
1479 
1480  // Get Data Pointers
1481  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1482  ArrayRCP<size_t> Crowptr_RCP;
1483  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1484  ArrayRCP<LO> Ccolind_RCP;
1485  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1486  ArrayRCP<SC> Cvals_RCP;
1487 
1488  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1489  // out of the CrsMatrix. This code computes A * (B_local +
1490  // B_remote), where B_local contains the locally owned rows of B,
1491  // and B_remote the (previously Import'ed) remote rows of B.
1492 
1493  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1494  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
1495  if (!Bview.importMatrix.is_null())
1496  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1497 
1498  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1499  // where Teuchos::ArrayRCP::operator[] may be slower than
1500  // Teuchos::ArrayView::operator[].
1501 
1502  // For efficiency
1503  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1504  ArrayView<const LO> Acolind, Bcolind, Icolind;
1505  ArrayView<const SC> Avals, Bvals, Ivals;
1506  ArrayView<size_t> Crowptr;
1507  ArrayView<LO> Ccolind;
1508  ArrayView<SC> Cvals;
1509  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1510  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1511  if (!Bview.importMatrix.is_null()) {
1512  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1513  }
1514 
1515  // Classic csr assembly (low memory edition)
1516  //
1517  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1518  // The method loops over rows of A, and may resize after processing
1519  // each row. Chris Siefert says that this reflects experience in
1520  // ML; for the non-threaded case, ML found it faster to spend less
1521  // effort on estimation and risk an occasional reallocation.
1522  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1523  Crowptr_RCP.resize(m+1); Crowptr = Crowptr_RCP();
1524  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
1525  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
1526 
1527 
1528  // mfh 27 Sep 2016: The c_status array is an implementation detail
1529  // of the local sparse matrix-matrix multiply routine.
1530 
1531  // The status array will contain the index into colind where this entry was last deposited.
1532  // c_status[i] < CSR_ip - not in the row yet
1533  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1534  // We start with this filled with INVALID's indicating that there are no entries yet.
1535  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1536  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1537  Array<size_t> c_status(n, ST_INVALID);
1538 
1539  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1540  // routine. The routine computes C := A * (B_local + B_remote).
1541  //
1542  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1543  // you whether the corresponding row of B belongs to B_local
1544  // ("orig") or B_remote ("Import").
1545 
1546  // For each row of A/C
1547  size_t CSR_ip = 0, OLD_ip = 0;
1548  for (size_t i = 0; i < m; i++) {
1549  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1550  // on the calling process.
1551  Crowptr[i] = CSR_ip;
1552 
1553  // mfh 27 Sep 2016: For each entry of A in the current row of A
1554  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1555  LO Aik = Acolind[k]; // local column index of current entry of A
1556  const SC Aval = Avals[k]; // value of current entry of A
1557  if (Aval == SC_ZERO)
1558  continue; // skip explicitly stored zero values in A
1559 
1560  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1561  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1562  // corresponding to the current entry of A is populated, then
1563  // the corresponding row of B is in B_local (i.e., it lives on
1564  // the calling process).
1565 
1566  // Local matrix
1567  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
1568 
1569  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1570  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1571  LO Bkj = Bcolind[j];
1572  LO Cij = Bcol2Ccol[Bkj];
1573 
1574  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1575  // New entry
1576  c_status[Cij] = CSR_ip;
1577  Ccolind[CSR_ip] = Cij;
1578  Cvals[CSR_ip] = Aval*Bvals[j];
1579  CSR_ip++;
1580 
1581  } else {
1582  Cvals[c_status[Cij]] += Aval*Bvals[j];
1583  }
1584  }
1585 
1586  } else {
1587  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1588  // corresponding to the current entry of A NOT populated (has
1589  // a flag "invalid" value), then the corresponding row of B is
1590  // in B_local (i.e., it lives on the calling process).
1591 
1592  // Remote matrix
1593  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
1594  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1595  LO Ikj = Icolind[j];
1596  LO Cij = Icol2Ccol[Ikj];
1597 
1598  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1599  // New entry
1600  c_status[Cij] = CSR_ip;
1601  Ccolind[CSR_ip] = Cij;
1602  Cvals[CSR_ip] = Aval*Ivals[j];
1603  CSR_ip++;
1604 
1605  } else {
1606  Cvals[c_status[Cij]] += Aval*Ivals[j];
1607  }
1608  }
1609  }
1610  }
1611 
1612  // Resize for next pass if needed
1613  if (CSR_ip + n > CSR_alloc) {
1614  CSR_alloc *= 2;
1615  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
1616  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
1617  }
1618  OLD_ip = CSR_ip;
1619  }
1620 
1621  Crowptr[m] = CSR_ip;
1622 
1623  // Downward resize
1624  Cvals_RCP .resize(CSR_ip);
1625  Ccolind_RCP.resize(CSR_ip);
1626 
1627 #ifdef HAVE_TPETRA_MMM_TIMINGS
1628  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort"))));
1629 #endif
1630 
1631  // Replace the column map
1632  //
1633  // mfh 27 Sep 2016: We do this because C was originally created
1634  // without a column Map. Now we have its column Map.
1635  C.replaceColMap(Ccolmap);
1636 
1637  // Final sort & set of CRS arrays
1638  //
1639  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1640  // matrix-matrix multiply routine sort the entries for us?
1641  Import_Util::sortCrsEntries(Crowptr_RCP(), Ccolind_RCP(), Cvals_RCP());
1642  // mfh 27 Sep 2016: This just sets pointers.
1643  C.setAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
1644 
1645 #ifdef HAVE_TPETRA_MMM_TIMINGS
1646  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC"))));
1647 #endif
1648 
1649  // Final FillComplete
1650  //
1651  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1652  // Import (from domain Map to column Map) construction (which costs
1653  // lots of communication) by taking the previously constructed
1654  // Import object. We should be able to do this without interfering
1655  // with the implementation of the local part of sparse matrix-matrix
1656  // multply above.
1657  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1658  labelList->set("Timer Label",label);
1659  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1660  RCP<const Export<LO,GO,NO> > dummyExport;
1661  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1662 
1663 }
1664 
1665 
1666 /*********************************************************************************************************/
1667 // AB NewMatrix Kernel wrappers (KokkosKernels/OpenMP Version)
1668 #if defined(HAVE_KOKKOSKERNELS_EXPERIMENTAL) && defined (HAVE_TPETRA_INST_OPENMP)
1669 template<class Scalar,
1670  class LocalOrdinal,
1671  class GlobalOrdinal>
1672 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> {
1673  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1675  const Teuchos::Array<LocalOrdinal> & Acol2Brow,
1676  const Teuchos::Array<LocalOrdinal> & Acol2Irow,
1677  const Teuchos::Array<LocalOrdinal> & Bcol2Ccol,
1678  const Teuchos::Array<LocalOrdinal> & Icol2Ccol,
1681  const std::string& label = std::string(),
1682  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
1683 
1684  };
1685 
1686 
1687 /*********************************************************************************************************/
1688 template<class Scalar,
1689  class LocalOrdinal,
1690  class GlobalOrdinal>
1691 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1693  const Teuchos::Array<LocalOrdinal> & Acol2Brow,
1694  const Teuchos::Array<LocalOrdinal> & Acol2Irow,
1695  const Teuchos::Array<LocalOrdinal> & Bcol2Ccol,
1696  const Teuchos::Array<LocalOrdinal> & Icol2Ccol,
1699  const std::string& label,
1700  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1701 
1702 #ifdef HAVE_TPETRA_MMM_TIMINGS
1703  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1704  using Teuchos::TimeMonitor;
1705  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPWrapper")))));
1706 #endif
1707  // printf("[%d] OpenMP kernel called\n",Aview.origMatrix->getRowMap()->getComm()->getRank());
1708 
1709  // Node-specific code<
1710  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1711  std::string nodename("OpenMP");
1712 
1713  // Lots and lots of typedefs
1714  using Teuchos::RCP;
1716  typedef typename KCRS::device_type device_t;
1717  typedef typename KCRS::StaticCrsGraphType graph_t;
1718  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1719  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1720  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1721  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
1722 
1723  // Options
1724  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1725  std::string myalg("SPGEMM_KK_MEMORY");
1726  if(!params.is_null()) {
1727  if(params->isParameter("openmp: algorithm"))
1728  myalg = params->get("openmp: algorithm",myalg);
1729  if(params->isParameter("openmp: team work size"))
1730  team_work_size = params->get("openmp: team work size",team_work_size);
1731  }
1732 
1733  // KokkosKernelsHandle
1734  typedef KokkosKernels::Experimental::KokkosKernelsHandle<lno_view_t,lno_nnz_view_t, scalar_view_t, typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
1735 
1736  // Grab the Kokkos::SparseCrsMatrices
1737  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
1738  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
1739  RCP<const KCRS> Bmerged;
1740 
1741  // Get the algorithm mode
1742  std::string alg = nodename+std::string(" algorithm");
1743  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
1744  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
1745  KokkosKernels::Experimental::Graph::SPGEMMAlgorithm alg_enum = KokkosKernels::Experimental::Graph::StringToSPGEMMAlgorithm(myalg);
1746 
1747  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
1748  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
1749  // We do have a Bimport
1750  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
1751  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
1752  RCP<const KCRS> Ik;
1753  if(!Bview.importMatrix.is_null()) Ik = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
1754 
1755  size_t merge_numrows = Ak.numCols();
1756  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
1757 
1758  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
1759 
1760  // Grap the raw pointers out of the Teuchos::Array's to avoid the Teuchos::Array+Kokkos+DEBUG problems
1761  const LocalOrdinal * Acol2Brow_ptr = Acol2Brow.getRawPtr();
1762  const LocalOrdinal * Acol2Irow_ptr = Acol2Irow.getRawPtr();
1763  const LocalOrdinal * Bcol2Ccol_ptr = Bcol2Ccol.getRawPtr();
1764  const LocalOrdinal * Icol2Ccol_ptr = Icol2Ccol.getRawPtr();
1765 
1766  // Use a Kokkos::parallel_scan to build the rowptr
1767  typedef Node::execution_space execution_space;
1768  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1769  Kokkos::parallel_scan (range_type (0, merge_numrows),
1770  KOKKOS_LAMBDA (const size_t i, size_t& update, const bool final) {
1771  if(final) Mrowptr(i) = update;
1772 
1773  // Get the row count
1774  size_t ct=0;
1775  if(Acol2Brow_ptr[i]!=LO_INVALID)
1776  ct = Bk.graph.row_map(Acol2Brow_ptr[i]+1) - Bk.graph.row_map(Acol2Brow_ptr[i]);
1777  else
1778  ct = Ik->graph.row_map(Acol2Irow_ptr[i]+1) - Ik->graph.row_map(Acol2Irow_ptr[i]);
1779  update+=ct;
1780 
1781  if(final && i+1==merge_numrows)
1782  Mrowptr(i+1)=update;
1783  });
1784 
1785  // Allocate nnz
1786  size_t merge_nnz = Mrowptr(merge_numrows);
1787  lno_nnz_view_t Mcolind("Mcolind",merge_nnz);
1788  scalar_view_t Mvalues("Mvals",merge_nnz);
1789 
1790  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
1791  typedef Node::execution_space execution_space;
1792  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1793  Kokkos::parallel_for (range_type (0, merge_numrows),
1794  KOKKOS_LAMBDA (const size_t i) {
1795  if(Acol2Brow_ptr[i]!=LO_INVALID) {
1796  size_t row = Acol2Brow_ptr[i];
1797  size_t start = Bk.graph.row_map(row);
1798  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
1799  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
1800  Mcolind(j) = Bcol2Ccol_ptr[Bk.graph.entries(j-Mrowptr(i)+start)];
1801  }
1802  }
1803  else {
1804  size_t row = Acol2Irow_ptr[i];
1805  size_t start = Ik->graph.row_map(row);
1806  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
1807  Mvalues(j) = Ik->values(j-Mrowptr(i)+start);
1808  Mcolind(j) = Icol2Ccol_ptr[Ik->graph.entries(j-Mrowptr(i)+start)];
1809  }
1810  }
1811  });
1812 
1813  Bmerged = Teuchos::rcp(new KCRS("CrsMatrix",merge_numrows,C.getColMap()->getNodeNumElements(),merge_nnz,Mvalues,Mrowptr,Mcolind));
1814 
1815  }
1816  else {
1817  // We don't have a Bimport (the easy case)
1818  Bmerged = Teuchos::rcpFromRef(Bk);
1819  }
1820 
1821 #ifdef HAVE_TPETRA_MMM_TIMINGS
1822  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPCore"))));
1823 #endif
1824 
1825  // Do the multiply on whatever we've got
1826  typename KernelHandle::nnz_lno_t AnumRows = Ak.numRows();
1827  typename KernelHandle::nnz_lno_t BnumRows = Bmerged->numRows();
1828  typename KernelHandle::nnz_lno_t BnumCols = Bmerged->numCols();
1829 
1830  lno_view_t row_mapC ("non_const_lnow_row", AnumRows + 1);
1831  lno_nnz_view_t entriesC;
1832  scalar_view_t valuesC;
1833  KernelHandle kh;
1834  kh.create_spgemm_handle(alg_enum);
1835  kh.set_team_work_size(team_work_size);
1836 
1837  KokkosKernels::Experimental::Graph::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Ak.graph.row_map,Ak.graph.entries,false,Bmerged->graph.row_map,Bmerged->graph.entries,false,row_mapC);
1838 
1839  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
1840  if (c_nnz_size){
1841  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
1842  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
1843  }
1844  KokkosKernels::Experimental::Graph::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Ak.graph.row_map,Ak.graph.entries,Ak.values,false,Bmerged->graph.row_map,Bmerged->graph.entries,Bmerged->values,false,row_mapC,entriesC,valuesC);
1845  kh.destroy_spgemm_handle();
1846 
1847 #ifdef HAVE_TPETRA_MMM_TIMINGS
1848  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
1849 #endif
1850 
1851  // Sort & set values
1852  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
1853  C.setAllValues(row_mapC,entriesC,valuesC);
1854 
1855 #ifdef HAVE_TPETRA_MMM_TIMINGS
1856  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPESFC"))));
1857 #endif
1858 
1859  // Final Fillcomplete
1860  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1861  labelList->set("Timer Label",label);
1862  RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > dummyExport;
1863  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1864 
1865 #if 0
1866  {
1867  Teuchos::ArrayRCP< const size_t > Crowptr;
1868  Teuchos::ArrayRCP< const LocalOrdinal > Ccolind;
1869  Teuchos::ArrayRCP< const Scalar > Cvalues;
1870  C.getAllValues(Crowptr,Ccolind,Cvalues);
1871 
1872  // DEBUG
1873  int MyPID = C->getComm()->getRank();
1874  printf("[%d] Crowptr = ",MyPID);
1875  for(size_t i=0; i<(size_t) Crowptr.size(); i++) {
1876  printf("%3d ",(int)Crowptr.getConst()[i]);
1877  }
1878  printf("\n");
1879  printf("[%d] Ccolind = ",MyPID);
1880  for(size_t i=0; i<(size_t)Ccolind.size(); i++) {
1881  printf("%3d ",(int)Ccolind.getConst()[i]);
1882  }
1883  printf("\n");
1884  fflush(stdout);
1885  // END DEBUG
1886  }
1887 #endif
1888 
1889 
1890 
1891 }
1892 #endif
1893 
1894 
1895 
1896 /*********************************************************************************************************/
1897 // Kernel method for computing the local portion of C = A*B
1898 template<class Scalar,
1899  class LocalOrdinal,
1900  class GlobalOrdinal,
1901  class Node>
1902 void mult_A_B_reuse(
1906  const std::string& label,
1907  const Teuchos::RCP<Teuchos::ParameterList>& params)
1908 {
1909  using Teuchos::Array;
1910  using Teuchos::ArrayRCP;
1911  using Teuchos::ArrayView;
1912  using Teuchos::RCP;
1913  using Teuchos::rcp;
1914 
1915  typedef Scalar SC;
1916  typedef LocalOrdinal LO;
1917  typedef GlobalOrdinal GO;
1918  typedef Node NO;
1919 
1920  typedef Import<LO,GO,NO> import_type;
1921  typedef Map<LO,GO,NO> map_type;
1922 
1923 #ifdef HAVE_TPETRA_MMM_TIMINGS
1924  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1925  using Teuchos::TimeMonitor;
1926  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1927 #endif
1928  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1929  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1930 
1931  // Build the final importer / column map, hash table lookups for C
1932  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1933  RCP<const map_type> Ccolmap = C.getColMap();
1934 
1935  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
1936  {
1937  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1938  // So, column map of C may be a strict subset of the column map of B
1939  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
1940  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
1941  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
1942 
1943  if (!Bview.importMatrix.is_null()) {
1944  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1945  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1946 
1947  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
1948  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
1949  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
1950  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
1951  }
1952  }
1953 
1954 #ifdef HAVE_TPETRA_MMM_TIMINGS
1955  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
1956 #endif
1957 
1958  // Sizes
1959  size_t m = Aview.origMatrix->getNodeNumRows();
1960  size_t n = Ccolmap->getNodeNumElements();
1961 
1962  // Get Data Pointers
1963  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP, Crowptr_RCP;
1964  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP, Ccolind_RCP;
1965  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP, Cvals_RCP;
1966 
1967  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1968  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
1969  if (!Bview.importMatrix.is_null())
1970  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1971  C.getAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
1972 
1973  // For efficiency
1974  ArrayView<const size_t> Arowptr, Browptr, Irowptr, Crowptr;
1975  ArrayView<const LO> Acolind, Bcolind, Icolind, Ccolind;
1976  ArrayView<const SC> Avals, Bvals, Ivals;
1977  ArrayView<SC> Cvals;
1978  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1979  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1980  if (!Bview.importMatrix.is_null()) {
1981  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1982  }
1983  Crowptr = Crowptr_RCP(); Ccolind = Ccolind_RCP(); Cvals = (Teuchos::arcp_const_cast<SC>(Cvals_RCP))();
1984 
1985  // Run through all the hash table lookups once and for all
1986  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
1987  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
1988 
1989  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
1990  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1991  if (B_LID != LO_INVALID) {
1992  targetMapToOrigRow[i] = B_LID;
1993  } else {
1994  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
1995  targetMapToImportRow[i] = I_LID;
1996  }
1997  }
1998 
1999  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2000 
2001  // The status array will contain the index into colind where this entry was last deposited.
2002  // c_status[i] < CSR_ip - not in the row yet
2003  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2004  // We start with this filled with INVALID's indicating that there are no entries yet.
2005  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2006  Array<size_t> c_status(n, ST_INVALID);
2007 
2008  // For each row of A/C
2009  size_t CSR_ip = 0, OLD_ip = 0;
2010  for (size_t i = 0; i < m; i++) {
2011 
2012  // First fill the c_status array w/ locations where we're allowed to
2013  // generate nonzeros for this row
2014  OLD_ip = Crowptr[i];
2015  CSR_ip = Crowptr[i+1];
2016  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2017  c_status[Ccolind[k]] = k;
2018 
2019  // Reset values in the row of C
2020  Cvals[k] = SC_ZERO;
2021  }
2022 
2023  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2024  LO Aik = Acolind[k];
2025  const SC Aval = Avals[k];
2026  if (Aval == SC_ZERO)
2027  continue;
2028 
2029  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2030  // Local matrix
2031  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
2032 
2033  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2034  LO Bkj = Bcolind[j];
2035  LO Cij = Bcol2Ccol[Bkj];
2036 
2037  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2038  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2039  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2040 
2041  Cvals[c_status[Cij]] += Aval * Bvals[j];
2042  }
2043 
2044  } else {
2045  // Remote matrix
2046  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
2047  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2048  LO Ikj = Icolind[j];
2049  LO Cij = Icol2Ccol[Ikj];
2050 
2051  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2052  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2053  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2054 
2055  Cvals[c_status[Cij]] += Aval * Ivals[j];
2056  }
2057  }
2058  }
2059  }
2060 
2062 }
2063 
2064 
2065 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2066 template<class Scalar,
2067  class LocalOrdinal,
2068  class GlobalOrdinal,
2069  class Node>
2070 void jacobi_A_B_newmatrix(
2071  Scalar omega,
2076  const std::string& label,
2077  const Teuchos::RCP<Teuchos::ParameterList>& params)
2078 {
2079  using Teuchos::Array;
2080  using Teuchos::ArrayRCP;
2081  using Teuchos::ArrayView;
2082  using Teuchos::RCP;
2083  using Teuchos::rcp;
2084 
2085  typedef Scalar SC;
2086  typedef LocalOrdinal LO;
2087  typedef GlobalOrdinal GO;
2088  typedef Node NO;
2089 
2090  typedef Import<LO,GO,NO> import_type;
2091  typedef Map<LO,GO,NO> map_type;
2092 
2093 #ifdef HAVE_TPETRA_MMM_TIMINGS
2094  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2095  using Teuchos::TimeMonitor;
2096  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2097 #endif
2098  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2099  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2100 
2101 
2102  // Build the final importer / column map, hash table lookups for C
2103  RCP<const import_type> Cimport;
2104  RCP<const map_type> Ccolmap;
2105  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2106  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2107 
2108  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2109  // indices of B, to local column indices of C. (B and C have the
2110  // same number of columns.) The kernel uses this, instead of
2111  // copying the entire input matrix B and converting its column
2112  // indices to those of C.
2113  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
2114 
2115  if (Bview.importMatrix.is_null()) {
2116  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2117  Cimport = Bimport;
2118  Ccolmap = Bview.colMap;
2119  // Bcol2Ccol is trivial
2120  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
2121  Bcol2Ccol[i] = Teuchos::as<LO>(i);
2122 
2123  } else {
2124  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2125  // column Map of C, as well as C's Import object (from its domain
2126  // Map to its column Map). C's column Map is the union of the
2127  // column Maps of (the local part of) B, and the "remote" part of
2128  // B. Ditto for the Import. We have optimized this "setUnion"
2129  // operation on Import objects and Maps.
2130 
2131  // Choose the right variant of setUnion
2132  if (!Bimport.is_null() && !Iimport.is_null()){
2133  Cimport = Bimport->setUnion(*Iimport);
2134  Ccolmap = Cimport->getTargetMap();
2135 
2136  } else if (!Bimport.is_null() && Iimport.is_null()) {
2137  Cimport = Bimport->setUnion();
2138 
2139  } else if(Bimport.is_null() && !Iimport.is_null()) {
2140  Cimport = Iimport->setUnion();
2141 
2142  } else
2143  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2144 
2145  Ccolmap = Cimport->getTargetMap();
2146 
2147  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2148  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2149 
2150  // NOTE: This is not efficient and should be folded into setUnion
2151  //
2152  // mfh 27 Sep 2016: What the above comment means, is that the
2153  // setUnion operation on Import objects could also compute these
2154  // local index - to - local index look-up tables.
2155  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
2156  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
2157  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
2158 
2159  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
2160  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
2161  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
2162  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
2163  }
2164 
2165 #ifdef HAVE_TPETRA_MMM_TIMINGS
2166  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix SerialCore"))));
2167 #endif
2168 
2169  // Sizes
2170  size_t m = Aview.origMatrix->getNodeNumRows();
2171  size_t n = Ccolmap->getNodeNumElements();
2172 
2173  // Get Data Pointers
2174  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
2175  ArrayRCP<size_t> Crowptr_RCP;
2176  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
2177  ArrayRCP<LO> Ccolind_RCP;
2178  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP;
2179  ArrayRCP<SC> Cvals_RCP;
2180  ArrayRCP<const SC> Dvals_RCP;
2181 
2182  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
2183  // out of the CrsMatrix. This code computes A * (B_local +
2184  // B_remote), where B_local contains the locally owned rows of B,
2185  // and B_remote the (previously Import'ed) remote rows of B.
2186 
2187  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
2188  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
2189  if (!Bview.importMatrix.is_null())
2190  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
2191 
2192  // mfh 27 Sep 2016: The "Jacobi" case scales by the inverse of a
2193  // diagonal matrix.
2194  Dvals_RCP = Dinv.getData();
2195 
2196  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
2197  // where Teuchos::ArrayRCP::operator[] may be slower than
2198  // Teuchos::ArrayView::operator[].
2199 
2200  // For efficiency
2201  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
2202  ArrayView<const LO> Acolind, Bcolind, Icolind;
2203  ArrayView<const SC> Avals, Bvals, Ivals;
2204  ArrayView<size_t> Crowptr;
2205  ArrayView<LO> Ccolind;
2206  ArrayView<SC> Cvals;
2207  ArrayView<const SC> Dvals;
2208  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
2209  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
2210  if (!Bview.importMatrix.is_null()) {
2211  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
2212  }
2213  Dvals = Dvals_RCP();
2214 
2215  // The status array will contain the index into colind where this entry was last deposited.
2216  // c_status[i] < CSR_ip - not in the row yet.
2217  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2218  // We start with this filled with INVALID's indicating that there are no entries yet.
2219  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2220  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2221  Array<size_t> c_status(n, ST_INVALID);
2222 
2223  // Classic csr assembly (low memory edition)
2224  //
2225  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2226  // The method loops over rows of A, and may resize after processing
2227  // each row. Chris Siefert says that this reflects experience in
2228  // ML; for the non-threaded case, ML found it faster to spend less
2229  // effort on estimation and risk an occasional reallocation.
2230  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2231  size_t CSR_ip = 0, OLD_ip = 0;
2232  Crowptr_RCP.resize(m+1); Crowptr = Crowptr_RCP();
2233  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
2234  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
2235 
2236  // mfh 27 Sep 2016: Construct tables that map from local column
2237  // indices of A, to local row indices of either B_local (the locally
2238  // owned part of B), or B_remote (the "imported" remote part of B).
2239  //
2240  // For column index Aik in row i of A, if the corresponding row of B
2241  // exists in the local part of B ("orig") (which I'll call B_local),
2242  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2243  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2244  //
2245  // For column index Aik in row i of A, if the corresponding row of B
2246  // exists in the remote part of B ("Import") (which I'll call
2247  // B_remote), then targetMapToImportRow[Aik] is the local index of
2248  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2249  // (a flag value).
2250 
2251  // Run through all the hash table lookups once and for all
2252  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
2253  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
2254 
2255  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
2256  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2257  if (B_LID != LO_INVALID) {
2258  targetMapToOrigRow[i] = B_LID;
2259  } else {
2260  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2261  targetMapToImportRow[i] = I_LID;
2262  }
2263  }
2264 
2265  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2266 
2267  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2268  // routine. The routine computes
2269  //
2270  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2271  //
2272  // This corresponds to one sweep of (weighted) Jacobi.
2273  //
2274  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2275  // you whether the corresponding row of B belongs to B_local
2276  // ("orig") or B_remote ("Import").
2277 
2278  // For each row of A/C
2279  for (size_t i = 0; i < m; i++) {
2280  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2281  // on the calling process.
2282  Crowptr[i] = CSR_ip;
2283  SC Dval = Dvals[i];
2284 
2285  // Entries of B
2286  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2287  Scalar Bval = Bvals[j];
2288  if (Bval == SC_ZERO)
2289  continue;
2290  LO Bij = Bcolind[j];
2291  LO Cij = Bcol2Ccol[Bij];
2292 
2293  // Assume no repeated entries in B
2294  c_status[Cij] = CSR_ip;
2295  Ccolind[CSR_ip] = Cij;
2296  Cvals[CSR_ip] = Bvals[j];
2297  CSR_ip++;
2298  }
2299 
2300  // Entries of -omega * Dinv * A * B
2301  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2302  LO Aik = Acolind[k];
2303  const SC Aval = Avals[k];
2304  if (Aval == SC_ZERO)
2305  continue;
2306 
2307  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2308  // Local matrix
2309  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
2310 
2311  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2312  LO Bkj = Bcolind[j];
2313  LO Cij = Bcol2Ccol[Bkj];
2314 
2315  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2316  // New entry
2317  c_status[Cij] = CSR_ip;
2318  Ccolind[CSR_ip] = Cij;
2319  Cvals[CSR_ip] = - omega * Dval* Aval * Bvals[j];
2320  CSR_ip++;
2321 
2322  } else {
2323  Cvals[c_status[Cij]] -= omega * Dval* Aval * Bvals[j];
2324  }
2325  }
2326 
2327  } else {
2328  // Remote matrix
2329  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
2330  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2331  LO Ikj = Icolind[j];
2332  LO Cij = Icol2Ccol[Ikj];
2333 
2334  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2335  // New entry
2336  c_status[Cij] = CSR_ip;
2337  Ccolind[CSR_ip] = Cij;
2338  Cvals[CSR_ip] = - omega * Dval* Aval * Ivals[j];
2339  CSR_ip++;
2340  } else {
2341  Cvals[c_status[Cij]] -= omega * Dval* Aval * Ivals[j];
2342  }
2343  }
2344  }
2345  }
2346 
2347  // Resize for next pass if needed
2348  if (CSR_ip + n > CSR_alloc) {
2349  CSR_alloc *= 2;
2350  Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
2351  Cvals_RCP.resize(CSR_alloc); Cvals = Cvals_RCP();
2352  }
2353  OLD_ip = CSR_ip;
2354  }
2355 
2356  Crowptr[m] = CSR_ip;
2357 
2358  // Downward resize
2359  Cvals_RCP .resize(CSR_ip);
2360  Ccolind_RCP.resize(CSR_ip);
2361 
2362 
2363 #ifdef HAVE_TPETRA_MMM_TIMINGS
2364  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort"))));
2365 #endif
2366 
2367  // Replace the column map
2368  //
2369  // mfh 27 Sep 2016: We do this because C was originally created
2370  // without a column Map. Now we have its column Map.
2371  C.replaceColMap(Ccolmap);
2372 
2373  // Final sort & set of CRS arrays
2374  //
2375  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2376  // matrix-matrix multiply routine sort the entries for us?
2377  Import_Util::sortCrsEntries(Crowptr_RCP(), Ccolind_RCP(), Cvals_RCP());
2378  // mfh 27 Sep 2016: This just sets pointers.
2379  C.setAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
2380 
2381 #ifdef HAVE_TPETRA_MMM_TIMINGS
2382  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC"))));
2383 #endif
2384 
2385  // Final FillComplete
2386  //
2387  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2388  // Import (from domain Map to column Map) construction (which costs
2389  // lots of communication) by taking the previously constructed
2390  // Import object. We should be able to do this without interfering
2391  // with the implementation of the local part of sparse matrix-matrix
2392  // multply above
2393  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2394  labelList->set("Timer Label",label);
2395  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2396  RCP<const Export<LO,GO,NO> > dummyExport;
2397  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2398 }
2399 
2400 
2401 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2402 template<class Scalar,
2403  class LocalOrdinal,
2404  class GlobalOrdinal,
2405  class Node>
2406 void jacobi_A_B_reuse(
2407  Scalar omega,
2412  const std::string& label,
2413  const Teuchos::RCP<Teuchos::ParameterList>& params)
2414 {
2415  using Teuchos::Array;
2416  using Teuchos::ArrayRCP;
2417  using Teuchos::ArrayView;
2418  using Teuchos::RCP;
2419  using Teuchos::rcp;
2420 
2421  typedef Scalar SC;
2422  typedef LocalOrdinal LO;
2423  typedef GlobalOrdinal GO;
2424  typedef Node NO;
2425 
2426  typedef Import<LO,GO,NO> import_type;
2427  typedef Map<LO,GO,NO> map_type;
2428 
2429 #ifdef HAVE_TPETRA_MMM_TIMINGS
2430  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2431  using Teuchos::TimeMonitor;
2432  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2433 #endif
2434  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2435  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2436 
2437  // Build the final importer / column map, hash table lookups for C
2438  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2439  RCP<const map_type> Ccolmap = C.getColMap();
2440 
2441  Array<LO> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
2442 
2443  if (Bview.importMatrix.is_null()) {
2444  // Bcol2Ccol is trivial
2445  // This is possible. This situation is different from mult_A_B_reuse, as we
2446  // always add B
2447  for (size_t i = 0; i < Bview.colMap->getNodeNumElements(); i++)
2448  Bcol2Ccol[i] = Teuchos::as<LO>(i);
2449 
2450  } else {
2451  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2452  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2453 
2454  // NOTE: This is not efficient and should be folded into setUnion
2455  Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
2456  ArrayView<const GO> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
2457  ArrayView<const GO> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
2458 
2459  for (size_t i = 0; i < Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
2460  Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
2461  for (size_t i = 0; i < Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
2462  Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
2463  }
2464 
2465 #ifdef HAVE_TPETRA_MMM_TIMINGS
2466  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2467 #endif
2468 
2469  // Sizes
2470  size_t m = Aview.origMatrix->getNodeNumRows();
2471  size_t n = Ccolmap->getNodeNumElements();
2472 
2473  // Get Data Pointers
2474  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP, Crowptr_RCP;
2475  ArrayRCP<const LO> Acolind_RCP, Bcolind_RCP, Icolind_RCP, Ccolind_RCP;
2476  ArrayRCP<const SC> Avals_RCP, Bvals_RCP, Ivals_RCP, Cvals_RCP, Dvals_RCP;
2477 
2478  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
2479  Bview.origMatrix->getAllValues(Browptr_RCP, Bcolind_RCP, Bvals_RCP);
2480  if (!Bview.importMatrix.is_null())
2481  Bview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
2482  C.getAllValues(Crowptr_RCP, Ccolind_RCP, Cvals_RCP);
2483  Dvals_RCP = Dinv.getData();
2484 
2485  // For efficiency
2486  ArrayView<const size_t> Arowptr, Browptr, Irowptr, Crowptr;
2487  ArrayView<const LO> Acolind, Bcolind, Icolind, Ccolind;
2488  ArrayView<const SC> Avals, Bvals, Ivals;
2489  ArrayView<SC> Cvals;
2490  ArrayView<const SC> Dvals;
2491  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
2492  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
2493  if (!Bview.importMatrix.is_null()) {
2494  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
2495  }
2496  Crowptr = Crowptr_RCP(); Ccolind = Ccolind_RCP(); Cvals = (Teuchos::arcp_const_cast<SC>(Cvals_RCP))();
2497  Dvals = Dvals_RCP();
2498 
2499  // Run through all the hash table lookups once and for all
2500  Array<LO> targetMapToOrigRow (Aview.colMap->getNodeNumElements(), LO_INVALID);
2501  Array<LO> targetMapToImportRow(Aview.colMap->getNodeNumElements(), LO_INVALID);
2502 
2503  for (LO i = Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
2504  LO B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2505  if (B_LID != LO_INVALID) {
2506  targetMapToOrigRow[i] = B_LID;
2507  } else {
2508  LO I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
2509  targetMapToImportRow[i] = I_LID;
2510  }
2511  }
2512 
2513  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2514 
2515  // The status array will contain the index into colind where this entry was last deposited.
2516  // c_status[i] < CSR_ip - not in the row yet
2517  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2518  // We start with this filled with INVALID's indicating that there are no entries yet.
2519  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2520  Array<size_t> c_status(n, ST_INVALID);
2521 
2522  // For each row of A/C
2523  size_t CSR_ip = 0, OLD_ip = 0;
2524  for (size_t i = 0; i < m; i++) {
2525 
2526  // First fill the c_status array w/ locations where we're allowed to
2527  // generate nonzeros for this row
2528  OLD_ip = Crowptr[i];
2529  CSR_ip = Crowptr[i+1];
2530  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2531  c_status[Ccolind[k]] = k;
2532 
2533  // Reset values in the row of C
2534  Cvals[k] = SC_ZERO;
2535  }
2536 
2537  SC Dval = Dvals[i];
2538 
2539  // Entries of B
2540  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2541  Scalar Bval = Bvals[j];
2542  if (Bval == SC_ZERO)
2543  continue;
2544  LO Bij = Bcolind[j];
2545  LO Cij = Bcol2Ccol[Bij];
2546 
2547  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2548  std::runtime_error, "Trying to insert a new entry into a static graph");
2549 
2550  Cvals[c_status[Cij]] = Bvals[j];
2551  }
2552 
2553  // Entries of -omega * Dinv * A * B
2554  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2555  LO Aik = Acolind[k];
2556  const SC Aval = Avals[k];
2557  if (Aval == SC_ZERO)
2558  continue;
2559 
2560  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2561  // Local matrix
2562  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
2563 
2564  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2565  LO Bkj = Bcolind[j];
2566  LO Cij = Bcol2Ccol[Bkj];
2567 
2568  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2569  std::runtime_error, "Trying to insert a new entry into a static graph");
2570 
2571  Cvals[c_status[Cij]] -= omega * Dval* Aval * Bvals[j];
2572  }
2573 
2574  } else {
2575  // Remote matrix
2576  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
2577  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2578  LO Ikj = Icolind[j];
2579  LO Cij = Icol2Ccol[Ikj];
2580 
2581  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2582  std::runtime_error, "Trying to insert a new entry into a static graph");
2583 
2584  Cvals[c_status[Cij]] -= omega * Dval* Aval * Ivals[j];
2585  }
2586  }
2587  }
2588  }
2589 
2591 }
2592 
2593 
2594 template<class Scalar,
2595  class LocalOrdinal,
2596  class GlobalOrdinal,
2597  class Node>
2598 void import_and_extract_views(
2600  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2602  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2603  bool userAssertsThereAreNoRemotes,
2604  const std::string& label,
2605  const Teuchos::RCP<Teuchos::ParameterList>& params)
2606 {
2607  using Teuchos::Array;
2608  using Teuchos::ArrayView;
2609  using Teuchos::RCP;
2610  using Teuchos::rcp;
2611  using Teuchos::null;
2612 
2613  typedef Scalar SC;
2614  typedef LocalOrdinal LO;
2615  typedef GlobalOrdinal GO;
2616  typedef Node NO;
2617 
2618  typedef Map<LO,GO,NO> map_type;
2619  typedef Import<LO,GO,NO> import_type;
2620  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2621 
2622 #ifdef HAVE_TPETRA_MMM_TIMINGS
2623  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2624  using Teuchos::TimeMonitor;
2625  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2626 #endif
2627  // The goal of this method is to populate the 'Aview' struct with views of the
2628  // rows of A, including all rows that correspond to elements in 'targetMap'.
2629  //
2630  // If targetMap includes local elements that correspond to remotely-owned rows
2631  // of A, then those remotely-owned rows will be imported into
2632  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2633  Aview.deleteContents();
2634 
2635  Aview.origMatrix = rcp(&A, false);
2636  Aview.origRowMap = A.getRowMap();
2637  Aview.rowMap = targetMap;
2638  Aview.colMap = A.getColMap();
2639  Aview.domainMap = A.getDomainMap();
2640  Aview.importColMap = null;
2641 
2642  // Short circuit if the user swears there are no remotes
2643  if (userAssertsThereAreNoRemotes)
2644  return;
2645 
2646  RCP<const import_type> importer;
2647  if (params != null && params->isParameter("importer")) {
2648  importer = params->get<RCP<const import_type> >("importer");
2649 
2650  } else {
2651 #ifdef HAVE_TPETRA_MMM_TIMINGS
2652  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2653 #endif
2654 
2655  // Mark each row in targetMap as local or remote, and go ahead and get a view
2656  // for the local rows
2657  RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
2658  size_t numRemote = 0;
2659  int mode = 0;
2660  if (!prototypeImporter.is_null() &&
2661  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2662  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2663  // We have a valid prototype importer --- ask it for the remotes
2664  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2665  numRemote = prototypeImporter->getNumRemoteIDs();
2666 
2667  Array<GO> remoteRows(numRemote);
2668  for (size_t i = 0; i < numRemote; i++)
2669  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2670 
2671  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2672  rowMap->getIndexBase(), rowMap->getComm(), rowMap->getNode()));
2673  mode = 1;
2674 
2675  } else if (prototypeImporter.is_null()) {
2676  // No prototype importer --- count the remotes the hard way
2677  ArrayView<const GO> rows = targetMap->getNodeElementList();
2678  size_t numRows = targetMap->getNodeNumElements();
2679 
2680  Array<GO> remoteRows(numRows);
2681  for(size_t i = 0; i < numRows; ++i) {
2682  const LO mlid = rowMap->getLocalElement(rows[i]);
2683 
2684  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2685  remoteRows[numRemote++] = rows[i];
2686  }
2687  remoteRows.resize(numRemote);
2688  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2689  rowMap->getIndexBase(), rowMap->getComm(), rowMap->getNode()));
2690  mode = 2;
2691 
2692  } else {
2693  // PrototypeImporter is bad. But if we're in serial that's OK.
2694  mode = 3;
2695  }
2696 
2697  const int numProcs = rowMap->getComm()->getSize();
2698  if (numProcs < 2) {
2699  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2700  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2701  // If only one processor we don't need to import any remote rows, so return.
2702  return;
2703  }
2704 
2705  //
2706  // Now we will import the needed remote rows of A, if the global maximum
2707  // value of numRemote is greater than 0.
2708  //
2709 #ifdef HAVE_TPETRA_MMM_TIMINGS
2710  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2711 #endif
2712 
2713  global_size_t globalMaxNumRemote = 0;
2714  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2715 
2716  if (globalMaxNumRemote > 0) {
2717 #ifdef HAVE_TPETRA_MMM_TIMINGS
2718  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2719 #endif
2720  // Create an importer with target-map remoteRowMap and source-map rowMap.
2721  if (mode == 1)
2722  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2723  else if (mode == 2)
2724  importer = rcp(new import_type(rowMap, remoteRowMap));
2725  else
2726  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2727  }
2728 
2729  if (params != null)
2730  params->set("importer", importer);
2731  }
2732 
2733  if (importer != null) {
2734 #ifdef HAVE_TPETRA_MMM_TIMINGS
2735  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
2736 #endif
2737 
2738  // Now create a new matrix into which we can import the remote rows of A that we need.
2739  Teuchos::ParameterList labelList;
2740  labelList.set("Timer Label", label);
2741  // Minor speedup tweak - avoid computing the global constants
2742  if(!params.is_null())
2743  labelList.set("compute global constants", params->get("compute global constants",false));
2744  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
2745  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
2746 
2747 #ifdef HAVE_TPETRA_MMM_STATISTICS
2748  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
2749 #endif
2750 
2751 
2752 #ifdef HAVE_TPETRA_MMM_TIMINGS
2753  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
2754 #endif
2755 
2756  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
2757  Aview.importColMap = Aview.importMatrix->getColMap();
2758  }
2759 }
2760 
2761 
2762 
2763 } //End namepsace MMdetails
2764 
2765 } //End namespace Tpetra
2766 //
2767 // Explicit instantiation macro
2768 //
2769 // Must be expanded from within the Tpetra namespace!
2770 //
2771 
2772 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
2773  \
2774  template \
2775  void MatrixMatrix::Multiply( \
2776  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2777  bool transposeA, \
2778  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2779  bool transposeB, \
2780  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
2781  bool call_FillComplete_on_result, \
2782  const std::string & label, \
2783  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2784 \
2785 template \
2786  void MatrixMatrix::Jacobi( \
2787  SCALAR omega, \
2788  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
2789  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2790  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2791  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
2792  bool call_FillComplete_on_result, \
2793  const std::string & label, \
2794  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2795 \
2796  template \
2797  void MatrixMatrix::Add( \
2798  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2799  bool transposeA, \
2800  SCALAR scalarA, \
2801  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2802  bool transposeB, \
2803  SCALAR scalarB, \
2804  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
2805  \
2806  template \
2807  void MatrixMatrix::Add( \
2808  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
2809  bool transposeA, \
2810  SCALAR scalarA, \
2811  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
2812  SCALAR scalarB ); \
2813  \
2814  template \
2815  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
2816  MatrixMatrix::add (const SCALAR & alpha, \
2817  const bool transposeA, \
2818  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
2819  const SCALAR & beta, \
2820  const bool transposeB, \
2821  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
2822  const Teuchos::RCP<const Map< LO , GO , NODE > >& domainMap, \
2823  const Teuchos::RCP<const Map< LO , GO , NODE > >& rangeMap, \
2824  const Teuchos::RCP<Teuchos::ParameterList>& params); \
2825 \
2826 
2827 
2828 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
ProfileType getProfileType() const
Returns true if the matrix was allocated with static data structures.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > importColMap
Colmap garnered as a result of the import.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t getNodeMaxNumRowEntries() const
Returns the maximum number of entries across all rows/columns on this node.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const
This matrix&#39;s graph, as a RowGraph.
bool isFillActive() const
Whether the matrix is not fill complete.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > domainMap
Domain map for original matrix.
Teuchos::RCP< const map_type > getRowMap() const
The Map that describes the row distribution in this matrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
size_t getNumReceives() const
The number of processes from which we will receive data.
size_t getNumSends() const
The number of processes to which we will send data.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
size_t global_size_t
Global size_t object.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
global_size_t getGlobalNumRows() const
Number of global elements in the row map of this matrix.
bool isFillComplete() const
Whether the matrix is fill complete.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void keyValueMerge(KeyInputIterType keyBeg1, KeyInputIterType keyEnd1, ValueInputIterType valBeg1, ValueInputIterType valEnd1, KeyInputIterType keyBeg2, KeyInputIterType keyEnd2, ValueInputIterType valBeg2, ValueInputIterType valEnd2, KeyOutputIterType keyOut, ValueOutputIterType valOut, BinaryFunction f)
Merge two sorted (by keys) sequences of unique (key,value) pairs by combining pairs with equal keys...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > importMatrix
The imported matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Sets up and executes a communication plan for a Tpetra DistObject.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::ArrayRCP< const Scalar > getData() const
Const view of the local values of this vector.
Utility functions for packing and unpacking sparse matrix entries.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, LocalOrdinal, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
size_t getNodeNumRows() const
The number of matrix rows owned by the calling process.
A parallel distribution of indices over processes.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A read-only, row-oriented interface to a sparse matrix.
A distributed dense vector.
Stand-alone utility functions and macros.
Teuchos::RCP< const map_type > rowMap
Desired row map for "imported" version of the matrix.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > getColMap() const
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const map_type > getDomainMap() const
The domain Map of this matrix.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
Teuchos::RCP< const map_type > getRangeMap() const
The range Map of this matrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
Teuchos::RCP< const map_type > origRowMap
Original row map of matrix.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.