Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41#ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42#define TPETRA_MATRIXMATRIX_DEF_HPP
43#include "Tpetra_ConfigDefs.hpp"
45#include "Teuchos_VerboseObject.hpp"
46#include "Teuchos_Array.hpp"
47#include "Tpetra_Util.hpp"
48#include "Tpetra_CrsMatrix.hpp"
50#include "Tpetra_RowMatrixTransposer.hpp"
53#include "Tpetra_Details_makeColMap.hpp"
54#include "Tpetra_ConfigDefs.hpp"
55#include "Tpetra_Map.hpp"
56#include "Tpetra_Export.hpp"
59#include <algorithm>
60#include <type_traits>
61#include "Teuchos_FancyOStream.hpp"
62
63#include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65
66#include "KokkosSparse_spgemm.hpp"
67#include "KokkosSparse_spadd.hpp"
68#include "Kokkos_Bitset.hpp"
69
71
79/*********************************************************************************************************/
80// Include the architecture-specific kernel partial specializations here
81// NOTE: This needs to be outside all namespaces
82#include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83#include "TpetraExt_MatrixMatrix_Cuda.hpp"
84#include "TpetraExt_MatrixMatrix_HIP.hpp"
85
86namespace Tpetra {
87
88namespace MatrixMatrix{
89
90//
91// This method forms the matrix-matrix product C = op(A) * op(B), where
92// op(A) == A if transposeA is false,
93// op(A) == A^T if transposeA is true,
94// and similarly for op(B).
95//
96template <class Scalar,
97 class LocalOrdinal,
98 class GlobalOrdinal,
99 class Node>
102 bool transposeA,
104 bool transposeB,
107 const std::string& label,
108 const Teuchos::RCP<Teuchos::ParameterList>& params)
109{
110 using Teuchos::null;
111 using Teuchos::RCP;
112 using Teuchos::rcp;
113 typedef Scalar SC;
114 typedef LocalOrdinal LO;
115 typedef GlobalOrdinal GO;
116 typedef Node NO;
117 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118 typedef Import<LO,GO,NO> import_type;
120 typedef Map<LO,GO,NO> map_type;
122
123#ifdef HAVE_TPETRA_MMM_TIMINGS
124 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125 using Teuchos::TimeMonitor;
126 //MM is used to time setup, and then multiply.
127
128 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129#endif
130
131 const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132
133 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134
135 // The input matrices A and B must both be fillComplete.
136 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138
139 // If transposeA is true, then Aprime will be the transpose of A
140 // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141 // will just be a pointer to A.
143 // If transposeB is true, then Bprime will be the transpose of B
144 // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145 // will just be a pointer to B.
147
148 // Is this a "clean" matrix?
149 //
150 // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151 // locally nor globally indexed, then it was empty. I don't like
152 // this, because the most straightforward implementation presumes
153 // lazy allocation of indices. However, historical precedent
154 // demands that we keep around this predicate as a way to test
155 // whether the matrix is empty.
156 const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157
158 bool use_optimized_ATB = false;
160 use_optimized_ATB = true;
161
162#ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163 use_optimized_ATB = false;
164#endif
165
166 using Teuchos::ParameterList;
168 transposeParams->set ("sort", false);
169
171 transposer_type transposer (rcpFromRef (A));
172 Aprime = transposer.createTranspose (transposeParams);
173 }
174 else {
175 Aprime = rcpFromRef(A);
176 }
177
178 if (transposeB) {
179 transposer_type transposer (rcpFromRef (B));
180 Bprime = transposer.createTranspose (transposeParams);
181 }
182 else {
183 Bprime = rcpFromRef(B);
184 }
185
186 // Check size compatibility
187 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189 global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190 global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191 global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192 global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194 prefix << "ERROR, inner dimensions of op(A) and op(B) "
195 "must match for matrix-matrix product. op(A) is "
196 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197
198 // The result matrix C must at least have a row-map that reflects the correct
199 // row-size. Don't check the number of columns because rectangular matrices
200 // which were constructed with only one map can still end up having the
201 // correct capacity and dimensions when filled.
202 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203 prefix << "ERROR, dimensions of result C must "
204 "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205 << " rows, should have at least " << Aouter << std::endl);
206
207 // It doesn't matter whether C is already Filled or not. If it is already
208 // Filled, it must have space allocated for the positions that will be
209 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210 // we'll error out later when trying to store result values.
211
212 // CGB: However, matrix must be in active-fill
213 if (!C.isFillActive()) C.resumeFill();
214
215 // We're going to need to import remotely-owned sections of A and/or B if
216 // more than one processor is performing this run, depending on the scenario.
217 int numProcs = A.getComm()->getSize();
218
219 // Declare a couple of structs that will be used to hold views of the data
220 // of A and B, to be used for fast access during the matrix-multiplication.
223
226
227#ifdef HAVE_TPETRA_MMM_TIMINGS
228 {
229 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230#endif
231
232 // Now import any needed remote rows and populate the Aview struct
233 // NOTE: We assert that an import isn't needed --- since we do the transpose
234 // above to handle that.
235 if (!use_optimized_ATB) {
237 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238 }
239
240 // We will also need local access to all rows of B that correspond to the
241 // column-map of op(A).
242 if (numProcs > 1)
243 targetMap_B = Aprime->getColMap();
244
245 // Import any needed remote rows and populate the Bview struct.
247 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
248
249#ifdef HAVE_TPETRA_MMM_TIMINGS
250 } //stop MM_importExtract here
251 //stop the setup timer, and start the multiply timer
252 MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253#endif
254
255 // Call the appropriate method to perform the actual multiplication.
256 if (use_optimized_ATB) {
257 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258
259 } else if (call_FillComplete_on_result && newFlag) {
260 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261
262 } else if (call_FillComplete_on_result) {
263 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264
265 } else {
266 // mfh 27 Sep 2016: Is this the "slow" case? This
267 // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268 // thread-parallel inserts, but that may take some effort.
270
271 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272 }
273
274#ifdef HAVE_TPETRA_MMM_TIMINGS
275 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276#endif
277 if (call_FillComplete_on_result && !C.isFillComplete()) {
278 // We'll call FillComplete on the C matrix before we exit, and give it a
279 // domain-map and a range-map.
280 // The domain-map will be the domain-map of B, unless
281 // op(B)==transpose(B), in which case the range-map of B will be used.
282 // The range-map will be the range-map of A, unless op(A)==transpose(A),
283 // in which case the domain-map of A will be used.
284 C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285 }
286}
287
288
289template <class Scalar,
290 class LocalOrdinal,
291 class GlobalOrdinal,
292 class Node>
299 const std::string& label,
300 const Teuchos::RCP<Teuchos::ParameterList>& params)
301{
302 using Teuchos::RCP;
303 typedef Scalar SC;
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
306 typedef Node NO;
307 typedef Import<LO,GO,NO> import_type;
309 typedef Map<LO,GO,NO> map_type;
310 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311
312#ifdef HAVE_TPETRA_MMM_TIMINGS
313 std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314 using Teuchos::TimeMonitor;
315 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316#endif
317
318 const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319
320 // A and B should already be Filled.
321 // Should we go ahead and call FillComplete() on them if necessary or error
322 // out? For now, we choose to error out.
323 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324 TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325
326 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328
329 // Now check size compatibility
330 global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331 global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332 global_size_t Aouter = A.getGlobalNumRows();
335 global_size_t Binner = B.getGlobalNumRows();
336 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337 prefix << "ERROR, inner dimensions of op(A) and op(B) "
338 "must match for matrix-matrix product. op(A) is "
339 << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340
341 // The result matrix C must at least have a row-map that reflects the correct
342 // row-size. Don't check the number of columns because rectangular matrices
343 // which were constructed with only one map can still end up having the
344 // correct capacity and dimensions when filled.
345 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346 prefix << "ERROR, dimensions of result C must "
347 "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348 << " rows, should have at least "<< Aouter << std::endl);
349
350 // It doesn't matter whether C is already Filled or not. If it is already
351 // Filled, it must have space allocated for the positions that will be
352 // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353 // we'll error out later when trying to store result values.
354
355 // CGB: However, matrix must be in active-fill
356 TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357
358 // We're going to need to import remotely-owned sections of A and/or B if
359 // more than one processor is performing this run, depending on the scenario.
360 int numProcs = A.getComm()->getSize();
361
362 // Declare a couple of structs that will be used to hold views of the data of
363 // A and B, to be used for fast access during the matrix-multiplication.
366
369
370#ifdef HAVE_TPETRA_MMM_TIMINGS
371 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372 {
373#endif
374
375 // Enable globalConstants by default
376 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378 if(!params.is_null()) {
379 importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
381 auto slist = params->sublist("matrixmatrix: kernel params",false);
383 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
385 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387 auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388 ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389 ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390 ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391 }
392
393 //Now import any needed remote rows and populate the Aview struct.
395 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396
397 // We will also need local access to all rows of B that correspond to the
398 // column-map of op(A).
399 if (numProcs > 1)
400 targetMap_B = Aprime->getColMap();
401
402 // Now import any needed remote rows and populate the Bview struct.
403 // Enable globalConstants by default
404 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406 if(!params.is_null()) {
407 importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408
409 auto slist = params->sublist("matrixmatrix: kernel params",false);
411 bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413 auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
416 ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417 ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418 ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419 }
420
421 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
422
423#ifdef HAVE_TPETRA_MMM_TIMINGS
424 }
425 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426#endif
427
428 // Now call the appropriate method to perform the actual multiplication.
430
431 // Is this a "clean" matrix
432 bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433
435 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436
437 } else if (call_FillComplete_on_result) {
438 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439
440 } else {
441 TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442 // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443 // commenting it out.
444// #ifdef HAVE_TPETRA_MMM_TIMINGS
445// MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446// #endif
447 // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448 // commenting it out.
449 // if (call_FillComplete_on_result) {
450 // //We'll call FillComplete on the C matrix before we exit, and give
451 // //it a domain-map and a range-map.
452 // //The domain-map will be the domain-map of B, unless
453 // //op(B)==transpose(B), in which case the range-map of B will be used.
454 // //The range-map will be the range-map of A, unless
455 // //op(A)==transpose(A), in which case the domain-map of A will be used.
456 // if (!C.isFillComplete()) {
457 // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458 // }
459 // }
460 }
461}
462
463
464template <class Scalar,
465 class LocalOrdinal,
466 class GlobalOrdinal,
467 class Node>
468void Add(
470 bool transposeA,
474{
475 using Teuchos::Array;
476 using Teuchos::RCP;
477 using Teuchos::null;
478 typedef Scalar SC;
479 typedef LocalOrdinal LO;
480 typedef GlobalOrdinal GO;
481 typedef Node NO;
482 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
484
485 const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486
487 TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488 prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489 "(Result matrix B is not required to be isFillComplete()).");
490 TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491 prefix << "ERROR, input matrix B must not be fill complete!");
492 TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493 prefix << "ERROR, input matrix B must not have static graph!");
494 TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495 prefix << "ERROR, input matrix B must not be locally indexed!");
496
497 using Teuchos::ParameterList;
499 transposeParams->set ("sort", false);
500
502 if (transposeA) {
503 transposer_type transposer (rcpFromRef (A));
504 Aprime = transposer.createTranspose (transposeParams);
505 }
506 else {
507 Aprime = rcpFromRef(A);
508 }
509
510 size_t a_numEntries;
511 typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getNodeMaxNumRowEntries());
512 typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getNodeMaxNumRowEntries());
513 GO row;
514
515 if (scalarB != Teuchos::ScalarTraits<SC>::one())
516 B.scale(scalarB);
517
518 bool bFilled = B.isFillComplete();
519 size_t numMyRows = B.getNodeNumRows();
520 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521 for (LO i = 0; (size_t)i < numMyRows; ++i) {
522 row = B.getRowMap()->getGlobalElement(i);
523 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
524
525 if (scalarA != Teuchos::ScalarTraits<SC>::one())
526 for (size_t j = 0; j < a_numEntries; ++j)
527 a_vals[j] *= scalarA;
528
529 if (bFilled)
530 B.sumIntoGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
531 else
532 B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
533 }
534 }
535}
536
537template <class Scalar,
538 class LocalOrdinal,
539 class GlobalOrdinal,
540 class Node>
541Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
543 const bool transposeA,
545 const Scalar& beta,
546 const bool transposeB,
548 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550 const Teuchos::RCP<Teuchos::ParameterList>& params)
551{
552 using Teuchos::RCP;
553 using Teuchos::rcpFromRef;
554 using Teuchos::rcp;
555 using Teuchos::ParameterList;
557 if(!params.is_null())
558 {
560 params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set("Call fillComplete", true);
565 }
566 //If transposeB, must compute B's explicit transpose to
567 //get the correct row map for C.
568 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569 if(transposeB)
570 {
572 Brcp = transposer.createTranspose();
573 }
574 //Check that A,B are fillComplete before getting B's column map
576 (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578 RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579 //this version of add() always fill completes the result, no matter what is in params on input
580 add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581 return C;
582}
583
584//This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585//but since the spadd() output is always packed there is no need for a separate
586//numRowEntries here.
587//
588template<class LO, class GO, class LOView, class GOView, class LocalMap>
589struct ConvertGlobalToLocalFunctor
590{
591 ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592 : lids(lids_), gids(gids_), localColMap(localColMap_)
593 {}
594
595 KOKKOS_FUNCTION void operator() (const GO i) const
596 {
597 lids(i) = localColMap.getLocalElement(gids(i));
598 }
599
600 LOView lids;
601 const GOView gids;
602 const LocalMap localColMap;
603};
604
605template <class Scalar,
606 class LocalOrdinal,
607 class GlobalOrdinal,
608 class Node>
609void
611 const bool transposeA,
613 const Scalar& beta,
614 const bool transposeB,
617 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618 const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619 const Teuchos::RCP<Teuchos::ParameterList>& params)
620{
621 using Teuchos::RCP;
622 using Teuchos::rcp;
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
627 using SC = Scalar;
628 using LO = LocalOrdinal;
629 using GO = GlobalOrdinal;
630 using NO = Node;
631 using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632 using crs_graph_type = CrsGraph<LO,GO,NO>;
633 using map_type = Map<LO,GO,NO>;
635 using import_type = Import<LO,GO,NO>;
636 using export_type = Export<LO,GO,NO>;
637 using exec_space = typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640 constexpr bool debug = false;
641
642#ifdef HAVE_TPETRA_MMM_TIMINGS
643 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644#endif
645
646 if (debug) {
647 std::ostringstream os;
648 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649 << "TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr << os.str ();
651 }
652
654 (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655 prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
657 (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658 prefix_mmm << "A and B must both be fill complete.");
659#ifdef HAVE_TPETRA_DEBUG
660 // The matrices don't have domain or range Maps unless they are fill complete.
661 if (A.isFillComplete () && B.isFillComplete ()) {
662 const bool domainMapsSame =
663 (! transposeA && ! transposeB &&
664 ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665 (! transposeA && transposeB &&
666 ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667 ( transposeA && ! transposeB &&
668 ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670 prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671
672 const bool rangeMapsSame =
673 (! transposeA && ! transposeB &&
674 ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675 (! transposeA && transposeB &&
676 ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677 ( transposeA && ! transposeB &&
678 ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680 prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681 }
682#endif // HAVE_TPETRA_DEBUG
683
684 using Teuchos::ParameterList;
685 // Form the explicit transpose of A if necessary.
686 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687 if (transposeA) {
689 Aprime = transposer.createTranspose ();
690 }
691
692#ifdef HAVE_TPETRA_DEBUG
694 (Aprime.is_null (), std::logic_error,
695 prefix_mmm << "Failed to compute Op(A). "
696 "Please report this bug to the Tpetra developers.");
697#endif // HAVE_TPETRA_DEBUG
698
699 // Form the explicit transpose of B if necessary.
700 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701 if (transposeB) {
702 if (debug) {
703 std::ostringstream os;
704 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705 << "Form explicit xpose of B" << std::endl;
706 std::cerr << os.str ();
707 }
709 Bprime = transposer.createTranspose ();
710 }
711#ifdef HAVE_TPETRA_DEBUG
712 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713 prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
715 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm << "Aprime and Bprime must both be fill complete. "
717 "Please report this bug to the Tpetra developers.");
718#endif // HAVE_TPETRA_DEBUG
721 if(CDomainMap.is_null())
722 {
723 CDomainMap = Bprime->getDomainMap();
724 }
725 if(CRangeMap.is_null())
726 {
727 CRangeMap = Bprime->getRangeMap();
728 }
729 assert(!(CDomainMap.is_null()));
730 assert(!(CRangeMap.is_null()));
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
734 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736 values_array vals;
737 row_ptrs_array rowptrs;
738 col_inds_array colinds;
739#ifdef HAVE_TPETRA_MMM_TIMINGS
740 MM = Teuchos::null;
741 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742#endif
743 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744 {
745 //import Aprime into Bprime's row map so the local matrices have same # of rows
746 auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747 // cbl do not set
748 // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749 // this import _may_ take the form of a transfer. In practice it would be unlikely,
750 // but the general case is not so forgiving.
751 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752 }
753 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
755 RCP<const import_type> Cimport = Teuchos::null;
756 RCP<export_type> Cexport = Teuchos::null;
757 bool doFillComplete = true;
758 if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759 {
760 doFillComplete = params->get<bool>("Call fillComplete");
761 }
762 auto Alocal = Aprime->getLocalMatrixDevice();
763 auto Blocal = Bprime->getLocalMatrixDevice();
764 LO numLocalRows = Alocal.numRows();
765 if(numLocalRows == 0)
766 {
767 //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768 //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769 //Handle this case now
770 //(without interfering with collective operations, since it's possible for
771 //some ranks to have 0 local rows and others not).
772 rowptrs = row_ptrs_array("C rowptrs", 0);
773 }
774 auto Acolmap = Aprime->getColMap();
775 auto Bcolmap = Bprime->getColMap();
776 if(!matchingColMaps)
777 {
778 using global_col_inds_array = typename AddKern::global_col_inds_array;
779#ifdef HAVE_TPETRA_MMM_TIMINGS
780 MM = Teuchos::null;
781 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782#endif
783 //use kernel that converts col indices in both A and B to common domain map before adding
784 auto AlocalColmap = Acolmap->getLocalMap();
785 auto BlocalColmap = Bcolmap->getLocalMap();
786 global_col_inds_array globalColinds;
787 if (debug) {
788 std::ostringstream os;
789 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790 << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr << os.str ();
792 }
793 AddKern::convertToGlobalAndAdd(
796 if (debug) {
797 std::ostringstream os;
798 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799 << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr << os.str ();
801 }
802#ifdef HAVE_TPETRA_MMM_TIMINGS
803 MM = Teuchos::null;
804 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805#endif
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
809 C.replaceColMap(CcolMap);
810 col_inds_array localColinds("C colinds", globalColinds.extent(0));
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
815 (localColinds, globalColinds, CcolMap->getLocalMap()));
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817 C.setAllValues(rowptrs, localColinds, vals);
818 C.fillComplete(CDomainMap, CRangeMap, params);
819 if(!doFillComplete)
820 C.resumeFill();
821 }
822 else
823 {
824 //Aprime, Bprime and C all have the same column maps
825 auto Avals = Alocal.values;
826 auto Bvals = Blocal.values;
827 auto Arowptrs = Alocal.graph.row_map;
828 auto Browptrs = Blocal.graph.row_map;
829 auto Acolinds = Alocal.graph.entries;
830 auto Bcolinds = Blocal.graph.entries;
831 if(sorted)
832 {
833 //use sorted kernel
834#ifdef HAVE_TPETRA_MMM_TIMINGS
835 MM = Teuchos::null;
836 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837#endif
838 if (debug) {
839 std::ostringstream os;
840 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841 << "Call AddKern::addSorted(...)" << std::endl;
842 std::cerr << os.str ();
843 }
844 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845 }
846 else
847 {
848 //use unsorted kernel
849#ifdef HAVE_TPETRA_MMM_TIMINGS
850 MM = Teuchos::null;
851 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852#endif
853 if (debug) {
854 std::ostringstream os;
855 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856 << "Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr << os.str ();
858 }
859 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860 }
861 //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
863 C.replaceColMap(Ccolmap);
864 C.setAllValues(rowptrs, colinds, vals);
866 {
867 #ifdef HAVE_TPETRA_MMM_TIMINGS
868 MM = Teuchos::null;
869 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870 #endif
871 if(!CDomainMap->isSameAs(*Ccolmap))
872 {
873 if (debug) {
874 std::ostringstream os;
875 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876 << "Create Cimport" << std::endl;
877 std::cerr << os.str ();
878 }
879 Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880 }
881 if(!C.getRowMap()->isSameAs(*CRangeMap))
882 {
883 if (debug) {
884 std::ostringstream os;
885 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886 << "Create Cexport" << std::endl;
887 std::cerr << os.str ();
888 }
889 Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890 }
891
892 if (debug) {
893 std::ostringstream os;
894 os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895 << "Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr << os.str ();
897 }
898 C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899 }
900 }
901}
902
903template <class Scalar,
904 class LocalOrdinal,
905 class GlobalOrdinal,
906 class Node>
907void Add(
909 bool transposeA,
912 bool transposeB,
915{
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
919 using Teuchos::RCP;
920 using Teuchos::rcp;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
924 using std::endl;
925 // typedef typename ArrayView<const Scalar>::size_type size_type;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
928 // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929 // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930 // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933
934 std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935
936 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937 prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938
940 ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941 prefix << "Both input matrices must be fill complete before calling this function.");
942
943#ifdef HAVE_TPETRA_DEBUG
944 {
945 const bool domainMapsSame =
946 (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947 (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948 (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950 prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951
952 const bool rangeMapsSame =
953 (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954 (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955 (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957 prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958 }
959#endif // HAVE_TPETRA_DEBUG
960
961 using Teuchos::ParameterList;
963 transposeParams->set ("sort", false);
964
965 // Form the explicit transpose of A if necessary.
967 if (transposeA) {
968 transposer_type theTransposer (rcpFromRef (A));
969 Aprime = theTransposer.createTranspose (transposeParams);
970 }
971 else {
972 Aprime = rcpFromRef (A);
973 }
974
975#ifdef HAVE_TPETRA_DEBUG
976 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977 prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978#endif // HAVE_TPETRA_DEBUG
979
980 // Form the explicit transpose of B if necessary.
982 if (transposeB) {
983 transposer_type theTransposer (rcpFromRef (B));
984 Bprime = theTransposer.createTranspose (transposeParams);
985 }
986 else {
987 Bprime = rcpFromRef (B);
988 }
989
990#ifdef HAVE_TPETRA_DEBUG
991 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992 prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993#endif // HAVE_TPETRA_DEBUG
994
995 // Allocate or zero the entries of the result matrix.
996 if (! C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
998 } else {
999 // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000 // noticed that C was being given the row Map of Aprime (the
1001 // possibly transposed version of A). Is this what we want?
1002 C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003 }
1004
1005#ifdef HAVE_TPETRA_DEBUG
1006 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007 prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009 prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012#endif // HAVE_TPETRA_DEBUG
1013
1017
1018 // do a loop over each matrix to add: A reordering might be more efficient
1019 for (int k = 0; k < 2; ++k) {
1020 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1021 typename crs_matrix_type::nonconst_values_host_view_type Values;
1022
1023 // Loop over each locally owned row of the current matrix (either
1024 // Aprime or Bprime), and sum its entries into the corresponding
1025 // row of C. This works regardless of whether Aprime or Bprime
1026 // has the same row Map as C, because both sumIntoGlobalValues and
1027 // insertGlobalValues allow summing resp. inserting into nonowned
1028 // rows of C.
1029#ifdef HAVE_TPETRA_DEBUG
1030 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032#endif // HAVE_TPETRA_DEBUG
1033 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034#ifdef HAVE_TPETRA_DEBUG
1035 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036 prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037#endif // HAVE_TPETRA_DEBUG
1038
1039 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040 for (size_t i = 0; i < localNumRows; ++i) {
1041 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043 if (numEntries > 0) {
1044 Kokkos::resize(Indices,numEntries);
1045 Kokkos::resize(Values,numEntries);
1046 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1047
1048 if (scalar[k] != STS::one ()) {
1049 for (size_t j = 0; j < numEntries; ++j) {
1050 Values[j] *= scalar[k];
1051 }
1052 }
1053
1054 if (C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (globalRow, numEntries,
1056 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1057 } else {
1058 C->insertGlobalValues (globalRow, numEntries,
1059 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1060 }
1061 }
1062 }
1063 }
1064}
1065
1066} //End namespace MatrixMatrix
1067
1068namespace MMdetails{
1069
1070/*********************************************************************************************************/
1071// Prints MMM-style statistics on communication done with an Import or Export object
1072template <class TransferType>
1073void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1074 if (Transfer.is_null())
1075 return;
1076
1077 const Distributor & Distor = Transfer->getDistributor();
1078 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1079
1080 size_t rows_send = Transfer->getNumExportIDs();
1081 size_t rows_recv = Transfer->getNumRemoteIDs();
1082
1083 size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1084 size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1085 size_t num_send_neighbors = Distor.getNumSends();
1086 size_t num_recv_neighbors = Distor.getNumReceives();
1087 size_t round2_send, round2_recv;
1088 Distor.getLastDoStatistics(round2_send,round2_recv);
1089
1090 int myPID = Comm->getRank();
1091 int NumProcs = Comm->getSize();
1092
1093 // Processor by processor statistics
1094 // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1095 // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1096
1097 // Global statistics
1098 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1099 size_t gstats_min[8], gstats_max[8];
1100
1101 double lstats_avg[8], gstats_avg[8];
1102 for(int i=0; i<8; i++)
1103 lstats_avg[i] = ((double)lstats[i])/NumProcs;
1104
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1106 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1107 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108
1109 if(!myPID) {
1110 printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1111 (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1112 (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1113 printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1114 (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1115 (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1116 }
1117}
1118
1119// Kernel method for computing the local portion of C = A*B
1120template<class Scalar,
1121 class LocalOrdinal,
1122 class GlobalOrdinal,
1123 class Node>
1124void mult_AT_B_newmatrix(
1125 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1126 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1128 const std::string & label,
1129 const Teuchos::RCP<Teuchos::ParameterList>& params)
1130{
1131 using Teuchos::RCP;
1132 using Teuchos::rcp;
1133 typedef Scalar SC;
1134 typedef LocalOrdinal LO;
1135 typedef GlobalOrdinal GO;
1136 typedef Node NO;
1137 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1138 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1139
1140#ifdef HAVE_TPETRA_MMM_TIMINGS
1141 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1142 using Teuchos::TimeMonitor;
1143 RCP<TimeMonitor> MM = rcp (new TimeMonitor
1144 (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1145#endif
1146
1147 /*************************************************************/
1148 /* 1) Local Transpose of A */
1149 /*************************************************************/
1150 transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1151
1152 using Teuchos::ParameterList;
1153 RCP<ParameterList> transposeParams (new ParameterList);
1154 transposeParams->set ("sort", false);
1155 if(! params.is_null ()) {
1156 transposeParams->set ("compute global constants",
1157 params->get ("compute global constants: temporaries",
1158 false));
1159 }
1160 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1161 transposer.createTransposeLocal (transposeParams);
1162
1163 /*************************************************************/
1164 /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1165 /*************************************************************/
1166#ifdef HAVE_TPETRA_MMM_TIMINGS
1167 MM = Teuchos::null;
1168 MM = rcp (new TimeMonitor
1169 (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1170#endif
1171
1172 // Get views, asserting that no import is required to speed up computation
1173 crs_matrix_struct_type Aview;
1174 crs_matrix_struct_type Bview;
1175 RCP<const Import<LO, GO, NO> > dummyImporter;
1176
1177 // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1178 RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1179 if (! params.is_null ()) {
1180 importParams1->set ("compute global constants",
1181 params->get ("compute global constants: temporaries",
1182 false));
1183 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1184 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1185 bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1186 int mm_optimization_core_count =
1188 mm_optimization_core_count =
1189 slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 int mm_optimization_core_count2 =
1191 params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1192 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1193 mm_optimization_core_count = mm_optimization_core_count2;
1194 }
1195 auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1196 sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1197 sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1198 sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199 }
1200
1201 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1202 Aview, dummyImporter, true,
1203 label, importParams1);
1204
1205 RCP<ParameterList> importParams2 (new ParameterList);
1206 if (! params.is_null ()) {
1207 importParams2->set ("compute global constants",
1208 params->get ("compute global constants: temporaries",
1209 false));
1210 auto slist = params->sublist ("matrixmatrix: kernel params", false);
1211 bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1212 bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1213 int mm_optimization_core_count =
1215 mm_optimization_core_count =
1216 slist.get ("MM_TAFC_OptimizationCoreCount",
1217 mm_optimization_core_count);
1218 int mm_optimization_core_count2 =
1219 params->get ("MM_TAFC_OptimizationCoreCount",
1220 mm_optimization_core_count);
1221 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1222 mm_optimization_core_count = mm_optimization_core_count2;
1223 }
1224 auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1225 sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1226 sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1227 sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228 }
1229
1230 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1231 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1232 }
1233 else {
1234 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1235 }
1236
1237#ifdef HAVE_TPETRA_MMM_TIMINGS
1238 MM = Teuchos::null;
1239 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1240#endif
1241
1242 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243
1244 // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1245 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1246 if (needs_final_export) {
1247 Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1248 }
1249 else {
1250 Ctemp = rcp (&C, false);
1251 }
1252
1253 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1254
1255 /*************************************************************/
1256 /* 4) exportAndFillComplete matrix */
1257 /*************************************************************/
1258#ifdef HAVE_TPETRA_MMM_TIMINGS
1259 MM = Teuchos::null;
1260 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1261#endif
1262
1263 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1264
1265 if (needs_final_export) {
1266 ParameterList labelList;
1267 labelList.set("Timer Label", label);
1268 if(!params.is_null()) {
1269 ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1270 ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1271 int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1272 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1273 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1274 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1275 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1276 bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1277 bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1278
1279 labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1280 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1281 labelList.set("compute global constants",params->get("compute global constants",true));
1282 labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1283 }
1284 Ctemp->exportAndFillComplete (Crcp,
1285 *Ctemp->getGraph ()->getExporter (),
1286 B.getDomainMap (),
1287 A.getDomainMap (),
1288 rcp (&labelList, false));
1289 }
1290#ifdef HAVE_TPETRA_MMM_STATISTICS
1291 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1292#endif
1293}
1294
1295/*********************************************************************************************************/
1296// Kernel method for computing the local portion of C = A*B
1297template<class Scalar,
1298 class LocalOrdinal,
1299 class GlobalOrdinal,
1300 class Node>
1301void mult_A_B(
1302 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1303 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1304 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1305 const std::string& /* label */,
1306 const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1307{
1308 using Teuchos::Array;
1309 using Teuchos::ArrayRCP;
1310 using Teuchos::ArrayView;
1311 using Teuchos::OrdinalTraits;
1312 using Teuchos::null;
1313
1314 typedef Teuchos::ScalarTraits<Scalar> STS;
1315 // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1316 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1317 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1318
1319 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1320 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1321
1322 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1323 ArrayView<const GlobalOrdinal> bcols_import = null;
1324 if (Bview.importColMap != null) {
1325 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1326 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1327
1328 bcols_import = Bview.importColMap->getNodeElementList();
1329 }
1330
1331 size_t C_numCols = C_lastCol - C_firstCol +
1332 OrdinalTraits<LocalOrdinal>::one();
1333 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1334 OrdinalTraits<LocalOrdinal>::one();
1335
1336 if (C_numCols_import > C_numCols)
1337 C_numCols = C_numCols_import;
1338
1339 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1340 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1341 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1342
1343 Array<Scalar> C_row_i = dwork;
1344 Array<GlobalOrdinal> C_cols = iwork;
1345 Array<size_t> c_index = iwork2;
1346 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1347 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1348
1349 size_t C_row_i_length, j, k, last_index;
1350
1351 // Run through all the hash table lookups once and for all
1352 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1353 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1354 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1355 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1356 // Maps are the same: Use local IDs as the hash
1357 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1358 Aview.colMap->getMaxLocalIndex(); i++)
1359 Acol2Brow[i]=i;
1360 }
1361 else {
1362 // Maps are not the same: Use the map's hash
1363 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1364 Aview.colMap->getMaxLocalIndex(); i++) {
1365 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1366 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1367 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1368 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1369 }
1370 }
1371
1372 // To form C = A*B we're going to execute this expression:
1373 //
1374 // C(i,j) = sum_k( A(i,k)*B(k,j) )
1375 //
1376 // Our goal, of course, is to navigate the data in A and B once, without
1377 // performing searches for column-indices, etc.
1378 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1379 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1380 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1381 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1382 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1383 ArrayView<const Scalar> Avals, Bvals, Ivals;
1384 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1385 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1386 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1387 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1388 if(!Bview.importMatrix.is_null()) {
1389 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1390 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391 }
1392
1393 bool C_filled = C.isFillComplete();
1394
1395 for (size_t i = 0; i < C_numCols; i++)
1396 c_index[i] = OrdinalTraits<size_t>::invalid();
1397
1398 // Loop over the rows of A.
1399 size_t Arows = Aview.rowMap->getNodeNumElements();
1400 for(size_t i=0; i<Arows; ++i) {
1401
1402 // Only navigate the local portion of Aview... which is, thankfully, all of
1403 // A since this routine doesn't do transpose modes
1404 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1405
1406 // Loop across the i-th row of A and for each corresponding row in B, loop
1407 // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1408 // quantities C_row_i. In other words, as we stride across B(k,:) we're
1409 // calculating updates for row i of the result matrix C.
1410 C_row_i_length = OrdinalTraits<size_t>::zero();
1411
1412 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1413 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1414 const Scalar Aval = Avals[k];
1415 if (Aval == STS::zero())
1416 continue;
1417
1418 if (Ak == LO_INVALID)
1419 continue;
1420
1421 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1422 LocalOrdinal col = Bcolind[j];
1423 //assert(col >= 0 && col < C_numCols);
1424
1425 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1427 // This has to be a += so insertGlobalValue goes out
1428 C_row_i[C_row_i_length] = Aval*Bvals[j];
1429 C_cols[C_row_i_length] = col;
1430 c_index[col] = C_row_i_length;
1431 C_row_i_length++;
1432
1433 } else {
1434 C_row_i[c_index[col]] += Aval*Bvals[j];
1435 }
1436 }
1437 }
1438
1439 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1440 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1441 C_cols[ii] = bcols[C_cols[ii]];
1442 combined_index[ii] = C_cols[ii];
1443 combined_values[ii] = C_row_i[ii];
1444 }
1445 last_index = C_row_i_length;
1446
1447 //
1448 //Now put the C_row_i values into C.
1449 //
1450 // We might have to revamp this later.
1451 C_row_i_length = OrdinalTraits<size_t>::zero();
1452
1453 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1454 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1455 const Scalar Aval = Avals[k];
1456 if (Aval == STS::zero())
1457 continue;
1458
1459 if (Ak!=LO_INVALID) continue;
1460
1461 Ak = Acol2Irow[Acolind[k]];
1462 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1463 LocalOrdinal col = Icolind[j];
1464 //assert(col >= 0 && col < C_numCols);
1465
1466 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467 //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1468 // This has to be a += so insertGlobalValue goes out
1469 C_row_i[C_row_i_length] = Aval*Ivals[j];
1470 C_cols[C_row_i_length] = col;
1471 c_index[col] = C_row_i_length;
1472 C_row_i_length++;
1473
1474 } else {
1475 // This has to be a += so insertGlobalValue goes out
1476 C_row_i[c_index[col]] += Aval*Ivals[j];
1477 }
1478 }
1479 }
1480
1481 for (size_t ii = 0; ii < C_row_i_length; ii++) {
1482 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1483 C_cols[ii] = bcols_import[C_cols[ii]];
1484 combined_index[last_index] = C_cols[ii];
1485 combined_values[last_index] = C_row_i[ii];
1486 last_index++;
1487 }
1488
1489 // Now put the C_row_i values into C.
1490 // We might have to revamp this later.
1491 C_filled ?
1492 C.sumIntoGlobalValues(
1493 global_row,
1494 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1495 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1496 :
1497 C.insertGlobalValues(
1498 global_row,
1499 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1500 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1501
1502 }
1503}
1504
1505/*********************************************************************************************************/
1506template<class Scalar,
1507 class LocalOrdinal,
1508 class GlobalOrdinal,
1509 class Node>
1510void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1511 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1512 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1513
1514 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1515 Mview.maxNumRowEntries = Mview.indices[0].size();
1516
1517 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1518 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1519 Mview.maxNumRowEntries = Mview.indices[i].size();
1520 }
1521}
1522
1523/*********************************************************************************************************/
1524template<class CrsMatrixType>
1525size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1526 // Follows the NZ estimate in ML's ml_matmatmult.c
1527 size_t Aest = 100, Best=100;
1528 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1529 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1530 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1531 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1532
1533 size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1534 nnzperrow *= nnzperrow;
1535
1536 return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1537}
1538
1539
1540/*********************************************************************************************************/
1541// Kernel method for computing the local portion of C = A*B
1542//
1543// mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1544// function, so this is probably the function we want to
1545// thread-parallelize.
1546template<class Scalar,
1547 class LocalOrdinal,
1548 class GlobalOrdinal,
1549 class Node>
1550void mult_A_B_newmatrix(
1551 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1552 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1553 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1554 const std::string& label,
1555 const Teuchos::RCP<Teuchos::ParameterList>& params)
1556{
1557 using Teuchos::Array;
1558 using Teuchos::ArrayRCP;
1559 using Teuchos::ArrayView;
1560 using Teuchos::RCP;
1561 using Teuchos::rcp;
1562
1563 // Tpetra typedefs
1564 typedef LocalOrdinal LO;
1565 typedef GlobalOrdinal GO;
1566 typedef Node NO;
1567 typedef Import<LO,GO,NO> import_type;
1568 typedef Map<LO,GO,NO> map_type;
1569
1570 // Kokkos typedefs
1571 typedef typename map_type::local_map_type local_map_type;
1573 typedef typename KCRS::StaticCrsGraphType graph_t;
1574 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1575 typedef typename NO::execution_space execution_space;
1576 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1577 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1578
1579#ifdef HAVE_TPETRA_MMM_TIMINGS
1580 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1581 using Teuchos::TimeMonitor;
1582 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1583#endif
1584 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585
1586 // Build the final importer / column map, hash table lookups for C
1587 RCP<const import_type> Cimport;
1588 RCP<const map_type> Ccolmap;
1589 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1590 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1591 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1592 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1593 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1594 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1595 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1596 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1597
1598 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1599 // indices of B, to local column indices of C. (B and C have the
1600 // same number of columns.) The kernel uses this, instead of
1601 // copying the entire input matrix B and converting its column
1602 // indices to those of C.
1603 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1604
1605 if (Bview.importMatrix.is_null()) {
1606 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1607 Cimport = Bimport;
1608 Ccolmap = Bview.colMap;
1609 const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1610 // Bcol2Ccol is trivial
1611 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1612 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1613 KOKKOS_LAMBDA(const LO i) {
1614 Bcol2Ccol(i) = i;
1615 });
1616 }
1617 else {
1618 // mfh 27 Sep 2016: B has "remotes," so we need to build the
1619 // column Map of C, as well as C's Import object (from its domain
1620 // Map to its column Map). C's column Map is the union of the
1621 // column Maps of (the local part of) B, and the "remote" part of
1622 // B. Ditto for the Import. We have optimized this "setUnion"
1623 // operation on Import objects and Maps.
1624
1625 // Choose the right variant of setUnion
1626 if (!Bimport.is_null() && !Iimport.is_null()) {
1627 Cimport = Bimport->setUnion(*Iimport);
1628 }
1629 else if (!Bimport.is_null() && Iimport.is_null()) {
1630 Cimport = Bimport->setUnion();
1631 }
1632 else if (Bimport.is_null() && !Iimport.is_null()) {
1633 Cimport = Iimport->setUnion();
1634 }
1635 else {
1636 throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1637 }
1638 Ccolmap = Cimport->getTargetMap();
1639
1640 // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1641 // in general. We should get rid of it in order to reduce
1642 // communication costs of sparse matrix-matrix multiply.
1643 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1644 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1645
1646 // NOTE: This is not efficient and should be folded into setUnion
1647 //
1648 // mfh 27 Sep 2016: What the above comment means, is that the
1649 // setUnion operation on Import objects could also compute these
1650 // local index - to - local index look-up tables.
1651 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1652 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1653 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1654 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1655 });
1656 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1657 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1658 });
1659
1660 }
1661
1662 // Replace the column map
1663 //
1664 // mfh 27 Sep 2016: We do this because C was originally created
1665 // without a column Map. Now we have its column Map.
1666 C.replaceColMap(Ccolmap);
1667
1668 // mfh 27 Sep 2016: Construct tables that map from local column
1669 // indices of A, to local row indices of either B_local (the locally
1670 // owned part of B), or B_remote (the "imported" remote part of B).
1671 //
1672 // For column index Aik in row i of A, if the corresponding row of B
1673 // exists in the local part of B ("orig") (which I'll call B_local),
1674 // then targetMapToOrigRow[Aik] is the local index of that row of B.
1675 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1676 //
1677 // For column index Aik in row i of A, if the corresponding row of B
1678 // exists in the remote part of B ("Import") (which I'll call
1679 // B_remote), then targetMapToImportRow[Aik] is the local index of
1680 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1681 // (a flag value).
1682
1683 // Run through all the hash table lookups once and for all
1684 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1685 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1686
1687 Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1688 GO aidx = Acolmap_local.getGlobalElement(i);
1689 LO B_LID = Browmap_local.getLocalElement(aidx);
1690 if (B_LID != LO_INVALID) {
1691 targetMapToOrigRow(i) = B_LID;
1692 targetMapToImportRow(i) = LO_INVALID;
1693 } else {
1694 LO I_LID = Irowmap_local.getLocalElement(aidx);
1695 targetMapToOrigRow(i) = LO_INVALID;
1696 targetMapToImportRow(i) = I_LID;
1697
1698 }
1699 });
1700
1701 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1702 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1703 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1704
1705}
1706
1707
1708/*********************************************************************************************************/
1709// AB NewMatrix Kernel wrappers (Default non-threaded version)
1710template<class Scalar,
1711 class LocalOrdinal,
1712 class GlobalOrdinal,
1713 class Node,
1714 class LocalOrdinalViewType>
1715void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1716 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1717 const LocalOrdinalViewType & targetMapToOrigRow,
1718 const LocalOrdinalViewType & targetMapToImportRow,
1719 const LocalOrdinalViewType & Bcol2Ccol,
1720 const LocalOrdinalViewType & Icol2Ccol,
1721 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1722 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1723 const std::string& label,
1724 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1725#ifdef HAVE_TPETRA_MMM_TIMINGS
1726 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1727 using Teuchos::TimeMonitor;
1728 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1729#endif
1730
1731 using Teuchos::Array;
1732 using Teuchos::ArrayRCP;
1733 using Teuchos::ArrayView;
1734 using Teuchos::RCP;
1735 using Teuchos::rcp;
1736
1737 // Lots and lots of typedefs
1738 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1739 typedef typename KCRS::StaticCrsGraphType graph_t;
1740 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1741 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1742 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1743 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744
1745 typedef Scalar SC;
1746 typedef LocalOrdinal LO;
1747 typedef GlobalOrdinal GO;
1748 typedef Node NO;
1749 typedef Map<LO,GO,NO> map_type;
1750 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1751 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1752 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753
1754 // Sizes
1755 RCP<const map_type> Ccolmap = C.getColMap();
1756 size_t m = Aview.origMatrix->getNodeNumRows();
1757 size_t n = Ccolmap->getNodeNumElements();
1758 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759
1760 // Grab the Kokkos::SparseCrsMatrices & inner stuff
1761 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1762 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
1763
1764 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1765 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1766 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1767
1768 c_lno_view_t Irowptr;
1769 lno_nnz_view_t Icolind;
1770 scalar_view_t Ivals;
1771 if(!Bview.importMatrix.is_null()) {
1772 auto lclB = Bview.importMatrix->getLocalMatrixHost();
1773 Irowptr = lclB.graph.row_map;
1774 Icolind = lclB.graph.entries;
1775 Ivals = lclB.values;
1776 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1777 }
1778
1779#ifdef HAVE_TPETRA_MMM_TIMINGS
1780 RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1781#endif
1782
1783 // Classic csr assembly (low memory edition)
1784 //
1785 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1786 // The method loops over rows of A, and may resize after processing
1787 // each row. Chris Siefert says that this reflects experience in
1788 // ML; for the non-threaded case, ML found it faster to spend less
1789 // effort on estimation and risk an occasional reallocation.
1790 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1791 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1792 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1793 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1794
1795 // mfh 27 Sep 2016: The c_status array is an implementation detail
1796 // of the local sparse matrix-matrix multiply routine.
1797
1798 // The status array will contain the index into colind where this entry was last deposited.
1799 // c_status[i] < CSR_ip - not in the row yet
1800 // c_status[i] >= CSR_ip - this is the entry where you can find the data
1801 // We start with this filled with INVALID's indicating that there are no entries yet.
1802 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1803 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1804 std::vector<size_t> c_status(n, ST_INVALID);
1805
1806 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1807 // routine. The routine computes C := A * (B_local + B_remote).
1808 //
1809 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1810 // you whether the corresponding row of B belongs to B_local
1811 // ("orig") or B_remote ("Import").
1812
1813 // For each row of A/C
1814 size_t CSR_ip = 0, OLD_ip = 0;
1815 for (size_t i = 0; i < m; i++) {
1816 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1817 // on the calling process.
1818 Crowptr[i] = CSR_ip;
1819
1820 // mfh 27 Sep 2016: For each entry of A in the current row of A
1821 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1822 LO Aik = Acolind[k]; // local column index of current entry of A
1823 const SC Aval = Avals[k]; // value of current entry of A
1824 if (Aval == SC_ZERO)
1825 continue; // skip explicitly stored zero values in A
1826
1827 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1828 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1829 // corresponding to the current entry of A is populated, then
1830 // the corresponding row of B is in B_local (i.e., it lives on
1831 // the calling process).
1832
1833 // Local matrix
1834 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1835
1836 // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1837 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1838 LO Bkj = Bcolind[j];
1839 LO Cij = Bcol2Ccol[Bkj];
1840
1841 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1842 // New entry
1843 c_status[Cij] = CSR_ip;
1844 Ccolind[CSR_ip] = Cij;
1845 Cvals[CSR_ip] = Aval*Bvals[j];
1846 CSR_ip++;
1847
1848 } else {
1849 Cvals[c_status[Cij]] += Aval*Bvals[j];
1850 }
1851 }
1852
1853 } else {
1854 // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1855 // corresponding to the current entry of A NOT populated (has
1856 // a flag "invalid" value), then the corresponding row of B is
1857 // in B_local (i.e., it lives on the calling process).
1858
1859 // Remote matrix
1860 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1861 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1862 LO Ikj = Icolind[j];
1863 LO Cij = Icol2Ccol[Ikj];
1864
1865 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1866 // New entry
1867 c_status[Cij] = CSR_ip;
1868 Ccolind[CSR_ip] = Cij;
1869 Cvals[CSR_ip] = Aval*Ivals[j];
1870 CSR_ip++;
1871 } else {
1872 Cvals[c_status[Cij]] += Aval*Ivals[j];
1873 }
1874 }
1875 }
1876 }
1877
1878 // Resize for next pass if needed
1879 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1880 CSR_alloc *= 2;
1881 Kokkos::resize(Ccolind,CSR_alloc);
1882 Kokkos::resize(Cvals,CSR_alloc);
1883 }
1884 OLD_ip = CSR_ip;
1885 }
1886
1887 Crowptr[m] = CSR_ip;
1888
1889 // Downward resize
1890 Kokkos::resize(Ccolind,CSR_ip);
1891 Kokkos::resize(Cvals,CSR_ip);
1892
1893#ifdef HAVE_TPETRA_MMM_TIMINGS
1894 {
1895 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1896#endif
1897
1898 // Final sort & set of CRS arrays
1899 if (params.is_null() || params->get("sort entries",true))
1900 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1901 C.setAllValues(Crowptr,Ccolind, Cvals);
1902
1903
1904#ifdef HAVE_TPETRA_MMM_TIMINGS
1905 }
1906 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1907 {
1908#endif
1909
1910 // Final FillComplete
1911 //
1912 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1913 // Import (from domain Map to column Map) construction (which costs
1914 // lots of communication) by taking the previously constructed
1915 // Import object. We should be able to do this without interfering
1916 // with the implementation of the local part of sparse matrix-matrix
1917 // multply above.
1918 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1919 labelList->set("Timer Label",label);
1920 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1921 RCP<const Export<LO,GO,NO> > dummyExport;
1922 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1923#ifdef HAVE_TPETRA_MMM_TIMINGS
1924 }
1925 MM2 = Teuchos::null;
1926#endif
1927
1928}
1929
1930
1931
1932
1933/*********************************************************************************************************/
1934// Kernel method for computing the local portion of C = A*B (reuse)
1935template<class Scalar,
1936 class LocalOrdinal,
1937 class GlobalOrdinal,
1938 class Node>
1939void mult_A_B_reuse(
1940 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1942 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1943 const std::string& label,
1944 const Teuchos::RCP<Teuchos::ParameterList>& params)
1945{
1946 using Teuchos::Array;
1947 using Teuchos::ArrayRCP;
1948 using Teuchos::ArrayView;
1949 using Teuchos::RCP;
1950 using Teuchos::rcp;
1951
1952 // Tpetra typedefs
1953 typedef LocalOrdinal LO;
1954 typedef GlobalOrdinal GO;
1955 typedef Node NO;
1956 typedef Import<LO,GO,NO> import_type;
1957 typedef Map<LO,GO,NO> map_type;
1958
1959 // Kokkos typedefs
1960 typedef typename map_type::local_map_type local_map_type;
1962 typedef typename KCRS::StaticCrsGraphType graph_t;
1963 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1964 typedef typename NO::execution_space execution_space;
1965 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1966 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1967
1968#ifdef HAVE_TPETRA_MMM_TIMINGS
1969 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1970 using Teuchos::TimeMonitor;
1971 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1972#endif
1973 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1974
1975 // Grab all the maps
1976 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1977 RCP<const map_type> Ccolmap = C.getColMap();
1978 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1979 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1980 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1981 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1982 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1983 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1984
1985 // Build the final importer / column map, hash table lookups for C
1986 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987 {
1988 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1989 // So, column map of C may be a strict subset of the column map of B
1990 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1991 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1992 });
1993
1994 if (!Bview.importMatrix.is_null()) {
1995 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1996 std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1997
1998 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1999 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2000 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2001 });
2002 }
2003 }
2004
2005 // Run through all the hash table lookups once and for all
2006 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2007 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2008 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2009 GO aidx = Acolmap_local.getGlobalElement(i);
2010 LO B_LID = Browmap_local.getLocalElement(aidx);
2011 if (B_LID != LO_INVALID) {
2012 targetMapToOrigRow(i) = B_LID;
2013 targetMapToImportRow(i) = LO_INVALID;
2014 } else {
2015 LO I_LID = Irowmap_local.getLocalElement(aidx);
2016 targetMapToOrigRow(i) = LO_INVALID;
2017 targetMapToImportRow(i) = I_LID;
2018
2019 }
2020 });
2021
2022 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2023 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2024 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025}
2026
2027/*********************************************************************************************************/
2028template<class Scalar,
2029 class LocalOrdinal,
2030 class GlobalOrdinal,
2031 class Node,
2032 class LocalOrdinalViewType>
2033void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2034 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2035 const LocalOrdinalViewType & targetMapToOrigRow,
2036 const LocalOrdinalViewType & targetMapToImportRow,
2037 const LocalOrdinalViewType & Bcol2Ccol,
2038 const LocalOrdinalViewType & Icol2Ccol,
2039 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2040 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2041 const std::string& label,
2042 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2043#ifdef HAVE_TPETRA_MMM_TIMINGS
2044 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2045 using Teuchos::TimeMonitor;
2046 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2047 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2048#else
2049 (void)label;
2050#endif
2051 using Teuchos::RCP;
2052 using Teuchos::rcp;
2053
2054
2055 // Lots and lots of typedefs
2056 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2057 typedef typename KCRS::StaticCrsGraphType graph_t;
2058 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2059 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2060 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2061
2062 typedef Scalar SC;
2063 typedef LocalOrdinal LO;
2064 typedef GlobalOrdinal GO;
2065 typedef Node NO;
2066 typedef Map<LO,GO,NO> map_type;
2067 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2068 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2069 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2070
2071 // Sizes
2072 RCP<const map_type> Ccolmap = C.getColMap();
2073 size_t m = Aview.origMatrix->getNodeNumRows();
2074 size_t n = Ccolmap->getNodeNumElements();
2075
2076 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2077 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2078 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2079 const KCRS & Cmat = C.getLocalMatrixHost();
2080
2081 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2082 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2083 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2084 scalar_view_t Cvals = Cmat.values;
2085
2086 c_lno_view_t Irowptr;
2087 lno_nnz_view_t Icolind;
2088 scalar_view_t Ivals;
2089 if(!Bview.importMatrix.is_null()) {
2090 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2091 Irowptr = lclB.graph.row_map;
2092 Icolind = lclB.graph.entries;
2093 Ivals = lclB.values;
2094 }
2095
2096#ifdef HAVE_TPETRA_MMM_TIMINGS
2097 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2098#endif
2099
2100 // Classic csr assembly (low memory edition)
2101 // mfh 27 Sep 2016: The c_status array is an implementation detail
2102 // of the local sparse matrix-matrix multiply routine.
2103
2104 // The status array will contain the index into colind where this entry was last deposited.
2105 // c_status[i] < CSR_ip - not in the row yet
2106 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2107 // We start with this filled with INVALID's indicating that there are no entries yet.
2108 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2109 std::vector<size_t> c_status(n, ST_INVALID);
2110
2111 // For each row of A/C
2112 size_t CSR_ip = 0, OLD_ip = 0;
2113 for (size_t i = 0; i < m; i++) {
2114 // First fill the c_status array w/ locations where we're allowed to
2115 // generate nonzeros for this row
2116 OLD_ip = Crowptr[i];
2117 CSR_ip = Crowptr[i+1];
2118 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2119 c_status[Ccolind[k]] = k;
2120
2121 // Reset values in the row of C
2122 Cvals[k] = SC_ZERO;
2123 }
2124
2125 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126 LO Aik = Acolind[k];
2127 const SC Aval = Avals[k];
2128 if (Aval == SC_ZERO)
2129 continue;
2130
2131 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2132 // Local matrix
2133 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2134
2135 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2136 LO Bkj = Bcolind[j];
2137 LO Cij = Bcol2Ccol[Bkj];
2138
2139 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2140 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2141 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2142
2143 Cvals[c_status[Cij]] += Aval * Bvals[j];
2144 }
2145
2146 } else {
2147 // Remote matrix
2148 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2149 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2150 LO Ikj = Icolind[j];
2151 LO Cij = Icol2Ccol[Ikj];
2152
2153 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2154 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2155 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2156
2157 Cvals[c_status[Cij]] += Aval * Ivals[j];
2158 }
2159 }
2160 }
2161 }
2162
2163#ifdef HAVE_TPETRA_MMM_TIMINGS
2164 auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2165#endif
2166
2167 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2168}
2169
2170
2171/*********************************************************************************************************/
2172// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2173template<class Scalar,
2174 class LocalOrdinal,
2175 class GlobalOrdinal,
2176 class Node>
2177void jacobi_A_B_newmatrix(
2178 Scalar omega,
2179 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2180 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2181 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2182 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2183 const std::string& label,
2184 const Teuchos::RCP<Teuchos::ParameterList>& params)
2185{
2186 using Teuchos::Array;
2187 using Teuchos::ArrayRCP;
2188 using Teuchos::ArrayView;
2189 using Teuchos::RCP;
2190 using Teuchos::rcp;
2191 // typedef Scalar SC;
2192 typedef LocalOrdinal LO;
2193 typedef GlobalOrdinal GO;
2194 typedef Node NO;
2195
2196 typedef Import<LO,GO,NO> import_type;
2197 typedef Map<LO,GO,NO> map_type;
2198 typedef typename map_type::local_map_type local_map_type;
2199
2200 // All of the Kokkos typedefs
2202 typedef typename KCRS::StaticCrsGraphType graph_t;
2203 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2204 typedef typename NO::execution_space execution_space;
2205 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2206 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2207
2208
2209#ifdef HAVE_TPETRA_MMM_TIMINGS
2210 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2211 using Teuchos::TimeMonitor;
2212 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2213#endif
2214 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2215
2216 // Build the final importer / column map, hash table lookups for C
2217 RCP<const import_type> Cimport;
2218 RCP<const map_type> Ccolmap;
2219 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2220 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2221 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2222 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2223 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2224 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2225 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2226 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2227
2228 // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2229 // indices of B, to local column indices of C. (B and C have the
2230 // same number of columns.) The kernel uses this, instead of
2231 // copying the entire input matrix B and converting its column
2232 // indices to those of C.
2233 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2234
2235 if (Bview.importMatrix.is_null()) {
2236 // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2237 Cimport = Bimport;
2238 Ccolmap = Bview.colMap;
2239 // Bcol2Ccol is trivial
2240 // Bcol2Ccol is trivial
2241
2242 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2243 Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2244 Bcol2Ccol(i) = static_cast<LO> (i);
2245 });
2246 } else {
2247 // mfh 27 Sep 2016: B has "remotes," so we need to build the
2248 // column Map of C, as well as C's Import object (from its domain
2249 // Map to its column Map). C's column Map is the union of the
2250 // column Maps of (the local part of) B, and the "remote" part of
2251 // B. Ditto for the Import. We have optimized this "setUnion"
2252 // operation on Import objects and Maps.
2253
2254 // Choose the right variant of setUnion
2255 if (!Bimport.is_null() && !Iimport.is_null()){
2256 Cimport = Bimport->setUnion(*Iimport);
2257 Ccolmap = Cimport->getTargetMap();
2258
2259 } else if (!Bimport.is_null() && Iimport.is_null()) {
2260 Cimport = Bimport->setUnion();
2261
2262 } else if(Bimport.is_null() && !Iimport.is_null()) {
2263 Cimport = Iimport->setUnion();
2264
2265 } else
2266 throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2267
2268 Ccolmap = Cimport->getTargetMap();
2269
2270 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2271 std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2272
2273 // NOTE: This is not efficient and should be folded into setUnion
2274 //
2275 // mfh 27 Sep 2016: What the above comment means, is that the
2276 // setUnion operation on Import objects could also compute these
2277 // local index - to - local index look-up tables.
2278 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2279 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2280 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2281 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2282 });
2283 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2284 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2285 });
2286
2287 }
2288
2289 // Replace the column map
2290 //
2291 // mfh 27 Sep 2016: We do this because C was originally created
2292 // without a column Map. Now we have its column Map.
2293 C.replaceColMap(Ccolmap);
2294
2295 // mfh 27 Sep 2016: Construct tables that map from local column
2296 // indices of A, to local row indices of either B_local (the locally
2297 // owned part of B), or B_remote (the "imported" remote part of B).
2298 //
2299 // For column index Aik in row i of A, if the corresponding row of B
2300 // exists in the local part of B ("orig") (which I'll call B_local),
2301 // then targetMapToOrigRow[Aik] is the local index of that row of B.
2302 // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2303 //
2304 // For column index Aik in row i of A, if the corresponding row of B
2305 // exists in the remote part of B ("Import") (which I'll call
2306 // B_remote), then targetMapToImportRow[Aik] is the local index of
2307 // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2308 // (a flag value).
2309
2310 // Run through all the hash table lookups once and for all
2311 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2312 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2313 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2314 GO aidx = Acolmap_local.getGlobalElement(i);
2315 LO B_LID = Browmap_local.getLocalElement(aidx);
2316 if (B_LID != LO_INVALID) {
2317 targetMapToOrigRow(i) = B_LID;
2318 targetMapToImportRow(i) = LO_INVALID;
2319 } else {
2320 LO I_LID = Irowmap_local.getLocalElement(aidx);
2321 targetMapToOrigRow(i) = LO_INVALID;
2322 targetMapToImportRow(i) = I_LID;
2323
2324 }
2325 });
2326
2327 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2328 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2329 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2330
2331}
2332
2333
2334/*********************************************************************************************************/
2335// Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2336// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2337
2338template<class Scalar,
2339 class LocalOrdinal,
2340 class GlobalOrdinal,
2341 class Node,
2342 class LocalOrdinalViewType>
2343void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2344 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2345 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2346 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2347 const LocalOrdinalViewType & targetMapToOrigRow,
2348 const LocalOrdinalViewType & targetMapToImportRow,
2349 const LocalOrdinalViewType & Bcol2Ccol,
2350 const LocalOrdinalViewType & Icol2Ccol,
2351 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2352 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2353 const std::string& label,
2354 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2355
2356#ifdef HAVE_TPETRA_MMM_TIMINGS
2357 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2358 using Teuchos::TimeMonitor;
2359 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2360#endif
2361
2362 using Teuchos::Array;
2363 using Teuchos::ArrayRCP;
2364 using Teuchos::ArrayView;
2365 using Teuchos::RCP;
2366 using Teuchos::rcp;
2367
2368 // Lots and lots of typedefs
2369 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2370 typedef typename KCRS::StaticCrsGraphType graph_t;
2371 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2372 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2374 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2375
2376 // Jacobi-specific
2377 typedef typename scalar_view_t::memory_space scalar_memory_space;
2378
2379 typedef Scalar SC;
2380 typedef LocalOrdinal LO;
2381 typedef GlobalOrdinal GO;
2382 typedef Node NO;
2383
2384 typedef Map<LO,GO,NO> map_type;
2385 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2386 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2387
2388 // Sizes
2389 RCP<const map_type> Ccolmap = C.getColMap();
2390 size_t m = Aview.origMatrix->getNodeNumRows();
2391 size_t n = Ccolmap->getNodeNumElements();
2392 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2393
2394 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2395 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2396 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2397
2398 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2399 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2400 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2401
2402 c_lno_view_t Irowptr;
2403 lno_nnz_view_t Icolind;
2404 scalar_view_t Ivals;
2405 if(!Bview.importMatrix.is_null()) {
2406 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2407 Irowptr = lclB.graph.row_map;
2408 Icolind = lclB.graph.entries;
2409 Ivals = lclB.values;
2410 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2411 }
2412
2413 // Jacobi-specific inner stuff
2414 auto Dvals =
2415 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2416
2417 // Teuchos::ArrayView::operator[].
2418 // The status array will contain the index into colind where this entry was last deposited.
2419 // c_status[i] < CSR_ip - not in the row yet.
2420 // c_status[i] >= CSR_ip, this is the entry where you can find the data
2421 // We start with this filled with INVALID's indicating that there are no entries yet.
2422 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2423 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2424 Array<size_t> c_status(n, ST_INVALID);
2425
2426 // Classic csr assembly (low memory edition)
2427 //
2428 // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2429 // The method loops over rows of A, and may resize after processing
2430 // each row. Chris Siefert says that this reflects experience in
2431 // ML; for the non-threaded case, ML found it faster to spend less
2432 // effort on estimation and risk an occasional reallocation.
2433 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2434 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2435 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2436 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2437 size_t CSR_ip = 0, OLD_ip = 0;
2438
2439 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2440
2441 // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2442 // routine. The routine computes
2443 //
2444 // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2445 //
2446 // This corresponds to one sweep of (weighted) Jacobi.
2447 //
2448 // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2449 // you whether the corresponding row of B belongs to B_local
2450 // ("orig") or B_remote ("Import").
2451
2452 // For each row of A/C
2453 for (size_t i = 0; i < m; i++) {
2454 // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2455 // on the calling process.
2456 Crowptr[i] = CSR_ip;
2457 SC minusOmegaDval = -omega*Dvals(i,0);
2458
2459 // Entries of B
2460 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2461 Scalar Bval = Bvals[j];
2462 if (Bval == SC_ZERO)
2463 continue;
2464 LO Bij = Bcolind[j];
2465 LO Cij = Bcol2Ccol[Bij];
2466
2467 // Assume no repeated entries in B
2468 c_status[Cij] = CSR_ip;
2469 Ccolind[CSR_ip] = Cij;
2470 Cvals[CSR_ip] = Bvals[j];
2471 CSR_ip++;
2472 }
2473
2474 // Entries of -omega * Dinv * A * B
2475 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2476 LO Aik = Acolind[k];
2477 const SC Aval = Avals[k];
2478 if (Aval == SC_ZERO)
2479 continue;
2480
2481 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2482 // Local matrix
2483 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2484
2485 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2486 LO Bkj = Bcolind[j];
2487 LO Cij = Bcol2Ccol[Bkj];
2488
2489 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2490 // New entry
2491 c_status[Cij] = CSR_ip;
2492 Ccolind[CSR_ip] = Cij;
2493 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2494 CSR_ip++;
2495
2496 } else {
2497 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2498 }
2499 }
2500
2501 } else {
2502 // Remote matrix
2503 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2504 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2505 LO Ikj = Icolind[j];
2506 LO Cij = Icol2Ccol[Ikj];
2507
2508 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2509 // New entry
2510 c_status[Cij] = CSR_ip;
2511 Ccolind[CSR_ip] = Cij;
2512 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2513 CSR_ip++;
2514 } else {
2515 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516 }
2517 }
2518 }
2519 }
2520
2521 // Resize for next pass if needed
2522 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2523 CSR_alloc *= 2;
2524 Kokkos::resize(Ccolind,CSR_alloc);
2525 Kokkos::resize(Cvals,CSR_alloc);
2526 }
2527 OLD_ip = CSR_ip;
2528 }
2529 Crowptr[m] = CSR_ip;
2530
2531 // Downward resize
2532 Kokkos::resize(Ccolind,CSR_ip);
2533 Kokkos::resize(Cvals,CSR_ip);
2534
2535 {
2536#ifdef HAVE_TPETRA_MMM_TIMINGS
2537 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2538#endif
2539
2540 // Replace the column map
2541 //
2542 // mfh 27 Sep 2016: We do this because C was originally created
2543 // without a column Map. Now we have its column Map.
2544 C.replaceColMap(Ccolmap);
2545
2546 // Final sort & set of CRS arrays
2547 //
2548 // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2549 // matrix-matrix multiply routine sort the entries for us?
2550 // Final sort & set of CRS arrays
2551 if (params.is_null() || params->get("sort entries",true))
2552 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2553 C.setAllValues(Crowptr,Ccolind, Cvals);
2554 }
2555 {
2556#ifdef HAVE_TPETRA_MMM_TIMINGS
2557 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2558#endif
2559
2560 // Final FillComplete
2561 //
2562 // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2563 // Import (from domain Map to column Map) construction (which costs
2564 // lots of communication) by taking the previously constructed
2565 // Import object. We should be able to do this without interfering
2566 // with the implementation of the local part of sparse matrix-matrix
2567 // multply above
2568 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2569 labelList->set("Timer Label",label);
2570 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2571 RCP<const Export<LO,GO,NO> > dummyExport;
2572 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2573
2574 }
2575}
2576
2577
2578/*********************************************************************************************************/
2579// Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2580template<class Scalar,
2581 class LocalOrdinal,
2582 class GlobalOrdinal,
2583 class Node>
2584void jacobi_A_B_reuse(
2585 Scalar omega,
2586 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2587 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2588 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2589 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2590 const std::string& label,
2591 const Teuchos::RCP<Teuchos::ParameterList>& params)
2592{
2593 using Teuchos::Array;
2594 using Teuchos::ArrayRCP;
2595 using Teuchos::ArrayView;
2596 using Teuchos::RCP;
2597 using Teuchos::rcp;
2598
2599 typedef LocalOrdinal LO;
2600 typedef GlobalOrdinal GO;
2601 typedef Node NO;
2602
2603 typedef Import<LO,GO,NO> import_type;
2604 typedef Map<LO,GO,NO> map_type;
2605
2606 // Kokkos typedefs
2607 typedef typename map_type::local_map_type local_map_type;
2609 typedef typename KCRS::StaticCrsGraphType graph_t;
2610 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2611 typedef typename NO::execution_space execution_space;
2612 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2613 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2614
2615#ifdef HAVE_TPETRA_MMM_TIMINGS
2616 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2617 using Teuchos::TimeMonitor;
2618 RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2619#endif
2620 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2621
2622 // Grab all the maps
2623 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2624 RCP<const map_type> Ccolmap = C.getColMap();
2625 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2626 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2627 local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2628 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2629 local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2630 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2631
2632 // Build the final importer / column map, hash table lookups for C
2633 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2634 {
2635 // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2636 // So, column map of C may be a strict subset of the column map of B
2637 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2638 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2639 });
2640
2641 if (!Bview.importMatrix.is_null()) {
2642 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2643 std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2644
2645 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2646 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2647 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2648 });
2649 }
2650 }
2651
2652 // Run through all the hash table lookups once and for all
2653 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2654 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2655 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2656 GO aidx = Acolmap_local.getGlobalElement(i);
2657 LO B_LID = Browmap_local.getLocalElement(aidx);
2658 if (B_LID != LO_INVALID) {
2659 targetMapToOrigRow(i) = B_LID;
2660 targetMapToImportRow(i) = LO_INVALID;
2661 } else {
2662 LO I_LID = Irowmap_local.getLocalElement(aidx);
2663 targetMapToOrigRow(i) = LO_INVALID;
2664 targetMapToImportRow(i) = I_LID;
2665
2666 }
2667 });
2668
2669#ifdef HAVE_TPETRA_MMM_TIMINGS
2670 MM = Teuchos::null;
2671#endif
2672
2673 // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2674 // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2675 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2676}
2677
2678
2679
2680/*********************************************************************************************************/
2681template<class Scalar,
2682 class LocalOrdinal,
2683 class GlobalOrdinal,
2684 class Node,
2685 class LocalOrdinalViewType>
2686void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2687 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2688 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2689 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2690 const LocalOrdinalViewType & targetMapToOrigRow,
2691 const LocalOrdinalViewType & targetMapToImportRow,
2692 const LocalOrdinalViewType & Bcol2Ccol,
2693 const LocalOrdinalViewType & Icol2Ccol,
2694 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2695 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2696 const std::string& label,
2697 const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2698#ifdef HAVE_TPETRA_MMM_TIMINGS
2699 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2700 using Teuchos::TimeMonitor;
2701 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2702 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2703#else
2704 (void)label;
2705#endif
2706 using Teuchos::RCP;
2707 using Teuchos::rcp;
2708
2709 // Lots and lots of typedefs
2710 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2711 typedef typename KCRS::StaticCrsGraphType graph_t;
2712 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2713 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2714 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2715 typedef typename scalar_view_t::memory_space scalar_memory_space;
2716
2717 typedef Scalar SC;
2718 typedef LocalOrdinal LO;
2719 typedef GlobalOrdinal GO;
2720 typedef Node NO;
2721 typedef Map<LO,GO,NO> map_type;
2722 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2723 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2724 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2725
2726 // Sizes
2727 RCP<const map_type> Ccolmap = C.getColMap();
2728 size_t m = Aview.origMatrix->getNodeNumRows();
2729 size_t n = Ccolmap->getNodeNumElements();
2730
2731 // Grab the Kokkos::SparseCrsMatrices & inner stuff
2732 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2733 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2734 const KCRS & Cmat = C.getLocalMatrixHost();
2735
2736 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2737 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2738 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2739 scalar_view_t Cvals = Cmat.values;
2740
2741 c_lno_view_t Irowptr;
2742 lno_nnz_view_t Icolind;
2743 scalar_view_t Ivals;
2744 if(!Bview.importMatrix.is_null()) {
2745 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2746 Irowptr = lclB.graph.row_map;
2747 Icolind = lclB.graph.entries;
2748 Ivals = lclB.values;
2749 }
2750
2751 // Jacobi-specific inner stuff
2752 auto Dvals =
2753 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2754
2755#ifdef HAVE_TPETRA_MMM_TIMINGS
2756 MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2757#endif
2758
2759 // The status array will contain the index into colind where this entry was last deposited.
2760 // c_status[i] < CSR_ip - not in the row yet
2761 // c_status[i] >= CSR_ip - this is the entry where you can find the data
2762 // We start with this filled with INVALID's indicating that there are no entries yet.
2763 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2764 std::vector<size_t> c_status(n, ST_INVALID);
2765
2766 // For each row of A/C
2767 size_t CSR_ip = 0, OLD_ip = 0;
2768 for (size_t i = 0; i < m; i++) {
2769
2770 // First fill the c_status array w/ locations where we're allowed to
2771 // generate nonzeros for this row
2772 OLD_ip = Crowptr[i];
2773 CSR_ip = Crowptr[i+1];
2774 for (size_t k = OLD_ip; k < CSR_ip; k++) {
2775 c_status[Ccolind[k]] = k;
2776
2777 // Reset values in the row of C
2778 Cvals[k] = SC_ZERO;
2779 }
2780
2781 SC minusOmegaDval = -omega*Dvals(i,0);
2782
2783 // Entries of B
2784 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2785 Scalar Bval = Bvals[j];
2786 if (Bval == SC_ZERO)
2787 continue;
2788 LO Bij = Bcolind[j];
2789 LO Cij = Bcol2Ccol[Bij];
2790
2791 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2792 std::runtime_error, "Trying to insert a new entry into a static graph");
2793
2794 Cvals[c_status[Cij]] = Bvals[j];
2795 }
2796
2797 // Entries of -omega * Dinv * A * B
2798 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2799 LO Aik = Acolind[k];
2800 const SC Aval = Avals[k];
2801 if (Aval == SC_ZERO)
2802 continue;
2803
2804 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2805 // Local matrix
2806 size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2807
2808 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2809 LO Bkj = Bcolind[j];
2810 LO Cij = Bcol2Ccol[Bkj];
2811
2812 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2813 std::runtime_error, "Trying to insert a new entry into a static graph");
2814
2815 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2816 }
2817
2818 } else {
2819 // Remote matrix
2820 size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2821 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2822 LO Ikj = Icolind[j];
2823 LO Cij = Icol2Ccol[Ikj];
2824
2825 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2826 std::runtime_error, "Trying to insert a new entry into a static graph");
2827
2828 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2829 }
2830 }
2831 }
2832 }
2833
2834#ifdef HAVE_TPETRA_MMM_TIMINGS
2835 MM2 = Teuchos::null;
2836 MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2837#endif
2838
2839 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2840#ifdef HAVE_TPETRA_MMM_TIMINGS
2841 MM2 = Teuchos::null;
2842 MM = Teuchos::null;
2843#endif
2844
2845}
2846
2847
2848
2849/*********************************************************************************************************/
2850template<class Scalar,
2851 class LocalOrdinal,
2852 class GlobalOrdinal,
2853 class Node>
2854void import_and_extract_views(
2855 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2856 Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2857 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2858 Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2859 bool userAssertsThereAreNoRemotes,
2860 const std::string& label,
2861 const Teuchos::RCP<Teuchos::ParameterList>& params)
2862{
2863 using Teuchos::Array;
2864 using Teuchos::ArrayView;
2865 using Teuchos::RCP;
2866 using Teuchos::rcp;
2867 using Teuchos::null;
2868
2869 typedef Scalar SC;
2870 typedef LocalOrdinal LO;
2871 typedef GlobalOrdinal GO;
2872 typedef Node NO;
2873
2874 typedef Map<LO,GO,NO> map_type;
2875 typedef Import<LO,GO,NO> import_type;
2876 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2877
2878#ifdef HAVE_TPETRA_MMM_TIMINGS
2879 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2880 using Teuchos::TimeMonitor;
2881 RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2882#endif
2883 // The goal of this method is to populate the 'Aview' struct with views of the
2884 // rows of A, including all rows that correspond to elements in 'targetMap'.
2885 //
2886 // If targetMap includes local elements that correspond to remotely-owned rows
2887 // of A, then those remotely-owned rows will be imported into
2888 // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2889 Aview.deleteContents();
2890
2891 Aview.origMatrix = rcp(&A, false);
2892 Aview.origRowMap = A.getRowMap();
2893 Aview.rowMap = targetMap;
2894 Aview.colMap = A.getColMap();
2895 Aview.domainMap = A.getDomainMap();
2896 Aview.importColMap = null;
2897 RCP<const map_type> rowMap = A.getRowMap();
2898 const int numProcs = rowMap->getComm()->getSize();
2899
2900 // Short circuit if the user swears there are no remotes (or if we're in serial)
2901 if (userAssertsThereAreNoRemotes || numProcs < 2)
2902 return;
2903
2904 RCP<const import_type> importer;
2905 if (params != null && params->isParameter("importer")) {
2906 importer = params->get<RCP<const import_type> >("importer");
2907
2908 } else {
2909#ifdef HAVE_TPETRA_MMM_TIMINGS
2910 MM = Teuchos::null;
2911 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2912#endif
2913
2914 // Mark each row in targetMap as local or remote, and go ahead and get a view
2915 // for the local rows
2916 RCP<const map_type> remoteRowMap;
2917 size_t numRemote = 0;
2918 int mode = 0;
2919 if (!prototypeImporter.is_null() &&
2920 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2921 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2922 // We have a valid prototype importer --- ask it for the remotes
2923#ifdef HAVE_TPETRA_MMM_TIMINGS
2924 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2925#endif
2926 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2927 numRemote = prototypeImporter->getNumRemoteIDs();
2928
2929 Array<GO> remoteRows(numRemote);
2930 for (size_t i = 0; i < numRemote; i++)
2931 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2932
2933 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2934 rowMap->getIndexBase(), rowMap->getComm()));
2935 mode = 1;
2936
2937 } else if (prototypeImporter.is_null()) {
2938 // No prototype importer --- count the remotes the hard way
2939#ifdef HAVE_TPETRA_MMM_TIMINGS
2940 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2941#endif
2942 ArrayView<const GO> rows = targetMap->getNodeElementList();
2943 size_t numRows = targetMap->getNodeNumElements();
2944
2945 Array<GO> remoteRows(numRows);
2946 for(size_t i = 0; i < numRows; ++i) {
2947 const LO mlid = rowMap->getLocalElement(rows[i]);
2948
2949 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2950 remoteRows[numRemote++] = rows[i];
2951 }
2952 remoteRows.resize(numRemote);
2953 remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2954 rowMap->getIndexBase(), rowMap->getComm()));
2955 mode = 2;
2956
2957 } else {
2958 // PrototypeImporter is bad. But if we're in serial that's OK.
2959 mode = 3;
2960 }
2961
2962 if (numProcs < 2) {
2963 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2964 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965 // If only one processor we don't need to import any remote rows, so return.
2966 return;
2967 }
2968
2969 //
2970 // Now we will import the needed remote rows of A, if the global maximum
2971 // value of numRemote is greater than 0.
2972 //
2973#ifdef HAVE_TPETRA_MMM_TIMINGS
2974 MM = Teuchos::null;
2975 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2976#endif
2977
2978 global_size_t globalMaxNumRemote = 0;
2979 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2980
2981 if (globalMaxNumRemote > 0) {
2982#ifdef HAVE_TPETRA_MMM_TIMINGS
2983 MM = Teuchos::null;
2984 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2985#endif
2986 // Create an importer with target-map remoteRowMap and source-map rowMap.
2987 if (mode == 1)
2988 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2989 else if (mode == 2)
2990 importer = rcp(new import_type(rowMap, remoteRowMap));
2991 else
2992 throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2993 }
2994
2995 if (params != null)
2996 params->set("importer", importer);
2997 }
2998
2999 if (importer != null) {
3000#ifdef HAVE_TPETRA_MMM_TIMINGS
3001 MM = Teuchos::null;
3002 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3003#endif
3004
3005 // Now create a new matrix into which we can import the remote rows of A that we need.
3006 Teuchos::ParameterList labelList;
3007 labelList.set("Timer Label", label);
3008 auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3009
3010 bool isMM = true;
3011 bool overrideAllreduce = false;
3012 int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3013 // Minor speedup tweak - avoid computing the global constants
3014 Teuchos::ParameterList params_sublist;
3015 if(!params.is_null()) {
3016 labelList.set("compute global constants", params->get("compute global constants",false));
3017 params_sublist = params->sublist("matrixmatrix: kernel params",false);
3018 mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3019 int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3020 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3021 isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3022 overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3023 }
3024 labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3025 labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3026 labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3027
3028 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3029 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3030
3031#if 0
3032 // Disabled code for dumping input matrices
3033 static int count=0;
3034 char str[80];
3035 sprintf(str,"import_matrix.%d.dat",count);
3037 count++;
3038#endif
3039
3040#ifdef HAVE_TPETRA_MMM_STATISTICS
3041 printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3042#endif
3043
3044
3045#ifdef HAVE_TPETRA_MMM_TIMINGS
3046 MM = Teuchos::null;
3047 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3048#endif
3049
3050 // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3051 Aview.importColMap = Aview.importMatrix->getColMap();
3052#ifdef HAVE_TPETRA_MMM_TIMINGS
3053 MM = Teuchos::null;
3054#endif
3055 }
3056}
3057
3058
3059
3060
3061
3062/*********************************************************************************************************/
3063 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3064template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3066merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3067 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3068 const LocalOrdinalViewType & Acol2Brow,
3069 const LocalOrdinalViewType & Acol2Irow,
3070 const LocalOrdinalViewType & Bcol2Ccol,
3071 const LocalOrdinalViewType & Icol2Ccol,
3072 const size_t mergedNodeNumCols) {
3073
3074 using Teuchos::RCP;
3076 typedef typename KCRS::StaticCrsGraphType graph_t;
3077 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3078 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3079 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3080 // Grab the Kokkos::SparseCrsMatrices
3081 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3082 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3083
3084 // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3085 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3086 // We do have a Bimport
3087 // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3088 // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3089 RCP<const KCRS> Ik_;
3090 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3091 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3092 KCRS Iks;
3093 if(Ik!=0) Iks = *Ik;
3094 size_t merge_numrows = Ak.numCols();
3095 // The last entry of this at least, need to be initialized
3096 lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3097
3098 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3099
3100 // Use a Kokkos::parallel_scan to build the rowptr
3101 typedef typename Node::execution_space execution_space;
3102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3103 Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3104 KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3105 if(final) Mrowptr(i) = update;
3106 // Get the row count
3107 size_t ct=0;
3108 if(Acol2Brow(i)!=LO_INVALID)
3109 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3110 else
3111 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3112 update+=ct;
3113
3114 if(final && i+1==merge_numrows)
3115 Mrowptr(i+1)=update;
3116 });
3117
3118 // Allocate nnz
3119 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3120 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3121 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3122
3123 // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3124 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3125 Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3126 if(Acol2Brow(i)!=LO_INVALID) {
3127 size_t row = Acol2Brow(i);
3128 size_t start = Bk.graph.row_map(row);
3129 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3132 }
3133 }
3134 else {
3135 size_t row = Acol2Irow(i);
3136 size_t start = Iks.graph.row_map(row);
3137 for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3138 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3139 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3140 }
3141 }
3142 });
3143
3144 KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145 return newmat;
3146 }
3147 else {
3148 // We don't have a Bimport (the easy case)
3149 return Bk;
3150 }
3151}//end merge_matrices
3152
3153template<typename SC, typename LO, typename GO, typename NO>
3154void AddKernels<SC, LO, GO, NO>::
3155addSorted(
3156 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3157 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3158 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3159 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3160 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3161 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3162 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3163 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3164 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3165 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3166 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3167{
3168 using Teuchos::TimeMonitor;
3169 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3170 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3171 auto nrows = Arowptrs.extent(0) - 1;
3172 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3173 typename AddKern::KKH handle;
3174 handle.create_spadd_handle(true);
3175 auto addHandle = handle.get_spadd_handle();
3176#ifdef HAVE_TPETRA_MMM_TIMINGS
3177 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3178#endif
3179 KokkosSparse::Experimental::spadd_symbolic
3180 <KKH,
3181 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3182 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3183 row_ptrs_array, col_inds_array>
3184 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3185 //KokkosKernels requires values to be zeroed
3186 Cvals = values_array("C values", addHandle->get_c_nnz());
3187 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3188#ifdef HAVE_TPETRA_MMM_TIMINGS
3189 MM = Teuchos::null;
3190 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3191#endif
3192 KokkosSparse::Experimental::spadd_numeric(&handle,
3193 Arowptrs, Acolinds, Avals, scalarA,
3194 Browptrs, Bcolinds, Bvals, scalarB,
3195 Crowptrs, Ccolinds, Cvals);
3196}
3197
3198template<typename SC, typename LO, typename GO, typename NO>
3199void AddKernels<SC, LO, GO, NO>::
3200addUnsorted(
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3202 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3203 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3204 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3205 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3206 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3207 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3208 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3209 GO /* numGlobalCols */,
3210 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3211 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3212 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3213{
3214 using Teuchos::TimeMonitor;
3215 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3216 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3217 auto nrows = Arowptrs.extent(0) - 1;
3218 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3219 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3220 typename AddKern::KKH handle;
3221 handle.create_spadd_handle(false);
3222 auto addHandle = handle.get_spadd_handle();
3223#ifdef HAVE_TPETRA_MMM_TIMINGS
3224 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3225#endif
3226 KokkosSparse::Experimental::spadd_symbolic
3227 <KKH,
3228 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3229 typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3230 row_ptrs_array, col_inds_array>
3231 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3232 //Cvals must be zeroed out
3233 Cvals = values_array("C values", addHandle->get_c_nnz());
3234 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3235#ifdef HAVE_TPETRA_MMM_TIMINGS
3236 MM = Teuchos::null;
3237 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3238#endif
3239 KokkosSparse::Experimental::spadd_numeric(&handle,
3240 Arowptrs, Acolinds, Avals, scalarA,
3241 Browptrs, Bcolinds, Bvals, scalarB,
3242 Crowptrs, Ccolinds, Cvals);
3243}
3244
3245template<typename GO,
3246 typename LocalIndicesType,
3247 typename GlobalIndicesType,
3248 typename ColMapType>
3249struct ConvertLocalToGlobalFunctor
3250{
3251 ConvertLocalToGlobalFunctor(
3252 const LocalIndicesType& colindsOrig_,
3253 const GlobalIndicesType& colindsConverted_,
3254 const ColMapType& colmap_) :
3255 colindsOrig (colindsOrig_),
3256 colindsConverted (colindsConverted_),
3257 colmap (colmap_)
3258 {}
3259 KOKKOS_INLINE_FUNCTION void
3260 operator() (const GO i) const
3261 {
3262 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3263 }
3264 LocalIndicesType colindsOrig;
3265 GlobalIndicesType colindsConverted;
3266 ColMapType colmap;
3267};
3268
3269template<typename SC, typename LO, typename GO, typename NO>
3270void MMdetails::AddKernels<SC, LO, GO, NO>::
3271convertToGlobalAndAdd(
3272 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3273 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3274 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3275 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3276 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3277 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3278 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3279 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3280 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3281{
3282 using Teuchos::TimeMonitor;
3283 //Need to use a different KokkosKernelsHandle type than other versions,
3284 //since the ordinals are now GO
3285 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3286 typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3287
3288 const values_array Avals = A.values;
3289 const values_array Bvals = B.values;
3290 const col_inds_array Acolinds = A.graph.entries;
3291 const col_inds_array Bcolinds = B.graph.entries;
3292 auto Arowptrs = A.graph.row_map;
3293 auto Browptrs = B.graph.row_map;
3294 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3295 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3296#ifdef HAVE_TPETRA_MMM_TIMINGS
3297 auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3298#endif
3299 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3300 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3301 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3302 Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3303 KKH_GO handle;
3304 handle.create_spadd_handle(false);
3305 auto addHandle = handle.get_spadd_handle();
3306#ifdef HAVE_TPETRA_MMM_TIMINGS
3307 MM = Teuchos::null;
3308 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3309#endif
3310 auto nrows = Arowptrs.extent(0) - 1;
3311 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3312 KokkosSparse::Experimental::spadd_symbolic
3313 <KKH_GO, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3314 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3315 Cvals = values_array("C values", addHandle->get_c_nnz());
3316 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3317#ifdef HAVE_TPETRA_MMM_TIMINGS
3318 MM = Teuchos::null;
3319 MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3320#endif
3321 KokkosSparse::Experimental::spadd_numeric(&handle,
3322 Arowptrs, AcolindsConverted, Avals, scalarA,
3323 Browptrs, BcolindsConverted, Bvals, scalarB,
3324 Crowptrs, Ccolinds, Cvals);
3325}
3326
3327
3328} //End namepsace MMdetails
3329
3330} //End namespace Tpetra
3331
3332/*********************************************************************************************************/
3333//
3334// Explicit instantiation macro
3335//
3336// Must be expanded from within the Tpetra namespace!
3337//
3338namespace Tpetra {
3339
3340#define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3341 template \
3342 void MatrixMatrix::Multiply( \
3343 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3344 bool transposeA, \
3345 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3346 bool transposeB, \
3347 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3348 bool call_FillComplete_on_result, \
3349 const std::string & label, \
3350 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3351\
3352template \
3353 void MatrixMatrix::Jacobi( \
3354 SCALAR omega, \
3355 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3356 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3357 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3358 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3359 bool call_FillComplete_on_result, \
3360 const std::string & label, \
3361 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3362\
3363 template \
3364 void MatrixMatrix::Add( \
3365 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3366 bool transposeA, \
3367 SCALAR scalarA, \
3368 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3369 bool transposeB, \
3370 SCALAR scalarB, \
3371 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3372\
3373 template \
3374 void MatrixMatrix::Add( \
3375 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3376 bool transposeA, \
3377 SCALAR scalarA, \
3378 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3379 SCALAR scalarB ); \
3380\
3381 template \
3382 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3383 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3384 (const SCALAR & alpha, \
3385 const bool transposeA, \
3386 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3387 const SCALAR & beta, \
3388 const bool transposeB, \
3389 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3390 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3391 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3392 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3393\
3394 template \
3395 void \
3396 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3397 (const SCALAR & alpha, \
3398 const bool transposeA, \
3399 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3400 const SCALAR& beta, \
3401 const bool transposeB, \
3402 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3403 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3404 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3405 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3406 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3407\
3408 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3409
3410} //End namespace Tpetra
3411
3412#endif // TPETRA_MATRIXMATRIX_DEF_HPP
Matrix Market file readers and writers for Tpetra objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declaration and definition of Tpetra::Details::getEntryOnHost.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Teuchos::RCP< const map_type > colMap
Col map for the original version of the matrix.
Teuchos::RCP< const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > origMatrix
The original matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t global_size_t
Global size_t object.