Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 //
38 // ************************************************************************
39 // @HEADER
40 
41 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
42 #define TPETRA_DETAILS_RESIDUAL_HPP
43 
44 #include "TpetraCore_config.h"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_LocalCrsMatrixOperator.hpp"
47 #include "Tpetra_MultiVector.hpp"
48 #include "Teuchos_RCP.hpp"
49 #include "Teuchos_ScalarTraits.hpp"
52 #include "KokkosSparse_spmv_impl.hpp"
53 
59 
60 namespace Tpetra {
61  namespace Details {
62 
63 
70 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
72 
73  using execution_space = typename AMatrix::execution_space;
74  using LO = typename AMatrix::non_const_ordinal_type;
75  using value_type = typename AMatrix::non_const_value_type;
76  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
77  using team_member = typename team_policy::member_type;
78  using ATV = Kokkos::ArithTraits<value_type>;
79 
80  AMatrix A_lcl;
81  ConstMV X_colmap_lcl;
82  ConstMV B_lcl;
83  MV R_lcl;
84  int rows_per_team;
85  Offsets offsets;
86  ConstMV X_domainmap_lcl;
87 
88 
89  LocalResidualFunctor (const AMatrix& A_lcl_,
90  const ConstMV& X_colmap_lcl_,
91  const ConstMV& B_lcl_,
92  const MV& R_lcl_,
93  const int rows_per_team_,
94  const Offsets& offsets_,
95  const ConstMV& X_domainmap_lcl_) :
96  A_lcl(A_lcl_),
97  X_colmap_lcl(X_colmap_lcl_),
98  B_lcl(B_lcl_),
99  R_lcl(R_lcl_),
100  rows_per_team(rows_per_team_),
101  offsets(offsets_),
102  X_domainmap_lcl(X_domainmap_lcl_)
103  { }
104 
105  KOKKOS_INLINE_FUNCTION
106  void operator() (const team_member& dev) const
107  {
108 
109  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
110  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
111 
112  if (lclRow >= A_lcl.numRows ()) {
113  return;
114  }
115 
116  if (!is_MV) { // MultiVectors only have a single column
117 
118  value_type A_x = ATV::zero ();
119 
120  if (!restrictedMode) {
121  const auto A_row = A_lcl.rowConst(lclRow);
122  const LO row_length = static_cast<LO> (A_row.length);
123 
124  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
125  const auto A_val = A_row.value(iEntry);
126  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
127  }, A_x);
128 
129  }
130  else {
131 
132  const LO offRankOffset = offsets(lclRow);
133  const size_t start = A_lcl.graph.row_map(lclRow);
134  const size_t end = A_lcl.graph.row_map(lclRow+1);
135 
136  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
137  const auto A_val = A_lcl.values(iEntry);
138  const auto lclCol = A_lcl.graph.entries(iEntry);
139  if (iEntry < offRankOffset)
140  lsum += A_val * X_domainmap_lcl(lclCol,0);
141  else if (!skipOffRank)
142  lsum += A_val * X_colmap_lcl(lclCol,0);
143  }, A_x);
144  }
145 
146  Kokkos::single(Kokkos::PerThread(dev),[&] () {
147  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
148  });
149  }
150  else { // MultiVectors have more than one column
151 
152  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
153 
154  for(LO v=0; v<numVectors; v++) {
155 
156  value_type A_x = ATV::zero ();
157 
158  if (!restrictedMode) {
159 
160  const auto A_row = A_lcl.rowConst(lclRow);
161  const LO row_length = static_cast<LO> (A_row.length);
162 
163  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
164  const auto A_val = A_row.value(iEntry);
165  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
166  }, A_x);
167  }
168  else {
169  const LO offRankOffset = offsets(lclRow);
170  const size_t start = A_lcl.graph.row_map(lclRow);
171  const size_t end = A_lcl.graph.row_map(lclRow+1);
172 
173  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
174  const auto A_val = A_lcl.values(iEntry);
175  const auto lclCol = A_lcl.graph.entries(iEntry);
176  if (iEntry < offRankOffset)
177  lsum += A_val * X_domainmap_lcl(lclCol,v);
178  else if (!skipOffRank)
179  lsum += A_val * X_colmap_lcl(lclCol,v);
180  }, A_x);
181  }
182 
183  Kokkos::single(Kokkos::PerThread(dev),[&] () {
184  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
185  });
186 
187  }//end for numVectors
188  }
189  });//end parallel_for TeamThreadRange
190  }
191 };
192 
193 
195 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
197 
198  using execution_space = typename AMatrix::execution_space;
199  using LO = typename AMatrix::non_const_ordinal_type;
200  using value_type = typename AMatrix::non_const_value_type;
201  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
202  using team_member = typename team_policy::member_type;
203  using ATV = Kokkos::ArithTraits<value_type>;
204 
205  AMatrix A_lcl;
206  ConstMV X_colmap_lcl;
207  MV R_lcl;
208  int rows_per_team;
209  Offsets offsets;
210 
211 
212  OffRankUpdateFunctor (const AMatrix& A_lcl_,
213  const ConstMV& X_colmap_lcl_,
214  const MV& R_lcl_,
215  const int rows_per_team_,
216  const Offsets& offsets_) :
217  A_lcl(A_lcl_),
218  X_colmap_lcl(X_colmap_lcl_),
219  R_lcl(R_lcl_),
220  rows_per_team(rows_per_team_),
221  offsets(offsets_)
222  { }
223 
224  KOKKOS_INLINE_FUNCTION
225  void operator() (const team_member& dev) const
226  {
227 
228  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
229  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
230 
231  if (lclRow >= A_lcl.numRows ()) {
232  return;
233  }
234 
235  const LO offRankOffset = offsets(lclRow);
236  const size_t end = A_lcl.graph.row_map(lclRow+1);
237 
238  if (!is_MV) { // MultiVectors only have a single column
239 
240  value_type A_x = ATV::zero ();
241 
242  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
243  const auto A_val = A_lcl.values(iEntry);
244  const auto lclCol = A_lcl.graph.entries(iEntry);
245  lsum += A_val * X_colmap_lcl(lclCol,0);
246  }, A_x);
247 
248  Kokkos::single(Kokkos::PerThread(dev),[&] () {
249  R_lcl(lclRow,0) -= A_x;
250  });
251  }
252  else { // MultiVectors have more than one column
253 
254  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
255 
256  for(LO v=0; v<numVectors; v++) {
257 
258  value_type A_x = ATV::zero ();
259 
260  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
261  const auto A_val = A_lcl.values(iEntry);
262  const auto lclCol = A_lcl.graph.entries(iEntry);
263  lsum += A_val * X_colmap_lcl(lclCol,v);
264  }, A_x);
265 
266  Kokkos::single(Kokkos::PerThread(dev),[&] () {
267  R_lcl(lclRow,v) -= A_x;
268  });
269 
270  }//end for numVectors
271  }
272  });
273  }
274 };
275 
276 template<class SC, class LO, class GO, class NO>
277 void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
278  const MultiVector<SC,LO,GO,NO> & X_colmap,
279  const MultiVector<SC,LO,GO,NO> & B,
281  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
282  const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
284  using Teuchos::NO_TRANS;
285  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
286 
287  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
288  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
289  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
290  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
291 
292  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
293  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
294  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
295  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
296  const_local_view_device_type X_domainmap_lcl;
297 
298  const bool debug = ::Tpetra::Details::Behavior::debug ();
299  if (debug) {
300  TEUCHOS_TEST_FOR_EXCEPTION
301  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
302  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
303  "R.getNumVectors() = " << R.getNumVectors () << ".");
304  TEUCHOS_TEST_FOR_EXCEPTION
305  (X_colmap.getLocalLength () !=
306  A.getColMap ()->getNodeNumElements (), std::runtime_error,
307  "X has the wrong number of local rows. "
308  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
309  "A.getColMap()->getNodeNumElements() = " <<
310  A.getColMap ()->getNodeNumElements () << ".");
311  TEUCHOS_TEST_FOR_EXCEPTION
312  (R.getLocalLength () !=
313  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
314  "R has the wrong number of local rows. "
315  "R.getLocalLength() = " << R.getLocalLength () << " != "
316  "A.getRowMap()->getNodeNumElements() = " <<
317  A.getRowMap ()->getNodeNumElements () << ".");
318  TEUCHOS_TEST_FOR_EXCEPTION
319  (B.getLocalLength () !=
320  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
321  "B has the wrong number of local rows. "
322  "B.getLocalLength() = " << B.getLocalLength () << " != "
323  "A.getRowMap()->getNodeNumElements() = " <<
324  A.getRowMap ()->getNodeNumElements () << ".");
325 
326  TEUCHOS_TEST_FOR_EXCEPTION
327  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
328  "fill complete. You must call fillComplete() (possibly with "
329  "domain and range Map arguments) without an intervening "
330  "resumeFill() call before you may call this method.");
331  TEUCHOS_TEST_FOR_EXCEPTION
332  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
333  std::runtime_error, "X, Y and B must be constant stride.");
334  // If the two pointers are NULL, then they don't alias one
335  // another, even though they are equal.
336  TEUCHOS_TEST_FOR_EXCEPTION
337  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
338  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
339  std::runtime_error, "X, Y and R may not alias one another.");
340  }
341 
342  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
343 #ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
344  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
345  // a residual kernel.
346  A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
347  R.update(one,B,negone);
348 #else
349 
350  if (A_lcl.numRows() == 0) {
351  return;
352  }
353 
354  int64_t numLocalRows = A_lcl.numRows ();
355  int64_t myNnz = A_lcl.nnz();
356 
357  //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
358  //use it and follow the reference residual code
359  LO maxRowImbalance = 0;
360  if(numLocalRows != 0)
361  maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows);
362  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
363  {
364  //note: lclOp will be wrapped in shared_ptr
365  auto lclOp = A.getLocalMultiplyOperator();
366  //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
367  lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
368  R.update(one,B,negone);
369  return;
370  }
371 
372  int team_size = -1;
373  int vector_length = -1;
374  int64_t rows_per_thread = -1;
375 
376  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
377  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
378 
379  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
380  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
381 
382  policy_type policy (1, 1);
383  if (team_size < 0) {
384  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
385  }
386  else {
387  policy = policy_type (worksets, team_size, vector_length);
388  }
389 
390  bool is_vector = (X_colmap_lcl.extent(1) == 1);
391 
392  if(is_vector) {
393 
394  if (X_domainmap == nullptr) {
395 
396  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
397  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
398  Kokkos::parallel_for("residual-vector",policy,func);
399 
400  }
401  else {
402 
403  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
404  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
405  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
406  Kokkos::parallel_for("residual-vector",policy,func);
407 
408  }
409  }
410  else {
411 
412  if (X_domainmap == nullptr) {
413 
414  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
415  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
416  Kokkos::parallel_for("residual-multivector",policy,func);
417 
418  }
419  else {
420 
421  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
422  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
423  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
424  Kokkos::parallel_for("residual-multivector",policy,func);
425 
426  }
427  }
428 #endif
429 }
430 
431 
432 template<class SC, class LO, class GO, class NO>
433 void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
434  MultiVector<SC,LO,GO,NO> & X_colmap,
435  const MultiVector<SC,LO,GO,NO> & B,
436  MultiVector<SC,LO,GO,NO> & R,
437  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
438  const MultiVector<SC,LO,GO,NO> & X_domainmap) {
440  using Teuchos::NO_TRANS;
441  using Teuchos::RCP;
442  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
443 
444  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
445 
446  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
447  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
448  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
449  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
450 
451  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
452  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
453  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
454  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
455  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
456 
457  const bool debug = ::Tpetra::Details::Behavior::debug ();
458  if (debug) {
459  TEUCHOS_TEST_FOR_EXCEPTION
460  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
461  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
462  "R.getNumVectors() = " << R.getNumVectors () << ".");
463  TEUCHOS_TEST_FOR_EXCEPTION
464  (X_colmap.getLocalLength () !=
465  A.getColMap ()->getNodeNumElements (), std::runtime_error,
466  "X has the wrong number of local rows. "
467  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
468  "A.getColMap()->getNodeNumElements() = " <<
469  A.getColMap ()->getNodeNumElements () << ".");
470  TEUCHOS_TEST_FOR_EXCEPTION
471  (R.getLocalLength () !=
472  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
473  "R has the wrong number of local rows. "
474  "R.getLocalLength() = " << R.getLocalLength () << " != "
475  "A.getRowMap()->getNodeNumElements() = " <<
476  A.getRowMap ()->getNodeNumElements () << ".");
477  TEUCHOS_TEST_FOR_EXCEPTION
478  (B.getLocalLength () !=
479  A.getRowMap ()->getNodeNumElements (), std::runtime_error,
480  "B has the wrong number of local rows. "
481  "B.getLocalLength() = " << B.getLocalLength () << " != "
482  "A.getRowMap()->getNodeNumElements() = " <<
483  A.getRowMap ()->getNodeNumElements () << ".");
484 
485  TEUCHOS_TEST_FOR_EXCEPTION
486  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
487  "fill complete. You must call fillComplete() (possibly with "
488  "domain and range Map arguments) without an intervening "
489  "resumeFill() call before you may call this method.");
490  TEUCHOS_TEST_FOR_EXCEPTION
491  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
492  std::runtime_error, "X, Y and B must be constant stride.");
493  // If the two pointers are NULL, then they don't alias one
494  // another, even though they are equal.
495  TEUCHOS_TEST_FOR_EXCEPTION
496  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
497  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
498  std::runtime_error, "X, Y and R may not alias one another.");
499  }
500 
501  if (A_lcl.numRows() == 0) {
502  return;
503  }
504 
505  int64_t numLocalRows = A_lcl.numRows ();
506  int64_t myNnz = A_lcl.nnz();
507 
508  // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
509  // //use it and follow the reference residual code
510  // LO maxRowImbalance = 0;
511  // if(numLocalRows != 0)
512  // maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows);
513  // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
514  // {
515  // //note: lclOp will be wrapped in shared_ptr
516  // auto lclOp = A.getLocalMultiplyOperator();
517  // //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
518  // lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
519  // R.update(one,B,negone);
520  // return;
521  // }
522 
523  int team_size = -1;
524  int vector_length = -1;
525  int64_t rows_per_thread = -1;
526 
527  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
528  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
529 
530  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
531  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
532 
533  policy_type policy (1, 1);
534  if (team_size < 0) {
535  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
536  }
537  else {
538  policy = policy_type (worksets, team_size, vector_length);
539  }
540 
541  bool is_vector = (X_colmap_lcl.extent(1) == 1);
542 
543  if(is_vector) {
544 
545  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
546  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
547  Kokkos::parallel_for("residual-vector",policy,func);
548 
549  RCP<const import_type> importer = A.getGraph ()->getImporter ();
550  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
551 
552  Kokkos::fence();
553 
554  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
555  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
556  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
557 
558  }
559  else {
560 
561  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
562  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
563  Kokkos::parallel_for("residual-multivector",policy,func);
564 
565  RCP<const import_type> importer = A.getGraph ()->getImporter ();
566  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
567 
568  Kokkos::fence();
569 
570  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
571  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
572  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
573 
574  }
575 }
576 
577 
579 template<class SC, class LO, class GO, class NO>
581  const MultiVector<SC,LO,GO,NO> & X_in,
582  const MultiVector<SC,LO,GO,NO> & B_in,
583  MultiVector<SC,LO,GO,NO> & R_in) {
585  using Teuchos::RCP;
586  using Teuchos::rcp;
587  using Teuchos::rcp_const_cast;
588  using Teuchos::rcpFromRef;
589 
590  const bool debug = ::Tpetra::Details::Behavior::debug ();
591  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
592  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
593  if (overlapCommunicationAndComputation)
594  TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
595 
596  // Whether we are using restrictedMode in the import from domain to
597  // column map. Restricted mode skips the copy and permutation of the
598  // local part of X. We are using restrictedMode only when domain and
599  // column map are locally fitted, i.e. when the local indices of
600  // domain and column map match.
601  bool restrictedMode = false;
602 
603  const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
604  if(!Apt) {
605  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
606  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
607  Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
608  R_in.update(one,B_in,negone);
609  return;
610  }
611  const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
612 
613  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
614  using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
615  using MV = MultiVector<SC,LO,GO,NO>;
616  using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
617  using offset_type = typename graph_type::offset_device_view_type;
618 
619  // We treat the case of a replicated MV output specially.
620  const bool R_is_replicated =
621  (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
622 
623  // It's possible that R is a view of X or B.
624  // We don't try to to detect the more subtle cases (e.g., one is a
625  // subview of the other, but their initial pointers differ). We
626  // only need to do this if this matrix's Import is trivial;
627  // otherwise, we don't actually apply the operator from X into Y.
628 
629  RCP<const import_type> importer = A.getGraph ()->getImporter ();
630  RCP<const export_type> exporter = A.getGraph ()->getExporter ();
631 
632  // Temporary MV for Import operation. After the block of code
633  // below, this will be an (Imported if necessary) column Map MV
634  // ready to give to localApply(...).
635  RCP<MV> X_colMap;
636  if (importer.is_null ()) {
637  if (! X_in.isConstantStride ()) {
638  // Not all sparse mat-vec kernels can handle an input MV with
639  // nonconstant stride correctly, so we have to copy it in that
640  // case into a constant stride MV. To make a constant stride
641  // copy of X_in, we force creation of the column (== domain)
642  // Map MV (if it hasn't already been created, else fetch the
643  // cached copy). This avoids creating a new MV each time.
644  X_colMap = A.getColumnMapMultiVector (X_in, true);
645  Tpetra::deep_copy (*X_colMap, X_in);
646  // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
647  }
648  else {
649  // The domain and column Maps are the same, so do the local
650  // multiply using the domain Map input MV X_in.
651  X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
652  }
653  }
654  else { // need to Import source (multi)vector
655  ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
656  // We're doing an Import anyway, which will copy the relevant
657  // elements of the domain Map MV X_in into a separate column Map
658  // MV. Thus, we don't have to worry whether X_in is constant
659  // stride.
660  X_colMap = A.getColumnMapMultiVector (X_in);
661 
662  // Do we want to use restrictedMode?
663  restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
664 
665  if (debug && restrictedMode) {
666  TEUCHOS_TEST_FOR_EXCEPTION
667  (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
668  "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
669  }
670 
671  // Import from the domain Map MV to the column Map MV.
672  X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
673  }
674 
675  offset_type offsets;
676  if (restrictedMode)
677  A.getCrsGraph()->getLocalOffRankOffsets(offsets);
678 
679  // Get a vector for the R_rowMap output residual, handling the
680  // non-constant stride and exporter cases. Since R gets clobbered
681  // we don't need to worry about the data in it
682  RCP<MV> R_rowMap;
683  if(exporter.is_null()) {
684  if (! R_in.isConstantStride ()) {
685  R_rowMap = A.getRowMapMultiVector(R_in);
686  }
687  else {
688  R_rowMap = rcpFromRef (R_in);
689  }
690  }
691  else {
692  R_rowMap = A.getRowMapMultiVector (R_in);
693  }
694 
695  // Get a vector for the B_rowMap output residual, handling the
696  // non-constant stride and exporter cases
697  RCP<const MV> B_rowMap;
698  if(exporter.is_null()) {
699  if (! B_in.isConstantStride ()) {
700  // Do an allocation here. If we need to optimize this later, we can have the matrix
701  // cache this.
702  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
703  Tpetra::deep_copy (*B_rowMapNonConst, B_in);
704  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
705  }
706  else {
707  B_rowMap = rcpFromRef (B_in);
708  }
709  }
710  else {
711  // Do an allocation here. If we need to optimize this later, we can have the matrix
712  // cache this.
713  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
714  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
715  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
716  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
717  }
718 
719  // If we have a nontrivial Export object, we must perform an
720  // Export. In that case, the local multiply result will go into
721  // the row Map multivector. We don't have to make a
722  // constant-stride version of R_in in this case, because we had to
723  // make a constant stride R_rowMap MV and do an Export anyway.
724  if (! exporter.is_null ()) {
725  if ( ! importer.is_null ())
726  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
727  if (restrictedMode && !importer.is_null ())
728  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
729  else
730  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
731 
732  {
733  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
734 
735  // Do the Export operation.
736  R_in.doExport (*R_rowMap, *exporter, ADD);
737  }
738  }
739  else { // Don't do an Export: row Map and range Map are the same.
740 
741  if (restrictedMode)
742  if (overlapCommunicationAndComputation) {
743  localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
744  } else {
745  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
746  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
747  }
748  else {
749  if ( ! importer.is_null ())
750  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
751  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
752  }
753 
754  //
755  // If R_in does not have constant stride,
756  // then we can't let the kernel write directly to R_in.
757  // Instead, we have to use the cached row (== range)
758  // Map MV as temporary storage.
759  //
760  if (! R_in.isConstantStride () ) {
761  // We need to be sure to do a copy out in this case.
762  Tpetra::deep_copy (R_in, *R_rowMap);
763  }
764  }
765 
766  // If the range Map is a locally replicated Map, sum up
767  // contributions from each process.
768  if (R_is_replicated) {
769  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
770  R_in.reduce ();
771  }
772 }
773 
774 
775 
776 
777 
778  } // namespace Details
779 } // namespace Tpetra
780 
781 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
typename device_type::execution_space execution_space
The Kokkos execution space.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
bool isDistributed() const
Whether this is a globally distributed object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract interface for operators (e.g., matrices and preconditioners).
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD
Sum new values.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.