Tpetra parallel linear algebra  Version of the Day
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
50 #include "Teuchos_TimeMonitor.hpp"
51 
52 //
53 // mfh 25 May 2016: Temporary fix for #393.
54 //
55 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
56 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
57 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
58 // use them in that case, either.
59 //
60 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
61 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
62 // thing for any GCC version.
63 //
64 #if defined(__CUDACC__)
65  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
66 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
67 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
68 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
69 
70 #elif defined(__GNUC__)
71 
72 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
73 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
74 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
75 
76 #else // some other compiler
77 
78  // Optimistically assume that other compilers aren't broken.
79 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
81 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
82 #endif // defined(__CUDACC__), defined(__GNUC__)
83 
84 
85 namespace Tpetra {
86 namespace Experimental {
87 
88 namespace Impl {
89 
90 #if 0
91 template<class AlphaCoeffType,
92  class GraphType,
93  class MatrixValuesType,
94  class InVecType,
95  class BetaCoeffType,
96  class OutVecType>
97 class BcrsApplyNoTrans1VecTeamFunctor {
98 private:
99  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
100  "MatrixValuesType must be a Kokkos::View.");
101  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
102  "OutVecType must be a Kokkos::View.");
103  static_assert (Kokkos::Impl::is_view<InVecType>::value,
104  "InVecType must be a Kokkos::View.");
105  static_assert (std::is_same<MatrixValuesType,
106  typename MatrixValuesType::const_type>::value,
107  "MatrixValuesType must be a const Kokkos::View.");
108  static_assert (std::is_same<OutVecType,
109  typename OutVecType::non_const_type>::value,
110  "OutVecType must be a nonconst Kokkos::View.");
111  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
112  "InVecType must be a const Kokkos::View.");
113  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
114  "MatrixValuesType must be a rank-1 Kokkos::View.");
115  static_assert (static_cast<int> (InVecType::rank) == 1,
116  "InVecType must be a rank-1 Kokkos::View.");
117  static_assert (static_cast<int> (OutVecType::rank) == 1,
118  "OutVecType must be a rank-1 Kokkos::View.");
119  typedef typename MatrixValuesType::non_const_value_type scalar_type;
120  typedef typename GraphType::device_type device_type;
121  typedef typename device_type::execution_space execution_space;
122  typedef typename execution_space::scratch_memory_space shmem_space;
123 
124 public:
126  typedef typename std::remove_const<typename GraphType::data_type>::type
129  typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,
130  execution_space,
131  local_ordinal_type> policy_type;
138  void setX (const InVecType& X) { X_ = X; }
139 
146  void setY (const OutVecType& Y) { Y_ = Y; }
147 
151  KOKKOS_INLINE_FUNCTION local_ordinal_type
152  getScratchSizePerTeam () const
153  {
154  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
155  // that have run-time sizes, like some of those in Stokhos.
156  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
157  return blockSize_ * sizeof (y_value_type);
158  }
159 
163  KOKKOS_INLINE_FUNCTION local_ordinal_type
164  getScratchSizePerThread () const
165  {
166  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
167  // that have run-time sizes, like some of those in Stokhos.
168  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
169  return blockSize_ * sizeof (y_value_type);
170  }
171 
172 private:
173  KOKKOS_INLINE_FUNCTION local_ordinal_type
174  getNumLclMeshRows () const
175  {
176  return ptr_.dimension_0 () == 0 ?
177  static_cast<local_ordinal_type> (0) :
178  static_cast<local_ordinal_type> (ptr_.dimension_0 () - 1);
179  }
180 
181  static constexpr local_ordinal_type defaultRowsPerTeam = 20;
182 
183 public:
185  local_ordinal_type getNumTeams () const {
186  return 1;
187  // const local_ordinal_type numLclMeshRows = getNumLclMeshRows ();
188  // return (numLclMeshRows + rowsPerTeam_ - 1) / rowsPerTeam_;
189  }
190 
192  BcrsApplyNoTrans1VecTeamFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
193  const GraphType& graph,
194  const MatrixValuesType& val,
195  const local_ordinal_type blockSize,
196  const InVecType& X,
197  const typename std::decay<BetaCoeffType>::type& beta,
198  const OutVecType& Y,
199  const local_ordinal_type rowsPerTeam = defaultRowsPerTeam) :
200  alpha_ (alpha),
201  ptr_ (graph.row_map),
202  ind_ (graph.entries),
203  val_ (val),
204  blockSize_ (blockSize),
205  X_ (X),
206  beta_ (beta),
207  Y_ (Y),
208  rowsPerTeam_ (rowsPerTeam)
209  {
210  // std::ostringstream os;
211  // os << "Functor ctor: numLclMeshRows = " << getNumLclMeshRows () << std::endl;
212  // std::cerr << os.str ();
213  }
214 
215  KOKKOS_INLINE_FUNCTION void
216  operator () (const typename policy_type::member_type& member) const
217  {
222  using Kokkos::Details::ArithTraits;
223  // I'm not writing 'using Kokkos::make_pair;' here, because that
224  // may break builds for users who make the mistake of putting
225  // 'using namespace std;' in the global namespace. Please don't
226  // ever do that! But just in case you do, I'll take this
227  // precaution.
228  using Kokkos::parallel_for;
229  using Kokkos::subview;
230  typedef local_ordinal_type LO;
231  typedef typename decltype (ptr_)::non_const_value_type offset_type;
232  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
233  shmem_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
234  shared_array_type;
235  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
236  Kokkos::LayoutRight,
237  device_type,
238  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
239  out_little_vec_type;
240  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
241  Kokkos::LayoutRight,
242  device_type,
243  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
244  little_block_type;
245 
246  const LO leagueRank = member.league_rank();
247 
248  // This looks wrong! All the threads are writing into the same local storage.
249  // shared_array_type threadLocalMem =
250  // shared_array_type (member.thread_scratch (1), blockSize_);
251  shared_array_type threadLocalMem =
252  shared_array_type (member.thread_scratch (1), blockSize_ * rowsPerTeam_);
253 
254  // This looks wrong! All the threads are writing into the same local storage.
255  //out_little_vec_type Y_tlm (threadLocalMem.ptr_on_device (), blockSize_, 1);
256 
257  const LO numLclMeshRows = getNumLclMeshRows ();
258  const LO rowBeg = leagueRank * rowsPerTeam_;
259  const LO rowTmp = rowBeg + rowsPerTeam_;
260  const LO rowEnd = rowTmp < numLclMeshRows ? rowTmp : numLclMeshRows;
261 
262  // {
263  // std::ostringstream os;
264  // os << leagueRank << "," << member.team_rank () << ": "
265  // << rowBeg << "," << rowEnd << std::endl;
266  // std::cerr << os.str ();
267  // }
268 
269  // Each team takes rowsPerTeam_ (block) rows.
270  // Each thread in the team takes a single (block) row.
271  parallel_for (Kokkos::TeamThreadRange (member, rowBeg, rowEnd),
272  [&] (const LO& lclRow) {
273  // Each thread in the team gets its own temp storage.
274  out_little_vec_type Y_tlm (threadLocalMem.ptr_on_device () + blockSize_ * member.team_rank (), blockSize_);
275 
276  const offset_type Y_ptBeg = lclRow * blockSize_;
277  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
278  auto Y_cur =
279  subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
280  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
281  FILL (Y_tlm, ArithTraits<BetaCoeffType>::zero ());
282  }
283  else if (beta_ == ArithTraits<BetaCoeffType>::one ()) {
284  COPY (Y_cur, Y_tlm);
285  }
286  else { // beta != 0 && beta != 1
287  COPY (Y_cur, Y_tlm);
288  SCAL (beta_, Y_tlm);
289  }
290 
291  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
292  const offset_type blkBeg = ptr_[lclRow];
293  const offset_type blkEnd = ptr_[lclRow+1];
294  // Precompute to save integer math in the inner loop.
295  const offset_type bs2 = blockSize_ * blockSize_;
296  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
297  ++absBlkOff) {
298  little_block_type A_cur (val_.ptr_on_device () + absBlkOff * bs2,
299  blockSize_, blockSize_);
300  const offset_type X_blkCol = ind_[absBlkOff];
301  const offset_type X_ptBeg = X_blkCol * blockSize_;
302  const offset_type X_ptEnd = X_ptBeg + blockSize_;
303  auto X_cur =
304  subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
305  // Y_tlm += alpha*A_cur*X_cur
306  GEMV (alpha_, A_cur, X_cur, Y_tlm);
307  } // for each entry in current local block row of matrix
308  COPY (Y_tlm, Y_cur);
309  }
310  });
311  }
312 
313 private:
314  typename std::decay<AlphaCoeffType>::type alpha_;
315  typename GraphType::row_map_type::const_type ptr_;
316  typename GraphType::entries_type::const_type ind_;
317  MatrixValuesType val_;
318  local_ordinal_type blockSize_;
319  InVecType X_;
320  typename std::decay<BetaCoeffType>::type beta_;
321  OutVecType Y_;
322  local_ordinal_type rowsPerTeam_;
323 };
324 #endif // 0
325 
326 template<class AlphaCoeffType,
327  class GraphType,
328  class MatrixValuesType,
329  class InVecType,
330  class BetaCoeffType,
331  class OutVecType>
332 class BcrsApplyNoTrans1VecFunctor {
333 private:
334  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
335  "MatrixValuesType must be a Kokkos::View.");
336  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
337  "OutVecType must be a Kokkos::View.");
338  static_assert (Kokkos::Impl::is_view<InVecType>::value,
339  "InVecType must be a Kokkos::View.");
340  static_assert (std::is_same<MatrixValuesType,
341  typename MatrixValuesType::const_type>::value,
342  "MatrixValuesType must be a const Kokkos::View.");
343  static_assert (std::is_same<OutVecType,
344  typename OutVecType::non_const_type>::value,
345  "OutVecType must be a nonconst Kokkos::View.");
346  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
347  "InVecType must be a const Kokkos::View.");
348  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
349  "MatrixValuesType must be a rank-1 Kokkos::View.");
350  static_assert (static_cast<int> (InVecType::rank) == 1,
351  "InVecType must be a rank-1 Kokkos::View.");
352  static_assert (static_cast<int> (OutVecType::rank) == 1,
353  "OutVecType must be a rank-1 Kokkos::View.");
354  typedef typename MatrixValuesType::non_const_value_type scalar_type;
355 
356 public:
357  typedef typename GraphType::device_type device_type;
358 
360  typedef typename std::remove_const<typename GraphType::data_type>::type
361  local_ordinal_type;
363  typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,
364  typename device_type::execution_space,
365  local_ordinal_type> policy_type;
372  void setX (const InVecType& X) { X_ = X; }
373 
380  void setY (const OutVecType& Y) { Y_ = Y; }
381 
382  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
383  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
384 
386  BcrsApplyNoTrans1VecFunctor (const alpha_coeff_type& alpha,
387  const GraphType& graph,
388  const MatrixValuesType& val,
389  const local_ordinal_type blockSize,
390  const InVecType& X,
391  const beta_coeff_type& beta,
392  const OutVecType& Y) :
393  alpha_ (alpha),
394  ptr_ (graph.row_map),
395  ind_ (graph.entries),
396  val_ (val),
397  blockSize_ (blockSize),
398  X_ (X),
399  beta_ (beta),
400  Y_ (Y)
401  {}
402 
403  KOKKOS_INLINE_FUNCTION void
404  operator () (const local_ordinal_type& lclRow) const
405  {
410  using Kokkos::Details::ArithTraits;
411  // I'm not writing 'using Kokkos::make_pair;' here, because that
412  // may break builds for users who make the mistake of putting
413  // 'using namespace std;' in the global namespace. Please don't
414  // ever do that! But just in case you do, I'll take this
415  // precaution.
416  using Kokkos::parallel_for;
417  using Kokkos::subview;
418  typedef typename decltype (ptr_)::non_const_value_type offset_type;
419  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
420  Kokkos::LayoutRight,
421  device_type,
422  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
423  little_block_type;
424 
425  const offset_type Y_ptBeg = lclRow * blockSize_;
426  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
427  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
428 
429  // This version of the code does not use temporary storage.
430  // Each thread writes to its own block of the target vector.
431  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
432  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
433  }
434  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
435  SCAL (beta_, Y_cur);
436  }
437 
438  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
439  const offset_type blkBeg = ptr_[lclRow];
440  const offset_type blkEnd = ptr_[lclRow+1];
441  // Precompute to save integer math in the inner loop.
442  const offset_type bs2 = blockSize_ * blockSize_;
443  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
444  ++absBlkOff) {
445  little_block_type A_cur (val_.ptr_on_device () + absBlkOff * bs2,
446  blockSize_, blockSize_);
447  const offset_type X_blkCol = ind_[absBlkOff];
448  const offset_type X_ptBeg = X_blkCol * blockSize_;
449  const offset_type X_ptEnd = X_ptBeg + blockSize_;
450  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
451 
452  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
453  } // for each entry in current local block row of matrix
454  }
455  }
456 
457 private:
458  alpha_coeff_type alpha_;
459  typename GraphType::row_map_type::const_type ptr_;
460  typename GraphType::entries_type::const_type ind_;
461  MatrixValuesType val_;
462  local_ordinal_type blockSize_;
463  InVecType X_;
464  beta_coeff_type beta_;
465  OutVecType Y_;
466 };
467 
468 template<class AlphaCoeffType,
469  class GraphType,
470  class MatrixValuesType,
471  class InMultiVecType,
472  class BetaCoeffType,
473  class OutMultiVecType>
474 void
475 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
476  const GraphType& graph,
477  const MatrixValuesType& val,
478  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
479  const InMultiVecType& X,
480  const BetaCoeffType& beta,
481  const OutMultiVecType& Y
482 #if 0
483  , const typename std::remove_const<typename GraphType::data_type>::type rowsPerTeam = 20
484 #endif // 0
485  )
486 {
487  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
488  "MatrixValuesType must be a Kokkos::View.");
489  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
490  "OutMultiVecType must be a Kokkos::View.");
491  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
492  "InMultiVecType must be a Kokkos::View.");
493  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
494  "MatrixValuesType must be a rank-1 Kokkos::View.");
495  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
496  "OutMultiVecType must be a rank-2 Kokkos::View.");
497  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
498  "InMultiVecType must be a rank-2 Kokkos::View.");
499 
500  typedef typename MatrixValuesType::const_type matrix_values_type;
501  typedef typename OutMultiVecType::non_const_type out_multivec_type;
502  typedef typename InMultiVecType::const_type in_multivec_type;
503  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
504  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
505  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
506 
507  const LO numLocalMeshRows = graph.row_map.dimension_0 () == 0 ?
508  static_cast<LO> (0) :
509  static_cast<LO> (graph.row_map.dimension_0 () - 1);
510  const LO numVecs = Y.dimension_1 ();
511  if (numLocalMeshRows == 0 || numVecs == 0) {
512  return; // code below doesn't handle numVecs==0 correctly
513  }
514 
515  // These assignments avoid instantiating the functor extra times
516  // unnecessarily, e.g., for X const vs. nonconst. We only need the
517  // X const case, so only instantiate for that case.
518  in_multivec_type X_in = X;
519  out_multivec_type Y_out = Y;
520 
521  // The functor only knows how to handle one vector at a time, and it
522  // expects 1-D Views. Thus, we need to know the type of each column
523  // of X and Y.
524  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
525  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
526 #if 0
527  typedef BcrsApplyNoTrans1VecTeamFunctor<alpha_type, GraphType,
528  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
529 #else
530  typedef BcrsApplyNoTrans1VecFunctor<alpha_type, GraphType,
531  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
532 #endif // 0
533  typedef typename functor_type::policy_type policy_type;
534 
535  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
536  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
537 #if 0
538  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0, rowsPerTeam);
539  const LO numTeams = functor.getNumTeams ();
540  policy_type policy (numTeams, Kokkos::AUTO ());
541  {
542  // KJ : hierarchy level of memory allocated e.g., cache (1),
543  // HBM (2), DDR (3), not used for now
544  const LO level = 1;
545  // KJ : for now provide two options for parallelizing (for vs. reduce)
546  const LO scratchSizePerTeam = functor.getScratchSizePerTeam (); // used for team parallel_red
547  const LO scratchSizePerThread = functor.getScratchSizePerThread (); // used for team parallel_for
548  policy =
549  policy.set_scratch_size (level,
550  Kokkos::PerTeam (scratchSizePerTeam),
551  Kokkos::PerThread (scratchSizePerThread));
552  }
553 #else
554  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
555  policy_type policy (0, numLocalMeshRows);
556 #endif // 0
557 
558  // Compute the first column of Y.
559  Kokkos::parallel_for (policy, functor);
560 
561  // Compute the remaining columns of Y.
562  for (LO j = 1; j < numVecs; ++j) {
563  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
564  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
565  functor.setX (X_j);
566  functor.setY (Y_j);
567  Kokkos::parallel_for (policy, functor);
568  }
569 }
570 
571 } // namespace Impl
572 
573 namespace { // (anonymous)
574 
575 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
576 // version that takes two Kokkos::View arguments).
577 template<class Scalar, class LO, class GO, class Node>
578 class GetLocalDiagCopy {
579 public:
580  typedef typename Node::device_type device_type;
581  typedef size_t diag_offset_type;
582  typedef Kokkos::View<const size_t*, device_type,
583  Kokkos::MemoryUnmanaged> diag_offsets_type;
584  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
585  typedef typename global_graph_type::local_graph_type local_graph_type;
586  typedef typename local_graph_type::row_map_type row_offsets_type;
587  typedef typename ::Tpetra::Experimental::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
588  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
589  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
590 
591  // Constructor
592  GetLocalDiagCopy (const diag_type& diag,
593  const values_type& val,
594  const diag_offsets_type& diagOffsets,
595  const row_offsets_type& ptr,
596  const LO blockSize) :
597  diag_ (diag),
598  diagOffsets_ (diagOffsets),
599  ptr_ (ptr),
600  blockSize_ (blockSize),
601  offsetPerBlock_ (blockSize_*blockSize_),
602  val_(val)
603  {}
604 
605  KOKKOS_INLINE_FUNCTION void
606  operator() (const LO& lclRowInd) const
607  {
608  using Kokkos::ALL;
609 
610  // Get row offset
611  const size_t absOffset = ptr_[lclRowInd];
612 
613  // Get offset relative to start of row
614  const size_t relOffset = diagOffsets_[lclRowInd];
615 
616  // Get the total offset
617  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
618 
619  // Get a view of the block. BCRS currently uses LayoutRight
620  // regardless of the device.
621  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
622  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
623  const_little_block_type;
624  const_little_block_type D_in (val_.ptr_on_device () + pointOffset,
625  blockSize_, blockSize_);
626  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
627  COPY (D_in, D_out);
628  }
629 
630  private:
631  diag_type diag_;
632  diag_offsets_type diagOffsets_;
633  row_offsets_type ptr_;
634  LO blockSize_;
635  LO offsetPerBlock_;
636  values_type val_;
637  };
638 } // namespace (anonymous)
639 
640  template<class Scalar, class LO, class GO, class Node>
641  std::ostream&
642  BlockCrsMatrix<Scalar, LO, GO, Node>::
643  markLocalErrorAndGetStream ()
644  {
645  * (this->localError_) = true;
646  if ((*errs_).is_null ()) {
647  *errs_ = Teuchos::rcp (new std::ostringstream ());
648  }
649  return **errs_;
650  }
651 
652  template<class Scalar, class LO, class GO, class Node>
655  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
656  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
657  blockSize_ (static_cast<LO> (0)),
658  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
659  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
660  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
661  offsetPerBlock_ (0),
662  localError_ (new bool (false)),
663  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
664  {
665  }
666 
667  template<class Scalar, class LO, class GO, class Node>
670  const LO blockSize) :
671  dist_object_type (graph.getMap ()),
672  graph_ (graph),
673  rowMeshMap_ (* (graph.getRowMap ())),
674  blockSize_ (blockSize),
675  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
676  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
677  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
678  offsetPerBlock_ (blockSize * blockSize),
679  localError_ (new bool (false)),
680  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
681  {
682  TEUCHOS_TEST_FOR_EXCEPTION(
683  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
684  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
685  "rows (isSorted() is false). This class assumes sorted rows.");
686 
687  graphRCP_ = Teuchos::rcpFromRef(graph_);
688 
689  // Trick to test whether LO is nonpositive, without a compiler
690  // warning in case LO is unsigned (which is generally a bad idea
691  // anyway). I don't promise that the trick works, but it
692  // generally does with gcc at least, in my experience.
693  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
694  TEUCHOS_TEST_FOR_EXCEPTION(
695  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
696  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
697  " <= 0. The block size must be positive.");
698 
699  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
700  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
701 
702  {
703  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
704  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
705 
706  row_map_type ptr_d = graph.getLocalGraph ().row_map;
707  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
708  Kokkos::deep_copy (ptr_h_nc, ptr_d);
709  ptrHost_ = ptr_h_nc;
710  }
711  {
712  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
713  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
714 
715  entries_type ind_d = graph.getLocalGraph ().entries;
716  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
717  Kokkos::deep_copy (ind_h_nc, ind_d);
718  indHost_ = ind_h_nc;
719  }
720 
721  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
722  val_ = decltype (val_) ("val", numValEnt);
723  }
724 
725  template<class Scalar, class LO, class GO, class Node>
728  const map_type& domainPointMap,
729  const map_type& rangePointMap,
730  const LO blockSize) :
731  dist_object_type (graph.getMap ()),
732  graph_ (graph),
733  rowMeshMap_ (* (graph.getRowMap ())),
734  domainPointMap_ (domainPointMap),
735  rangePointMap_ (rangePointMap),
736  blockSize_ (blockSize),
737  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
738  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
739  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
740  offsetPerBlock_ (blockSize * blockSize),
741  localError_ (new bool (false)),
742  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
743  {
744  TEUCHOS_TEST_FOR_EXCEPTION(
745  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
746  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
747  "rows (isSorted() is false). This class assumes sorted rows.");
748 
749  graphRCP_ = Teuchos::rcpFromRef(graph_);
750 
751  // Trick to test whether LO is nonpositive, without a compiler
752  // warning in case LO is unsigned (which is generally a bad idea
753  // anyway). I don't promise that the trick works, but it
754  // generally does with gcc at least, in my experience.
755  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
756  TEUCHOS_TEST_FOR_EXCEPTION(
757  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
758  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
759  " <= 0. The block size must be positive.");
760 
761  {
762  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
763  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
764 
765  row_map_type ptr_d = graph.getLocalGraph ().row_map;
766  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
767  Kokkos::deep_copy (ptr_h_nc, ptr_d);
768  ptrHost_ = ptr_h_nc;
769  }
770  {
771  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
772  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
773 
774  entries_type ind_d = graph.getLocalGraph ().entries;
775  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
776  Kokkos::deep_copy (ind_h_nc, ind_d);
777  indHost_ = ind_h_nc;
778  }
779 
780  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
781  val_ = decltype (val_) ("val", numValEnt);
782  }
783 
784  template<class Scalar, class LO, class GO, class Node>
785  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
788  { // Copy constructor of map_type does a shallow copy.
789  // We're only returning by RCP for backwards compatibility.
790  return Teuchos::rcp (new map_type (domainPointMap_));
791  }
792 
793  template<class Scalar, class LO, class GO, class Node>
794  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
796  getRangeMap () const
797  { // Copy constructor of map_type does a shallow copy.
798  // We're only returning by RCP for backwards compatibility.
799  return Teuchos::rcp (new map_type (rangePointMap_));
800  }
801 
802  template<class Scalar, class LO, class GO, class Node>
803  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
805  getRowMap () const
806  {
807  return graph_.getRowMap();
808  }
809 
810  template<class Scalar, class LO, class GO, class Node>
811  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
813  getColMap () const
814  {
815  return graph_.getColMap();
816  }
817 
818  template<class Scalar, class LO, class GO, class Node>
822  {
823  return graph_.getGlobalNumRows();
824  }
825 
826  template<class Scalar, class LO, class GO, class Node>
827  size_t
830  {
831  return graph_.getNodeNumRows();
832  }
833 
834  template<class Scalar, class LO, class GO, class Node>
835  size_t
838  {
839  return graph_.getNodeMaxNumRowEntries();
840  }
841 
842  template<class Scalar, class LO, class GO, class Node>
843  void
845  apply (const mv_type& X,
846  mv_type& Y,
847  Teuchos::ETransp mode,
848  Scalar alpha,
849  Scalar beta) const
850  {
852  TEUCHOS_TEST_FOR_EXCEPTION(
853  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
854  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
855  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
856  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
857 
858  BMV X_view;
859  BMV Y_view;
860  const LO blockSize = getBlockSize ();
861  try {
862  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
863  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
864  }
865  catch (std::invalid_argument& e) {
866  TEUCHOS_TEST_FOR_EXCEPTION(
867  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
868  "apply: Either the input MultiVector X or the output MultiVector Y "
869  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
870  "graph. BlockMultiVector's constructor threw the following exception: "
871  << e.what ());
872  }
873 
874  try {
875  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
876  // either that or mark fields of this class as 'mutable'. The
877  // problem is that applyBlock wants to do lazy initialization of
878  // temporary block multivectors.
879  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
880  } catch (std::invalid_argument& e) {
881  TEUCHOS_TEST_FOR_EXCEPTION(
882  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
883  "apply: The implementation method applyBlock complained about having "
884  "an invalid argument. It reported the following: " << e.what ());
885  } catch (std::logic_error& e) {
886  TEUCHOS_TEST_FOR_EXCEPTION(
887  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
888  "apply: The implementation method applyBlock complained about a "
889  "possible bug in its implementation. It reported the following: "
890  << e.what ());
891  } catch (std::exception& e) {
892  TEUCHOS_TEST_FOR_EXCEPTION(
893  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
894  "apply: The implementation method applyBlock threw an exception which "
895  "is neither std::invalid_argument nor std::logic_error, but is a "
896  "subclass of std::exception. It reported the following: "
897  << e.what ());
898  } catch (...) {
899  TEUCHOS_TEST_FOR_EXCEPTION(
900  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
901  "apply: The implementation method applyBlock threw an exception which "
902  "is not an instance of a subclass of std::exception. This probably "
903  "indicates a bug. Please report this to the Tpetra developers.");
904  }
905  }
906 
907  template<class Scalar, class LO, class GO, class Node>
908  void
912  Teuchos::ETransp mode,
913  const Scalar alpha,
914  const Scalar beta)
915  {
916  TEUCHOS_TEST_FOR_EXCEPTION(
917  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
918  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
919  "X and Y have different block sizes. X.getBlockSize() = "
920  << X.getBlockSize () << " != Y.getBlockSize() = "
921  << Y.getBlockSize () << ".");
922 
923  if (mode == Teuchos::NO_TRANS) {
924  applyBlockNoTrans (X, Y, alpha, beta);
925  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
926  applyBlockTrans (X, Y, mode, alpha, beta);
927  } else {
928  TEUCHOS_TEST_FOR_EXCEPTION(
929  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
930  "applyBlock: Invalid 'mode' argument. Valid values are "
931  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
932  }
933  }
934 
935  template<class Scalar, class LO, class GO, class Node>
936  void
938  setAllToScalar (const Scalar& alpha)
939  {
940 #ifdef HAVE_TPETRA_DEBUG
941  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::setAllToScalar: ";
942 #endif // HAVE_TPETRA_DEBUG
943 
944  if (this->template need_sync<device_type> ()) {
945  // If we need to sync to device, then the data were last
946  // modified on host. In that case, we should again modify them
947  // on host.
948 #ifdef HAVE_TPETRA_DEBUG
949  TEUCHOS_TEST_FOR_EXCEPTION
950  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
951  prefix << "The matrix's values need sync on both device and host.");
952 #endif // HAVE_TPETRA_DEBUG
953  this->template modify<Kokkos::HostSpace> ();
954  Kokkos::deep_copy (this->template getValues<Kokkos::HostSpace> (), alpha);
955  }
956  else if (this->template need_sync<Kokkos::HostSpace> ()) {
957  // If we need to sync to host, then the data were last modified
958  // on device. In that case, we should again modify them on
959  // device.
960 #ifdef HAVE_TPETRA_DEBUG
961  TEUCHOS_TEST_FOR_EXCEPTION
962  (this->template need_sync<device_type> (), std::runtime_error,
963  prefix << "The matrix's values need sync on both host and device.");
964 #endif // HAVE_TPETRA_DEBUG
965  this->template modify<device_type> ();
966  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
967  }
968  else { // neither host nor device marked as modified, so modify on device
969  this->template modify<device_type> ();
970  Kokkos::deep_copy (this->template getValues<device_type> (), alpha);
971  }
972  }
973 
974  template<class Scalar, class LO, class GO, class Node>
975  LO
977  replaceLocalValues (const LO localRowInd,
978  const LO colInds[],
979  const Scalar vals[],
980  const LO numColInds) const
981  {
982 #ifdef HAVE_TPETRA_DEBUG
983  const char prefix[] =
984  "Tpetra::Experimental::BlockCrsMatrix::replaceLocalValues: ";
985 #endif // HAVE_TPETRA_DEBUG
986 
987  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
988  // We modified no values, because the input local row index is
989  // invalid on the calling process. That may not be an error, if
990  // numColInds is zero anyway; it doesn't matter. This is the
991  // advantage of returning the number of valid indices.
992  return static_cast<LO> (0);
993  }
994  const impl_scalar_type* const vIn =
995  reinterpret_cast<const impl_scalar_type*> (vals);
996  const size_t absRowBlockOffset = ptrHost_[localRowInd];
997  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
998  const LO perBlockSize = this->offsetPerBlock ();
999  LO hint = 0; // Guess for the relative offset into the current row
1000  LO pointOffset = 0; // Current offset into input values
1001  LO validCount = 0; // number of valid column indices in colInds
1002 
1003 #ifdef HAVE_TPETRA_DEBUG
1004  TEUCHOS_TEST_FOR_EXCEPTION
1005  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1006  prefix << "The matrix's data were last modified on device, but have "
1007  "not been sync'd to host. Please sync to host (by calling "
1008  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1009  "method.");
1010 #endif // HAVE_TPETRA_DEBUG
1011 
1012  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1013  // version of the data always exists (no lazy allocation for host
1014  // data).
1016  auto vals_host_out =
1017  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1018  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1019 
1020  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1021  const LO relBlockOffset =
1022  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1023  if (relBlockOffset != LINV) {
1024  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1025  // are stored contiguously, with no padding. "Contiguously"
1026  // means that all memory between the first and last entries
1027  // belongs to the block (no striding). "No padding" means
1028  // that getBlockSize() * getBlockSize() is exactly the number
1029  // of entries that the block uses. For another place where
1030  // this assumption is encoded, see sumIntoLocalValues.
1031 
1032  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1033  // little_block_type A_old =
1034  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1035  impl_scalar_type* const A_old =
1036  vals_host_out_raw + absBlockOffset * perBlockSize;
1037  // const_little_block_type A_new =
1038  // getConstLocalBlockFromInput (vIn, pointOffset);
1039  const impl_scalar_type* const A_new = vIn + pointOffset;
1040  // COPY (A_new, A_old);
1041  for (LO i = 0; i < perBlockSize; ++i) {
1042  A_old[i] = A_new[i];
1043  }
1044  hint = relBlockOffset + 1;
1045  ++validCount;
1046  }
1047  }
1048  return validCount;
1049  }
1050 
1051  template <class Scalar, class LO, class GO, class Node>
1052  void
1054  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1055  Kokkos::MemoryUnmanaged>& offsets) const
1056  {
1057  graph_.getLocalDiagOffsets (offsets);
1058  }
1059 
1060  template <class Scalar, class LO, class GO, class Node>
1061  void TPETRA_DEPRECATED
1063  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1064  {
1065  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1066  // this method in CrsGraph too, so don't call it (otherwise build
1067  // warnings will show up and annoy users). Instead, copy results
1068  // in and out, if the memory space requires it.
1069 
1070  const size_t lclNumRows = graph_.getNodeNumRows ();
1071  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1072  offsets.resize (lclNumRows);
1073  }
1074 
1075  // The input ArrayRCP must always be a host pointer. Thus, if
1076  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1077  // to write to that allocation directly as a Kokkos::View.
1078  typedef typename device_type::memory_space memory_space;
1079  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1080  // It is always syntactically correct to assign a raw host
1081  // pointer to a device View, so this code will compile correctly
1082  // even if this branch never runs.
1083  typedef Kokkos::View<size_t*, device_type,
1084  Kokkos::MemoryUnmanaged> output_type;
1085  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1086  graph_.getLocalDiagOffsets (offsetsOut);
1087  }
1088  else {
1089  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1090  graph_.getLocalDiagOffsets (offsetsTmp);
1091  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1092  Kokkos::MemoryUnmanaged> output_type;
1093  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1094  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1095  }
1096  }
1097 
1098  template <class Scalar, class LO, class GO, class Node>
1099  void
1103  const Kokkos::View<impl_scalar_type***, device_type,
1104  Kokkos::MemoryUnmanaged>& D_inv,
1105  const Scalar& omega,
1106  const ESweepDirection direction) const
1107  {
1108  using Kokkos::ALL;
1109  const impl_scalar_type zero =
1110  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1111  const impl_scalar_type one =
1112  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1113  const LO numLocalMeshRows =
1114  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1115  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1116 
1117  // If using (new) Kokkos, replace localMem with thread-local
1118  // memory. Note that for larger block sizes, this will affect the
1119  // two-level parallelization. Look to Stokhos for best practice
1120  // on making this fast for GPUs.
1121  const LO blockSize = getBlockSize ();
1122  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1123  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1124  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1125 
1126  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1127  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1128  if (direction == Forward) {
1129  rowBegin = 1;
1130  rowEnd = numLocalMeshRows+1;
1131  rowStride = 1;
1132  }
1133  else if (direction == Backward) {
1134  rowBegin = numLocalMeshRows;
1135  rowEnd = 0;
1136  rowStride = -1;
1137  }
1138  else if (direction == Symmetric) {
1139  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1140  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1141  return;
1142  }
1143 
1144  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1145  const Scalar minus_omega = -omega;
1146 
1147  if (numVecs == 1) {
1148  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1149  const LO actlRow = lclRow - 1;
1150 
1151  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1152  COPY (B_cur, X_lcl);
1153  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1154 
1155  const size_t meshBeg = ptrHost_[actlRow];
1156  const size_t meshEnd = ptrHost_[actlRow+1];
1157  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1158  const LO meshCol = indHost_[absBlkOff];
1159  const_little_block_type A_cur =
1160  getConstLocalBlockFromAbsOffset (absBlkOff);
1161  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1162 
1163  // X_lcl += alpha*A_cur*X_cur
1164  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1165  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1166  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1167  } // for each entry in the current local row of the matrix
1168 
1169  // NOTE (mfh 20 Jan 2016) The two input Views here are
1170  // unmanaged already, so we don't have to take unmanaged
1171  // subviews first.
1172  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1173  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1174  FILL (X_update, zero);
1175  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1176  } // for each local row of the matrix
1177  }
1178  else {
1179  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1180  for (LO j = 0; j < numVecs; ++j) {
1181  LO actlRow = lclRow-1;
1182 
1183  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1184  COPY (B_cur, X_lcl);
1185  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1186 
1187  const size_t meshBeg = ptrHost_[actlRow];
1188  const size_t meshEnd = ptrHost_[actlRow+1];
1189  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1190  const LO meshCol = indHost_[absBlkOff];
1191  const_little_block_type A_cur =
1192  getConstLocalBlockFromAbsOffset (absBlkOff);
1193  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1194 
1195  // X_lcl += alpha*A_cur*X_cur
1196  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1197  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1198  } // for each entry in the current local row of the matrx
1199 
1200  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1201  auto X_update = X.getLocalBlock (actlRow, j);
1202  FILL (X_update, zero);
1203  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1204  } // for each entry in the current local row of the matrix
1205  } // for each local row of the matrix
1206  }
1207  }
1208 
1209  template <class Scalar, class LO, class GO, class Node>
1210  void
1215  const Scalar& dampingFactor,
1216  const ESweepDirection direction,
1217  const int numSweeps,
1218  const bool zeroInitialGuess) const
1219  {
1220  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1221  // interface for block Gauss-Seidel.
1222  TEUCHOS_TEST_FOR_EXCEPTION(
1223  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1224  "gaussSeidelCopy: Not implemented.");
1225  }
1226 
1227  template <class Scalar, class LO, class GO, class Node>
1228  void
1233  const Teuchos::ArrayView<LO>& rowIndices,
1234  const Scalar& dampingFactor,
1235  const ESweepDirection direction,
1236  const int numSweeps,
1237  const bool zeroInitialGuess) const
1238  {
1239  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1240  // interface for block Gauss-Seidel.
1241  TEUCHOS_TEST_FOR_EXCEPTION(
1242  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1243  "reorderedGaussSeidelCopy: Not implemented.");
1244  }
1245 
1246  template <class Scalar, class LO, class GO, class Node>
1247  void TPETRA_DEPRECATED
1250  const Teuchos::ArrayView<const size_t>& offsets) const
1251  {
1252  using Teuchos::ArrayRCP;
1253  using Teuchos::ArrayView;
1254  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1255 
1256  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1257  const LO* columnIndices;
1258  Scalar* vals;
1259  LO numColumns;
1260  Teuchos::Array<LO> cols(1);
1261 
1262  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1263  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1264  for (size_t i = 0; i < myNumRows; ++i) {
1265  cols[0] = i;
1266  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1267  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1268  }
1269  else {
1270  getLocalRowView (i, columnIndices, vals, numColumns);
1271  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1272  }
1273  }
1274  }
1275 
1276 
1277  template <class Scalar, class LO, class GO, class Node>
1278  void
1280  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1281  Kokkos::MemoryUnmanaged>& diag,
1282  const Kokkos::View<const size_t*, device_type,
1283  Kokkos::MemoryUnmanaged>& offsets) const
1284  {
1285  using Kokkos::parallel_for;
1286  typedef typename device_type::execution_space execution_space;
1287  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1288 
1289  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1290  const LO blockSize = this->getBlockSize ();
1291  TEUCHOS_TEST_FOR_EXCEPTION
1292  (static_cast<LO> (diag.dimension_0 ()) < lclNumMeshRows ||
1293  static_cast<LO> (diag.dimension_1 ()) < blockSize ||
1294  static_cast<LO> (diag.dimension_2 ()) < blockSize,
1295  std::invalid_argument, prefix <<
1296  "The input Kokkos::View is not big enough to hold all the data.");
1297  TEUCHOS_TEST_FOR_EXCEPTION
1298  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1299  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1300  "diagonal blocks " << lclNumMeshRows << ".");
1301 
1302 #ifdef HAVE_TPETRA_DEBUG
1303  TEUCHOS_TEST_FOR_EXCEPTION
1304  (this->template need_sync<device_type> (), std::runtime_error,
1305  prefix << "The matrix's data were last modified on host, but have "
1306  "not been sync'd to device. Please sync to device (by calling "
1307  "sync<device_type>() on this matrix) before calling this method.");
1308 #endif // HAVE_TPETRA_DEBUG
1309 
1310  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1311  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1312 
1313  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1314  // we reserve the right to do lazy allocation of device data. (We
1315  // don't plan to do lazy allocation for host data; the host
1316  // version of the data always exists.)
1318  auto vals_dev =
1319  const_cast<this_type*> (this)->template getValues<device_type> ();
1320 
1321  parallel_for (policy_type (0, lclNumMeshRows),
1322  functor_type (diag, vals_dev, offsets,
1323  graph_.getLocalGraph ().row_map, blockSize_));
1324  }
1325 
1326 
1327  template <class Scalar, class LO, class GO, class Node>
1328  void
1330  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1331  Kokkos::MemoryUnmanaged>& diag,
1332  const Teuchos::ArrayView<const size_t>& offsets) const
1333  {
1334  using Kokkos::ALL;
1335  using Kokkos::parallel_for;
1336  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1337  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1338 
1339  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1340  const LO blockSize = this->getBlockSize ();
1341  TEUCHOS_TEST_FOR_EXCEPTION
1342  (static_cast<LO> (diag.dimension_0 ()) < lclNumMeshRows ||
1343  static_cast<LO> (diag.dimension_1 ()) < blockSize ||
1344  static_cast<LO> (diag.dimension_2 ()) < blockSize,
1345  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1346  "The input Kokkos::View is not big enough to hold all the data.");
1347  TEUCHOS_TEST_FOR_EXCEPTION
1348  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1349  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1350  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1351  "blocks " << lclNumMeshRows << ".");
1352 
1353  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1354  // quite made everything work with CUDA yet.
1355  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1356  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1357  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1358  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1359  COPY (D_in, D_out);
1360  });
1361  }
1362 
1363 
1364  template<class Scalar, class LO, class GO, class Node>
1365  LO
1367  absMaxLocalValues (const LO localRowInd,
1368  const LO colInds[],
1369  const Scalar vals[],
1370  const LO numColInds) const
1371  {
1372  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1373  // We modified no values, because the input local row index is
1374  // invalid on the calling process. That may not be an error, if
1375  // numColInds is zero anyway; it doesn't matter. This is the
1376  // advantage of returning the number of valid indices.
1377  return static_cast<LO> (0);
1378  }
1379  const impl_scalar_type* const vIn =
1380  reinterpret_cast<const impl_scalar_type*> (vals);
1381  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1382  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1383  const LO perBlockSize = this->offsetPerBlock ();
1384  LO hint = 0; // Guess for the relative offset into the current row
1385  LO pointOffset = 0; // Current offset into input values
1386  LO validCount = 0; // number of valid column indices in colInds
1387 
1388  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1389  const LO relBlockOffset =
1390  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1391  if (relBlockOffset != LINV) {
1392  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1393  little_block_type A_old =
1394  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1395  const_little_block_type A_new =
1396  getConstLocalBlockFromInput (vIn, pointOffset);
1397 
1398  Impl::absMax (A_old, A_new);
1399  hint = relBlockOffset + 1;
1400  ++validCount;
1401  }
1402  }
1403  return validCount;
1404  }
1405 
1406 
1407  template<class Scalar, class LO, class GO, class Node>
1408  LO
1410  sumIntoLocalValues (const LO localRowInd,
1411  const LO colInds[],
1412  const Scalar vals[],
1413  const LO numColInds) const
1414  {
1415 #ifdef HAVE_TPETRA_DEBUG
1416  const char prefix[] =
1417  "Tpetra::Experimental::BlockCrsMatrix::sumIntoLocalValues: ";
1418 #endif // HAVE_TPETRA_DEBUG
1419 
1420  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1421  // We modified no values, because the input local row index is
1422  // invalid on the calling process. That may not be an error, if
1423  // numColInds is zero anyway; it doesn't matter. This is the
1424  // advantage of returning the number of valid indices.
1425  return static_cast<LO> (0);
1426  }
1427  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1428  const impl_scalar_type* const vIn =
1429  reinterpret_cast<const impl_scalar_type*> (vals);
1430  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1431  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1432  const LO perBlockSize = this->offsetPerBlock ();
1433  LO hint = 0; // Guess for the relative offset into the current row
1434  LO pointOffset = 0; // Current offset into input values
1435  LO validCount = 0; // number of valid column indices in colInds
1436 
1437 #ifdef HAVE_TPETRA_DEBUG
1438  TEUCHOS_TEST_FOR_EXCEPTION
1439  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1440  prefix << "The matrix's data were last modified on device, but have not "
1441  "been sync'd to host. Please sync to host (by calling "
1442  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1443 #endif // HAVE_TPETRA_DEBUG
1444 
1445  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1446  // version of the data always exists (no lazy allocation for host
1447  // data).
1449  auto vals_host_out =
1450  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1451  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1452 
1453  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1454  const LO relBlockOffset =
1455  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1456  if (relBlockOffset != LINV) {
1457  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1458  // are stored contiguously, with no padding. "Contiguously"
1459  // means that all memory between the first and last entries
1460  // belongs to the block (no striding). "No padding" means
1461  // that getBlockSize() * getBlockSize() is exactly the number
1462  // of entries that the block uses. For another place where
1463  // this assumption is encoded, see replaceLocalValues.
1464 
1465  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1466  // little_block_type A_old =
1467  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1468  impl_scalar_type* const A_old =
1469  vals_host_out_raw + absBlockOffset * perBlockSize;
1470  // const_little_block_type A_new =
1471  // getConstLocalBlockFromInput (vIn, pointOffset);
1472  const impl_scalar_type* const A_new = vIn + pointOffset;
1473  // AXPY (ONE, A_new, A_old);
1474  for (LO i = 0; i < perBlockSize; ++i) {
1475  A_old[i] += A_new[i];
1476  }
1477  hint = relBlockOffset + 1;
1478  ++validCount;
1479  }
1480  }
1481  return validCount;
1482  }
1483 
1484  template<class Scalar, class LO, class GO, class Node>
1485  LO
1487  getLocalRowView (const LO localRowInd,
1488  const LO*& colInds,
1489  Scalar*& vals,
1490  LO& numInds) const
1491  {
1492 #ifdef HAVE_TPETRA_DEBUG
1493  const char prefix[] =
1494  "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: ";
1495 #endif // HAVE_TPETRA_DEBUG
1496 
1497  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1498  colInds = NULL;
1499  vals = NULL;
1500  numInds = 0;
1501  return Teuchos::OrdinalTraits<LO>::invalid ();
1502  }
1503  else {
1504  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1505  colInds = indHost_.ptr_on_device () + absBlockOffsetStart;
1506 
1507 #ifdef HAVE_TPETRA_DEBUG
1508  TEUCHOS_TEST_FOR_EXCEPTION
1509  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
1510  prefix << "The matrix's data were last modified on device, but have "
1511  "not been sync'd to host. Please sync to host (by calling "
1512  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1513  "method.");
1514 #endif // HAVE_TPETRA_DEBUG
1515 
1516  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
1517  // version of the data always exists (no lazy allocation for host
1518  // data).
1520  auto vals_host_out =
1521  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
1522  impl_scalar_type* vals_host_out_raw = vals_host_out.ptr_on_device ();
1523  impl_scalar_type* const vOut = vals_host_out_raw +
1524  absBlockOffsetStart * offsetPerBlock ();
1525  vals = reinterpret_cast<Scalar*> (vOut);
1526 
1527  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1528  return 0; // indicates no error
1529  }
1530  }
1531 
1532  template<class Scalar, class LO, class GO, class Node>
1533  void
1535  getLocalRowCopy (LO LocalRow,
1536  const Teuchos::ArrayView<LO>& Indices,
1537  const Teuchos::ArrayView<Scalar>& Values,
1538  size_t &NumEntries) const
1539  {
1540  const LO *colInds;
1541  Scalar *vals;
1542  LO numInds;
1543  getLocalRowView(LocalRow,colInds,vals,numInds);
1544  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1545  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1546  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1547  << numInds << " row entries");
1548  }
1549  for (LO i=0; i<numInds; ++i) {
1550  Indices[i] = colInds[i];
1551  }
1552  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1553  Values[i] = vals[i];
1554  }
1555  NumEntries = numInds;
1556  }
1557 
1558  template<class Scalar, class LO, class GO, class Node>
1559  LO
1561  getLocalRowOffsets (const LO localRowInd,
1562  ptrdiff_t offsets[],
1563  const LO colInds[],
1564  const LO numColInds) const
1565  {
1566  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1567  // We got no offsets, because the input local row index is
1568  // invalid on the calling process. That may not be an error, if
1569  // numColInds is zero anyway; it doesn't matter. This is the
1570  // advantage of returning the number of valid indices.
1571  return static_cast<LO> (0);
1572  }
1573 
1574  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1575  LO hint = 0; // Guess for the relative offset into the current row
1576  LO validCount = 0; // number of valid column indices in colInds
1577 
1578  for (LO k = 0; k < numColInds; ++k) {
1579  const LO relBlockOffset =
1580  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1581  if (relBlockOffset != LINV) {
1582  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1583  hint = relBlockOffset + 1;
1584  ++validCount;
1585  }
1586  }
1587  return validCount;
1588  }
1589 
1590 
1591  template<class Scalar, class LO, class GO, class Node>
1592  LO
1594  replaceLocalValuesByOffsets (const LO localRowInd,
1595  const ptrdiff_t offsets[],
1596  const Scalar vals[],
1597  const LO numOffsets) const
1598  {
1599  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1600  // We modified no values, because the input local row index is
1601  // invalid on the calling process. That may not be an error, if
1602  // numColInds is zero anyway; it doesn't matter. This is the
1603  // advantage of returning the number of valid indices.
1604  return static_cast<LO> (0);
1605  }
1606  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1607 
1608  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1609  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1610  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1611  size_t pointOffset = 0; // Current offset into input values
1612  LO validCount = 0; // number of valid offsets
1613 
1614  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1615  const size_t relBlockOffset = offsets[k];
1616  if (relBlockOffset != STINV) {
1617  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1618  little_block_type A_old =
1619  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1620  const_little_block_type A_new =
1621  getConstLocalBlockFromInput (vIn, pointOffset);
1622  COPY (A_new, A_old);
1623  ++validCount;
1624  }
1625  }
1626  return validCount;
1627  }
1628 
1629 
1630  template<class Scalar, class LO, class GO, class Node>
1631  LO
1633  absMaxLocalValuesByOffsets (const LO localRowInd,
1634  const ptrdiff_t offsets[],
1635  const Scalar vals[],
1636  const LO numOffsets) const
1637  {
1638  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1639  // We modified no values, because the input local row index is
1640  // invalid on the calling process. That may not be an error, if
1641  // numColInds is zero anyway; it doesn't matter. This is the
1642  // advantage of returning the number of valid indices.
1643  return static_cast<LO> (0);
1644  }
1645  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1646 
1647  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1648  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1649  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1650  size_t pointOffset = 0; // Current offset into input values
1651  LO validCount = 0; // number of valid offsets
1652 
1653  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1654  const size_t relBlockOffset = offsets[k];
1655  if (relBlockOffset != STINV) {
1656  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1657  little_block_type A_old =
1658  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1659  const_little_block_type A_new =
1660  getConstLocalBlockFromInput (vIn, pointOffset);
1661  Impl::absMax (A_old, A_new);
1662  ++validCount;
1663  }
1664  }
1665  return validCount;
1666  }
1667 
1668 
1669  template<class Scalar, class LO, class GO, class Node>
1670  LO
1672  sumIntoLocalValuesByOffsets (const LO localRowInd,
1673  const ptrdiff_t offsets[],
1674  const Scalar vals[],
1675  const LO numOffsets) const
1676  {
1677  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1678  // We modified no values, because the input local row index is
1679  // invalid on the calling process. That may not be an error, if
1680  // numColInds is zero anyway; it doesn't matter. This is the
1681  // advantage of returning the number of valid indices.
1682  return static_cast<LO> (0);
1683  }
1684  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1685  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1686 
1687  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1688  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1689  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1690  size_t pointOffset = 0; // Current offset into input values
1691  LO validCount = 0; // number of valid offsets
1692 
1693  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1694  const size_t relBlockOffset = offsets[k];
1695  if (relBlockOffset != STINV) {
1696  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1697  little_block_type A_old =
1698  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1699  const_little_block_type A_new =
1700  getConstLocalBlockFromInput (vIn, pointOffset);
1701  //A_old.update (ONE, A_new);
1702  AXPY (ONE, A_new, A_old);
1703  ++validCount;
1704  }
1705  }
1706  return validCount;
1707  }
1708 
1709 
1710  template<class Scalar, class LO, class GO, class Node>
1711  size_t
1713  getNumEntriesInLocalRow (const LO localRowInd) const
1714  {
1715  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1716  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1717  return static_cast<LO> (0); // the calling process doesn't have that row
1718  } else {
1719  return static_cast<LO> (numEntInGraph);
1720  }
1721  }
1722 
1723  template<class Scalar, class LO, class GO, class Node>
1724  void
1728  const Teuchos::ETransp mode,
1729  const Scalar alpha,
1730  const Scalar beta)
1731  {
1732  (void) X;
1733  (void) Y;
1734  (void) mode;
1735  (void) alpha;
1736  (void) beta;
1737 
1738  TEUCHOS_TEST_FOR_EXCEPTION(
1739  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1740  "transpose and conjugate transpose modes are not yet implemented.");
1741  }
1742 
1743  template<class Scalar, class LO, class GO, class Node>
1744  void
1748  const Scalar alpha,
1749  const Scalar beta)
1750  {
1751  using Teuchos::RCP;
1752  using Teuchos::rcp;
1753  typedef Tpetra::Import<LO, GO, Node> import_type;
1754  typedef Tpetra::Export<LO, GO, Node> export_type;
1755  const Scalar zero = STS::zero ();
1756  const Scalar one = STS::one ();
1757  RCP<const import_type> import = graph_.getImporter ();
1758  // "export" is a reserved C++ keyword, so we can't use it.
1759  RCP<const export_type> theExport = graph_.getExporter ();
1760 
1761  // FIXME (mfh 20 May 2014) X.mv_ and Y.mv_ requires a friend
1762  // declaration, which is useful only for debugging.
1763  TEUCHOS_TEST_FOR_EXCEPTION(
1764  X.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1765  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1766  "The input BlockMultiVector X has deep copy semantics, "
1767  "not view semantics (as it should).");
1768  TEUCHOS_TEST_FOR_EXCEPTION(
1769  Y.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
1770  "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
1771  "The output BlockMultiVector Y has deep copy semantics, "
1772  "not view semantics (as it should).");
1773 
1774  if (alpha == zero) {
1775  if (beta == zero) {
1776  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1777  } else if (beta != one) {
1778  Y.scale (beta);
1779  }
1780  } else { // alpha != 0
1781  const BMV* X_colMap = NULL;
1782  if (import.is_null ()) {
1783  try {
1784  X_colMap = &X;
1785  } catch (std::exception& e) {
1786  TEUCHOS_TEST_FOR_EXCEPTION(
1787  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1788  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1789  "operator= threw an exception: " << std::endl << e.what ());
1790  }
1791  } else {
1792  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1793  // Y_rowMap_ below. This lets us do lazy initialization
1794  // correctly with view semantics of BlockCrsMatrix. All views
1795  // of this BlockCrsMatrix have the same outer pointer. That
1796  // way, we can set the inner pointer in one view, and all
1797  // other views will see it.
1798  if ((*X_colMap_).is_null () ||
1799  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1800  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1801  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1802  static_cast<LO> (X.getNumVectors ())));
1803  }
1804 #ifdef HAVE_TPETRA_BCRS_DO_POINT_IMPORT
1805  if (pointImporter_->is_null ()) {
1806  // The Import ctor needs RCPs. Map's copy ctor does a shallow copy, so
1807  // these are small operations.
1808  const auto domainPointMap = rcp (new typename BMV::map_type (domainPointMap_));
1809  const auto colPointMap = rcp (new typename BMV::map_type (
1810  BMV::makePointMap (*graph_.getColMap(),
1811  blockSize_)));
1812  *pointImporter_ = rcp (new typename crs_graph_type::import_type (
1813  domainPointMap, colPointMap));
1814  }
1815  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1816  **pointImporter_,
1817  Tpetra::REPLACE);
1818 #else
1819  (**X_colMap_).doImport (X, *import, Tpetra::REPLACE);
1820 #endif
1821  try {
1822  X_colMap = &(**X_colMap_);
1823  } catch (std::exception& e) {
1824  TEUCHOS_TEST_FOR_EXCEPTION(
1825  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1826  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1827  "operator= threw an exception: " << std::endl << e.what ());
1828  }
1829  }
1830 
1831  BMV* Y_rowMap = NULL;
1832  if (theExport.is_null ()) {
1833  Y_rowMap = &Y;
1834  } else if ((*Y_rowMap_).is_null () ||
1835  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1836  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1837  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1838  static_cast<LO> (X.getNumVectors ())));
1839  try {
1840  Y_rowMap = &(**Y_rowMap_);
1841  } catch (std::exception& e) {
1842  TEUCHOS_TEST_FOR_EXCEPTION(
1843  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1844  "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
1845  "operator= threw an exception: " << std::endl << e.what ());
1846  }
1847  }
1848 
1849  try {
1850  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1851  }
1852  catch (std::exception& e) {
1853  TEUCHOS_TEST_FOR_EXCEPTION
1854  (true, std::runtime_error, "Tpetra::Experimental::BlockCrsMatrix::"
1855  "applyBlockNoTrans: localApplyBlockNoTrans threw an exception: "
1856  << e.what ());
1857  }
1858  catch (...) {
1859  TEUCHOS_TEST_FOR_EXCEPTION
1860  (true, std::runtime_error, "Tpetra::Experimental::BlockCrsMatrix::"
1861  "applyBlockNoTrans: localApplyBlockNoTrans threw some exception "
1862  "that is not a subclass of std::exception.");
1863  }
1864 
1865  if (! theExport.is_null ()) {
1866  Y.doExport (*Y_rowMap, *theExport, Tpetra::REPLACE);
1867  }
1868  }
1869  }
1870 
1871  template<class Scalar, class LO, class GO, class Node>
1872  void
1876  const Scalar alpha,
1877  const Scalar beta)
1878  {
1879  using Tpetra::Experimental::Impl::bcrsLocalApplyNoTrans;
1880 
1881  const impl_scalar_type alpha_impl = alpha;
1882  const auto graph = this->graph_.getLocalGraph ();
1883  const impl_scalar_type beta_impl = beta;
1884  const LO blockSize = this->getBlockSize ();
1885 
1886  auto X_mv = X.getMultiVectorView ();
1887  auto Y_mv = Y.getMultiVectorView ();
1888  Y_mv.template modify<device_type> ();
1889 
1890  auto X_lcl = X_mv.template getLocalView<device_type> ();
1891  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1892  auto val = this->val_.template view<device_type> ();
1893 
1894  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1895  beta_impl, Y_lcl);
1896  }
1897 
1898  template<class Scalar, class LO, class GO, class Node>
1899  LO
1901  findRelOffsetOfColumnIndex (const LO localRowIndex,
1902  const LO colIndexToFind,
1903  const LO hint) const
1904  {
1905  const size_t absStartOffset = ptrHost_[localRowIndex];
1906  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1907  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1908  // Amortize pointer arithmetic over the search loop.
1909  const LO* const curInd = indHost_.ptr_on_device () + absStartOffset;
1910 
1911  // If the hint was correct, then the hint is the offset to return.
1912  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1913  // Always return the offset relative to the current row.
1914  return hint;
1915  }
1916 
1917  // The hint was wrong, so we must search for the given column
1918  // index in the column indices for the given row.
1919  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1920 
1921  // We require that the graph have sorted rows. However, binary
1922  // search only pays if the current row is longer than a certain
1923  // amount. We set this to 32, but you might want to tune this.
1924  const LO maxNumEntriesForLinearSearch = 32;
1925  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1926  // Use binary search. It would probably be better for us to
1927  // roll this loop by hand. If we wrote it right, a smart
1928  // compiler could perhaps use conditional loads and avoid
1929  // branches (according to Jed Brown on May 2014).
1930  const LO* beg = curInd;
1931  const LO* end = curInd + numEntriesInRow;
1932  std::pair<const LO*, const LO*> p =
1933  std::equal_range (beg, end, colIndexToFind);
1934  if (p.first != p.second) {
1935  // offset is relative to the current row
1936  relOffset = static_cast<LO> (p.first - beg);
1937  }
1938  }
1939  else { // use linear search
1940  for (LO k = 0; k < numEntriesInRow; ++k) {
1941  if (colIndexToFind == curInd[k]) {
1942  relOffset = k; // offset is relative to the current row
1943  break;
1944  }
1945  }
1946  }
1947 
1948  return relOffset;
1949  }
1950 
1951  template<class Scalar, class LO, class GO, class Node>
1952  LO
1954  offsetPerBlock () const
1955  {
1956  return offsetPerBlock_;
1957  }
1958 
1959  template<class Scalar, class LO, class GO, class Node>
1963  const size_t pointOffset) const
1964  {
1965  // Row major blocks
1966  const LO rowStride = blockSize_;
1967  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1968  }
1969 
1970  template<class Scalar, class LO, class GO, class Node>
1974  const size_t pointOffset) const
1975  {
1976  // Row major blocks
1977  const LO rowStride = blockSize_;
1978  return little_block_type (val + pointOffset, blockSize_, rowStride);
1979  }
1980 
1981  template<class Scalar, class LO, class GO, class Node>
1984  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1985  {
1986 #ifdef HAVE_TPETRA_DEBUG
1987  const char prefix[] =
1988  "Tpetra::Experimental::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1989 #endif // HAVE_TPETRA_DEBUG
1990 
1991  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1992  // An empty block signifies an error. We don't expect to see
1993  // this error in correct code, but it's helpful for avoiding
1994  // memory corruption in case there is a bug.
1995  return const_little_block_type ();
1996  }
1997  else {
1998 #ifdef HAVE_TPETRA_DEBUG
1999  TEUCHOS_TEST_FOR_EXCEPTION
2000  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
2001  prefix << "The matrix's data were last modified on device, but have "
2002  "not been sync'd to host. Please sync to host (by calling "
2003  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2004  "method.");
2005 #endif // HAVE_TPETRA_DEBUG
2006  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2007 
2008  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2009  // version of the data always exists (no lazy allocation for host
2010  // data).
2012  auto vals_host =
2013  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2014  const impl_scalar_type* vals_host_raw = vals_host.ptr_on_device ();
2015 
2016  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2017  }
2018  }
2019 
2020  template<class Scalar, class LO, class GO, class Node>
2023  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
2024  const size_t relMeshOffset) const
2025  {
2026  typedef impl_scalar_type IST;
2027 
2028  const LO* lclColInds = NULL;
2029  Scalar* lclVals = NULL;
2030  LO numEnt = 0;
2031 
2032  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2033  if (err != 0) {
2034  // An empty block signifies an error. We don't expect to see
2035  // this error in correct code, but it's helpful for avoiding
2036  // memory corruption in case there is a bug.
2037  return const_little_block_type ();
2038  }
2039  else {
2040  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2041  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2042  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2043  relPointOffset);
2044  }
2045  }
2046 
2047  template<class Scalar, class LO, class GO, class Node>
2050  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2051  {
2052 #ifdef HAVE_TPETRA_DEBUG
2053  const char prefix[] =
2054  "Tpetra::Experimental::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2055 #endif // HAVE_TPETRA_DEBUG
2056 
2057  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2058  // An empty block signifies an error. We don't expect to see
2059  // this error in correct code, but it's helpful for avoiding
2060  // memory corruption in case there is a bug.
2061  return little_block_type ();
2062  }
2063  else {
2064  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2065 #ifdef HAVE_TPETRA_DEBUG
2066  TEUCHOS_TEST_FOR_EXCEPTION
2067  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
2068  prefix << "The matrix's data were last modified on device, but have "
2069  "not been sync'd to host. Please sync to host (by calling "
2070  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2071  "method.");
2072 #endif // HAVE_TPETRA_DEBUG
2073  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
2074  // version of the data always exists (no lazy allocation for host
2075  // data).
2077  auto vals_host =
2078  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
2079  impl_scalar_type* vals_host_raw = vals_host.ptr_on_device ();
2080  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2081  }
2082  }
2083 
2084  template<class Scalar, class LO, class GO, class Node>
2087  getLocalBlock (const LO localRowInd, const LO localColInd) const
2088  {
2089  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2090  const LO relBlockOffset =
2091  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2092 
2093  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2094  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2095  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2096  }
2097  else {
2098  return little_block_type ();
2099  }
2100  }
2101 
2102  // template<class Scalar, class LO, class GO, class Node>
2103  // void
2104  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2105  // clearLocalErrorStateAndStream ()
2106  // {
2107  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2108  // * (const_cast<this_type*> (this)->localError_) = false;
2109  // *errs_ = Teuchos::null;
2110  // }
2111 
2112  template<class Scalar, class LO, class GO, class Node>
2113  bool
2116  {
2117  using std::endl;
2119  const this_type* src = dynamic_cast<const this_type* > (&source);
2120 
2121  if (src == NULL) {
2122  std::ostream& err = markLocalErrorAndGetStream ();
2123  err << "checkSizes: The source object of the Import or Export "
2124  "must be a BlockCrsMatrix with the same template parameters as the "
2125  "target object." << endl;
2126  }
2127  else {
2128  // Use a string of ifs, not if-elseifs, because we want to know
2129  // all the errors.
2130  if (src->getBlockSize () != this->getBlockSize ()) {
2131  std::ostream& err = markLocalErrorAndGetStream ();
2132  err << "checkSizes: The source and target objects of the Import or "
2133  << "Export must have the same block sizes. The source's block "
2134  << "size = " << src->getBlockSize () << " != the target's block "
2135  << "size = " << this->getBlockSize () << "." << endl;
2136  }
2137  if (! src->graph_.isFillComplete ()) {
2138  std::ostream& err = markLocalErrorAndGetStream ();
2139  err << "checkSizes: The source object of the Import or Export is "
2140  "not fill complete. Both source and target objects must be fill "
2141  "complete." << endl;
2142  }
2143  if (! this->graph_.isFillComplete ()) {
2144  std::ostream& err = markLocalErrorAndGetStream ();
2145  err << "checkSizes: The target object of the Import or Export is "
2146  "not fill complete. Both source and target objects must be fill "
2147  "complete." << endl;
2148  }
2149  if (src->graph_.getColMap ().is_null ()) {
2150  std::ostream& err = markLocalErrorAndGetStream ();
2151  err << "checkSizes: The source object of the Import or Export does "
2152  "not have a column Map. Both source and target objects must have "
2153  "column Maps." << endl;
2154  }
2155  if (this->graph_.getColMap ().is_null ()) {
2156  std::ostream& err = markLocalErrorAndGetStream ();
2157  err << "checkSizes: The target object of the Import or Export does "
2158  "not have a column Map. Both source and target objects must have "
2159  "column Maps." << endl;
2160  }
2161  }
2162 
2163  return ! (* (this->localError_));
2164  }
2165 
2166  template<class Scalar, class LO, class GO, class Node>
2167  void
2170  size_t numSameIDs,
2171  const Teuchos::ArrayView<const LO>& permuteToLIDs,
2172  const Teuchos::ArrayView<const LO>& permuteFromLIDs)
2173  {
2174  using std::endl;
2176  const bool debug = false;
2177 
2178  if (debug) {
2179  std::ostringstream os;
2180  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2181  os << "Proc " << myRank << ": copyAndPermute: "
2182  << "numSameIDs: " << numSameIDs
2183  << ", permuteToLIDs.size(): " << permuteToLIDs.size ()
2184  << ", permuteFromLIDs.size(): " << permuteFromLIDs.size ()
2185  << endl;
2186  std::cerr << os.str ();
2187  }
2188 
2189  // There's no communication in this method, so it's safe just to
2190  // return on error.
2191 
2192  if (* (this->localError_)) {
2193  std::ostream& err = this->markLocalErrorAndGetStream ();
2194  err << "copyAndPermute: The target object of the Import or Export is "
2195  "already in an error state." << endl;
2196  return;
2197  }
2198  if (permuteToLIDs.size () != permuteFromLIDs.size ()) {
2199  std::ostream& err = this->markLocalErrorAndGetStream ();
2200  err << "copyAndPermute: permuteToLIDs.size() = " << permuteToLIDs.size ()
2201  << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."
2202  << endl;
2203  return;
2204  }
2205 
2206  const this_type* src = dynamic_cast<const this_type* > (&source);
2207  if (src == NULL) {
2208  std::ostream& err = this->markLocalErrorAndGetStream ();
2209  err << "copyAndPermute: The source object of the Import or Export is "
2210  "either not a BlockCrsMatrix, or does not have the right template "
2211  "parameters. checkSizes() should have caught this. "
2212  "Please report this bug to the Tpetra developers." << endl;
2213  return;
2214  }
2215  if (* (src->localError_)) {
2216  std::ostream& err = this->markLocalErrorAndGetStream ();
2217  err << "copyAndPermute: The source object of the Import or Export is "
2218  "already in an error state." << endl;
2219  return;
2220  }
2221 
2222  bool lclErr = false;
2223 #ifdef HAVE_TPETRA_DEBUG
2224  std::set<LO> invalidSrcCopyRows;
2225  std::set<LO> invalidDstCopyRows;
2226  std::set<LO> invalidDstCopyCols;
2227  std::set<LO> invalidDstPermuteCols;
2228  std::set<LO> invalidPermuteFromRows;
2229 #endif // HAVE_TPETRA_DEBUG
2230 
2231  // Copy the initial sequence of rows that are the same.
2232  //
2233  // The two graphs might have different column Maps, so we need to
2234  // do this using global column indices. This is purely local, so
2235  // we only need to check for local sameness of the two column
2236  // Maps.
2237 
2238 #ifdef HAVE_TPETRA_DEBUG
2239  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2240 #endif // HAVE_TPETRA_DEBUG
2241  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2242  const map_type& srcColMap = * (src->graph_.getColMap ());
2243  const map_type& dstColMap = * (this->graph_.getColMap ());
2244  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2245  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.size ());
2246 
2247  if (debug) {
2248  std::ostringstream os;
2249  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2250  os << "Proc " << myRank << ": copyAndPermute: "
2251  << "canUseLocalColumnIndices: "
2252  << (canUseLocalColumnIndices ? "true" : "false")
2253  << ", numPermute: " << numPermute
2254  << endl;
2255  std::cerr << os.str ();
2256  }
2257 
2258  if (canUseLocalColumnIndices) {
2259  // Copy local rows that are the "same" in both source and target.
2260  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2261 #ifdef HAVE_TPETRA_DEBUG
2262  if (! srcRowMap.isNodeLocalElement (localRow)) {
2263  lclErr = true;
2264  invalidSrcCopyRows.insert (localRow);
2265  continue; // skip invalid rows
2266  }
2267 #endif // HAVE_TPETRA_DEBUG
2268 
2269  const LO* lclSrcCols;
2270  Scalar* vals;
2271  LO numEntries;
2272  // If this call fails, that means the mesh row local index is
2273  // invalid. That means the Import or Export is invalid somehow.
2274  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2275  if (err != 0) {
2276  lclErr = true;
2277 #ifdef HAVE_TPETRA_DEBUG
2278  (void) invalidSrcCopyRows.insert (localRow);
2279 #endif // HAVE_TPETRA_DEBUG
2280  }
2281  else {
2282  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2283  if (err != numEntries) {
2284  lclErr = true;
2285  if (! dstRowMap.isNodeLocalElement (localRow)) {
2286 #ifdef HAVE_TPETRA_DEBUG
2287  invalidDstCopyRows.insert (localRow);
2288 #endif // HAVE_TPETRA_DEBUG
2289  }
2290  else {
2291  // Once there's an error, there's no sense in saving
2292  // time, so we check whether the column indices were
2293  // invalid. However, only remember which ones were
2294  // invalid in a debug build, because that might take a
2295  // lot of space.
2296  for (LO k = 0; k < numEntries; ++k) {
2297  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2298  lclErr = true;
2299 #ifdef HAVE_TPETRA_DEBUG
2300  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2301 #endif // HAVE_TPETRA_DEBUG
2302  }
2303  }
2304  }
2305  }
2306  }
2307  } // for each "same" local row
2308 
2309  // Copy the "permute" local rows.
2310  for (size_t k = 0; k < numPermute; ++k) {
2311  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2312  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2313 
2314  const LO* lclSrcCols;
2315  Scalar* vals;
2316  LO numEntries;
2317  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2318  if (err != 0) {
2319  lclErr = true;
2320 #ifdef HAVE_TPETRA_DEBUG
2321  invalidPermuteFromRows.insert (srcLclRow);
2322 #endif // HAVE_TPETRA_DEBUG
2323  }
2324  else {
2325  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2326  if (err != numEntries) {
2327  lclErr = true;
2328 #ifdef HAVE_TPETRA_DEBUG
2329  for (LO c = 0; c < numEntries; ++c) {
2330  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2331  invalidDstPermuteCols.insert (lclSrcCols[c]);
2332  }
2333  }
2334 #endif // HAVE_TPETRA_DEBUG
2335  }
2336  }
2337  }
2338  }
2339  else { // must convert column indices to global
2340  // Reserve space to store the destination matrix's local column indices.
2341  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2342  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2343 
2344  // Copy local rows that are the "same" in both source and target.
2345  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2346  const LO* lclSrcCols;
2347  Scalar* vals;
2348  LO numEntries;
2349  // If this call fails, that means the mesh row local index is
2350  // invalid. That means the Import or Export is invalid somehow.
2351  LO err = 0;
2352  try {
2353  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2354  } catch (std::exception& e) {
2355  if (debug) {
2356  std::ostringstream os;
2357  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2358  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2359  << localRow << ", src->getLocalRowView() threw an exception: "
2360  << e.what ();
2361  std::cerr << os.str ();
2362  }
2363  throw e;
2364  }
2365 
2366  if (err != 0) {
2367  lclErr = true;
2368 #ifdef HAVE_TPETRA_DEBUG
2369  invalidSrcCopyRows.insert (localRow);
2370 #endif // HAVE_TPETRA_DEBUG
2371  }
2372  else {
2373  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2374  lclErr = true;
2375  if (debug) {
2376  std::ostringstream os;
2377  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2378  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2379  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2380  << maxNumEnt << endl;
2381  std::cerr << os.str ();
2382  }
2383  }
2384  else {
2385  // Convert the source matrix's local column indices to the
2386  // destination matrix's local column indices.
2387  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2388  for (LO j = 0; j < numEntries; ++j) {
2389  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2390  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2391  lclErr = true;
2392 #ifdef HAVE_TPETRA_DEBUG
2393  invalidDstCopyCols.insert (lclDstColsView[j]);
2394 #endif // HAVE_TPETRA_DEBUG
2395  }
2396  }
2397  try {
2398  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2399  } catch (std::exception& e) {
2400  if (debug) {
2401  std::ostringstream os;
2402  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2403  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2404  << localRow << ", this->replaceLocalValues() threw an exception: "
2405  << e.what ();
2406  std::cerr << os.str ();
2407  }
2408  throw e;
2409  }
2410  if (err != numEntries) {
2411  lclErr = true;
2412  if (debug) {
2413  std::ostringstream os;
2414  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2415  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2416  "localRow " << localRow << ", this->replaceLocalValues "
2417  "returned " << err << " instead of numEntries = "
2418  << numEntries << endl;
2419  std::cerr << os.str ();
2420  }
2421  }
2422  }
2423  }
2424  }
2425 
2426  // Copy the "permute" local rows.
2427  for (size_t k = 0; k < numPermute; ++k) {
2428  const LO srcLclRow = static_cast<LO> (permuteFromLIDs[k]);
2429  const LO dstLclRow = static_cast<LO> (permuteToLIDs[k]);
2430 
2431  const LO* lclSrcCols;
2432  Scalar* vals;
2433  LO numEntries;
2434  LO err = 0;
2435  try {
2436  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2437  } catch (std::exception& e) {
2438  if (debug) {
2439  std::ostringstream os;
2440  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2441  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2442  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2443  << ", src->getLocalRowView() threw an exception: " << e.what ();
2444  std::cerr << os.str ();
2445  }
2446  throw e;
2447  }
2448 
2449  if (err != 0) {
2450  lclErr = true;
2451 #ifdef HAVE_TPETRA_DEBUG
2452  invalidPermuteFromRows.insert (srcLclRow);
2453 #endif // HAVE_TPETRA_DEBUG
2454  }
2455  else {
2456  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2457  lclErr = true;
2458  }
2459  else {
2460  // Convert the source matrix's local column indices to the
2461  // destination matrix's local column indices.
2462  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2463  for (LO j = 0; j < numEntries; ++j) {
2464  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2465  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2466  lclErr = true;
2467 #ifdef HAVE_TPETRA_DEBUG
2468  invalidDstPermuteCols.insert (lclDstColsView[j]);
2469 #endif // HAVE_TPETRA_DEBUG
2470  }
2471  }
2472  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2473  if (err != numEntries) {
2474  lclErr = true;
2475  }
2476  }
2477  }
2478  }
2479  }
2480 
2481  if (lclErr) {
2482  std::ostream& err = this->markLocalErrorAndGetStream ();
2483 #ifdef HAVE_TPETRA_DEBUG
2484  err << "copyAndPermute: The graph structure of the source of the "
2485  "Import or Export must be a subset of the graph structure of the "
2486  "target. ";
2487  err << "invalidSrcCopyRows = [";
2488  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2489  it != invalidSrcCopyRows.end (); ++it) {
2490  err << *it;
2491  typename std::set<LO>::const_iterator itp1 = it;
2492  itp1++;
2493  if (itp1 != invalidSrcCopyRows.end ()) {
2494  err << ",";
2495  }
2496  }
2497  err << "], invalidDstCopyRows = [";
2498  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2499  it != invalidDstCopyRows.end (); ++it) {
2500  err << *it;
2501  typename std::set<LO>::const_iterator itp1 = it;
2502  itp1++;
2503  if (itp1 != invalidDstCopyRows.end ()) {
2504  err << ",";
2505  }
2506  }
2507  err << "], invalidDstCopyCols = [";
2508  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2509  it != invalidDstCopyCols.end (); ++it) {
2510  err << *it;
2511  typename std::set<LO>::const_iterator itp1 = it;
2512  itp1++;
2513  if (itp1 != invalidDstCopyCols.end ()) {
2514  err << ",";
2515  }
2516  }
2517  err << "], invalidDstPermuteCols = [";
2518  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2519  it != invalidDstPermuteCols.end (); ++it) {
2520  err << *it;
2521  typename std::set<LO>::const_iterator itp1 = it;
2522  itp1++;
2523  if (itp1 != invalidDstPermuteCols.end ()) {
2524  err << ",";
2525  }
2526  }
2527  err << "], invalidPermuteFromRows = [";
2528  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2529  it != invalidPermuteFromRows.end (); ++it) {
2530  err << *it;
2531  typename std::set<LO>::const_iterator itp1 = it;
2532  itp1++;
2533  if (itp1 != invalidPermuteFromRows.end ()) {
2534  err << ",";
2535  }
2536  }
2537  err << "]" << std::endl;
2538 #else
2539  err << "copyAndPermute: The graph structure of the source of the "
2540  "Import or Export must be a subset of the graph structure of the "
2541  "target." << std::endl;
2542 #endif // HAVE_TPETRA_DEBUG
2543  }
2544 
2545  if (debug) {
2546  std::ostringstream os;
2547  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2548  const bool lclSuccess = ! (* (this->localError_));
2549  os << "*** Proc " << myRank << ": copyAndPermute "
2550  << (lclSuccess ? "succeeded" : "FAILED");
2551  if (lclSuccess) {
2552  os << endl;
2553  } else {
2554  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2555  }
2556  std::cerr << os.str ();
2557  }
2558  }
2559 
2560  namespace { // (anonymous)
2561 
2580  template<class LO, class GO, class D>
2581  size_t
2582  packRowCount (const size_t numEnt,
2583  const size_t numBytesPerValue,
2584  const size_t blkSize)
2585  {
2587 
2588  if (numEnt == 0) {
2589  // Empty rows always take zero bytes, to ensure sparsity.
2590  return 0;
2591  }
2592  else {
2593  // We store the number of entries as a local index (LO).
2594  LO numEntLO = 0; // packValueCount wants this.
2595  GO gid;
2596  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2597  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2598  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2599  return numEntLen + gidsLen + valsLen;
2600  }
2601  }
2602 
2613  template<class ST, class LO, class GO, class D>
2614  size_t
2615  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2616  const size_t offset,
2617  const size_t numBytes,
2618  const size_t numBytesPerValue)
2619  {
2620  using Kokkos::subview;
2622  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2623  typedef typename input_buffer_type::size_type size_type;
2624 
2625  if (numBytes == 0) {
2626  // Empty rows always take zero bytes, to ensure sparsity.
2627  return static_cast<size_t> (0);
2628  }
2629  else {
2630  LO numEntLO = 0;
2631  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2632 #ifdef HAVE_TPETRA_DEBUG
2633  TEUCHOS_TEST_FOR_EXCEPTION(
2634  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2635  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2636  << ".");
2637 #endif // HAVE_TPETRA_DEBUG
2638  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
2639  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
2640  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2641 #ifdef HAVE_TPETRA_DEBUG
2642  TEUCHOS_TEST_FOR_EXCEPTION(
2643  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2644  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2645  << ".");
2646 #else
2647  (void) actualNumBytes;
2648 #endif // HAVE_TPETRA_DEBUG
2649  return static_cast<size_t> (numEntLO);
2650  }
2651  }
2652 
2660  template<class ST, class LO, class GO, class D>
2661  size_t
2662  packRowForBlockCrs (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
2663  const size_t offset,
2664  const size_t numEnt,
2667  const size_t numBytesPerValue,
2668  const size_t blockSize)
2669  {
2670  using Kokkos::subview;
2672  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
2673  // the same, no matter what type we're packing. It's a reasonable
2674  // assumption, given that we go through the trouble of PackTraits
2675  // just so that we can pack data of different types in the same
2676  // buffer.
2677  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
2678  typedef typename output_buffer_type::size_type size_type;
2679  typedef typename std::pair<size_type, size_type> pair_type;
2680 
2681  if (numEnt == 0) {
2682  // Empty rows always take zero bytes, to ensure sparsity.
2683  return 0;
2684  }
2685  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2686 
2687  const GO gid = 0; // packValueCount wants this
2688  const LO numEntLO = static_cast<size_t> (numEnt);
2689 
2690  const size_t numEntBeg = offset;
2691  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2692  const size_t gidsBeg = numEntBeg + numEntLen;
2693  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2694  const size_t valsBeg = gidsBeg + gidsLen;
2695  const size_t valsLen = numScalarEnt * numBytesPerValue;
2696 
2697  output_buffer_type numEntOut =
2698  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
2699  output_buffer_type gidsOut =
2700  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
2701  output_buffer_type valsOut =
2702  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
2703 
2704  size_t numBytesOut = 0;
2705  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2706  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
2707  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numScalarEnt);
2708 
2709  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2710  TEUCHOS_TEST_FOR_EXCEPTION(
2711  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2712  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2713  << expectedNumBytes << ".");
2714  return numBytesOut;
2715  }
2716 
2717  // Return the number of bytes actually read / used.
2718  template<class ST, class LO, class GO, class D>
2719  size_t
2720  unpackRowForBlockCrs (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2723  const size_t offset,
2724  const size_t numBytes,
2725  const size_t numEnt,
2726  const size_t numBytesPerValue,
2727  const size_t blockSize)
2728  {
2729  using Kokkos::subview;
2731  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
2732  // the same, no matter what type we're unpacking. It's a
2733  // reasonable assumption, given that we go through the trouble of
2734  // PackTraits just so that we can pack data of different types in
2735  // the same buffer.
2736  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
2737  typedef typename input_buffer_type::size_type size_type;
2738  typedef typename std::pair<size_type, size_type> pair_type;
2739 
2740  if (numBytes == 0) {
2741  // Rows with zero bytes always have zero entries.
2742  return 0;
2743  }
2744  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2745  TEUCHOS_TEST_FOR_EXCEPTION(
2746  static_cast<size_t> (imports.dimension_0 ()) <= offset,
2747  std::logic_error, "unpackRow: imports.dimension_0() = "
2748  << imports.dimension_0 () << " <= offset = " << offset << ".");
2749  TEUCHOS_TEST_FOR_EXCEPTION(
2750  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
2751  std::logic_error, "unpackRow: imports.dimension_0() = "
2752  << imports.dimension_0 () << " < offset + numBytes = "
2753  << (offset + numBytes) << ".");
2754 
2755  const GO gid = 0; // packValueCount wants this
2756  const LO lid = 0; // packValueCount wants this
2757 
2758  const size_t numEntBeg = offset;
2759  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2760  const size_t gidsBeg = numEntBeg + numEntLen;
2761  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2762  const size_t valsBeg = gidsBeg + gidsLen;
2763  const size_t valsLen = numScalarEnt * numBytesPerValue;
2764 
2765  input_buffer_type numEntIn =
2766  subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
2767  input_buffer_type gidsIn =
2768  subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
2769  input_buffer_type valsIn =
2770  subview (imports, pair_type (valsBeg, valsBeg + valsLen));
2771 
2772  size_t numBytesOut = 0;
2773  LO numEntOut;
2774  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2775  TEUCHOS_TEST_FOR_EXCEPTION(
2776  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2777  "unpackRow: Expected number of entries " << numEnt
2778  << " != actual number of entries " << numEntOut << ".");
2779 
2780  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
2781  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numScalarEnt);
2782 
2783  TEUCHOS_TEST_FOR_EXCEPTION(
2784  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
2785  << numBytesOut << " != numBytes = " << numBytes << ".");
2786  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2787  TEUCHOS_TEST_FOR_EXCEPTION(
2788  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
2789  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
2790  << expectedNumBytes << ".");
2791  return numBytesOut;
2792  }
2793  } // namespace (anonymous)
2794 
2795  template<class Scalar, class LO, class GO, class Node>
2796  void
2798  packAndPrepare (const Tpetra::SrcDistObject& source,
2799  const Teuchos::ArrayView<const LO>& exportLIDs,
2800  Teuchos::Array<packet_type>& exports,
2801  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
2802  size_t& constantNumPackets,
2803  Tpetra::Distributor& /* distor */)
2804  {
2805  using std::endl;
2807  using Kokkos::MemoryUnmanaged;
2808  using Kokkos::subview;
2809  using Kokkos::View;
2811  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
2813  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
2814  const bool debug = false;
2815 
2816  if (debug) {
2817  std::ostringstream os;
2818  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2819  os << "Proc " << myRank << ": packAndPrepare: exportLIDs.size() = "
2820  << exportLIDs.size () << ", numPacketsPerLID.size() = "
2821  << numPacketsPerLID.size () << endl;
2822  std::cerr << os.str ();
2823  }
2824 
2825  if (* (this->localError_)) {
2826  std::ostream& err = this->markLocalErrorAndGetStream ();
2827  err << "packAndPrepare: The target object of the Import or Export is "
2828  "already in an error state." << endl;
2829  return;
2830  }
2831 
2832  const this_type* src = dynamic_cast<const this_type* > (&source);
2833  // Should have checked for these cases in checkSizes().
2834  if (src == NULL) {
2835  std::ostream& err = this->markLocalErrorAndGetStream ();
2836  err << "packAndPrepare: The source (input) object of the Import or "
2837  "Export is either not a BlockCrsMatrix, or does not have the right "
2838  "template parameters. checkSizes() should have caught this. "
2839  "Please report this bug to the Tpetra developers." << endl;
2840  return;
2841  }
2842 
2843  const crs_graph_type& srcGraph = src->graph_;
2844  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2845  const size_type numExportLIDs = exportLIDs.size ();
2846 
2847  if (numExportLIDs != numPacketsPerLID.size ()) {
2848  std::ostream& err = this->markLocalErrorAndGetStream ();
2849  err << "packAndPrepare: exportLIDs.size() = " << numExportLIDs
2850  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
2851  << "." << endl;
2852  return;
2853  }
2854 
2855  // Graphs and matrices are allowed to have a variable number of
2856  // entries per row. We could test whether all rows have the same
2857  // number of entries, but DistObject can only use this
2858  // optimization if all rows on _all_ processes have the same
2859  // number of entries. Rather than do the all-reduce necessary to
2860  // test for this unlikely case, we tell DistObject (by setting
2861  // constantNumPackets to zero) to assume that different rows may
2862  // have different numbers of entries.
2863  constantNumPackets = 0;
2864 
2865  // Compute the number of bytes ("packets") per row to pack. While
2866  // we're at it, compute the total # of block entries to send, and
2867  // the max # of block entries in any of the rows we're sending.
2868  size_t totalNumBytes = 0;
2869  size_t totalNumEntries = 0;
2870  size_t maxRowLength = 0;
2871  for (size_type i = 0; i < numExportLIDs; ++i) {
2872  const LO lclRow = exportLIDs[i];
2873  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2874  // If any given LIDs are invalid, the above might return either
2875  // zero or the invalid size_t value. If the former, we have no
2876  // way to tell, but that's OK; it just means the calling process
2877  // won't pack anything (it has nothing to pack anyway). If the
2878  // latter, we replace it with zero (that row is not owned by the
2879  // calling process, so it has no entries to pack).
2880  if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2881  numEnt = 0;
2882  }
2883  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2884 
2885  // The 'if' branch implicitly assumes that packRowCount() returns
2886  // zero if numEnt == 0.
2887  size_t numBytesPerValue = 0;
2888  if (numEnt > 0) {
2889  // Get a locally indexed view of the current row's data. If
2890  // the current row has > 0 entries, we need an entry in order
2891  // to figure out the byte count of the packed row. (We really
2892  // only need it if ST's size is determined at run time.)
2893  Scalar* valsRaw = NULL;
2894  const LO* lidsRaw = NULL;
2895  LO actualNumEnt = 0;
2896  const LO errCode =
2897  src->getLocalRowView (lclRow, lidsRaw, valsRaw, actualNumEnt);
2898 
2899  if (numEnt != static_cast<size_t> (actualNumEnt)) {
2900  std::ostream& err = this->markLocalErrorAndGetStream ();
2901  err << "packAndPrepare: Local row " << i << " claims to have " <<
2902  numEnt << "entry/ies, but the View returned by getLocalRowView() "
2903  "has " << actualNumEnt << " entry/ies. This should never happen. "
2904  "Please report this bug to the Tpetra developers." << endl;
2905  return;
2906  }
2907  if (errCode == Teuchos::OrdinalTraits<LO>::invalid ()) {
2908  std::ostream& err = this->markLocalErrorAndGetStream ();
2909  err << "packAndPrepare: Local row " << i << " is not in the row Map "
2910  "of the source object on the calling process." << endl;
2911  return;
2912  }
2913 
2914  const ST* valsRawST =
2915  const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2916  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2917 
2918  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2919  // space here for now, this doesn't assume UVM. That may change
2920  // in the future, if we ever start packing on the device.
2921  numBytesPerValue = PackTraits<ST, HES>::packValueCount (vals(0));
2922  }
2923 
2924  const size_t numBytes =
2925  packRowCount<LO, GO, HES> (numEnt, numBytesPerValue, blockSize);
2926  numPacketsPerLID[i] = numBytes;
2927  totalNumBytes += numBytes;
2928  totalNumEntries += numEnt;
2929  maxRowLength = std::max (maxRowLength, numEnt);
2930  }
2931 
2932  if (debug) {
2933  const int myRank = graph_.getComm ()->getRank ();
2934  std::ostringstream os;
2935  os << "Proc " << myRank << ": packAndPrepare: totalNumBytes = "
2936  << totalNumBytes << endl;
2937  std::cerr << os.str ();
2938  }
2939 
2940  // We use a "struct of arrays" approach to packing each row's
2941  // entries. All the column indices (as global indices) go first,
2942  // then all their owning process ranks, and then the values.
2943  exports.resize (totalNumBytes);
2944  if (totalNumEntries > 0) {
2945  View<char*, HES, MemoryUnmanaged> exportsK (exports.getRawPtr (),
2946  totalNumBytes);
2947 
2948  // Current position (in bytes) in the 'exports' output array.
2949  size_t offset = 0;
2950 
2951  // For each block row of the matrix owned by the calling
2952  // process, pack that block row's column indices and values into
2953  // the exports array.
2954 
2955  // Source matrix's column Map. We verified in checkSizes() that
2956  // the column Map exists (is not null).
2957  const map_type& srcColMap = * (srcGraph.getColMap ());
2958 
2959  // Temporary buffer for global column indices.
2960  View<GO*, HES> gblColInds;
2961  {
2962  GO gid = 0;
2963  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowLength, "gids");
2964  }
2965 
2966  // Pack the data for each row to send, into the 'exports' buffer.
2967  for (size_type i = 0; i < numExportLIDs; ++i) {
2968  const LO lclRowInd = exportLIDs[i];
2969  const LO* lclColIndsRaw;
2970  Scalar* valsRaw;
2971  LO numEntLO;
2972  // It's OK to ignore the return value, since if the calling
2973  // process doesn't own that local row, then the number of
2974  // entries in that row on the calling process is zero.
2975  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2976  const size_t numEnt = static_cast<size_t> (numEntLO);
2977  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2978  View<const LO*, HES, MemoryUnmanaged> lclColInds (lclColIndsRaw, numEnt);
2979  const ST* valsRawST = const_cast<const ST*> (reinterpret_cast<ST*> (valsRaw));
2980  View<const ST*, HES, MemoryUnmanaged> vals (valsRawST, numScalarEnt);
2981 
2982  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
2983  // space here for now, this doesn't assume UVM. That may
2984  // change in the future, if we ever start packing on device.
2985  const size_t numBytesPerValue = numEnt == 0 ?
2986  static_cast<size_t> (0) :
2987  PackTraits<ST, HES>::packValueCount (vals(0));
2988 
2989  // Convert column indices from local to global.
2990  for (size_t j = 0; j < numEnt; ++j) {
2991  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2992  }
2993 
2994  // Copy the row's data into the current spot in the exports array.
2995  const size_t numBytes =
2996  packRowForBlockCrs<ST, LO, GO, HES> (exportsK, offset, numEnt, gblColInds,
2997  vals, numBytesPerValue, blockSize);
2998  // Keep track of how many bytes we packed.
2999  offset += numBytes;
3000  } // for each LID (of a row) to send
3001 
3002  if (offset != totalNumBytes) {
3003  std::ostream& err = this->markLocalErrorAndGetStream ();
3004  err << "packAndPreapre: At end of method, the final offset (in bytes) "
3005  << offset << " does not equal the total number of bytes packed "
3006  << totalNumBytes << ". "
3007  << "Please report this bug to the Tpetra developers." << endl;
3008  return;
3009  }
3010  } // if totalNumEntries > 0
3011 
3012  if (debug) {
3013  std::ostringstream os;
3014  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3015  const bool lclSuccess = ! (* (this->localError_));
3016  os << "*** Proc " << myRank << ": packAndPrepare "
3017  << (lclSuccess ? "succeeded" : "FAILED")
3018  << " (totalNumEntries = " << totalNumEntries << ") ***" << endl;
3019  std::cerr << os.str ();
3020  }
3021  }
3022 
3023 
3024  template<class Scalar, class LO, class GO, class Node>
3025  void
3027  unpackAndCombine (const Teuchos::ArrayView<const LO>& importLIDs,
3028  const Teuchos::ArrayView<const packet_type>& imports,
3029  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
3030  size_t /* constantNumPackets */, // not worthwhile to use this
3031  Tpetra::Distributor& /* distor */,
3033  {
3034  using std::endl;
3036  using Kokkos::MemoryUnmanaged;
3037  using Kokkos::subview;
3038  using Kokkos::View;
3040  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
3041  typedef typename View<int*, device_type>::HostMirror::execution_space HES;
3042  typedef std::pair<typename View<int*, HES>::size_type,
3043  typename View<int*, HES>::size_type> pair_type;
3044  typedef View<GO*, HES, MemoryUnmanaged> gids_out_type;
3045  typedef View<LO*, HES, MemoryUnmanaged> lids_out_type;
3046  typedef View<ST*, HES, MemoryUnmanaged> vals_out_type;
3047  typedef typename PackTraits<GO, HES>::input_buffer_type input_buffer_type;
3048  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::unpackAndCombine: ";
3049  const bool debug = false;
3050 
3051  if (debug) {
3052  std::ostringstream os;
3053  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3054  os << "Proc " << myRank << ": unpackAndCombine" << endl;
3055  std::cerr << os.str ();
3056  }
3057 
3058  // It should not cause deadlock to return on error in this method,
3059  // since this method does not communicate.
3060 
3061  if (* (this->localError_)) {
3062  std::ostream& err = this->markLocalErrorAndGetStream ();
3063  err << prefix << "The target object of the Import or Export is "
3064  "already in an error state." << endl;
3065  return;
3066  }
3067  if (importLIDs.size () != numPacketsPerLID.size ()) {
3068  std::ostream& err = this->markLocalErrorAndGetStream ();
3069  err << prefix << "importLIDs.size() = " << importLIDs.size () << " != "
3070  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << "." << endl;
3071  return;
3072  }
3073  if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
3074  std::ostream& err = this->markLocalErrorAndGetStream ();
3075  err << prefix << "Invalid CombineMode value " << CM << ". Valid "
3076  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3077  << endl;
3078  return;
3079  }
3080 
3081  // Target matrix's column Map. Use to convert the global column
3082  // indices in the receive buffer to local indices. We verified in
3083  // checkSizes() that the column Map exists (is not null).
3084  const map_type& tgtColMap = * (this->graph_.getColMap ());
3085 
3086  const size_type numImportLIDs = importLIDs.size ();
3087  if (CM == ZERO || numImportLIDs == 0) {
3088  if (debug) {
3089  std::ostringstream os;
3090  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3091  os << "Proc " << myRank << ": unpackAndCombine: Nothing to do" << endl;
3092  std::cerr << os.str ();
3093  }
3094  return; // nothing to do; no need to combine entries
3095  }
3096 
3097  if (debug) {
3098  std::ostringstream os;
3099  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3100  os << "Proc " << myRank << ": unpackAndCombine: Getting ready" << endl;
3101  std::cerr << os.str ();
3102  }
3103 
3104  input_buffer_type importsK (imports.getRawPtr (), imports.size ());
3105  const size_t blockSize = this->getBlockSize ();
3106  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3107  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3108 
3109  size_t numBytesPerValue;
3110  {
3111  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3112  // run-time size? We already assume that all entries in both
3113  // the source and target matrices have the same size. If the
3114  // calling process owns at least one entry in either matrix, we
3115  // can use that entry to set the size. However, it is possible
3116  // that the calling process owns no entries. In that case,
3117  // we're in trouble. One way to fix this would be for each
3118  // row's data to contain the run-time size. This is only
3119  // necessary if the size is not a compile-time constant.
3120  Scalar val;
3121  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
3122  }
3123 
3124  // Temporary space to cache incoming global column indices and
3125  // values. Column indices come in as global indices, in case the
3126  // source object's column Map differs from the target object's
3127  // (this's) column Map.
3128  View<GO*, HES> gblColInds;
3129  View<LO*, HES> lclColInds;
3130  View<ST*, HES> vals;
3131  {
3132  GO gid = 0;
3133  LO lid = 0;
3134  // FIXME (mfh 17 Feb 2015) What do I do about Scalar types with
3135  // run-time size? We already assume that all entries in both
3136  // the source and target matrices have the same size. If the
3137  // calling process owns at least one entry in either matrix, we
3138  // can use that entry to set the size. However, it is possible
3139  // that the calling process owns no entries. In that case,
3140  // we're in trouble. One way to fix this would be for each
3141  // row's data to contain the run-time size. This is only
3142  // necessary if the size is not a compile-time constant.
3143  Scalar val;
3144  gblColInds = PackTraits<GO, HES>::allocateArray (gid, maxRowNumEnt, "gids");
3145  lclColInds = PackTraits<LO, HES>::allocateArray (lid, maxRowNumEnt, "lids");
3146  vals = PackTraits<ST, HES>::allocateArray (val, maxRowNumScalarEnt, "vals");
3147  }
3148 
3149  size_t offset = 0;
3150  bool errorDuringUnpack = false;
3151  for (size_type i = 0; i < numImportLIDs; ++i) {
3152  const size_t numBytes = numPacketsPerLID[i];
3153  if (numBytes == 0) {
3154  continue; // empty buffer for that row means that the row is empty
3155  }
3156  const size_t numEnt =
3157  unpackRowCount<ST, LO, GO, HES> (importsK, offset, numBytes,
3158  numBytesPerValue);
3159  if (numEnt > maxRowNumEnt) {
3160  errorDuringUnpack = true;
3161 #ifdef HAVE_TPETRA_DEBUG
3162  std::ostream& err = this->markLocalErrorAndGetStream ();
3163  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3164  << " > maxRowNumEnt = " << maxRowNumEnt << endl;
3165 #endif // HAVE_TPETRA_DEBUG
3166  continue;
3167  }
3168 
3169  const size_t numScalarEnt = numEnt * blockSize * blockSize;
3170  const LO lclRow = importLIDs[i];
3171 
3172  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
3173  vals_out_type valsOut = subview (vals, pair_type (0, numScalarEnt));
3174 
3175  const size_t numBytesOut =
3176  unpackRowForBlockCrs<ST, LO, GO, HES> (gidsOut, valsOut, importsK,
3177  offset, numBytes, numEnt,
3178  numBytesPerValue, blockSize);
3179  if (numBytes != numBytesOut) {
3180  errorDuringUnpack = true;
3181 #ifdef HAVE_TPETRA_DEBUG
3182  std::ostream& err = this->markLocalErrorAndGetStream ();
3183  err << prefix << "At i = " << i << ", numBytes = " << numBytes
3184  << " != numBytesOut = " << numBytesOut << ".";
3185 #endif // HAVE_TPETRA_DEBUG
3186  continue;
3187  }
3188 
3189  // Convert incoming global indices to local indices.
3190  lids_out_type lidsOut = subview (lclColInds, pair_type (0, numEnt));
3191  for (size_t k = 0; k < numEnt; ++k) {
3192  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3193  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3194  errorDuringUnpack = true;
3195 #ifdef HAVE_TPETRA_DEBUG
3196  std::ostream& err = this->markLocalErrorAndGetStream ();
3197  err << prefix << "At i = " << i << ", GID " << gidsOut(k)
3198  << " is not owned by the calling process.";
3199 #endif // HAVE_TPETRA_DEBUG
3200  continue;
3201  }
3202  }
3203 
3204  // Combine the incoming data with the matrix's current data.
3205  LO numCombd = 0;
3206  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.ptr_on_device ());
3207  const Scalar* const valsRaw =
3208  reinterpret_cast<const Scalar*> (const_cast<const ST*> (valsOut.ptr_on_device ()));
3209  if (CM == ADD) {
3210  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3211  } else if (CM == INSERT || CM == REPLACE) {
3212  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3213  } else if (CM == ABSMAX) {
3214  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3215  }
3216 
3217  if (static_cast<LO> (numEnt) != numCombd) {
3218  errorDuringUnpack = true;
3219 #ifdef HAVE_TPETRA_DEBUG
3220  std::ostream& err = this->markLocalErrorAndGetStream ();
3221  err << prefix << "At i = " << i << ", numEnt = " << numEnt
3222  << " != numCombd = " << numCombd << ".";
3223 #endif // HAVE_TPETRA_DEBUG
3224  continue;
3225  }
3226 
3227  // Don't update offset until current LID has succeeded.
3228  offset += numBytes;
3229  } // for each import LID i
3230 
3231  if (errorDuringUnpack) {
3232  std::ostream& err = this->markLocalErrorAndGetStream ();
3233  err << prefix << "Unpacking failed.";
3234 #ifndef HAVE_TPETRA_DEBUG
3235  err << " Please run again with a debug build to get more verbose "
3236  "diagnostic output.";
3237 #endif // ! HAVE_TPETRA_DEBUG
3238  err << endl;
3239  }
3240 
3241  if (debug) {
3242  std::ostringstream os;
3243  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
3244  const bool lclSuccess = ! (* (this->localError_));
3245  os << "*** Proc " << myRank << ": unpackAndCombine "
3246  << (lclSuccess ? "succeeded" : "FAILED")
3247  << " ***" << endl;
3248  std::cerr << os.str ();
3249  }
3250  }
3251 
3252 
3253  template<class Scalar, class LO, class GO, class Node>
3254  std::string
3256  {
3257  using Teuchos::TypeNameTraits;
3258  std::ostringstream os;
3259  os << "\"Tpetra::BlockCrsMatrix\": { "
3260  << "Template parameters: { "
3261  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3262  << "LO: " << TypeNameTraits<LO>::name ()
3263  << "GO: " << TypeNameTraits<GO>::name ()
3264  << "Node: " << TypeNameTraits<Node>::name ()
3265  << " }"
3266  << ", Label: \"" << this->getObjectLabel () << "\""
3267  << ", Global dimensions: ["
3268  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3269  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3270  << ", Block size: " << getBlockSize ()
3271  << " }";
3272  return os.str ();
3273  }
3274 
3275 
3276  template<class Scalar, class LO, class GO, class Node>
3277  void
3279  describe (Teuchos::FancyOStream& out,
3280  const Teuchos::EVerbosityLevel verbLevel) const
3281  {
3282  using Teuchos::ArrayRCP;
3283  using Teuchos::CommRequest;
3284  using Teuchos::FancyOStream;
3285  using Teuchos::getFancyOStream;
3286  using Teuchos::ireceive;
3287  using Teuchos::isend;
3288  using Teuchos::outArg;
3289  using Teuchos::TypeNameTraits;
3290  using Teuchos::VERB_DEFAULT;
3291  using Teuchos::VERB_NONE;
3292  using Teuchos::VERB_LOW;
3293  using Teuchos::VERB_MEDIUM;
3294  // using Teuchos::VERB_HIGH;
3295  using Teuchos::VERB_EXTREME;
3296  using Teuchos::RCP;
3297  using Teuchos::wait;
3298  using std::endl;
3299 #ifdef HAVE_TPETRA_DEBUG
3300  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::describe: ";
3301 #endif // HAVE_TPETRA_DEBUG
3302 
3303  // Set default verbosity if applicable.
3304  const Teuchos::EVerbosityLevel vl =
3305  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3306 
3307  if (vl == VERB_NONE) {
3308  return; // print nothing
3309  }
3310 
3311  // describe() always starts with a tab before it prints anything.
3312  Teuchos::OSTab tab0 (out);
3313 
3314  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3315  Teuchos::OSTab tab1 (out);
3316 
3317  out << "Template parameters:" << endl;
3318  {
3319  Teuchos::OSTab tab2 (out);
3320  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3321  << "LO: " << TypeNameTraits<LO>::name () << endl
3322  << "GO: " << TypeNameTraits<GO>::name () << endl
3323  << "Node: " << TypeNameTraits<Node>::name () << endl;
3324  }
3325  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3326  << "Global dimensions: ["
3327  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3328  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3329 
3330  const LO blockSize = getBlockSize ();
3331  out << "Block size: " << blockSize << endl;
3332 
3333  // constituent objects
3334  if (vl >= VERB_MEDIUM) {
3335  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3336  const int myRank = comm.getRank ();
3337  if (myRank == 0) {
3338  out << "Row Map:" << endl;
3339  }
3340  getRowMap()->describe (out, vl);
3341 
3342  if (! getColMap ().is_null ()) {
3343  if (getColMap() == getRowMap()) {
3344  if (myRank == 0) {
3345  out << "Column Map: same as row Map" << endl;
3346  }
3347  }
3348  else {
3349  if (myRank == 0) {
3350  out << "Column Map:" << endl;
3351  }
3352  getColMap ()->describe (out, vl);
3353  }
3354  }
3355  if (! getDomainMap ().is_null ()) {
3356  if (getDomainMap () == getRowMap ()) {
3357  if (myRank == 0) {
3358  out << "Domain Map: same as row Map" << endl;
3359  }
3360  }
3361  else if (getDomainMap () == getColMap ()) {
3362  if (myRank == 0) {
3363  out << "Domain Map: same as column Map" << endl;
3364  }
3365  }
3366  else {
3367  if (myRank == 0) {
3368  out << "Domain Map:" << endl;
3369  }
3370  getDomainMap ()->describe (out, vl);
3371  }
3372  }
3373  if (! getRangeMap ().is_null ()) {
3374  if (getRangeMap () == getDomainMap ()) {
3375  if (myRank == 0) {
3376  out << "Range Map: same as domain Map" << endl;
3377  }
3378  }
3379  else if (getRangeMap () == getRowMap ()) {
3380  if (myRank == 0) {
3381  out << "Range Map: same as row Map" << endl;
3382  }
3383  }
3384  else {
3385  if (myRank == 0) {
3386  out << "Range Map: " << endl;
3387  }
3388  getRangeMap ()->describe (out, vl);
3389  }
3390  }
3391  }
3392 
3393  if (vl >= VERB_EXTREME) {
3394  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3395  // to host, since it's supposed to be const. However, that's
3396  // the easiest and least memory-intensive way to implement this
3397  // method.
3399  const_cast<this_type*> (this)->template sync<Kokkos::HostSpace> ();
3400 
3401 #ifdef HAVE_TPETRA_DEBUG
3402  TEUCHOS_TEST_FOR_EXCEPTION
3403  (this->template need_sync<Kokkos::HostSpace> (), std::logic_error,
3404  prefix << "Right after sync to host, the matrix claims that it needs "
3405  "sync to host. Please report this bug to the Tpetra developers.");
3406 #endif // HAVE_TPETRA_DEBUG
3407 
3408  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3409  const int myRank = comm.getRank ();
3410  const int numProcs = comm.getSize ();
3411 
3412  // Print the calling process' data to the given output stream.
3413  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3414  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3415  FancyOStream& os = *osPtr;
3416  os << "Process " << myRank << ":" << endl;
3417  Teuchos::OSTab tab2 (os);
3418 
3419  const map_type& meshRowMap = * (graph_.getRowMap ());
3420  const map_type& meshColMap = * (graph_.getColMap ());
3421  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3422  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3423  ++meshLclRow) {
3424  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3425  os << "Row " << meshGblRow << ": {";
3426 
3427  const LO* lclColInds = NULL;
3428  Scalar* vals = NULL;
3429  LO numInds = 0;
3430  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3431 
3432  for (LO k = 0; k < numInds; ++k) {
3433  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3434 
3435  os << "Col " << gblCol << ": [";
3436  for (LO i = 0; i < blockSize; ++i) {
3437  for (LO j = 0; j < blockSize; ++j) {
3438  os << vals[blockSize*blockSize*k + i*blockSize + j];
3439  if (j + 1 < blockSize) {
3440  os << ", ";
3441  }
3442  }
3443  if (i + 1 < blockSize) {
3444  os << "; ";
3445  }
3446  }
3447  os << "]";
3448  if (k + 1 < numInds) {
3449  os << ", ";
3450  }
3451  }
3452  os << "}" << endl;
3453  }
3454 
3455  // Print data on Process 0. This will automatically respect the
3456  // current indentation level.
3457  if (myRank == 0) {
3458  out << lclOutStrPtr->str ();
3459  lclOutStrPtr = Teuchos::null; // clear it to save space
3460  }
3461 
3462  const int sizeTag = 1337;
3463  const int dataTag = 1338;
3464 
3465  ArrayRCP<char> recvDataBuf; // only used on Process 0
3466 
3467  // Send string sizes and data from each process in turn to
3468  // Process 0, and print on that process.
3469  for (int p = 1; p < numProcs; ++p) {
3470  if (myRank == 0) {
3471  // Receive the incoming string's length.
3472  ArrayRCP<size_t> recvSize (1);
3473  recvSize[0] = 0;
3474  RCP<CommRequest<int> > recvSizeReq =
3475  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3476  wait<int> (comm, outArg (recvSizeReq));
3477  const size_t numCharsToRecv = recvSize[0];
3478 
3479  // Allocate space for the string to receive. Reuse receive
3480  // buffer space if possible. We can do this because in the
3481  // current implementation, we only have one receive in
3482  // flight at a time. Leave space for the '\0' at the end,
3483  // in case the sender doesn't send it.
3484  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3485  recvDataBuf.resize (numCharsToRecv + 1);
3486  }
3487  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3488  // Post the receive of the actual string data.
3489  RCP<CommRequest<int> > recvDataReq =
3490  ireceive<int, char> (recvData, p, dataTag, comm);
3491  wait<int> (comm, outArg (recvDataReq));
3492 
3493  // Print the received data. This will respect the current
3494  // indentation level. Make sure that the string is
3495  // null-terminated.
3496  recvDataBuf[numCharsToRecv] = '\0';
3497  out << recvDataBuf.getRawPtr ();
3498  }
3499  else if (myRank == p) { // if I am not Process 0, and my rank is p
3500  // This deep-copies the string at most twice, depending on
3501  // whether std::string reference counts internally (it
3502  // generally does, so this won't deep-copy at all).
3503  const std::string stringToSend = lclOutStrPtr->str ();
3504  lclOutStrPtr = Teuchos::null; // clear original to save space
3505 
3506  // Send the string's length to Process 0.
3507  const size_t numCharsToSend = stringToSend.size ();
3508  ArrayRCP<size_t> sendSize (1);
3509  sendSize[0] = numCharsToSend;
3510  RCP<CommRequest<int> > sendSizeReq =
3511  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3512  wait<int> (comm, outArg (sendSizeReq));
3513 
3514  // Send the actual string to Process 0. We know that the
3515  // string has length > 0, so it's save to take the address
3516  // of the first entry. Make a nonowning ArrayRCP to hold
3517  // the string. Process 0 will add a null termination
3518  // character at the end of the string, after it receives the
3519  // message.
3520  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3521  RCP<CommRequest<int> > sendDataReq =
3522  isend<int, char> (sendData, 0, dataTag, comm);
3523  wait<int> (comm, outArg (sendDataReq));
3524  }
3525  } // for each process rank p other than 0
3526  } // extreme verbosity level (print the whole matrix)
3527  }
3528 
3529  template<class Scalar, class LO, class GO, class Node>
3530  Teuchos::RCP<const Teuchos::Comm<int> >
3532  getComm() const
3533  {
3534  return graph_.getComm();
3535  }
3536 
3537  template<class Scalar, class LO, class GO, class Node>
3538  Teuchos::RCP<Node>
3540  getNode() const
3541  {
3542  return graph_.getNode();
3543 
3544  }
3545 
3546  template<class Scalar, class LO, class GO, class Node>
3550  {
3551  return graph_.getGlobalNumCols();
3552  }
3553 
3554  template<class Scalar, class LO, class GO, class Node>
3555  size_t
3558  {
3559  return graph_.getNodeNumCols();
3560  }
3561 
3562  template<class Scalar, class LO, class GO, class Node>
3563  GO
3566  {
3567  return graph_.getIndexBase();
3568  }
3569 
3570  template<class Scalar, class LO, class GO, class Node>
3574  {
3575  return graph_.getGlobalNumEntries();
3576  }
3577 
3578  template<class Scalar, class LO, class GO, class Node>
3579  size_t
3582  {
3583  return graph_.getNodeNumEntries();
3584  }
3585 
3586  template<class Scalar, class LO, class GO, class Node>
3587  size_t
3589  getNumEntriesInGlobalRow (GO globalRow) const
3590  {
3591  return graph_.getNumEntriesInGlobalRow(globalRow);
3592  }
3593 
3594  template<class Scalar, class LO, class GO, class Node>
3598  {
3599  return graph_.getGlobalNumDiags();
3600  }
3601 
3602  template<class Scalar, class LO, class GO, class Node>
3603  size_t
3606  {
3607  return graph_.getNodeNumDiags();
3608  }
3609 
3610  template<class Scalar, class LO, class GO, class Node>
3611  size_t
3614  {
3615  return graph_.getGlobalMaxNumRowEntries();
3616  }
3617 
3618  template<class Scalar, class LO, class GO, class Node>
3619  bool
3621  hasColMap() const
3622  {
3623  return graph_.hasColMap();
3624  }
3625 
3626  template<class Scalar, class LO, class GO, class Node>
3627  bool
3630  {
3631  return graph_.isLowerTriangular();
3632  }
3633 
3634  template<class Scalar, class LO, class GO, class Node>
3635  bool
3638  {
3639  return graph_.isUpperTriangular();
3640  }
3641 
3642  template<class Scalar, class LO, class GO, class Node>
3643  bool
3646  {
3647  return graph_.isLocallyIndexed();
3648  }
3649 
3650  template<class Scalar, class LO, class GO, class Node>
3651  bool
3654  {
3655  return graph_.isGloballyIndexed();
3656  }
3657 
3658  template<class Scalar, class LO, class GO, class Node>
3659  bool
3662  {
3663  return graph_.isFillComplete ();
3664  }
3665 
3666  template<class Scalar, class LO, class GO, class Node>
3667  bool
3670  {
3671  return false;
3672  }
3673 
3674 
3675  template<class Scalar, class LO, class GO, class Node>
3676  void
3678  getGlobalRowCopy (GO GlobalRow,
3679  const Teuchos::ArrayView<GO> &Indices,
3680  const Teuchos::ArrayView<Scalar> &Values,
3681  size_t &NumEntries) const
3682  {
3683  TEUCHOS_TEST_FOR_EXCEPTION(
3684  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3685  "This class doesn't support global matrix indexing.");
3686 
3687  }
3688 
3689  template<class Scalar, class LO, class GO, class Node>
3690  void
3692  getGlobalRowView (GO GlobalRow,
3693  Teuchos::ArrayView<const GO> &indices,
3694  Teuchos::ArrayView<const Scalar> &values) const
3695  {
3696  TEUCHOS_TEST_FOR_EXCEPTION(
3697  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3698  "This class doesn't support global matrix indexing.");
3699 
3700  }
3701 
3702  template<class Scalar, class LO, class GO, class Node>
3703  void
3705  getLocalRowView (LO LocalRow,
3706  Teuchos::ArrayView<const LO> &indices,
3707  Teuchos::ArrayView<const Scalar> &values) const
3708  {
3709  TEUCHOS_TEST_FOR_EXCEPTION(
3710  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: "
3711  "This class doesn't support local matrix indexing.");
3712 
3713  }
3714 
3715 
3716  template<class Scalar, class LO, class GO, class Node>
3717  void
3720  {
3721 #ifdef HAVE_TPETRA_DEBUG
3722  const char prefix[] =
3723  "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: ";
3724 #endif // HAVE_TPETRA_DEBUG
3725 
3726  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3727 
3728  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3729  graph_.getLocalDiagOffsets (diagOffsets);
3730 
3731  // The code below works on host, so use a host View.
3732  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3733  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3734  // We're filling diag on host for now.
3735  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3736 
3737 #ifdef HAVE_TPETRA_DEBUG
3738  TEUCHOS_TEST_FOR_EXCEPTION
3739  (this->template need_sync<Kokkos::HostSpace> (), std::runtime_error,
3740  prefix << "The matrix's data were last modified on device, but have "
3741  "not been sync'd to host. Please sync to host (by calling "
3742  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3743  "method.");
3744 #endif // HAVE_TPETRA_DEBUG
3745 
3746  // NOTE (mfh 26 May 2016) OK to const_cast here, since the host
3747  // version of the data always exists (no lazy allocation for host
3748  // data).
3750  auto vals_host_out =
3751  const_cast<this_type*> (this)->template getValues<Kokkos::HostSpace> ();
3752  Scalar* vals_host_out_raw =
3753  reinterpret_cast<Scalar*> (vals_host_out.ptr_on_device ());
3754 
3755  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3756  size_t rowOffset = 0;
3757  size_t offset = 0;
3758  LO bs = getBlockSize();
3759  for(size_t r=0; r<getNodeNumRows(); r++)
3760  {
3761  // move pointer to start of diagonal block
3762  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3763  for(int b=0; b<bs; b++)
3764  {
3765  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3766  }
3767  // move pointer to start of next block row
3768  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3769  }
3770 
3771  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3772  }
3773 
3774  template<class Scalar, class LO, class GO, class Node>
3775  void
3778  {
3779  TEUCHOS_TEST_FOR_EXCEPTION(
3780  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3781  "not implemented.");
3782 
3783  }
3784 
3785  template<class Scalar, class LO, class GO, class Node>
3786  void
3789  {
3790  TEUCHOS_TEST_FOR_EXCEPTION(
3791  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3792  "not implemented.");
3793 
3794  }
3795 
3796  template<class Scalar, class LO, class GO, class Node>
3797  Teuchos::RCP<const Tpetra::RowGraph<LO, GO, Node> >
3799  getGraph() const
3800  {
3801  return graphRCP_;
3802  }
3803 
3804  template<class Scalar, class LO, class GO, class Node>
3808  {
3809  TEUCHOS_TEST_FOR_EXCEPTION(
3810  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
3811  "not implemented.");
3812  }
3813 
3814 } // namespace Experimental
3815 } // namespace Tpetra
3816 
3817 //
3818 // Explicit instantiation macro
3819 //
3820 // Must be expanded from within the Tpetra::Experimental namespace!
3821 //
3822 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3823  template class Experimental::BlockCrsMatrix< S, LO, GO, NODE >;
3824 
3825 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
virtual Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual void copyAndPermute(const Tpetra::SrcDistObject &source, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Perform copies and permutations that are local to this process.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
size_t getNodeNumEntries() const
Returns the local number of entries in the graph.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
mv_type mv_
The Tpetra::MultiVector used to represent the data.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
GlobalOrdinal getIndexBase() const
Returns the index base for global indices for this graph.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
global_size_t getGlobalNumRows() const
get the global number of block rows
global_size_t getGlobalNumEntries() const
Returns the global number of entries in the graph.
void putScalar(const Scalar &val)
Fill all entries with the given value val.
Teuchos::RCP< const map_type > getRowMap() const
Returns the Map that describes the row distribution in this graph.
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
std::string errorMessages() const
The current stream of error messages.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Teuchos::RCP< const map_type > getRangeMap() const
Returns the Map associated with the domain of this graph.
One or more distributed dense vectors.
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
MultiVector for multiple degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
global_size_t getGlobalNumRows() const
Returns the number of global rows in the graph.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
virtual Teuchos::RCP< const Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
size_t getNodeNumDiags() const
Returns the number of local diagonal entries, based on global row/column index comparisons.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
std::string description() const
One-line description of this object.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const
Returns the current number of entries on this node in the specified global row.
virtual size_t getNodeNumDiags() const
The number of local diagonal entries, based on global row/column index comparisons.
bool isFillComplete() const
Returns true if fillComplete() has been called and the graph is in compute mode.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Returns the communicator.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the graph.
bool locallySameAs(const Map< LocalOrdinal, GlobalOrdinal, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
LocalOrdinal getLocalElement(GlobalOrdinal globalIndex) const
The local index corresponding to the given global index.
int local_ordinal_type
Default value of LocalOrdinal template parameter.
virtual bool isLowerTriangular() const
Whether this matrix is lower triangular.
virtual global_size_t getGlobalNumDiags() const
The number of global diagonal entries, based on global row/column index comparisons.
void scale(const Scalar &val)
Multiply all entries in place by the given value val.
virtual bool checkSizes(const Tpetra::SrcDistObject &source)
Compare the source and target (this) objects for compatibility.
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
size_t getNodeNumCols() const
Returns the number of columns connected to the locally owned rows of this graph.
Teuchos::RCP< node_type > getNode() const
Returns the underlying node.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const map_type > getDomainMap() const
Returns the Map associated with the domain of this graph.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
size_t global_size_t
Global size_t object.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const import_type > getImporter() const
Returns the importer associated with this graph.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
void reorderedGaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const export_type > getExporter() const
Returns the exporter associated with this graph.
virtual Teuchos::RCP< Node > getNode() const
The Kokkos Node instance.
Sets up and executes a communication plan for a Tpetra DistObject.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const MultiVector< Scalar, LO, GO, Node > &B, const MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
size_t getNodeMaxNumRowEntries() const
The maximum number of entries across all rows/columns on this node.
Tpetra::MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node >::mag_type mag_type
Type of a norm result.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
virtual void leftScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
Replace old value with maximum of magnitudes of old and new values.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
Abstract base class for objects that can be the source of an Import or Export operation.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries across all rows/columns on all nodes.
double scalar_type
Default value of Scalar template parameter.
size_t getNodeNumRows() const
Returns the number of graph rows owned on the calling node.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
Replace existing values with new values.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
bool isLocallyIndexed() const
If graph indices are in the local range, this function returns true. Otherwise, this function returns...
Replace old values with zero.
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in all rows owned by the calling process.
mv_type getMultiVectorView() const
Get a Tpetra::MultiVector that views this BlockMultiVector&#39;s data.
bool isUpperTriangular() const
Whether the graph is locally upper triangular.
virtual bool hasColMap() const
Whether this matrix has a well-defined column map.
size_t getGlobalMaxNumRowEntries() const
Maximum number of entries in all rows over all processes.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
bool isLowerTriangular() const
Whether the graph is locally lower triangular.
virtual bool isUpperTriangular() const
Whether this matrix is upper triangular.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
bool isGloballyIndexed() const
If graph indices are in the global range, this function returns true. Otherwise, this function return...
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
virtual void rightScale(const Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
Teuchos::RCP< const map_type > getColMap() const
Returns the Map that describes the column distribution in this graph.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Get the number of entries in the given row (local index).
virtual GO getIndexBase() const
The index base for global indices in this matrix.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
A distributed dense vector.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
device_type::memory_space memory_space
The Kokkos memory space that this class uses.
void doExport(const SrcDistObject &source, const Export< LO, GO, Node > &exporter, CombineMode CM)
Export data into this object using an Export object ("forward mode").
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
virtual bool isFillComplete() const
Whether fillComplete() has been called.
Teuchos::DataAccess getCopyOrView() const
Get whether this has copy (copyOrView = Teuchos::Copy) or view (copyOrView = Teuchos::View) semantics...
BlockMultiVector< Scalar, LO, GO, Node >::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
LO getBlockSize() const
The number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
Declaration of Tpetra::Experimental::BlockCrsMatrix.
bool isNodeLocalElement(LocalOrdinal localIndex) const
Whether the given local index is valid for this Map on the calling process.
global_size_t getGlobalNumCols() const
Returns the number of global columns in the graph.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
bool hasColMap() const
Whether the graph has a column Map.
global_size_t getGlobalNumDiags() const
Returns the number of global diagonal entries, based on global row/column index comparisons.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
size_t getNodeNumRows() const
get the local number of block rows
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
local_graph_type getLocalGraph() const
Get the local graph.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.