Tpetra parallel linear algebra  Version of the Day
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
42 
45 
49 #include "Tpetra_BlockMultiVector.hpp"
50 #include "Tpetra_BlockView.hpp"
51 
52 #include "Teuchos_TimeMonitor.hpp"
53 #ifdef HAVE_TPETRA_DEBUG
54 # include <set>
55 #endif // HAVE_TPETRA_DEBUG
56 
57 //
58 // mfh 25 May 2016: Temporary fix for #393.
59 //
60 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
61 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
62 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
63 // use them in that case, either.
64 //
65 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
66 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
67 // thing for any GCC version.
68 //
69 #if defined(__CUDACC__)
70  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
71 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
73 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74 
75 #elif defined(__GNUC__)
76 
77 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
79 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 
81 #else // some other compiler
82 
83  // Optimistically assume that other compilers aren't broken.
84 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
86 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87 #endif // defined(__CUDACC__), defined(__GNUC__)
88 
89 
90 namespace Tpetra {
91 
92 namespace Impl {
93 
94  template<typename T>
95  struct BlockCrsRowStruct {
96  T totalNumEntries, totalNumBytes, maxRowLength;
97  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
98  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
99  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
100  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
101  };
102 
103  template<typename T>
104  static
105  KOKKOS_INLINE_FUNCTION
106  void operator+=(volatile BlockCrsRowStruct<T> &a,
107  volatile const BlockCrsRowStruct<T> &b) {
108  a.totalNumEntries += b.totalNumEntries;
109  a.totalNumBytes += b.totalNumBytes;
110  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
111  }
112 
113  template<typename T>
114  static
115  KOKKOS_INLINE_FUNCTION
116  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
117  a.totalNumEntries += b.totalNumEntries;
118  a.totalNumBytes += b.totalNumBytes;
119  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
120  }
121 
122  template<typename T, typename ExecSpace>
123  struct BlockCrsReducer {
124  typedef BlockCrsReducer reducer;
125  typedef T value_type;
126  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
127  value_type *value;
128 
129  KOKKOS_INLINE_FUNCTION
130  BlockCrsReducer(value_type &val) : value(&val) {}
131 
132  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
133  KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
134  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
135  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
136  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
137  };
138 
139  template<class AlphaCoeffType,
140  class GraphType,
141  class MatrixValuesType,
142  class InVecType,
143  class BetaCoeffType,
144  class OutVecType,
145  bool IsBuiltInType>
146  class BcrsApplyNoTransFunctor {
147  private:
148  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
149  "MatrixValuesType must be a Kokkos::View.");
150  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
151  "OutVecType must be a Kokkos::View.");
152  static_assert (Kokkos::Impl::is_view<InVecType>::value,
153  "InVecType must be a Kokkos::View.");
154  static_assert (std::is_same<MatrixValuesType,
155  typename MatrixValuesType::const_type>::value,
156  "MatrixValuesType must be a const Kokkos::View.");
157  static_assert (std::is_same<OutVecType,
158  typename OutVecType::non_const_type>::value,
159  "OutVecType must be a nonconst Kokkos::View.");
160  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
161  "InVecType must be a const Kokkos::View.");
162  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
163  "MatrixValuesType must be a rank-1 Kokkos::View.");
164  static_assert (static_cast<int> (InVecType::rank) == 1,
165  "InVecType must be a rank-1 Kokkos::View.");
166  static_assert (static_cast<int> (OutVecType::rank) == 1,
167  "OutVecType must be a rank-1 Kokkos::View.");
168  typedef typename MatrixValuesType::non_const_value_type scalar_type;
169 
170  public:
171  typedef typename GraphType::device_type device_type;
172 
174  typedef typename std::remove_const<typename GraphType::data_type>::type
176 
183  void setX (const InVecType& X) { X_ = X; }
184 
191  void setY (const OutVecType& Y) { Y_ = Y; }
192 
193  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
194  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
195 
197  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
198  const GraphType& graph,
199  const MatrixValuesType& val,
200  const local_ordinal_type blockSize,
201  const InVecType& X,
202  const beta_coeff_type& beta,
203  const OutVecType& Y) :
204  alpha_ (alpha),
205  ptr_ (graph.row_map),
206  ind_ (graph.entries),
207  val_ (val),
208  blockSize_ (blockSize),
209  X_ (X),
210  beta_ (beta),
211  Y_ (Y)
212  {}
213 
214  // Dummy team version
215  KOKKOS_INLINE_FUNCTION void
216  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
217  {
218  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
219  }
220 
221  // Range Policy for non built-in types
222  KOKKOS_INLINE_FUNCTION void
223  operator () (const local_ordinal_type& lclRow) const
224  {
229  using Kokkos::Details::ArithTraits;
230  // I'm not writing 'using Kokkos::make_pair;' here, because that
231  // may break builds for users who make the mistake of putting
232  // 'using namespace std;' in the global namespace. Please don't
233  // ever do that! But just in case you do, I'll take this
234  // precaution.
235  using Kokkos::parallel_for;
236  using Kokkos::subview;
237  typedef typename decltype (ptr_)::non_const_value_type offset_type;
238  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
239  Kokkos::LayoutRight,
240  device_type,
241  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
242  little_block_type;
243 
244  const offset_type Y_ptBeg = lclRow * blockSize_;
245  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
246  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
247 
248  // This version of the code does not use temporary storage.
249  // Each thread writes to its own block of the target vector.
250  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
251  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
252  }
253  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
254  SCAL (beta_, Y_cur);
255  }
256 
257  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
258  const offset_type blkBeg = ptr_[lclRow];
259  const offset_type blkEnd = ptr_[lclRow+1];
260  // Precompute to save integer math in the inner loop.
261  const offset_type bs2 = blockSize_ * blockSize_;
262  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
263  ++absBlkOff) {
264  little_block_type A_cur (val_.data () + absBlkOff * bs2,
265  blockSize_, blockSize_);
266  const offset_type X_blkCol = ind_[absBlkOff];
267  const offset_type X_ptBeg = X_blkCol * blockSize_;
268  const offset_type X_ptEnd = X_ptBeg + blockSize_;
269  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
270 
271  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
272  } // for each entry in current local block row of matrix
273  }
274  }
275 
276  private:
277  alpha_coeff_type alpha_;
278  typename GraphType::row_map_type::const_type ptr_;
279  typename GraphType::entries_type::const_type ind_;
280  MatrixValuesType val_;
281  local_ordinal_type blockSize_;
282  InVecType X_;
283  beta_coeff_type beta_;
284  OutVecType Y_;
285  };
286 
287  template<class AlphaCoeffType,
288  class GraphType,
289  class MatrixValuesType,
290  class InVecType,
291  class BetaCoeffType,
292  class OutVecType>
293  class BcrsApplyNoTransFunctor<AlphaCoeffType,
294  GraphType,
295  MatrixValuesType,
296  InVecType,
297  BetaCoeffType,
298  OutVecType,
299  true> {
300  private:
301  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
302  "MatrixValuesType must be a Kokkos::View.");
303  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
304  "OutVecType must be a Kokkos::View.");
305  static_assert (Kokkos::Impl::is_view<InVecType>::value,
306  "InVecType must be a Kokkos::View.");
307  static_assert (std::is_same<MatrixValuesType,
308  typename MatrixValuesType::const_type>::value,
309  "MatrixValuesType must be a const Kokkos::View.");
310  static_assert (std::is_same<OutVecType,
311  typename OutVecType::non_const_type>::value,
312  "OutVecType must be a nonconst Kokkos::View.");
313  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
314  "InVecType must be a const Kokkos::View.");
315  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
316  "MatrixValuesType must be a rank-1 Kokkos::View.");
317  static_assert (static_cast<int> (InVecType::rank) == 1,
318  "InVecType must be a rank-1 Kokkos::View.");
319  static_assert (static_cast<int> (OutVecType::rank) == 1,
320  "OutVecType must be a rank-1 Kokkos::View.");
321  typedef typename MatrixValuesType::non_const_value_type scalar_type;
322 
323  public:
324  typedef typename GraphType::device_type device_type;
325 
327  typedef typename std::remove_const<typename GraphType::data_type>::type
329 
336  void setX (const InVecType& X) { X_ = X; }
337 
344  void setY (const OutVecType& Y) { Y_ = Y; }
345 
346  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
347  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
348 
350  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
351  const GraphType& graph,
352  const MatrixValuesType& val,
353  const local_ordinal_type blockSize,
354  const InVecType& X,
355  const beta_coeff_type& beta,
356  const OutVecType& Y) :
357  alpha_ (alpha),
358  ptr_ (graph.row_map),
359  ind_ (graph.entries),
360  val_ (val),
361  blockSize_ (blockSize),
362  X_ (X),
363  beta_ (beta),
364  Y_ (Y)
365  {}
366 
367  // dummy Range version
368  KOKKOS_INLINE_FUNCTION void
369  operator () (const local_ordinal_type& lclRow) const
370  {
371  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
372  }
373 
374  // Team policy for built-in types
375  KOKKOS_INLINE_FUNCTION void
376  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
377  {
378  const local_ordinal_type lclRow = member.league_rank();
379 
380  using Kokkos::Details::ArithTraits;
381  // I'm not writing 'using Kokkos::make_pair;' here, because that
382  // may break builds for users who make the mistake of putting
383  // 'using namespace std;' in the global namespace. Please don't
384  // ever do that! But just in case you do, I'll take this
385  // precaution.
386  using Kokkos::parallel_for;
387  using Kokkos::subview;
388  typedef typename decltype (ptr_)::non_const_value_type offset_type;
389  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
390  Kokkos::LayoutRight,
391  device_type,
392  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
393  little_block_type;
394 
395  const offset_type Y_ptBeg = lclRow * blockSize_;
396  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
397  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
398 
399  // This version of the code does not use temporary storage.
400  // Each thread writes to its own block of the target vector.
401  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
402  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
403  [&](const local_ordinal_type &i) {
404  Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
405  });
406  }
407  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
408  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
409  [&](const local_ordinal_type &i) {
410  Y_cur(i) *= beta_;
411  });
412  }
413  member.team_barrier();
414 
415  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
416  const offset_type blkBeg = ptr_[lclRow];
417  const offset_type blkEnd = ptr_[lclRow+1];
418  // Precompute to save integer math in the inner loop.
419  const offset_type bs2 = blockSize_ * blockSize_;
420  little_block_type A_cur (val_.data (), blockSize_, blockSize_);
421  auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
422  Kokkos::parallel_for
423  (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
424  [&](const local_ordinal_type &absBlkOff) {
425  A_cur.assign_data(val_.data () + absBlkOff * bs2);
426  const offset_type X_blkCol = ind_[absBlkOff];
427  const offset_type X_ptBeg = X_blkCol * blockSize_;
428  X_cur.assign_data(&X_(X_ptBeg));
429 
430  Kokkos::parallel_for
431  (Kokkos::ThreadVectorRange(member, blockSize_),
432  [&](const local_ordinal_type &k0) {
433  scalar_type val(0);
434  for (local_ordinal_type k1=0;k1<blockSize_;++k1)
435  val += A_cur(k0,k1)*X_cur(k1);
436 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
437  // host space team size is always 1
438  Y_cur(k0) += alpha_*val;
439 #else
440  // cuda space team size can be larger than 1
441  // atomic is not allowed for sacado type;
442  // thus this needs to be specialized or
443  // sacado atomic should be supported.
444  Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
445 #endif
446  });
447  }); // for each entry in current local block row of matrix
448  }
449  }
450 
451  private:
452  alpha_coeff_type alpha_;
453  typename GraphType::row_map_type::const_type ptr_;
454  typename GraphType::entries_type::const_type ind_;
455  MatrixValuesType val_;
456  local_ordinal_type blockSize_;
457  InVecType X_;
458  beta_coeff_type beta_;
459  OutVecType Y_;
460  };
461 
462 
463  template<class AlphaCoeffType,
464  class GraphType,
465  class MatrixValuesType,
466  class InMultiVecType,
467  class BetaCoeffType,
468  class OutMultiVecType>
469  void
470  bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
471  const GraphType& graph,
472  const MatrixValuesType& val,
473  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
474  const InMultiVecType& X,
475  const BetaCoeffType& beta,
476  const OutMultiVecType& Y
477  )
478  {
479  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
480  "MatrixValuesType must be a Kokkos::View.");
481  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
482  "OutMultiVecType must be a Kokkos::View.");
483  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
484  "InMultiVecType must be a Kokkos::View.");
485  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
486  "MatrixValuesType must be a rank-1 Kokkos::View.");
487  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
488  "OutMultiVecType must be a rank-2 Kokkos::View.");
489  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
490  "InMultiVecType must be a rank-2 Kokkos::View.");
491 
492  typedef typename GraphType::device_type::execution_space execution_space;
493  typedef typename MatrixValuesType::const_type matrix_values_type;
494  typedef typename OutMultiVecType::non_const_type out_multivec_type;
495  typedef typename InMultiVecType::const_type in_multivec_type;
496  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
497  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
498  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
499 
500  constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
501 
502  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
503  static_cast<LO> (0) :
504  static_cast<LO> (graph.row_map.extent (0) - 1);
505  const LO numVecs = Y.extent (1);
506  if (numLocalMeshRows == 0 || numVecs == 0) {
507  return; // code below doesn't handle numVecs==0 correctly
508  }
509 
510  // These assignments avoid instantiating the functor extra times
511  // unnecessarily, e.g., for X const vs. nonconst. We only need the
512  // X const case, so only instantiate for that case.
513  in_multivec_type X_in = X;
514  out_multivec_type Y_out = Y;
515 
516  // The functor only knows how to handle one vector at a time, and it
517  // expects 1-D Views. Thus, we need to know the type of each column
518  // of X and Y.
519  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
520  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
521  typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
522  matrix_values_type, in_vec_type, beta_type, out_vec_type,
523  is_builtin_type_enabled> functor_type;
524 
525  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
526  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
527 
528  // Compute the first column of Y.
529  if (is_builtin_type_enabled) {
530  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
531  // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
532  typedef Kokkos::TeamPolicy<execution_space> policy_type;
533  policy_type policy(1,1);
534 #if defined(KOKKOS_ENABLE_CUDA)
535  constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
536 #else
537  constexpr bool is_cuda = false;
538 #endif // defined(KOKKOS_ENABLE_CUDA)
539  if (is_cuda) {
540  LO team_size = 8, vector_size = 1;
541  if (blockSize <= 5) vector_size = 4;
542  else if (blockSize <= 9) vector_size = 8;
543  else if (blockSize <= 12) vector_size = 12;
544  else if (blockSize <= 20) vector_size = 20;
545  else vector_size = 20;
546  policy = policy_type(numLocalMeshRows, team_size, vector_size);
547  } else {
548  policy = policy_type(numLocalMeshRows, 1, 1);
549  }
550  Kokkos::parallel_for (policy, functor);
551 
552  // Compute the remaining columns of Y.
553  for (LO j = 1; j < numVecs; ++j) {
554  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
555  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
556  functor.setX (X_j);
557  functor.setY (Y_j);
558  Kokkos::parallel_for (policy, functor);
559  }
560  } else {
561  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
562  Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
563  Kokkos::parallel_for (policy, functor);
564  for (LO j = 1; j < numVecs; ++j) {
565  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
566  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
567  functor.setX (X_j);
568  functor.setY (Y_j);
569  Kokkos::parallel_for (policy, functor);
570  }
571  }
572  }
573 } // namespace Impl
574 
575 namespace { // (anonymous)
576 
577 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
578 // version that takes two Kokkos::View arguments).
579 template<class Scalar, class LO, class GO, class Node>
580 class GetLocalDiagCopy {
581 public:
582  typedef typename Node::device_type device_type;
583  typedef size_t diag_offset_type;
584  typedef Kokkos::View<const size_t*, device_type,
585  Kokkos::MemoryUnmanaged> diag_offsets_type;
586  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
587  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
588  typedef typename local_graph_device_type::row_map_type row_offsets_type;
589  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
590  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
591  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
592 
593  // Constructor
594  GetLocalDiagCopy (const diag_type& diag,
595  const values_type& val,
596  const diag_offsets_type& diagOffsets,
597  const row_offsets_type& ptr,
598  const LO blockSize) :
599  diag_ (diag),
600  diagOffsets_ (diagOffsets),
601  ptr_ (ptr),
602  blockSize_ (blockSize),
603  offsetPerBlock_ (blockSize_*blockSize_),
604  val_(val)
605  {}
606 
607  KOKKOS_INLINE_FUNCTION void
608  operator() (const LO& lclRowInd) const
609  {
610  using Kokkos::ALL;
611 
612  // Get row offset
613  const size_t absOffset = ptr_[lclRowInd];
614 
615  // Get offset relative to start of row
616  const size_t relOffset = diagOffsets_[lclRowInd];
617 
618  // Get the total offset
619  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
620 
621  // Get a view of the block. BCRS currently uses LayoutRight
622  // regardless of the device.
623  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
624  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
625  const_little_block_type;
626  const_little_block_type D_in (val_.data () + pointOffset,
627  blockSize_, blockSize_);
628  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
629  COPY (D_in, D_out);
630  }
631 
632  private:
633  diag_type diag_;
634  diag_offsets_type diagOffsets_;
635  row_offsets_type ptr_;
636  LO blockSize_;
637  LO offsetPerBlock_;
638  values_type val_;
639  };
640 } // namespace (anonymous)
641 
642  template<class Scalar, class LO, class GO, class Node>
643  std::ostream&
644  BlockCrsMatrix<Scalar, LO, GO, Node>::
645  markLocalErrorAndGetStream ()
646  {
647  * (this->localError_) = true;
648  if ((*errs_).is_null ()) {
649  *errs_ = Teuchos::rcp (new std::ostringstream ());
650  }
651  return **errs_;
652  }
653 
654  template<class Scalar, class LO, class GO, class Node>
656  BlockCrsMatrix () :
657  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
658  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
659  blockSize_ (static_cast<LO> (0)),
660  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
661  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
662  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
663  offsetPerBlock_ (0),
664  localError_ (new bool (false)),
665  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
666  {
667  }
668 
669  template<class Scalar, class LO, class GO, class Node>
671  BlockCrsMatrix (const crs_graph_type& graph,
672  const LO blockSize) :
673  dist_object_type (graph.getMap ()),
674  graph_ (graph),
675  rowMeshMap_ (* (graph.getRowMap ())),
676  blockSize_ (blockSize),
677  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
678  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
679  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
680  offsetPerBlock_ (blockSize * blockSize),
681  localError_ (new bool (false)),
682  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
683  {
685  TEUCHOS_TEST_FOR_EXCEPTION(
686  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
687  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
688  "rows (isSorted() is false). This class assumes sorted rows.");
689 
690  graphRCP_ = Teuchos::rcpFromRef(graph_);
691 
692  // Trick to test whether LO is nonpositive, without a compiler
693  // warning in case LO is unsigned (which is generally a bad idea
694  // anyway). I don't promise that the trick works, but it
695  // generally does with gcc at least, in my experience.
696  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
697  TEUCHOS_TEST_FOR_EXCEPTION(
698  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
699  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
700  " <= 0. The block size must be positive.");
701 
702  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
703  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
704 
705  {
706  // These are rcp
707  const auto domainPointMap = getDomainMap();
708  const auto colPointMap = Teuchos::rcp
709  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
710  *pointImporter_ = Teuchos::rcp
711  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
712  }
713  {
714  auto local_graph_h = graph.getLocalGraphHost ();
715  auto ptr_h = local_graph_h.row_map;
716  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
717  Kokkos::deep_copy(ptrHost_, ptr_h);
718 
719  auto ind_h = local_graph_h.entries;
720  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
721  Kokkos::deep_copy (indHost_, ind_h);
722 
723  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
724  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
725  }
726  }
727 
728  template<class Scalar, class LO, class GO, class Node>
730  BlockCrsMatrix (const crs_graph_type& graph,
731  const map_type& domainPointMap,
732  const map_type& rangePointMap,
733  const LO blockSize) :
734  dist_object_type (graph.getMap ()),
735  graph_ (graph),
736  rowMeshMap_ (* (graph.getRowMap ())),
737  domainPointMap_ (domainPointMap),
738  rangePointMap_ (rangePointMap),
739  blockSize_ (blockSize),
740  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
741  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
742  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
743  offsetPerBlock_ (blockSize * blockSize),
744  localError_ (new bool (false)),
745  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
746  {
747  TEUCHOS_TEST_FOR_EXCEPTION(
748  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
749  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
750  "rows (isSorted() is false). This class assumes sorted rows.");
751 
752  graphRCP_ = Teuchos::rcpFromRef(graph_);
753 
754  // Trick to test whether LO is nonpositive, without a compiler
755  // warning in case LO is unsigned (which is generally a bad idea
756  // anyway). I don't promise that the trick works, but it
757  // generally does with gcc at least, in my experience.
758  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
759  TEUCHOS_TEST_FOR_EXCEPTION(
760  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
761  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
762  " <= 0. The block size must be positive.");
763  {
764  // These are rcp
765  const auto rcpDomainPointMap = getDomainMap();
766  const auto colPointMap = Teuchos::rcp
767  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
768  *pointImporter_ = Teuchos::rcp
769  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
770  }
771  {
772  auto local_graph_h = graph.getLocalGraphHost ();
773  auto ptr_h = local_graph_h.row_map;
774  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
775  Kokkos::deep_copy(ptrHost_, ptr_h);
776 
777  auto ind_h = local_graph_h.entries;
778  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
779  Kokkos::deep_copy (indHost_, ind_h);
780 
781  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
782  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
783  }
784  }
785 
786  template<class Scalar, class LO, class GO, class Node>
787  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
789  getDomainMap () const
790  { // Copy constructor of map_type does a shallow copy.
791  // We're only returning by RCP for backwards compatibility.
792  return Teuchos::rcp (new map_type (domainPointMap_));
793  }
794 
795  template<class Scalar, class LO, class GO, class Node>
796  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
798  getRangeMap () const
799  { // Copy constructor of map_type does a shallow copy.
800  // We're only returning by RCP for backwards compatibility.
801  return Teuchos::rcp (new map_type (rangePointMap_));
802  }
803 
804  template<class Scalar, class LO, class GO, class Node>
805  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
807  getRowMap () const
808  {
809  return graph_.getRowMap();
810  }
811 
812  template<class Scalar, class LO, class GO, class Node>
813  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
815  getColMap () const
816  {
817  return graph_.getColMap();
818  }
819 
820  template<class Scalar, class LO, class GO, class Node>
823  getGlobalNumRows() const
824  {
825  return graph_.getGlobalNumRows();
826  }
827 
828  template<class Scalar, class LO, class GO, class Node>
829  size_t
831  getNodeNumRows() const
832  {
833  return graph_.getNodeNumRows();
834  }
835 
836  template<class Scalar, class LO, class GO, class Node>
837  size_t
840  {
841  return graph_.getNodeMaxNumRowEntries();
842  }
843 
844  template<class Scalar, class LO, class GO, class Node>
845  void
847  apply (const mv_type& X,
848  mv_type& Y,
849  Teuchos::ETransp mode,
850  Scalar alpha,
851  Scalar beta) const
852  {
854  TEUCHOS_TEST_FOR_EXCEPTION(
855  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
856  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
857  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
858  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
859 
860  BMV X_view;
861  BMV Y_view;
862  const LO blockSize = getBlockSize ();
863  try {
864  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
865  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
866  }
867  catch (std::invalid_argument& e) {
868  TEUCHOS_TEST_FOR_EXCEPTION(
869  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
870  "apply: Either the input MultiVector X or the output MultiVector Y "
871  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
872  "graph. BlockMultiVector's constructor threw the following exception: "
873  << e.what ());
874  }
875 
876  try {
877  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
878  // either that or mark fields of this class as 'mutable'. The
879  // problem is that applyBlock wants to do lazy initialization of
880  // temporary block multivectors.
881  const_cast<this_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
882  } catch (std::invalid_argument& e) {
883  TEUCHOS_TEST_FOR_EXCEPTION(
884  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
885  "apply: The implementation method applyBlock complained about having "
886  "an invalid argument. It reported the following: " << e.what ());
887  } catch (std::logic_error& e) {
888  TEUCHOS_TEST_FOR_EXCEPTION(
889  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
890  "apply: The implementation method applyBlock complained about a "
891  "possible bug in its implementation. It reported the following: "
892  << e.what ());
893  } catch (std::exception& e) {
894  TEUCHOS_TEST_FOR_EXCEPTION(
895  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
896  "apply: The implementation method applyBlock threw an exception which "
897  "is neither std::invalid_argument nor std::logic_error, but is a "
898  "subclass of std::exception. It reported the following: "
899  << e.what ());
900  } catch (...) {
901  TEUCHOS_TEST_FOR_EXCEPTION(
902  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
903  "apply: The implementation method applyBlock threw an exception which "
904  "is not an instance of a subclass of std::exception. This probably "
905  "indicates a bug. Please report this to the Tpetra developers.");
906  }
907  }
908 
909  template<class Scalar, class LO, class GO, class Node>
910  void
914  Teuchos::ETransp mode,
915  const Scalar alpha,
916  const Scalar beta)
917  {
918  TEUCHOS_TEST_FOR_EXCEPTION(
919  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
920  "Tpetra::BlockCrsMatrix::applyBlock: "
921  "X and Y have different block sizes. X.getBlockSize() = "
922  << X.getBlockSize () << " != Y.getBlockSize() = "
923  << Y.getBlockSize () << ".");
924 
925  if (mode == Teuchos::NO_TRANS) {
926  applyBlockNoTrans (X, Y, alpha, beta);
927  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
928  applyBlockTrans (X, Y, mode, alpha, beta);
929  } else {
930  TEUCHOS_TEST_FOR_EXCEPTION(
931  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
932  "applyBlock: Invalid 'mode' argument. Valid values are "
933  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
934  }
935  }
936 
937  template<class Scalar, class LO, class GO, class Node>
938  void
940  setAllToScalar (const Scalar& alpha)
941  {
942  auto val_d = val_.getDeviceView(Access::OverwriteAll);
943  Kokkos::deep_copy(val_d, alpha);
944  }
945 
946  template<class Scalar, class LO, class GO, class Node>
947  LO
949  replaceLocalValues (const LO localRowInd,
950  const LO colInds[],
951  const Scalar vals[],
952  const LO numColInds) const
953  {
954  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
955  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
956  ptrdiff_t * offsets = offsets_host_view.data();
957  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
958  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
959  return validCount;
960  }
961 
962  template <class Scalar, class LO, class GO, class Node>
963  void
965  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
966  Kokkos::MemoryUnmanaged>& offsets) const
967  {
968  graph_.getLocalDiagOffsets (offsets);
969  }
970 
971  template <class Scalar, class LO, class GO, class Node>
972  void
974  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
975  Kokkos::MemoryUnmanaged>& diag,
976  const Kokkos::View<const size_t*, device_type,
977  Kokkos::MemoryUnmanaged>& offsets) const
978  {
979  using Kokkos::parallel_for;
980  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
981 
982  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
983  const LO blockSize = this->getBlockSize ();
984  TEUCHOS_TEST_FOR_EXCEPTION
985  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
986  static_cast<LO> (diag.extent (1)) < blockSize ||
987  static_cast<LO> (diag.extent (2)) < blockSize,
988  std::invalid_argument, prefix <<
989  "The input Kokkos::View is not big enough to hold all the data.");
990  TEUCHOS_TEST_FOR_EXCEPTION
991  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
992  prefix << "offsets.size() = " << offsets.size () << " < local number of "
993  "diagonal blocks " << lclNumMeshRows << ".");
994 
995  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
996  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
997 
998  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
999  // we reserve the right to do lazy allocation of device data. (We
1000  // don't plan to do lazy allocation for host data; the host
1001  // version of the data always exists.)
1002  auto val_d = val_.getDeviceView(Access::ReadOnly);
1003  parallel_for (policy_type (0, lclNumMeshRows),
1004  functor_type (diag, val_d, offsets,
1005  graph_.getLocalGraphDevice ().row_map, blockSize_));
1006  }
1007 
1008 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1009  template <class Scalar, class LO, class GO, class Node>
1010  void
1012  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1013  Kokkos::MemoryUnmanaged>& diag,
1014  const Teuchos::ArrayView<const size_t>& offsets) const
1015  {
1016  auto offsets_view_host = Kokkos::View<size_t*,Kokkos::HostSpace>(const_cast<size_t*>(offsets.getRawPtr()), offsets.size());
1017  auto offsets_view_device = Kokkos::create_mirror_view_and_copy(typename device_type::memory_space(), offsets_view_host);
1018  getLocalDiagCopy(diag, offsets_view_device);
1019  Kokkos::deep_copy(offsets_view_host, offsets_view_device);
1020  }
1021 #endif
1022 
1023  template<class Scalar, class LO, class GO, class Node>
1024  LO
1026  absMaxLocalValues (const LO localRowInd,
1027  const LO colInds[],
1028  const Scalar vals[],
1029  const LO numColInds) const
1030  {
1031  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1032  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1033  ptrdiff_t * offsets = offsets_host_view.data();
1034  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1035  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1036  return validCount;
1037  }
1038 
1039 
1040  template<class Scalar, class LO, class GO, class Node>
1041  LO
1043  sumIntoLocalValues (const LO localRowInd,
1044  const LO colInds[],
1045  const Scalar vals[],
1046  const LO numColInds) const
1047  {
1048  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
1049  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
1050  ptrdiff_t * offsets = offsets_host_view.data();
1051  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
1052  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
1053  return validCount;
1054  }
1055 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1056  template<class Scalar, class LO, class GO, class Node>
1057  LO
1059  getLocalRowView (const LO localRowInd,
1060  const LO*& colInds,
1061  Scalar*& vals,
1062  LO& numInds) const
1063  {
1064 
1065  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1066  colInds = NULL;
1067  vals = NULL;
1068  numInds = 0;
1069  return Teuchos::OrdinalTraits<LO>::invalid ();
1070  }
1071  else {
1072  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1073  colInds = indHost_.data () + absBlockOffsetStart;
1074 
1075  auto vals_host_out = getValuesHost (localRowInd);
1076  impl_scalar_type* vals_host_out_raw = const_cast<impl_scalar_type*>(vals_host_out.data ());
1077  vals = reinterpret_cast<Scalar*> (vals_host_out_raw);
1078  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1079  return 0; // indicates no error
1080  }
1081  }
1082 #endif
1083 
1084  template<class Scalar, class LO, class GO, class Node>
1085  void
1087  getLocalRowCopy (LO LocalRow,
1088  nonconst_local_inds_host_view_type &Indices,
1089  nonconst_values_host_view_type &Values,
1090  size_t& NumEntries) const
1091  {
1092  auto vals = getValuesHost(LocalRow);
1093  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1094  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
1095  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1096  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1097  << numInds << " row entries");
1098  }
1099  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1100  for (LO i=0; i<numInds; ++i) {
1101  Indices[i] = colInds[i];
1102  }
1103  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1104  Values[i] = vals[i];
1105  }
1106  NumEntries = numInds;
1107  }
1108 
1109 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1110  template<class Scalar, class LO, class GO, class Node>
1111  void
1113  getLocalRowCopy (LO LocalRow,
1114  const Teuchos::ArrayView<LO>& Indices,
1115  const Teuchos::ArrayView<Scalar>& Values,
1116  size_t &NumEntries) const
1117  {
1118  auto vals = getValuesHost(LocalRow);
1119  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
1120  if (numInds > (LO)Indices.size() || numInds*blockSize_*blockSize_ > (LO)Values.size()) {
1121  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1122  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1123  << numInds << " row entries");
1124  }
1125  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
1126  for (LO i=0; i<numInds; ++i) {
1127  Indices[i] = colInds[i];
1128  }
1129  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1130  Values[i] = vals[i];
1131  }
1132  NumEntries = numInds;
1133  }
1134 #endif
1135 
1136  template<class Scalar, class LO, class GO, class Node>
1137  LO
1139  getLocalRowOffsets (const LO localRowInd,
1140  ptrdiff_t offsets[],
1141  const LO colInds[],
1142  const LO numColInds) const
1143  {
1144  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1145  // We got no offsets, because the input local row index is
1146  // invalid on the calling process. That may not be an error, if
1147  // numColInds is zero anyway; it doesn't matter. This is the
1148  // advantage of returning the number of valid indices.
1149  return static_cast<LO> (0);
1150  }
1151 
1152  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1153  LO hint = 0; // Guess for the relative offset into the current row
1154  LO validCount = 0; // number of valid column indices in colInds
1155 
1156  for (LO k = 0; k < numColInds; ++k) {
1157  const LO relBlockOffset =
1158  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1159  if (relBlockOffset != LINV) {
1160  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1161  hint = relBlockOffset + 1;
1162  ++validCount;
1163  }
1164  }
1165  return validCount;
1166  }
1167 
1168 
1169  template<class Scalar, class LO, class GO, class Node>
1170  LO
1172  replaceLocalValuesByOffsets (const LO localRowInd,
1173  const ptrdiff_t offsets[],
1174  const Scalar vals[],
1175  const LO numOffsets) const
1176  {
1177  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1178  // We modified no values, because the input local row index is
1179  // invalid on the calling process. That may not be an error, if
1180  // numColInds is zero anyway; it doesn't matter. This is the
1181  // advantage of returning the number of valid indices.
1182  return static_cast<LO> (0);
1183  }
1184  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1186  auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
1187  impl_scalar_type* vOut = val_out.data();
1188 
1189  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1190  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
1191  size_t pointOffset = 0; // Current offset into input values
1192  LO validCount = 0; // number of valid offsets
1193 
1194  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1195  const size_t blockOffset = offsets[k]*perBlockSize;
1196  if (offsets[k] != STINV) {
1197  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1198  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1199  COPY (A_new, A_old);
1200  ++validCount;
1201  }
1202  }
1203  return validCount;
1204  }
1205 
1206 
1207  template<class Scalar, class LO, class GO, class Node>
1208  LO
1210  absMaxLocalValuesByOffsets (const LO localRowInd,
1211  const ptrdiff_t offsets[],
1212  const Scalar vals[],
1213  const LO numOffsets) const
1214  {
1215  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1216  // We modified no values, because the input local row index is
1217  // invalid on the calling process. That may not be an error, if
1218  // numColInds is zero anyway; it doesn't matter. This is the
1219  // advantage of returning the number of valid indices.
1220  return static_cast<LO> (0);
1221  }
1222  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1223  auto val_out = getValuesHost(localRowInd);
1224  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
1225 
1226  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1227  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1228  size_t pointOffset = 0; // Current offset into input values
1229  LO validCount = 0; // number of valid offsets
1230 
1231  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1232  const size_t blockOffset = offsets[k]*perBlockSize;
1233  if (blockOffset != STINV) {
1234  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1235  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1236  ::Tpetra::Impl::absMax (A_old, A_new);
1237  ++validCount;
1238  }
1239  }
1240  return validCount;
1241  }
1242 
1243 
1244  template<class Scalar, class LO, class GO, class Node>
1245  LO
1247  sumIntoLocalValuesByOffsets (const LO localRowInd,
1248  const ptrdiff_t offsets[],
1249  const Scalar vals[],
1250  const LO numOffsets) const
1251  {
1252  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1253  // We modified no values, because the input local row index is
1254  // invalid on the calling process. That may not be an error, if
1255  // numColInds is zero anyway; it doesn't matter. This is the
1256  // advantage of returning the number of valid indices.
1257  return static_cast<LO> (0);
1258  }
1259  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1260  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1262  auto val_out = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
1263  impl_scalar_type* vOut = val_out.data();
1264 
1265  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1266  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1267  size_t pointOffset = 0; // Current offset into input values
1268  LO validCount = 0; // number of valid offsets
1269 
1270  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1271  const size_t blockOffset = offsets[k]*perBlockSize;
1272  if (blockOffset != STINV) {
1273  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
1274  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
1275  AXPY (ONE, A_new, A_old);
1276  ++validCount;
1277  }
1278  }
1279  return validCount;
1280  }
1281 
1282  template<class Scalar, class LO, class GO, class Node>
1284  impl_scalar_type_dualview::t_host::const_type
1286  getValuesHost () const
1287  {
1288  return val_.getHostView(Access::ReadOnly);
1289  }
1290 
1291  template<class Scalar, class LO, class GO, class Node>
1292  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1293  impl_scalar_type_dualview::t_dev::const_type
1294  BlockCrsMatrix<Scalar, LO, GO, Node>::
1295  getValuesDevice () const
1296  {
1297  return val_.getDeviceView(Access::ReadOnly);
1298  }
1299 
1300  template<class Scalar, class LO, class GO, class Node>
1301  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1302  impl_scalar_type_dualview::t_host
1304  getValuesHostNonConst () const
1305  {
1306  return val_.getHostView(Access::ReadWrite);
1307  }
1308 
1309  template<class Scalar, class LO, class GO, class Node>
1311  impl_scalar_type_dualview::t_dev
1313  getValuesDeviceNonConst () const
1314  {
1315  return val_.getDeviceView(Access::ReadWrite);
1316  }
1317 
1318  template<class Scalar, class LO, class GO, class Node>
1319  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
1320  impl_scalar_type_dualview::t_host::const_type
1322  getValuesHost (const LO& lclRow) const
1323  {
1324  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1325  auto val = val_.getHostView(Access::ReadOnly);
1326  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1327  return r_val;
1328  }
1329 
1330  template<class Scalar, class LO, class GO, class Node>
1332  impl_scalar_type_dualview::t_dev::const_type
1334  getValuesDevice (const LO& lclRow) const
1335  {
1336  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1337  auto val = val_.getDeviceView(Access::ReadOnly);
1338  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1339  return r_val;
1340  }
1341 
1342  template<class Scalar, class LO, class GO, class Node>
1345  getValuesHostNonConst (const LO& lclRow)
1346  {
1347  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1348  auto val = val_.getHostView(Access::ReadWrite);
1349  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1350  return r_val;
1351  }
1352 
1353  template<class Scalar, class LO, class GO, class Node>
1356  getValuesDeviceNonConst (const LO& lclRow)
1357  {
1358  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1359  auto val = val_.getDeviceView(Access::ReadWrite);
1360  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
1361  return r_val;
1362  }
1363 
1364  template<class Scalar, class LO, class GO, class Node>
1365  size_t
1367  getNumEntriesInLocalRow (const LO localRowInd) const
1368  {
1369  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1370  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1371  return static_cast<LO> (0); // the calling process doesn't have that row
1372  } else {
1373  return static_cast<LO> (numEntInGraph);
1374  }
1375  }
1376 
1377  template<class Scalar, class LO, class GO, class Node>
1378  void
1382  const Teuchos::ETransp mode,
1383  const Scalar alpha,
1384  const Scalar beta)
1385  {
1386  (void) X;
1387  (void) Y;
1388  (void) mode;
1389  (void) alpha;
1390  (void) beta;
1391 
1392  TEUCHOS_TEST_FOR_EXCEPTION(
1393  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1394  "transpose and conjugate transpose modes are not yet implemented.");
1395  }
1396 
1397  template<class Scalar, class LO, class GO, class Node>
1398  void
1399  BlockCrsMatrix<Scalar, LO, GO, Node>::
1400  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1401  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1402  const Scalar alpha,
1403  const Scalar beta)
1404  {
1405  using Teuchos::RCP;
1406  using Teuchos::rcp;
1407  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1408  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1409  const Scalar zero = STS::zero ();
1410  const Scalar one = STS::one ();
1411  RCP<const import_type> import = graph_.getImporter ();
1412  // "export" is a reserved C++ keyword, so we can't use it.
1413  RCP<const export_type> theExport = graph_.getExporter ();
1414  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1415 
1416  if (alpha == zero) {
1417  if (beta == zero) {
1418  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1419  }
1420  else if (beta != one) {
1421  Y.scale (beta);
1422  }
1423  }
1424  else { // alpha != 0
1425  const BMV* X_colMap = NULL;
1426  if (import.is_null ()) {
1427  try {
1428  X_colMap = &X;
1429  }
1430  catch (std::exception& e) {
1431  TEUCHOS_TEST_FOR_EXCEPTION
1432  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1433  "operator= threw an exception: " << std::endl << e.what ());
1434  }
1435  }
1436  else {
1437  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1438  // Y_rowMap_ below. This lets us do lazy initialization
1439  // correctly with view semantics of BlockCrsMatrix. All views
1440  // of this BlockCrsMatrix have the same outer pointer. That
1441  // way, we can set the inner pointer in one view, and all
1442  // other views will see it.
1443  if ((*X_colMap_).is_null () ||
1444  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1445  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1446  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1447  static_cast<LO> (X.getNumVectors ())));
1448  }
1449  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1450  **pointImporter_,
1451  ::Tpetra::REPLACE);
1452  try {
1453  X_colMap = &(**X_colMap_);
1454  }
1455  catch (std::exception& e) {
1456  TEUCHOS_TEST_FOR_EXCEPTION
1457  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1458  "operator= threw an exception: " << std::endl << e.what ());
1459  }
1460  }
1461 
1462  BMV* Y_rowMap = NULL;
1463  if (theExport.is_null ()) {
1464  Y_rowMap = &Y;
1465  }
1466  else if ((*Y_rowMap_).is_null () ||
1467  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1468  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1469  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1470  static_cast<LO> (X.getNumVectors ())));
1471  try {
1472  Y_rowMap = &(**Y_rowMap_);
1473  }
1474  catch (std::exception& e) {
1475  TEUCHOS_TEST_FOR_EXCEPTION(
1476  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1477  "operator= threw an exception: " << std::endl << e.what ());
1478  }
1479  }
1480 
1481  try {
1482  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1483  }
1484  catch (std::exception& e) {
1485  TEUCHOS_TEST_FOR_EXCEPTION
1486  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1487  "an exception: " << e.what ());
1488  }
1489  catch (...) {
1490  TEUCHOS_TEST_FOR_EXCEPTION
1491  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1492  "an exception not a subclass of std::exception.");
1493  }
1494 
1495  if (! theExport.is_null ()) {
1496  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1497  }
1498  }
1499  }
1500 
1501  template<class Scalar, class LO, class GO, class Node>
1502  void
1503  BlockCrsMatrix<Scalar, LO, GO, Node>::
1504  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1505  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1506  const Scalar alpha,
1507  const Scalar beta)
1508  {
1509  using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1510 
1511  const impl_scalar_type alpha_impl = alpha;
1512  const auto graph = this->graph_.getLocalGraphDevice ();
1513  const impl_scalar_type beta_impl = beta;
1514  const LO blockSize = this->getBlockSize ();
1515 
1516  auto X_mv = X.getMultiVectorView ();
1517  auto Y_mv = Y.getMultiVectorView ();
1518 
1519  //auto X_lcl = X_mv.template getLocalView<device_type> (Access::ReadOnly);
1520  //auto Y_lcl = Y_mv.template getLocalView<device_type> (Access::ReadWrite);
1521  auto X_lcl = X_mv.getLocalViewDevice (Access::ReadOnly);
1522  auto Y_lcl = Y_mv.getLocalViewDevice (Access::ReadWrite);
1523  auto val = val_.getDeviceView(Access::ReadWrite);
1524 
1525  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1526  beta_impl, Y_lcl);
1527  }
1528 
1529  template<class Scalar, class LO, class GO, class Node>
1530  LO
1531  BlockCrsMatrix<Scalar, LO, GO, Node>::
1532  findRelOffsetOfColumnIndex (const LO localRowIndex,
1533  const LO colIndexToFind,
1534  const LO hint) const
1535  {
1536  const size_t absStartOffset = ptrHost_[localRowIndex];
1537  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1538  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1539  // Amortize pointer arithmetic over the search loop.
1540  const LO* const curInd = indHost_.data () + absStartOffset;
1541 
1542  // If the hint was correct, then the hint is the offset to return.
1543  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1544  // Always return the offset relative to the current row.
1545  return hint;
1546  }
1547 
1548  // The hint was wrong, so we must search for the given column
1549  // index in the column indices for the given row.
1550  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1551 
1552  // We require that the graph have sorted rows. However, binary
1553  // search only pays if the current row is longer than a certain
1554  // amount. We set this to 32, but you might want to tune this.
1555  const LO maxNumEntriesForLinearSearch = 32;
1556  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1557  // Use binary search. It would probably be better for us to
1558  // roll this loop by hand. If we wrote it right, a smart
1559  // compiler could perhaps use conditional loads and avoid
1560  // branches (according to Jed Brown on May 2014).
1561  const LO* beg = curInd;
1562  const LO* end = curInd + numEntriesInRow;
1563  std::pair<const LO*, const LO*> p =
1564  std::equal_range (beg, end, colIndexToFind);
1565  if (p.first != p.second) {
1566  // offset is relative to the current row
1567  relOffset = static_cast<LO> (p.first - beg);
1568  }
1569  }
1570  else { // use linear search
1571  for (LO k = 0; k < numEntriesInRow; ++k) {
1572  if (colIndexToFind == curInd[k]) {
1573  relOffset = k; // offset is relative to the current row
1574  break;
1575  }
1576  }
1577  }
1578 
1579  return relOffset;
1580  }
1581 
1582  template<class Scalar, class LO, class GO, class Node>
1583  LO
1584  BlockCrsMatrix<Scalar, LO, GO, Node>::
1585  offsetPerBlock () const
1586  {
1587  return offsetPerBlock_;
1588  }
1589 
1590  template<class Scalar, class LO, class GO, class Node>
1592  BlockCrsMatrix<Scalar, LO, GO, Node>::
1593  getConstLocalBlockFromInput (const impl_scalar_type* val,
1594  const size_t pointOffset) const
1595  {
1596  // Row major blocks
1597  const LO rowStride = blockSize_;
1598  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1599  }
1600 
1601  template<class Scalar, class LO, class GO, class Node>
1603  BlockCrsMatrix<Scalar, LO, GO, Node>::
1604  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1605  const size_t pointOffset) const
1606  {
1607  // Row major blocks
1608  const LO rowStride = blockSize_;
1609  return little_block_type (val + pointOffset, blockSize_, rowStride);
1610  }
1611 
1612  template<class Scalar, class LO, class GO, class Node>
1613  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1614  BlockCrsMatrix<Scalar, LO, GO, Node>::
1615  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1616  const size_t pointOffset) const
1617  {
1618  // Row major blocks
1619  const LO rowStride = blockSize_;
1620  const size_t bs2 = blockSize_ * blockSize_;
1621  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1622  }
1623 
1624 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1625  template<class Scalar, class LO, class GO, class Node>
1627  BlockCrsMatrix<Scalar, LO, GO, Node>::
1628  getLocalBlock (const LO localRowInd, const LO localColInd) const
1629  {
1630  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1631 
1632  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1633  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1634 
1635  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1636  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1637  auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
1638  return getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1639  }
1640  else {
1641  return little_block_type ();
1642  }
1643  }
1644 #endif
1645 
1646  template<class Scalar, class LO, class GO, class Node>
1648  BlockCrsMatrix<Scalar, LO, GO, Node>::
1649  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1650  {
1651  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1652 
1653  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1654  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1655  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1656  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1657  auto vals = const_cast<this_type&>(*this).getValuesDeviceNonConst();
1658  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1659  return r_val;
1660  }
1661  else {
1662  return little_block_type ();
1663  }
1664  }
1665 
1666  template<class Scalar, class LO, class GO, class Node>
1667  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1668  BlockCrsMatrix<Scalar, LO, GO, Node>::
1669  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1670  {
1671  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1672 
1673  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1674  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1675  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1676  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1677  auto vals = const_cast<this_type&>(*this).getValuesHostNonConst();
1678  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1679  return r_val;
1680  }
1681  else {
1682  return little_block_host_type ();
1683  }
1684  }
1685 
1686 
1687  template<class Scalar, class LO, class GO, class Node>
1688  bool
1689  BlockCrsMatrix<Scalar, LO, GO, Node>::
1690  checkSizes (const ::Tpetra::SrcDistObject& source)
1691  {
1692  using std::endl;
1693  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
1694  const this_type* src = dynamic_cast<const this_type* > (&source);
1695 
1696  if (src == NULL) {
1697  std::ostream& err = markLocalErrorAndGetStream ();
1698  err << "checkSizes: The source object of the Import or Export "
1699  "must be a BlockCrsMatrix with the same template parameters as the "
1700  "target object." << endl;
1701  }
1702  else {
1703  // Use a string of ifs, not if-elseifs, because we want to know
1704  // all the errors.
1705  if (src->getBlockSize () != this->getBlockSize ()) {
1706  std::ostream& err = markLocalErrorAndGetStream ();
1707  err << "checkSizes: The source and target objects of the Import or "
1708  << "Export must have the same block sizes. The source's block "
1709  << "size = " << src->getBlockSize () << " != the target's block "
1710  << "size = " << this->getBlockSize () << "." << endl;
1711  }
1712  if (! src->graph_.isFillComplete ()) {
1713  std::ostream& err = markLocalErrorAndGetStream ();
1714  err << "checkSizes: The source object of the Import or Export is "
1715  "not fill complete. Both source and target objects must be fill "
1716  "complete." << endl;
1717  }
1718  if (! this->graph_.isFillComplete ()) {
1719  std::ostream& err = markLocalErrorAndGetStream ();
1720  err << "checkSizes: The target object of the Import or Export is "
1721  "not fill complete. Both source and target objects must be fill "
1722  "complete." << endl;
1723  }
1724  if (src->graph_.getColMap ().is_null ()) {
1725  std::ostream& err = markLocalErrorAndGetStream ();
1726  err << "checkSizes: The source object of the Import or Export does "
1727  "not have a column Map. Both source and target objects must have "
1728  "column Maps." << endl;
1729  }
1730  if (this->graph_.getColMap ().is_null ()) {
1731  std::ostream& err = markLocalErrorAndGetStream ();
1732  err << "checkSizes: The target object of the Import or Export does "
1733  "not have a column Map. Both source and target objects must have "
1734  "column Maps." << endl;
1735  }
1736  }
1737 
1738  return ! (* (this->localError_));
1739  }
1740 
1741  template<class Scalar, class LO, class GO, class Node>
1742  void
1745  (const ::Tpetra::SrcDistObject& source,
1746  const size_t numSameIDs,
1747  const Kokkos::DualView<const local_ordinal_type*,
1748  buffer_device_type>& permuteToLIDs,
1749  const Kokkos::DualView<const local_ordinal_type*,
1750  buffer_device_type>& permuteFromLIDs,
1751  const CombineMode /*CM*/)
1752  {
1753  using ::Tpetra::Details::Behavior;
1755  using ::Tpetra::Details::ProfilingRegion;
1756  using std::endl;
1758 
1759  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1760  const bool debug = Behavior::debug();
1761  const bool verbose = Behavior::verbose();
1762 
1763  // Define this function prefix
1764  std::string prefix;
1765  {
1766  std::ostringstream os;
1767  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1768  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1769  prefix = os.str();
1770  }
1771 
1772  // check if this already includes a local error
1773  if (* (this->localError_)) {
1774  std::ostream& err = this->markLocalErrorAndGetStream ();
1775  err << prefix
1776  << "The target object of the Import or Export is already in an error state."
1777  << endl;
1778  return;
1779  }
1780 
1781  //
1782  // Verbose input dual view status
1783  //
1784  if (verbose) {
1785  std::ostringstream os;
1786  os << prefix << endl
1787  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1788  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1789  std::cerr << os.str ();
1790  }
1791 
1795  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1796  std::ostream& err = this->markLocalErrorAndGetStream ();
1797  err << prefix
1798  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1799  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1800  << "." << endl;
1801  return;
1802  }
1803  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1804  std::ostream& err = this->markLocalErrorAndGetStream ();
1805  err << prefix
1806  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1807  << endl;
1808  return;
1809  }
1810 
1811  const this_type* src = dynamic_cast<const this_type* > (&source);
1812  if (src == NULL) {
1813  std::ostream& err = this->markLocalErrorAndGetStream ();
1814  err << prefix
1815  << "The source (input) object of the Import or "
1816  "Export is either not a BlockCrsMatrix, or does not have the right "
1817  "template parameters. checkSizes() should have caught this. "
1818  "Please report this bug to the Tpetra developers." << endl;
1819  return;
1820  }
1821 
1822  bool lclErr = false;
1823 #ifdef HAVE_TPETRA_DEBUG
1824  std::set<LO> invalidSrcCopyRows;
1825  std::set<LO> invalidDstCopyRows;
1826  std::set<LO> invalidDstCopyCols;
1827  std::set<LO> invalidDstPermuteCols;
1828  std::set<LO> invalidPermuteFromRows;
1829 #endif // HAVE_TPETRA_DEBUG
1830 
1831  // Copy the initial sequence of rows that are the same.
1832  //
1833  // The two graphs might have different column Maps, so we need to
1834  // do this using global column indices. This is purely local, so
1835  // we only need to check for local sameness of the two column
1836  // Maps.
1837 
1838 #ifdef HAVE_TPETRA_DEBUG
1839  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1840 #endif // HAVE_TPETRA_DEBUG
1841  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1842  const map_type& srcColMap = * (src->graph_.getColMap ());
1843  const map_type& dstColMap = * (this->graph_.getColMap ());
1844  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1845 
1846  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1847  if (verbose) {
1848  std::ostringstream os;
1849  os << prefix
1850  << "canUseLocalColumnIndices: "
1851  << (canUseLocalColumnIndices ? "true" : "false")
1852  << ", numPermute: " << numPermute
1853  << endl;
1854  std::cerr << os.str ();
1855  }
1856 
1857  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1858  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1859 
1860  if (canUseLocalColumnIndices) {
1861  // Copy local rows that are the "same" in both source and target.
1862  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1863 #ifdef HAVE_TPETRA_DEBUG
1864  if (! srcRowMap.isNodeLocalElement (localRow)) {
1865  lclErr = true;
1866  invalidSrcCopyRows.insert (localRow);
1867  continue; // skip invalid rows
1868  }
1869 #endif // HAVE_TPETRA_DEBUG
1870 
1871  local_inds_host_view_type lclSrcCols;
1872  values_host_view_type vals;
1873  LO numEntries;
1874  // If this call fails, that means the mesh row local index is
1875  // invalid. That means the Import or Export is invalid somehow.
1876  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1877  if (numEntries > 0) {
1878  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1879  if (err != numEntries) {
1880  lclErr = true;
1881  if (! dstRowMap.isNodeLocalElement (localRow)) {
1882 #ifdef HAVE_TPETRA_DEBUG
1883  invalidDstCopyRows.insert (localRow);
1884 #endif // HAVE_TPETRA_DEBUG
1885  }
1886  else {
1887  // Once there's an error, there's no sense in saving
1888  // time, so we check whether the column indices were
1889  // invalid. However, only remember which ones were
1890  // invalid in a debug build, because that might take a
1891  // lot of space.
1892  for (LO k = 0; k < numEntries; ++k) {
1893  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1894  lclErr = true;
1895 #ifdef HAVE_TPETRA_DEBUG
1896  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1897 #endif // HAVE_TPETRA_DEBUG
1898  }
1899  }
1900  }
1901  }
1902  }
1903  } // for each "same" local row
1904 
1905  // Copy the "permute" local rows.
1906  for (size_t k = 0; k < numPermute; ++k) {
1907  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1908  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1909 
1910  local_inds_host_view_type lclSrcCols;
1911  values_host_view_type vals;
1912  LO numEntries;
1913  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1914  if (numEntries > 0) {
1915  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1916  if (err != numEntries) {
1917  lclErr = true;
1918 #ifdef HAVE_TPETRA_DEBUG
1919  for (LO c = 0; c < numEntries; ++c) {
1920  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1921  invalidDstPermuteCols.insert (lclSrcCols[c]);
1922  }
1923  }
1924 #endif // HAVE_TPETRA_DEBUG
1925  }
1926  }
1927  }
1928  }
1929  else { // must convert column indices to global
1930  // Reserve space to store the destination matrix's local column indices.
1931  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
1932  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1933 
1934  // Copy local rows that are the "same" in both source and target.
1935  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1936  local_inds_host_view_type lclSrcCols;
1937  values_host_view_type vals;
1938  LO numEntries;
1939 
1940  // If this call fails, that means the mesh row local index is
1941  // invalid. That means the Import or Export is invalid somehow.
1942  try {
1943  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1944  } catch (std::exception& e) {
1945  if (debug) {
1946  std::ostringstream os;
1947  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1948  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1949  << localRow << ", src->getLocalRowView() threw an exception: "
1950  << e.what ();
1951  std::cerr << os.str ();
1952  }
1953  throw e;
1954  }
1955 
1956  if (numEntries > 0) {
1957  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1958  lclErr = true;
1959  if (debug) {
1960  std::ostringstream os;
1961  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1962  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1963  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1964  << maxNumEnt << endl;
1965  std::cerr << os.str ();
1966  }
1967  }
1968  else {
1969  // Convert the source matrix's local column indices to the
1970  // destination matrix's local column indices.
1971  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1972  for (LO j = 0; j < numEntries; ++j) {
1973  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1974  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1975  lclErr = true;
1976 #ifdef HAVE_TPETRA_DEBUG
1977  invalidDstCopyCols.insert (lclDstColsView[j]);
1978 #endif // HAVE_TPETRA_DEBUG
1979  }
1980  }
1981  LO err(0);
1982  try {
1983  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1984  } catch (std::exception& e) {
1985  if (debug) {
1986  std::ostringstream os;
1987  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1988  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1989  << localRow << ", this->replaceLocalValues() threw an exception: "
1990  << e.what ();
1991  std::cerr << os.str ();
1992  }
1993  throw e;
1994  }
1995  if (err != numEntries) {
1996  lclErr = true;
1997  if (debug) {
1998  std::ostringstream os;
1999  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2000  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2001  "localRow " << localRow << ", this->replaceLocalValues "
2002  "returned " << err << " instead of numEntries = "
2003  << numEntries << endl;
2004  std::cerr << os.str ();
2005  }
2006  }
2007  }
2008  }
2009  }
2010 
2011  // Copy the "permute" local rows.
2012  for (size_t k = 0; k < numPermute; ++k) {
2013  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2014  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2015 
2016  local_inds_host_view_type lclSrcCols;
2017  values_host_view_type vals;
2018  LO numEntries;
2019 
2020  try {
2021  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
2022  } catch (std::exception& e) {
2023  if (debug) {
2024  std::ostringstream os;
2025  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2026  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2027  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2028  << ", src->getLocalRowView() threw an exception: " << e.what ();
2029  std::cerr << os.str ();
2030  }
2031  throw e;
2032  }
2033 
2034  if (numEntries > 0) {
2035  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2036  lclErr = true;
2037  }
2038  else {
2039  // Convert the source matrix's local column indices to the
2040  // destination matrix's local column indices.
2041  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2042  for (LO j = 0; j < numEntries; ++j) {
2043  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2044  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2045  lclErr = true;
2046 #ifdef HAVE_TPETRA_DEBUG
2047  invalidDstPermuteCols.insert (lclDstColsView[j]);
2048 #endif // HAVE_TPETRA_DEBUG
2049  }
2050  }
2051  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
2052  if (err != numEntries) {
2053  lclErr = true;
2054  }
2055  }
2056  }
2057  }
2058  }
2059 
2060  if (lclErr) {
2061  std::ostream& err = this->markLocalErrorAndGetStream ();
2062 #ifdef HAVE_TPETRA_DEBUG
2063  err << "copyAndPermute: The graph structure of the source of the "
2064  "Import or Export must be a subset of the graph structure of the "
2065  "target. ";
2066  err << "invalidSrcCopyRows = [";
2067  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2068  it != invalidSrcCopyRows.end (); ++it) {
2069  err << *it;
2070  typename std::set<LO>::const_iterator itp1 = it;
2071  itp1++;
2072  if (itp1 != invalidSrcCopyRows.end ()) {
2073  err << ",";
2074  }
2075  }
2076  err << "], invalidDstCopyRows = [";
2077  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2078  it != invalidDstCopyRows.end (); ++it) {
2079  err << *it;
2080  typename std::set<LO>::const_iterator itp1 = it;
2081  itp1++;
2082  if (itp1 != invalidDstCopyRows.end ()) {
2083  err << ",";
2084  }
2085  }
2086  err << "], invalidDstCopyCols = [";
2087  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2088  it != invalidDstCopyCols.end (); ++it) {
2089  err << *it;
2090  typename std::set<LO>::const_iterator itp1 = it;
2091  itp1++;
2092  if (itp1 != invalidDstCopyCols.end ()) {
2093  err << ",";
2094  }
2095  }
2096  err << "], invalidDstPermuteCols = [";
2097  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2098  it != invalidDstPermuteCols.end (); ++it) {
2099  err << *it;
2100  typename std::set<LO>::const_iterator itp1 = it;
2101  itp1++;
2102  if (itp1 != invalidDstPermuteCols.end ()) {
2103  err << ",";
2104  }
2105  }
2106  err << "], invalidPermuteFromRows = [";
2107  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2108  it != invalidPermuteFromRows.end (); ++it) {
2109  err << *it;
2110  typename std::set<LO>::const_iterator itp1 = it;
2111  itp1++;
2112  if (itp1 != invalidPermuteFromRows.end ()) {
2113  err << ",";
2114  }
2115  }
2116  err << "]" << endl;
2117 #else
2118  err << "copyAndPermute: The graph structure of the source of the "
2119  "Import or Export must be a subset of the graph structure of the "
2120  "target." << endl;
2121 #endif // HAVE_TPETRA_DEBUG
2122  }
2123 
2124  if (debug) {
2125  std::ostringstream os;
2126  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2127  const bool lclSuccess = ! (* (this->localError_));
2128  os << "*** Proc " << myRank << ": copyAndPermute "
2129  << (lclSuccess ? "succeeded" : "FAILED");
2130  if (lclSuccess) {
2131  os << endl;
2132  } else {
2133  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2134  }
2135  std::cerr << os.str ();
2136  }
2137  }
2138 
2139  namespace { // (anonymous)
2140 
2159  template<class LO, class GO>
2160  size_t
2161  packRowCount (const size_t numEnt,
2162  const size_t numBytesPerValue,
2163  const size_t blkSize)
2164  {
2165  using ::Tpetra::Details::PackTraits;
2166 
2167  if (numEnt == 0) {
2168  // Empty rows always take zero bytes, to ensure sparsity.
2169  return 0;
2170  }
2171  else {
2172  // We store the number of entries as a local index (LO).
2173  LO numEntLO = 0; // packValueCount wants this.
2174  GO gid {};
2175  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2176  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2177  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2178  return numEntLen + gidsLen + valsLen;
2179  }
2180  }
2181 
2192  template<class ST, class LO, class GO>
2193  size_t
2194  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2195  const size_t offset,
2196  const size_t numBytes,
2197  const size_t /* numBytesPerValue */)
2198  {
2199  using Kokkos::subview;
2200  using ::Tpetra::Details::PackTraits;
2201 
2202  if (numBytes == 0) {
2203  // Empty rows always take zero bytes, to ensure sparsity.
2204  return static_cast<size_t> (0);
2205  }
2206  else {
2207  LO numEntLO = 0;
2208  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2209  TEUCHOS_TEST_FOR_EXCEPTION
2210  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2211  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2212  << ".");
2213  const char* const inBuf = imports.data () + offset;
2214  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2215  TEUCHOS_TEST_FOR_EXCEPTION
2216  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2217  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2218  << ".");
2219  return static_cast<size_t> (numEntLO);
2220  }
2221  }
2222 
2226  template<class ST, class LO, class GO>
2227  size_t
2228  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2229  const size_t offset,
2230  const size_t numEnt,
2231  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2232  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2233  const size_t numBytesPerValue,
2234  const size_t blockSize)
2235  {
2236  using Kokkos::subview;
2237  using ::Tpetra::Details::PackTraits;
2238 
2239  if (numEnt == 0) {
2240  // Empty rows always take zero bytes, to ensure sparsity.
2241  return 0;
2242  }
2243  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2244 
2245  const GO gid = 0; // packValueCount wants this
2246  const LO numEntLO = static_cast<size_t> (numEnt);
2247 
2248  const size_t numEntBeg = offset;
2249  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2250  const size_t gidsBeg = numEntBeg + numEntLen;
2251  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2252  const size_t valsBeg = gidsBeg + gidsLen;
2253  const size_t valsLen = numScalarEnt * numBytesPerValue;
2254 
2255  char* const numEntOut = exports.data () + numEntBeg;
2256  char* const gidsOut = exports.data () + gidsBeg;
2257  char* const valsOut = exports.data () + valsBeg;
2258 
2259  size_t numBytesOut = 0;
2260  int errorCode = 0;
2261  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2262 
2263  {
2264  Kokkos::pair<int, size_t> p;
2265  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2266  errorCode += p.first;
2267  numBytesOut += p.second;
2268 
2269  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2270  errorCode += p.first;
2271  numBytesOut += p.second;
2272  }
2273 
2274  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2275  TEUCHOS_TEST_FOR_EXCEPTION
2276  (numBytesOut != expectedNumBytes, std::logic_error,
2277  "packRowForBlockCrs: numBytesOut = " << numBytesOut
2278  << " != expectedNumBytes = " << expectedNumBytes << ".");
2279 
2280  TEUCHOS_TEST_FOR_EXCEPTION
2281  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2282  "PackTraits::packArray returned a nonzero error code");
2283 
2284  return numBytesOut;
2285  }
2286 
2287  // Return the number of bytes actually read / used.
2288  template<class ST, class LO, class GO>
2289  size_t
2290  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2291  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2292  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2293  const size_t offset,
2294  const size_t numBytes,
2295  const size_t numEnt,
2296  const size_t numBytesPerValue,
2297  const size_t blockSize)
2298  {
2299  using ::Tpetra::Details::PackTraits;
2300 
2301  if (numBytes == 0) {
2302  // Rows with zero bytes always have zero entries.
2303  return 0;
2304  }
2305  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2306  TEUCHOS_TEST_FOR_EXCEPTION
2307  (static_cast<size_t> (imports.extent (0)) <= offset,
2308  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2309  << imports.extent (0) << " <= offset = " << offset << ".");
2310  TEUCHOS_TEST_FOR_EXCEPTION
2311  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2312  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2313  << imports.extent (0) << " < offset + numBytes = "
2314  << (offset + numBytes) << ".");
2315 
2316  const GO gid = 0; // packValueCount wants this
2317  const LO lid = 0; // packValueCount wants this
2318 
2319  const size_t numEntBeg = offset;
2320  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2321  const size_t gidsBeg = numEntBeg + numEntLen;
2322  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2323  const size_t valsBeg = gidsBeg + gidsLen;
2324  const size_t valsLen = numScalarEnt * numBytesPerValue;
2325 
2326  const char* const numEntIn = imports.data () + numEntBeg;
2327  const char* const gidsIn = imports.data () + gidsBeg;
2328  const char* const valsIn = imports.data () + valsBeg;
2329 
2330  size_t numBytesOut = 0;
2331  int errorCode = 0;
2332  LO numEntOut;
2333  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2334  TEUCHOS_TEST_FOR_EXCEPTION
2335  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2336  "unpackRowForBlockCrs: Expected number of entries " << numEnt
2337  << " != actual number of entries " << numEntOut << ".");
2338 
2339  {
2340  Kokkos::pair<int, size_t> p;
2341  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2342  errorCode += p.first;
2343  numBytesOut += p.second;
2344 
2345  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2346  errorCode += p.first;
2347  numBytesOut += p.second;
2348  }
2349 
2350  TEUCHOS_TEST_FOR_EXCEPTION
2351  (numBytesOut != numBytes, std::logic_error,
2352  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2353  << " != numBytes = " << numBytes << ".");
2354 
2355  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2356  TEUCHOS_TEST_FOR_EXCEPTION
2357  (numBytesOut != expectedNumBytes, std::logic_error,
2358  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2359  << " != expectedNumBytes = " << expectedNumBytes << ".");
2360 
2361  TEUCHOS_TEST_FOR_EXCEPTION
2362  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2363  "PackTraits::unpackArray returned a nonzero error code");
2364 
2365  return numBytesOut;
2366  }
2367  } // namespace (anonymous)
2368 
2369  template<class Scalar, class LO, class GO, class Node>
2370  void
2373  (const ::Tpetra::SrcDistObject& source,
2374  const Kokkos::DualView<const local_ordinal_type*,
2375  buffer_device_type>& exportLIDs,
2376  Kokkos::DualView<packet_type*,
2377  buffer_device_type>& exports, // output
2378  Kokkos::DualView<size_t*,
2379  buffer_device_type> numPacketsPerLID, // output
2380  size_t& constantNumPackets)
2381  {
2382  using ::Tpetra::Details::Behavior;
2384  using ::Tpetra::Details::ProfilingRegion;
2385  using ::Tpetra::Details::PackTraits;
2386 
2387  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2388 
2390 
2391  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2392 
2393  const bool debug = Behavior::debug();
2394  const bool verbose = Behavior::verbose();
2395 
2396  // Define this function prefix
2397  std::string prefix;
2398  {
2399  std::ostringstream os;
2400  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2401  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2402  prefix = os.str();
2403  }
2404 
2405  // check if this already includes a local error
2406  if (* (this->localError_)) {
2407  std::ostream& err = this->markLocalErrorAndGetStream ();
2408  err << prefix
2409  << "The target object of the Import or Export is already in an error state."
2410  << std::endl;
2411  return;
2412  }
2413 
2414  //
2415  // Verbose input dual view status
2416  //
2417  if (verbose) {
2418  std::ostringstream os;
2419  os << prefix << std::endl
2420  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2421  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2422  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2423  std::cerr << os.str ();
2424  }
2425 
2429  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2430  std::ostream& err = this->markLocalErrorAndGetStream ();
2431  err << prefix
2432  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2433  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2434  << "." << std::endl;
2435  return;
2436  }
2437  if (exportLIDs.need_sync_host ()) {
2438  std::ostream& err = this->markLocalErrorAndGetStream ();
2439  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2440  return;
2441  }
2442 
2443  const this_type* src = dynamic_cast<const this_type* > (&source);
2444  if (src == NULL) {
2445  std::ostream& err = this->markLocalErrorAndGetStream ();
2446  err << prefix
2447  << "The source (input) object of the Import or "
2448  "Export is either not a BlockCrsMatrix, or does not have the right "
2449  "template parameters. checkSizes() should have caught this. "
2450  "Please report this bug to the Tpetra developers." << std::endl;
2451  return;
2452  }
2453 
2454  // Graphs and matrices are allowed to have a variable number of
2455  // entries per row. We could test whether all rows have the same
2456  // number of entries, but DistObject can only use this
2457  // optimization if all rows on _all_ processes have the same
2458  // number of entries. Rather than do the all-reduce necessary to
2459  // test for this unlikely case, we tell DistObject (by setting
2460  // constantNumPackets to zero) to assume that different rows may
2461  // have different numbers of entries.
2462  constantNumPackets = 0;
2463 
2464  // const values
2465  const crs_graph_type& srcGraph = src->graph_;
2466  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2467  const size_t numExportLIDs = exportLIDs.extent (0);
2468  size_t numBytesPerValue(0);
2469  {
2470  auto val_host = val_.getHostView(Access::ReadOnly);
2471  numBytesPerValue =
2472  PackTraits<impl_scalar_type>
2473  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2474  }
2475 
2476  // Compute the number of bytes ("packets") per row to pack. While
2477  // we're at it, compute the total # of block entries to send, and
2478  // the max # of block entries in any of the rows we're sending.
2479 
2480  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2481 
2482  // Graph information is on host; let's do this on host parallel reduce
2483  auto exportLIDsHost = exportLIDs.view_host();
2484  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2485  numPacketsPerLID.modify_host ();
2486  {
2487  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2488  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2489  Kokkos::parallel_reduce
2490  (policy,
2491  [=](const size_t &i, typename reducer_type::value_type &update) {
2492  const LO lclRow = exportLIDsHost(i);
2493  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2494  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2495 
2496  const size_t numBytes =
2497  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2498  numPacketsPerLIDHost(i) = numBytes;
2499  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2500  }, rowReducerStruct);
2501  }
2502 
2503  // Compute the number of bytes ("packets") per row to pack. While
2504  // we're at it, compute the total # of block entries to send, and
2505  // the max # of block entries in any of the rows we're sending.
2506  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2507  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2508  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2509 
2510  if (verbose) {
2511  std::ostringstream os;
2512  os << prefix
2513  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2514  << std::endl;
2515  std::cerr << os.str ();
2516  }
2517 
2518  // We use a "struct of arrays" approach to packing each row's
2519  // entries. All the column indices (as global indices) go first,
2520  // then all their owning process ranks, and then the values.
2521  if(exports.extent(0) != totalNumBytes)
2522  {
2523  const std::string oldLabel = exports.d_view.label ();
2524  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2525  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2526  }
2527  if (totalNumEntries > 0) {
2528  // Current position (in bytes) in the 'exports' output array.
2529  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2530  {
2531  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2532  Kokkos::parallel_scan
2533  (policy,
2534  [=](const size_t &i, size_t &update, const bool &final) {
2535  if (final) offset(i) = update;
2536  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2537  });
2538  }
2539  if (offset(numExportLIDs) != totalNumBytes) {
2540  std::ostream& err = this->markLocalErrorAndGetStream ();
2541  err << prefix
2542  << "At end of method, the final offset (in bytes) "
2543  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2544  << totalNumBytes << ". "
2545  << "Please report this bug to the Tpetra developers." << std::endl;
2546  return;
2547  }
2548 
2549  // For each block row of the matrix owned by the calling
2550  // process, pack that block row's column indices and values into
2551  // the exports array.
2552 
2553  // Source matrix's column Map. We verified in checkSizes() that
2554  // the column Map exists (is not null).
2555  const map_type& srcColMap = * (srcGraph.getColMap ());
2556 
2557  // Pack the data for each row to send, into the 'exports' buffer.
2558  // exports will be modified on host.
2559  exports.modify_host();
2560  {
2561  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2562  const auto policy =
2563  policy_type(numExportLIDs, 1, 1)
2564  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2565  Kokkos::parallel_for
2566  (policy,
2567  [=](const typename policy_type::member_type &member) {
2568  const size_t i = member.league_rank();
2569  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2570  gblColInds(member.team_scratch(0), maxRowLength);
2571 
2572  const LO lclRowInd = exportLIDsHost(i);
2573  local_inds_host_view_type lclColInds;
2574  values_host_view_type vals;
2575 
2576  // It's OK to ignore the return value, since if the calling
2577  // process doesn't own that local row, then the number of
2578  // entries in that row on the calling process is zero.
2579  src->getLocalRowView (lclRowInd, lclColInds, vals);
2580  const size_t numEnt = lclColInds.extent(0);
2581 
2582  // Convert column indices from local to global.
2583  for (size_t j = 0; j < numEnt; ++j)
2584  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2585 
2586  // Kyungjoo: additional wrapping scratch view is necessary
2587  // the following function interface need the same execution space
2588  // host scratch space somehow is not considered same as the host_exec
2589  // Copy the row's data into the current spot in the exports array.
2590  const size_t numBytes =
2591  packRowForBlockCrs<impl_scalar_type, LO, GO>
2592  (exports.view_host(),
2593  offset(i),
2594  numEnt,
2595  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2596  vals,
2597  numBytesPerValue,
2598  blockSize);
2599 
2600  // numBytes should be same as the difference between offsets
2601  if (debug) {
2602  const size_t offsetDiff = offset(i+1) - offset(i);
2603  if (numBytes != offsetDiff) {
2604  std::ostringstream os;
2605  os << prefix
2606  << "numBytes computed from packRowForBlockCrs is different from "
2607  << "precomputed offset values, LID = " << i << std::endl;
2608  std::cerr << os.str ();
2609  }
2610  }
2611  }); // for each LID (of a row) to send
2612  }
2613  } // if totalNumEntries > 0
2614 
2615  if (debug) {
2616  std::ostringstream os;
2617  const bool lclSuccess = ! (* (this->localError_));
2618  os << prefix
2619  << (lclSuccess ? "succeeded" : "FAILED")
2620  << std::endl;
2621  std::cerr << os.str ();
2622  }
2623  }
2624 
2625  template<class Scalar, class LO, class GO, class Node>
2626  void
2629  (const Kokkos::DualView<const local_ordinal_type*,
2630  buffer_device_type>& importLIDs,
2631  Kokkos::DualView<packet_type*,
2632  buffer_device_type> imports,
2633  Kokkos::DualView<size_t*,
2634  buffer_device_type> numPacketsPerLID,
2635  const size_t /* constantNumPackets */,
2636  const CombineMode combineMode)
2637  {
2638  using ::Tpetra::Details::Behavior;
2640  using ::Tpetra::Details::ProfilingRegion;
2641  using ::Tpetra::Details::PackTraits;
2642  using std::endl;
2643  using host_exec =
2644  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2645 
2646  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2647  const bool verbose = Behavior::verbose ();
2648 
2649  // Define this function prefix
2650  std::string prefix;
2651  {
2652  std::ostringstream os;
2653  auto map = this->graph_.getRowMap ();
2654  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2655  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2656  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2657  prefix = os.str ();
2658  if (verbose) {
2659  os << "Start" << endl;
2660  std::cerr << os.str ();
2661  }
2662  }
2663 
2664  // check if this already includes a local error
2665  if (* (this->localError_)) {
2666  std::ostream& err = this->markLocalErrorAndGetStream ();
2667  std::ostringstream os;
2668  os << prefix << "{Im/Ex}port target is already in error." << endl;
2669  if (verbose) {
2670  std::cerr << os.str ();
2671  }
2672  err << os.str ();
2673  return;
2674  }
2675 
2679  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2680  std::ostream& err = this->markLocalErrorAndGetStream ();
2681  std::ostringstream os;
2682  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2683  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2684  << "." << endl;
2685  if (verbose) {
2686  std::cerr << os.str ();
2687  }
2688  err << os.str ();
2689  return;
2690  }
2691 
2692  if (combineMode != ADD && combineMode != INSERT &&
2693  combineMode != REPLACE && combineMode != ABSMAX &&
2694  combineMode != ZERO) {
2695  std::ostream& err = this->markLocalErrorAndGetStream ();
2696  std::ostringstream os;
2697  os << prefix
2698  << "Invalid CombineMode value " << combineMode << ". Valid "
2699  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2700  << std::endl;
2701  if (verbose) {
2702  std::cerr << os.str ();
2703  }
2704  err << os.str ();
2705  return;
2706  }
2707 
2708  if (this->graph_.getColMap ().is_null ()) {
2709  std::ostream& err = this->markLocalErrorAndGetStream ();
2710  std::ostringstream os;
2711  os << prefix << "Target matrix's column Map is null." << endl;
2712  if (verbose) {
2713  std::cerr << os.str ();
2714  }
2715  err << os.str ();
2716  return;
2717  }
2718 
2719  // Target matrix's column Map. Use to convert the global column
2720  // indices in the receive buffer to local indices. We verified in
2721  // checkSizes() that the column Map exists (is not null).
2722  const map_type& tgtColMap = * (this->graph_.getColMap ());
2723 
2724  // Const values
2725  const size_t blockSize = this->getBlockSize ();
2726  const size_t numImportLIDs = importLIDs.extent(0);
2727  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2728  // default-constructed instance could have a different size than
2729  // other instances. (We assume all nominally constructed
2730  // instances have the same size; that's not the issue here.) This
2731  // could be bad if the calling process has no entries, but other
2732  // processes have entries that they want to send to us.
2733  size_t numBytesPerValue(0);
2734  {
2735  auto val_host = val_.getHostView(Access::ReadOnly);
2736  numBytesPerValue =
2737  PackTraits<impl_scalar_type>::packValueCount
2738  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2739  }
2740  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
2741  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2742 
2743  if (verbose) {
2744  std::ostringstream os;
2745  os << prefix << "combineMode: "
2746  << ::Tpetra::combineModeToString (combineMode)
2747  << ", blockSize: " << blockSize
2748  << ", numImportLIDs: " << numImportLIDs
2749  << ", numBytesPerValue: " << numBytesPerValue
2750  << ", maxRowNumEnt: " << maxRowNumEnt
2751  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2752  std::cerr << os.str ();
2753  }
2754 
2755  if (combineMode == ZERO || numImportLIDs == 0) {
2756  if (verbose) {
2757  std::ostringstream os;
2758  os << prefix << "Nothing to unpack. Done!" << std::endl;
2759  std::cerr << os.str ();
2760  }
2761  return;
2762  }
2763 
2764  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2765  // we can remove this sync.
2766  if (imports.need_sync_host ()) {
2767  imports.sync_host ();
2768  }
2769 
2770  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2771  // sync'd numPacketsPerLID to host, since it needs to do that in
2772  // order to post MPI_Irecv messages with the correct lengths. A
2773  // hypothetical device-specific boundary exchange implementation
2774  // could possibly receive data without sync'ing lengths to host,
2775  // but we don't need to design for that nonexistent thing yet.
2776 
2777  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2778  importLIDs.need_sync_host ()) {
2779  std::ostream& err = this->markLocalErrorAndGetStream ();
2780  std::ostringstream os;
2781  os << prefix << "All input DualViews must be sync'd to host by now. "
2782  << "imports_nc.need_sync_host()="
2783  << (imports.need_sync_host () ? "true" : "false")
2784  << ", numPacketsPerLID.need_sync_host()="
2785  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2786  << ", importLIDs.need_sync_host()="
2787  << (importLIDs.need_sync_host () ? "true" : "false")
2788  << "." << endl;
2789  if (verbose) {
2790  std::cerr << os.str ();
2791  }
2792  err << os.str ();
2793  return;
2794  }
2795 
2796  const auto importLIDsHost = importLIDs.view_host ();
2797  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2798 
2799  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2800  // loop, by only unpacking on final==true (when we know the
2801  // current offset's value).
2802 
2803  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2804  {
2805  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2806  Kokkos::parallel_scan
2807  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2808  [=] (const size_t &i, size_t &update, const bool &final) {
2809  if (final) offset(i) = update;
2810  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2811  });
2812  }
2813 
2814  // this variable does not matter with a race condition (just error flag)
2815  //
2816  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2817  // or atomics on bool, so we use int instead. (I know we're not
2818  // launching a CUDA loop here, but we could in the future, and I'd
2819  // like to avoid potential trouble.)
2820  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2821  errorDuringUnpack ("errorDuringUnpack");
2822  errorDuringUnpack () = 0;
2823  {
2824  using policy_type = Kokkos::TeamPolicy<host_exec>;
2825  const auto policy = policy_type (numImportLIDs, 1, 1)
2826  .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
2827  sizeof (LO) * maxRowNumEnt +
2828  numBytesPerValue * maxRowNumScalarEnt));
2829  using host_scratch_space = typename host_exec::scratch_memory_space;
2830  using pair_type = Kokkos::pair<size_t, size_t>;
2831  Kokkos::parallel_for
2832  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2833  [=] (const typename policy_type::member_type& member) {
2834  const size_t i = member.league_rank();
2835 
2836  Kokkos::View<GO*, host_scratch_space> gblColInds
2837  (member.team_scratch (0), maxRowNumEnt);
2838  Kokkos::View<LO*, host_scratch_space> lclColInds
2839  (member.team_scratch (0), maxRowNumEnt);
2840  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2841  (member.team_scratch (0), maxRowNumScalarEnt);
2842 
2843  const size_t offval = offset(i);
2844  const LO lclRow = importLIDsHost(i);
2845  const size_t numBytes = numPacketsPerLIDHost(i);
2846  const size_t numEnt =
2847  unpackRowCount<impl_scalar_type, LO, GO>
2848  (imports.view_host (), offval, numBytes, numBytesPerValue);
2849 
2850  if (numBytes > 0) {
2851  if (numEnt > maxRowNumEnt) {
2852  errorDuringUnpack() = 1;
2853  if (verbose) {
2854  std::ostringstream os;
2855  os << prefix
2856  << "At i = " << i << ", numEnt = " << numEnt
2857  << " > maxRowNumEnt = " << maxRowNumEnt
2858  << std::endl;
2859  std::cerr << os.str ();
2860  }
2861  return;
2862  }
2863  }
2864  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2865  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2866  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2867  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2868 
2869  // Kyungjoo: additional wrapping scratch view is necessary
2870  // the following function interface need the same execution space
2871  // host scratch space somehow is not considered same as the host_exec
2872  size_t numBytesOut = 0;
2873  try {
2874  numBytesOut =
2875  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2876  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2877  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2878  imports.view_host(),
2879  offval, numBytes, numEnt,
2880  numBytesPerValue, blockSize);
2881  }
2882  catch (std::exception& e) {
2883  errorDuringUnpack() = 1;
2884  if (verbose) {
2885  std::ostringstream os;
2886  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2887  << e.what () << endl;
2888  std::cerr << os.str ();
2889  }
2890  return;
2891  }
2892 
2893  if (numBytes != numBytesOut) {
2894  errorDuringUnpack() = 1;
2895  if (verbose) {
2896  std::ostringstream os;
2897  os << prefix
2898  << "At i = " << i << ", numBytes = " << numBytes
2899  << " != numBytesOut = " << numBytesOut << "."
2900  << std::endl;
2901  std::cerr << os.str();
2902  }
2903  return;
2904  }
2905 
2906  // Convert incoming global indices to local indices.
2907  for (size_t k = 0; k < numEnt; ++k) {
2908  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2909  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2910  if (verbose) {
2911  std::ostringstream os;
2912  os << prefix
2913  << "At i = " << i << ", GID " << gidsOut(k)
2914  << " is not owned by the calling process."
2915  << std::endl;
2916  std::cerr << os.str();
2917  }
2918  return;
2919  }
2920  }
2921 
2922  // Combine the incoming data with the matrix's current data.
2923  LO numCombd = 0;
2924  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2925  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2926  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2927  if (combineMode == ADD) {
2928  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2929  } else if (combineMode == INSERT || combineMode == REPLACE) {
2930  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2931  } else if (combineMode == ABSMAX) {
2932  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2933  }
2934 
2935  if (static_cast<LO> (numEnt) != numCombd) {
2936  errorDuringUnpack() = 1;
2937  if (verbose) {
2938  std::ostringstream os;
2939  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2940  << " != numCombd = " << numCombd << "."
2941  << endl;
2942  std::cerr << os.str ();
2943  }
2944  return;
2945  }
2946  }); // for each import LID i
2947  }
2948 
2949  if (errorDuringUnpack () != 0) {
2950  std::ostream& err = this->markLocalErrorAndGetStream ();
2951  err << prefix << "Unpacking failed.";
2952  if (! verbose) {
2953  err << " Please run again with the environment variable setting "
2954  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2955  }
2956  err << endl;
2957  }
2958 
2959  if (verbose) {
2960  std::ostringstream os;
2961  const bool lclSuccess = ! (* (this->localError_));
2962  os << prefix
2963  << (lclSuccess ? "succeeded" : "FAILED")
2964  << std::endl;
2965  std::cerr << os.str ();
2966  }
2967  }
2968 
2969  template<class Scalar, class LO, class GO, class Node>
2970  std::string
2972  {
2973  using Teuchos::TypeNameTraits;
2974  std::ostringstream os;
2975  os << "\"Tpetra::BlockCrsMatrix\": { "
2976  << "Template parameters: { "
2977  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2978  << "LO: " << TypeNameTraits<LO>::name ()
2979  << "GO: " << TypeNameTraits<GO>::name ()
2980  << "Node: " << TypeNameTraits<Node>::name ()
2981  << " }"
2982  << ", Label: \"" << this->getObjectLabel () << "\""
2983  << ", Global dimensions: ["
2984  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2985  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2986  << ", Block size: " << getBlockSize ()
2987  << " }";
2988  return os.str ();
2989  }
2990 
2991 
2992  template<class Scalar, class LO, class GO, class Node>
2993  void
2995  describe (Teuchos::FancyOStream& out,
2996  const Teuchos::EVerbosityLevel verbLevel) const
2997  {
2998  using Teuchos::ArrayRCP;
2999  using Teuchos::CommRequest;
3000  using Teuchos::FancyOStream;
3001  using Teuchos::getFancyOStream;
3002  using Teuchos::ireceive;
3003  using Teuchos::isend;
3004  using Teuchos::outArg;
3005  using Teuchos::TypeNameTraits;
3006  using Teuchos::VERB_DEFAULT;
3007  using Teuchos::VERB_NONE;
3008  using Teuchos::VERB_LOW;
3009  using Teuchos::VERB_MEDIUM;
3010  // using Teuchos::VERB_HIGH;
3011  using Teuchos::VERB_EXTREME;
3012  using Teuchos::RCP;
3013  using Teuchos::wait;
3014  using std::endl;
3015 
3016  // Set default verbosity if applicable.
3017  const Teuchos::EVerbosityLevel vl =
3018  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3019 
3020  if (vl == VERB_NONE) {
3021  return; // print nothing
3022  }
3023 
3024  // describe() always starts with a tab before it prints anything.
3025  Teuchos::OSTab tab0 (out);
3026 
3027  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3028  Teuchos::OSTab tab1 (out);
3029 
3030  out << "Template parameters:" << endl;
3031  {
3032  Teuchos::OSTab tab2 (out);
3033  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3034  << "LO: " << TypeNameTraits<LO>::name () << endl
3035  << "GO: " << TypeNameTraits<GO>::name () << endl
3036  << "Node: " << TypeNameTraits<Node>::name () << endl;
3037  }
3038  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3039  << "Global dimensions: ["
3040  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3041  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3042 
3043  const LO blockSize = getBlockSize ();
3044  out << "Block size: " << blockSize << endl;
3045 
3046  // constituent objects
3047  if (vl >= VERB_MEDIUM) {
3048  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3049  const int myRank = comm.getRank ();
3050  if (myRank == 0) {
3051  out << "Row Map:" << endl;
3052  }
3053  getRowMap()->describe (out, vl);
3054 
3055  if (! getColMap ().is_null ()) {
3056  if (getColMap() == getRowMap()) {
3057  if (myRank == 0) {
3058  out << "Column Map: same as row Map" << endl;
3059  }
3060  }
3061  else {
3062  if (myRank == 0) {
3063  out << "Column Map:" << endl;
3064  }
3065  getColMap ()->describe (out, vl);
3066  }
3067  }
3068  if (! getDomainMap ().is_null ()) {
3069  if (getDomainMap () == getRowMap ()) {
3070  if (myRank == 0) {
3071  out << "Domain Map: same as row Map" << endl;
3072  }
3073  }
3074  else if (getDomainMap () == getColMap ()) {
3075  if (myRank == 0) {
3076  out << "Domain Map: same as column Map" << endl;
3077  }
3078  }
3079  else {
3080  if (myRank == 0) {
3081  out << "Domain Map:" << endl;
3082  }
3083  getDomainMap ()->describe (out, vl);
3084  }
3085  }
3086  if (! getRangeMap ().is_null ()) {
3087  if (getRangeMap () == getDomainMap ()) {
3088  if (myRank == 0) {
3089  out << "Range Map: same as domain Map" << endl;
3090  }
3091  }
3092  else if (getRangeMap () == getRowMap ()) {
3093  if (myRank == 0) {
3094  out << "Range Map: same as row Map" << endl;
3095  }
3096  }
3097  else {
3098  if (myRank == 0) {
3099  out << "Range Map: " << endl;
3100  }
3101  getRangeMap ()->describe (out, vl);
3102  }
3103  }
3104  }
3105 
3106  if (vl >= VERB_EXTREME) {
3107  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3108  const int myRank = comm.getRank ();
3109  const int numProcs = comm.getSize ();
3110 
3111  // Print the calling process' data to the given output stream.
3112  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3113  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3114  FancyOStream& os = *osPtr;
3115  os << "Process " << myRank << ":" << endl;
3116  Teuchos::OSTab tab2 (os);
3117 
3118  const map_type& meshRowMap = * (graph_.getRowMap ());
3119  const map_type& meshColMap = * (graph_.getColMap ());
3120  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3121  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3122  ++meshLclRow) {
3123  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3124  os << "Row " << meshGblRow << ": {";
3125 
3126  local_inds_host_view_type lclColInds;
3127  values_host_view_type vals;
3128  LO numInds = 0;
3129  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
3130 
3131  for (LO k = 0; k < numInds; ++k) {
3132  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3133 
3134  os << "Col " << gblCol << ": [";
3135  for (LO i = 0; i < blockSize; ++i) {
3136  for (LO j = 0; j < blockSize; ++j) {
3137  os << vals[blockSize*blockSize*k + i*blockSize + j];
3138  if (j + 1 < blockSize) {
3139  os << ", ";
3140  }
3141  }
3142  if (i + 1 < blockSize) {
3143  os << "; ";
3144  }
3145  }
3146  os << "]";
3147  if (k + 1 < numInds) {
3148  os << ", ";
3149  }
3150  }
3151  os << "}" << endl;
3152  }
3153 
3154  // Print data on Process 0. This will automatically respect the
3155  // current indentation level.
3156  if (myRank == 0) {
3157  out << lclOutStrPtr->str ();
3158  lclOutStrPtr = Teuchos::null; // clear it to save space
3159  }
3160 
3161  const int sizeTag = 1337;
3162  const int dataTag = 1338;
3163 
3164  ArrayRCP<char> recvDataBuf; // only used on Process 0
3165 
3166  // Send string sizes and data from each process in turn to
3167  // Process 0, and print on that process.
3168  for (int p = 1; p < numProcs; ++p) {
3169  if (myRank == 0) {
3170  // Receive the incoming string's length.
3171  ArrayRCP<size_t> recvSize (1);
3172  recvSize[0] = 0;
3173  RCP<CommRequest<int> > recvSizeReq =
3174  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3175  wait<int> (comm, outArg (recvSizeReq));
3176  const size_t numCharsToRecv = recvSize[0];
3177 
3178  // Allocate space for the string to receive. Reuse receive
3179  // buffer space if possible. We can do this because in the
3180  // current implementation, we only have one receive in
3181  // flight at a time. Leave space for the '\0' at the end,
3182  // in case the sender doesn't send it.
3183  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3184  recvDataBuf.resize (numCharsToRecv + 1);
3185  }
3186  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3187  // Post the receive of the actual string data.
3188  RCP<CommRequest<int> > recvDataReq =
3189  ireceive<int, char> (recvData, p, dataTag, comm);
3190  wait<int> (comm, outArg (recvDataReq));
3191 
3192  // Print the received data. This will respect the current
3193  // indentation level. Make sure that the string is
3194  // null-terminated.
3195  recvDataBuf[numCharsToRecv] = '\0';
3196  out << recvDataBuf.getRawPtr ();
3197  }
3198  else if (myRank == p) { // if I am not Process 0, and my rank is p
3199  // This deep-copies the string at most twice, depending on
3200  // whether std::string reference counts internally (it
3201  // generally does, so this won't deep-copy at all).
3202  const std::string stringToSend = lclOutStrPtr->str ();
3203  lclOutStrPtr = Teuchos::null; // clear original to save space
3204 
3205  // Send the string's length to Process 0.
3206  const size_t numCharsToSend = stringToSend.size ();
3207  ArrayRCP<size_t> sendSize (1);
3208  sendSize[0] = numCharsToSend;
3209  RCP<CommRequest<int> > sendSizeReq =
3210  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3211  wait<int> (comm, outArg (sendSizeReq));
3212 
3213  // Send the actual string to Process 0. We know that the
3214  // string has length > 0, so it's save to take the address
3215  // of the first entry. Make a nonowning ArrayRCP to hold
3216  // the string. Process 0 will add a null termination
3217  // character at the end of the string, after it receives the
3218  // message.
3219  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3220  RCP<CommRequest<int> > sendDataReq =
3221  isend<int, char> (sendData, 0, dataTag, comm);
3222  wait<int> (comm, outArg (sendDataReq));
3223  }
3224  } // for each process rank p other than 0
3225  } // extreme verbosity level (print the whole matrix)
3226  }
3227 
3228  template<class Scalar, class LO, class GO, class Node>
3229  Teuchos::RCP<const Teuchos::Comm<int> >
3231  getComm() const
3232  {
3233  return graph_.getComm();
3234  }
3235 
3236 
3237  template<class Scalar, class LO, class GO, class Node>
3240  getGlobalNumCols() const
3241  {
3242  return graph_.getGlobalNumCols();
3243  }
3244 
3245  template<class Scalar, class LO, class GO, class Node>
3246  size_t
3248  getNodeNumCols() const
3249  {
3250  return graph_.getNodeNumCols();
3251  }
3252 
3253  template<class Scalar, class LO, class GO, class Node>
3254  GO
3256  getIndexBase() const
3257  {
3258  return graph_.getIndexBase();
3259  }
3260 
3261  template<class Scalar, class LO, class GO, class Node>
3264  getGlobalNumEntries() const
3265  {
3266  return graph_.getGlobalNumEntries();
3267  }
3268 
3269  template<class Scalar, class LO, class GO, class Node>
3270  size_t
3272  getNodeNumEntries() const
3273  {
3274  return graph_.getNodeNumEntries();
3275  }
3276 
3277  template<class Scalar, class LO, class GO, class Node>
3278  size_t
3280  getNumEntriesInGlobalRow (GO globalRow) const
3281  {
3282  return graph_.getNumEntriesInGlobalRow(globalRow);
3283  }
3284 
3285 
3286  template<class Scalar, class LO, class GO, class Node>
3287  size_t
3290  {
3291  return graph_.getGlobalMaxNumRowEntries();
3292  }
3293 
3294  template<class Scalar, class LO, class GO, class Node>
3295  bool
3297  hasColMap() const
3298  {
3299  return graph_.hasColMap();
3300  }
3301 
3302 
3303  template<class Scalar, class LO, class GO, class Node>
3304  bool
3306  isLocallyIndexed() const
3307  {
3308  return graph_.isLocallyIndexed();
3309  }
3310 
3311  template<class Scalar, class LO, class GO, class Node>
3312  bool
3314  isGloballyIndexed() const
3315  {
3316  return graph_.isGloballyIndexed();
3317  }
3318 
3319  template<class Scalar, class LO, class GO, class Node>
3320  bool
3322  isFillComplete() const
3323  {
3324  return graph_.isFillComplete ();
3325  }
3326 
3327  template<class Scalar, class LO, class GO, class Node>
3328  bool
3330  supportsRowViews() const
3331  {
3332  return false;
3333  }
3334 
3335  template<class Scalar, class LO, class GO, class Node>
3336  void
3338  getGlobalRowCopy (GO /*GlobalRow*/,
3339  nonconst_global_inds_host_view_type &/*Indices*/,
3340  nonconst_values_host_view_type &/*Values*/,
3341  size_t& /*NumEntries*/) const
3342  {
3343  TEUCHOS_TEST_FOR_EXCEPTION(
3344  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3345  "This class doesn't support global matrix indexing.");
3346 
3347  }
3348 
3349 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3350  template<class Scalar, class LO, class GO, class Node>
3351  void
3353  getGlobalRowCopy (GO /* GlobalRow */,
3354  const Teuchos::ArrayView<GO> &/* Indices */,
3355  const Teuchos::ArrayView<Scalar> &/* Values */,
3356  size_t &/* NumEntries */) const
3357  {
3358  TEUCHOS_TEST_FOR_EXCEPTION(
3359  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3360  "This class doesn't support global matrix indexing.");
3361 
3362  }
3363 #endif
3364 
3365 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3366  template<class Scalar, class LO, class GO, class Node>
3367  void
3369  getGlobalRowView (GO /* GlobalRow */,
3370  Teuchos::ArrayView<const GO> &/* indices */,
3371  Teuchos::ArrayView<const Scalar> &/* values */) const
3372  {
3373  TEUCHOS_TEST_FOR_EXCEPTION(
3374  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3375  "This class doesn't support global matrix indexing.");
3376 
3377  }
3378 
3379  template<class Scalar, class LO, class GO, class Node>
3380  void
3382  getLocalRowView (LO /* LocalRow */,
3383  Teuchos::ArrayView<const LO>& /* indices */,
3384  Teuchos::ArrayView<const Scalar>& /* values */) const
3385  {
3386  TEUCHOS_TEST_FOR_EXCEPTION(
3387  true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
3388  "This class doesn't support local matrix indexing.");
3389  }
3390 #endif
3391 
3392  template<class Scalar, class LO, class GO, class Node>
3393  void
3395  getGlobalRowView (GO /* GlobalRow */,
3396  global_inds_host_view_type &/* indices */,
3397  values_host_view_type &/* values */) const
3398  {
3399  TEUCHOS_TEST_FOR_EXCEPTION(
3400  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3401  "This class doesn't support global matrix indexing.");
3402 
3403  }
3404 
3405  template<class Scalar, class LO, class GO, class Node>
3406  void
3408  getLocalRowView (LO localRowInd,
3409  local_inds_host_view_type &colInds,
3410  values_host_view_type &vals) const
3411  {
3412  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3413  colInds = local_inds_host_view_type();
3414  vals = values_host_view_type();
3415  }
3416  else {
3417  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3418  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3419  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3420 
3421  vals = getValuesHost (localRowInd);
3422  }
3423  }
3424 
3425  template<class Scalar, class LO, class GO, class Node>
3426  void
3428  getLocalRowViewNonConst (LO localRowInd,
3429  local_inds_host_view_type &colInds,
3430  nonconst_values_host_view_type &vals) const
3431  {
3432  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
3433  colInds = nonconst_local_inds_host_view_type();
3434  vals = nonconst_values_host_view_type();
3435  }
3436  else {
3437  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3438  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3439  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3440 
3442  vals = const_cast<this_type&>(*this).getValuesHostNonConst(localRowInd);
3443  }
3444  }
3445 
3446  template<class Scalar, class LO, class GO, class Node>
3447  void
3450  {
3451  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3452 
3453  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3454  graph_.getLocalDiagOffsets (diagOffsets);
3455 
3456  // The code below works on host, so use a host View.
3457  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3458  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3459 
3460  auto vals_host_out = val_.getHostView(Access::ReadOnly);
3461  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
3462 
3463  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3464  size_t rowOffset = 0;
3465  size_t offset = 0;
3466  LO bs = getBlockSize();
3467  for(size_t r=0; r<getNodeNumRows(); r++)
3468  {
3469  // move pointer to start of diagonal block
3470  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3471  for(int b=0; b<bs; b++)
3472  {
3473  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3474  }
3475  // move pointer to start of next block row
3476  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3477  }
3478 
3479  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3480  }
3481 
3482  template<class Scalar, class LO, class GO, class Node>
3483  void
3485  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3486  {
3487  TEUCHOS_TEST_FOR_EXCEPTION(
3488  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3489  "not implemented.");
3490 
3491  }
3492 
3493  template<class Scalar, class LO, class GO, class Node>
3494  void
3496  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3497  {
3498  TEUCHOS_TEST_FOR_EXCEPTION(
3499  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3500  "not implemented.");
3501 
3502  }
3503 
3504  template<class Scalar, class LO, class GO, class Node>
3505  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3507  getGraph() const
3508  {
3509  return graphRCP_;
3510  }
3511 
3512  template<class Scalar, class LO, class GO, class Node>
3513  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3515  getFrobeniusNorm () const
3516  {
3517  TEUCHOS_TEST_FOR_EXCEPTION(
3518  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3519  "not implemented.");
3520  }
3521 
3522 } // namespace Tpetra
3523 
3524 //
3525 // Explicit instantiation macro
3526 //
3527 // Must be expanded from within the Tpetra namespace!
3528 //
3529 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3530  template class BlockCrsMatrix< S, LO, GO, NODE >;
3531 
3532 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
Linear algebra kernels for small dense matrices and vectors.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix's communicator.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
virtual size_t getNodeNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
LO local_ordinal_type
The type of local indices.
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row's entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix's values (val_).
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual typename ::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
std::string description() const override
One-line description of this object.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Kokkos::View< const impl_scalar_type **, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > const_little_block_type
The type used to access const matrix blocks.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh,...
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row.
size_t getNodeNumRows() const override
get the local number of block rows
Node::device_type device_type
The Kokkos::Device specialization that this class uses.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
char packet_type
Implementation detail; tells.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
global_size_t getGlobalNumRows() const override
get the global number of block rows
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
virtual size_t getNodeNumCols() const override
The number of columns needed to apply the forward operator on this node.
MultiVector for multiple degrees of freedom per mesh point.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
One or more distributed dense vectors.
A distributed dense vector.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
int local_ordinal_type
Default value of Scalar template parameter.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra's ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
size_t global_size_t
Global size_t object.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.