Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_packCrsMatrix.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_PACKCRSMATRIX_HPP
43 #define TPETRA_DETAILS_PACKCRSMATRIX_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
51 #include "Kokkos_Core.hpp"
52 #include <memory>
53 #include <string>
54 
63 
64 namespace Tpetra {
65 
66 #ifndef DOXYGEN_SHOULD_SKIP_THIS
67 // Forward declaration of Distributor
68 class Distributor;
69 #endif // DOXYGEN_SHOULD_SKIP_THIS
70 
71 //
72 // Users must never rely on anything in the Details namespace.
73 //
74 namespace Details {
75 
76 template<class OutputOffsetsViewType,
77  class CountsViewType,
78  class InputOffsetsViewType,
79  class InputLocalRowIndicesViewType,
80  const bool debug =
81 #ifdef HAVE_TPETRA_DEBUG
82  true
83 #else
84  false
85 #endif // HAVE_TPETRA_DEBUG
86  >
87 class NumPacketsAndOffsetsFunctor {
88 public:
89  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
90  typedef typename CountsViewType::non_const_value_type count_type;
91  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
92  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
93  // output Views drive where execution happens.
94  typedef typename OutputOffsetsViewType::device_type device_type;
95  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
96  typename device_type::execution_space>::value,
97  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
98  static_assert (Kokkos::Impl::is_view<OutputOffsetsViewType>::value,
99  "OutputOffsetsViewType must be a Kokkos::View.");
100  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
101  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
102  static_assert (std::is_integral<output_offset_type>::value,
103  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
104  static_assert (Kokkos::Impl::is_view<CountsViewType>::value,
105  "CountsViewType must be a Kokkos::View.");
106  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
107  "CountsViewType must be a nonconst Kokkos::View.");
108  static_assert (std::is_integral<count_type>::value,
109  "The type of each entry of CountsViewType must be a built-in integer type.");
110  static_assert (Kokkos::Impl::is_view<InputOffsetsViewType>::value,
111  "InputOffsetsViewType must be a Kokkos::View.");
112  static_assert (std::is_integral<input_offset_type>::value,
113  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
114  static_assert (Kokkos::Impl::is_view<InputLocalRowIndicesViewType>::value,
115  "InputLocalRowIndicesViewType must be a Kokkos::View.");
116  static_assert (std::is_integral<local_row_index_type>::value,
117  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
118 
119  NumPacketsAndOffsetsFunctor (const OutputOffsetsViewType& outputOffsets,
120  const CountsViewType& counts,
121  const InputOffsetsViewType& rowOffsets,
122  const InputLocalRowIndicesViewType& lclRowInds,
123  const count_type sizeOfLclCount,
124  const count_type sizeOfValue,
125  const count_type sizeOfGblColInd) :
126  outputOffsets_ (outputOffsets),
127  counts_ (counts),
128  rowOffsets_ (rowOffsets),
129  lclRowInds_ (lclRowInds),
130  sizeOfLclCount_ (sizeOfLclCount),
131  sizeOfValue_ (sizeOfValue),
132  sizeOfGblColInd_ (sizeOfGblColInd),
133  error_ ("error") // don't forget this, or you'll get segfaults!
134  {
135  if (debug) {
136  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.dimension_0 ());
137 
138  if (numRowsToPack != static_cast<size_t> (counts_.dimension_0 ())) {
139  std::ostringstream os;
140  os << "lclRowInds.dimension_0() = " << numRowsToPack
141  << " != counts.dimension_0() = " << counts_.dimension_0 ()
142  << ".";
143  throw std::invalid_argument (os.str ());
144  }
145  if (static_cast<size_t> (numRowsToPack + 1) != static_cast<size_t> (outputOffsets_.dimension_0 ())) {
146  std::ostringstream os;
147  os << "lclRowInds.dimension_0() + 1 = " << (numRowsToPack + 1)
148  << " != outputOffsets.dimension_0() = " << outputOffsets_.dimension_0 ()
149  << ".";
150  throw std::invalid_argument (os.str ());
151  }
152  }
153  }
154 
155  KOKKOS_INLINE_FUNCTION void
156  operator() (const local_row_index_type& curInd,
157  output_offset_type& update,
158  const bool final) const
159  {
160  if (debug) {
161  if (curInd < static_cast<local_row_index_type> (0)) {
162  error_ () = 1;
163  return;
164  }
165  }
166 
167  if (final) {
168  if (debug) {
169  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.dimension_0 ())) {
170  error_ () = 2;
171  return;
172  }
173  }
174  outputOffsets_(curInd) = update;
175  }
176 
177  if (curInd < static_cast<local_row_index_type> (counts_.dimension_0 ())) {
178  const auto lclRow = lclRowInds_(curInd);
179  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.dimension_0 ()) ||
180  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
181  error_ () = 3;
182  return;
183  }
184  // count_type could differ from the type of each row offset.
185  // For example, row offsets might each be 64 bits, but if their
186  // difference always fits in 32 bits, we may then safely use a
187  // 32-bit count_type.
188  const count_type count =
189  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
190 
191  // We pack first the number of entries in the row, then that many
192  // global column indices, then that many values. However, if the
193  // number of entries in the row is zero, we pack nothing.
194  const count_type numBytes = (count == 0) ?
195  static_cast<count_type> (0) :
196  sizeOfLclCount_ + count * (sizeOfGblColInd_ + sizeOfValue_);
197 
198  if (final) {
199  counts_(curInd) = numBytes;
200  }
201  update += numBytes;
202  }
203  }
204 
205  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
206  // have join both with and without volatile! Otherwise intrawarp
207  // joins are really slow on GPUs.
208 
210  int getError () const {
211  auto error_h = Kokkos::create_mirror_view (error_);
212  Kokkos::deep_copy (error_h, error_);
213  return error_h ();
214  }
215 
216 private:
217  OutputOffsetsViewType outputOffsets_;
218  CountsViewType counts_;
219  typename InputOffsetsViewType::const_type rowOffsets_;
220  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
221  count_type sizeOfLclCount_;
222  count_type sizeOfValue_;
223  count_type sizeOfGblColInd_;
224  Kokkos::View<int, device_type> error_;
225 };
226 
227 template<class OutputOffsetsViewType,
228  class CountsViewType,
229  class InputOffsetsViewType,
230  class InputLocalRowIndicesViewType>
231 std::pair<typename CountsViewType::non_const_value_type, bool>
232 computeNumPacketsAndOffsets (std::unique_ptr<std::ostringstream>& errStr,
233  const OutputOffsetsViewType& outputOffsets,
234  const CountsViewType& counts,
235  const InputOffsetsViewType& rowOffsets,
236  const InputLocalRowIndicesViewType& lclRowInds,
237  const typename CountsViewType::non_const_value_type sizeOfLclCount,
238  const typename CountsViewType::non_const_value_type sizeOfValue,
239  const typename CountsViewType::non_const_value_type sizeOfGblColInd)
240 {
241  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
242  CountsViewType, typename InputOffsetsViewType::const_type,
243  typename InputLocalRowIndicesViewType::const_type> functor_type;
244  typedef typename CountsViewType::non_const_value_type count_type;
245  typedef typename OutputOffsetsViewType::size_type size_type;
246  typedef typename OutputOffsetsViewType::execution_space execution_space;
247  typedef typename functor_type::local_row_index_type LO;
248  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
249  const char prefix[] = "computeNumPacketsAndOffsets: ";
250 
251  const count_type numRowsToPack = lclRowInds.dimension_0 ();
252  if (numRowsToPack == 0) {
253  return {static_cast<count_type> (0), true}; // nothing to pack, but no error
254  }
255  else {
256  if (rowOffsets.dimension_0 () <= static_cast<size_type> (1)) {
257  if (errStr.get () == NULL) {
258  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
259  }
260  std::ostringstream& os = *errStr;
261  os << prefix
262  << "There is at least one row to pack, but the matrix has no rows. "
263  "lclRowInds.dimension_0() = " << numRowsToPack << ", but "
264  "rowOffsets.dimension_0() = " << rowOffsets.dimension_0 () << " <= 1."
265  << std::endl;
266  return {static_cast<count_type> (0), false};
267  }
268  if (outputOffsets.dimension_0 () != static_cast<size_type> (numRowsToPack + 1)) {
269  if (errStr.get () == NULL) {
270  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
271  }
272  std::ostringstream& os = *errStr;
273  os << prefix
274  << "Output dimension does not match number of rows to pack. "
275  << "outputOffsets.dimension_0() = " << outputOffsets.dimension_0 ()
276  << " != lclRowInds.dimension_0() + 1 = "
277  << static_cast<size_type> (numRowsToPack + 1) << "." << std::endl;
278  return {static_cast<count_type> (0), false};
279  }
280 
281  functor_type f (outputOffsets, counts, rowOffsets,
282  lclRowInds, sizeOfLclCount,
283  sizeOfValue, sizeOfGblColInd);
284 #ifdef HAVE_TPETRA_DEBUG
285  TEUCHOS_TEST_FOR_EXCEPT(static_cast<size_t> (outputOffsets.dimension_0 ()) !=
286  static_cast<size_t> (numRowsToPack + 1));
287  TEUCHOS_TEST_FOR_EXCEPT(static_cast<size_t> (counts.dimension_0 ()) !=
288  static_cast<size_t> (numRowsToPack));
289  TEUCHOS_TEST_FOR_EXCEPT(static_cast<size_t> (outputOffsets.dimension_0 ()) !=
290  static_cast<size_t> (numRowsToPack + 1));
291 #endif // HAVE_TPETRA_DEBUG
292 
293  Kokkos::parallel_scan (range_type (0, numRowsToPack+1), f);
294 
295  // At least in debug mode, this functor checks for errors.
296  const int errCode = f.getError ();
297  if (errCode != 0) {
298  if (errStr.get () == NULL) {
299  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
300  }
301  std::ostringstream& os = *errStr;
302  os << prefix
303  << "NumPacketsAndOffsetsFunctor reported error code " << errCode
304  << " != 0." << std::endl;
305  return {0, false};
306  }
307 
308 #if 0
309  size_t total = 0;
310  for (LO k = 0; k < numRowsToPack; ++k) {
311  total += counts[k];
312  }
313  if (outputOffsets(numRowsToPack) != total) {
314  if (errStr.get () == NULL) {
315  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
316  }
317  std::ostringstream& os = *errStr;
318  os << prefix
319  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
320  << outputOffsets(numRowsToPack) << " != sum of counts = "
321  << total << "." << std::endl;
322  if (numRowsToPack != 0) {
323  // Only print the array if it's not too long.
324  if (numRowsToPack < static_cast<LO> (10)) {
325  os << "outputOffsets: [";
326  for (LO i = 0; i <= numRowsToPack; ++i) {
327  os << outputOffsets(i);
328  if (static_cast<LO> (i + 1) <= numRowsToPack) {
329  os << ",";
330  }
331  }
332  os << "]" << std::endl;
333  os << "counts: [";
334  for (LO i = 0; i < numRowsToPack; ++i) {
335  os << counts(i);
336  if (static_cast<LO> (i + 1) < numRowsToPack) {
337  os << ",";
338  }
339  }
340  os << "]" << std::endl;
341  }
342  else {
343  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
344  << outputOffsets(numRowsToPack-1) << "." << std::endl;
345  }
346  }
347  return {outputOffsets(numRowsToPack), false};
348  }
349 #endif // HAVE_TPETRA_DEBUG
350 
351  // Get last entry of outputOffsets, which is the sum of the entries
352  // of counts. Don't assume UVM.
353  auto outputOffsets_last = Kokkos::subview (outputOffsets, numRowsToPack);
354  auto outputOffsets_last_h = Kokkos::create_mirror_view (outputOffsets_last);
355  return {static_cast<count_type> (outputOffsets_last_h ()), true};
356  }
357 }
358 
363 template<class LO>
365 
366  LO bad_index; // only valid if outOfBounds == true.
367  size_t bad_offset; // only valid if outOfBounds == true.
368  size_t bad_num_bytes; // only valid if outOfBounds == true.
369  bool out_of_bounds_error;
370  bool packing_error;
371 
372  KOKKOS_INLINE_FUNCTION PackCrsMatrixError () :
373  bad_index (Tpetra::Details::OrdinalTraits<LO>::invalid ()),
374  bad_offset (0),
375  bad_num_bytes (0),
376  out_of_bounds_error (false),
377  packing_error (false)
378  {}
379 
380  bool success() const
381  {
382  // Any possible error would result in bad_index being changed from
383  // `invalid` to the index associated with the error.
384  return bad_index == Tpetra::Details::OrdinalTraits<LO>::invalid();
385  }
386 
387  std::string summary() const
388  {
389  std::ostringstream os;
390  os << "First bad index: " << bad_index
391  << ", first bad offset: " << bad_offset
392  << ", first bad number of bytes: " << bad_num_bytes
393  << ", out of bounds error?: " << (out_of_bounds_error ? "true" : "false");
394  return os.str();
395  }
396 };
397 
398 template<class NumPacketsPerLidViewType,
399  class OffsetsViewType,
400  class ExportsViewType,
401  class ExportLidsViewType,
402  class LocalMatrixType,
403  class LocalMapType>
404 struct PackCrsMatrixFunctor {
405  typedef NumPacketsPerLidViewType num_packets_per_lid_view_type;
406  typedef OffsetsViewType offsets_view_type;
407  typedef ExportsViewType exports_view_type;
408  typedef ExportLidsViewType export_lids_view_type;
409 
410  typedef typename num_packets_per_lid_view_type::non_const_value_type count_type;
411  typedef typename offsets_view_type::non_const_value_type offset_type;
412  typedef typename LocalMatrixType::value_type IST;
413  typedef typename LocalMatrixType::ordinal_type LO;
414  typedef typename LocalMapType::global_ordinal_type GO;
415  typedef PackCrsMatrixError<LO> value_type;
416 
417  static_assert (std::is_same<LO, typename LocalMatrixType::ordinal_type>::value,
418  "LocalMapType::local_ordinal_type and "
419  "LocalMatrixType::ordinal_type must be the same.");
420 
421  num_packets_per_lid_view_type num_packets_per_lid_;
422  offsets_view_type offsets_;
423  exports_view_type exports_;
424  export_lids_view_type export_lids_;
425  LocalMatrixType local_matrix_;
426  LocalMapType local_col_map_;
427 
428  PackCrsMatrixFunctor (const num_packets_per_lid_view_type& num_packets_per_lid,
429  const offsets_view_type& offsets,
430  const exports_view_type& exports,
431  const export_lids_view_type& export_lids,
432  const LocalMatrixType& local_matrix,
433  const LocalMapType& local_col_map) :
434  num_packets_per_lid_ (num_packets_per_lid),
435  offsets_ (offsets),
436  exports_ (exports),
437  export_lids_ (export_lids),
438  local_matrix_ (local_matrix),
439  local_col_map_ (local_col_map)
440  {}
441 
442  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
443  {
444  dst.bad_index = Tpetra::Details::OrdinalTraits<LO>::invalid();
445  dst.bad_offset = 0;
446  dst.bad_num_bytes = 0;
447  dst.out_of_bounds_error = false;
448  dst.packing_error = false;
449  }
450 
451  KOKKOS_INLINE_FUNCTION void
452  join (volatile value_type& dst, const volatile value_type& src) const
453  {
454  // The dst object should reflect the first (least) bad index and
455  // all other associated error codes and data. Thus, we need only
456  // check if the src object shows an error and if its associated
457  // `bad_index` is less than the dst.bad_index (if dst shows
458  // errors).
459  LO invalid = Tpetra::Details::OrdinalTraits<LO>::invalid();
460  if (src.bad_index != invalid) {
461  // An error in the src; check if
462  // 1. The dst shows errors
463  // 2. If dst does show errors, if src bad_index is less than
464  // its bad index
465  if (dst.bad_index == invalid || src.bad_index < dst.bad_index) {
466  dst.bad_index = src.bad_index;
467  dst.bad_offset = src.bad_offset;
468  dst.bad_num_bytes = src.bad_num_bytes;
469  dst.out_of_bounds_error = src.out_of_bounds_error;
470  dst.packing_error = src.packing_error;
471  }
472  }
473  }
474 
475  KOKKOS_INLINE_FUNCTION
476  void operator() (const LO i, value_type& dst) const
477  {
478  const LO export_lid = export_lids_[i];
479  const size_t buf_size = exports_.size();
480  const size_t num_ent =
481  static_cast<size_t> (local_matrix_.graph.row_map[export_lid+1]
482  - local_matrix_.graph.row_map[export_lid]);
483 
484  // Only pack this row's data if it has a nonzero number of
485  // entries. We can do this because receiving processes get the
486  // number of packets, and will know that zero packets means zero
487  // entries.
488  if (num_ent != 0) {
489  char* const num_ent_beg = exports_.ptr_on_device() + offsets_(i);
490  char* const num_ent_end = num_ent_beg + sizeof (LO);
491  char* const val_beg = num_ent_end;
492  char* const val_end = val_beg + num_ent * sizeof (IST);
493  char* const ind_beg = val_end;
494  const size_t num_bytes = num_packets_per_lid_(i);
495 
496  if ((offsets_(i) > buf_size || offsets_(i) + num_bytes > buf_size)) {
497  dst.out_of_bounds_error = true;
498  }
499  else {
500  dst.packing_error = ! packCrsMatrixRow(local_matrix_, local_col_map_,
501  num_ent_beg, val_beg, ind_beg, num_ent, export_lid);
502  }
503  if (dst.out_of_bounds_error || dst.packing_error) {
504  dst.bad_index = i;
505  dst.bad_offset = offsets_(i);
506  dst.bad_num_bytes = num_bytes;
507  }
508  }
509  }
510 };
511 
524 template<class LocalMatrixType, class LocalMapType>
525 KOKKOS_FUNCTION bool
526 packCrsMatrixRow (const LocalMatrixType& lclMatrix,
527  const LocalMapType& lclColMap,
528  char* const numEntOut,
529  char* const valOut,
530  char* const indOut,
531  const size_t numEnt, // number of entries in row
532  const typename LocalMatrixType::ordinal_type lclRow)
533 {
534  using Kokkos::subview;
535  typedef LocalMatrixType local_matrix_type;
536  typedef LocalMapType local_map_type;
537  typedef typename local_matrix_type::value_type IST;
538  typedef typename local_matrix_type::ordinal_type LO;
539  typedef typename local_matrix_type::size_type offset_type;
540  typedef typename local_map_type::global_ordinal_type GO;
541  typedef Kokkos::pair<offset_type, offset_type> pair_type;
542 
543  const LO numEntLO = static_cast<LO> (numEnt);
544  // As of CUDA 6, it's totally fine to use memcpy in a CUDA device
545  // function. It does what one would expect.
546  memcpy (numEntOut, &numEntLO, sizeof (LO));
547 
548  if (numEnt == 0) {
549  return true; // nothing more to pack
550  }
551 
552 #ifdef HAVE_TPETRA_DEBUG
553  if (lclRow >= lclMatrix.numRows () ||
554  (static_cast<size_t> (lclRow + 1) >=
555  static_cast<size_t> (lclMatrix.graph.row_map.dimension_0 ()))) {
556 #else // NOT HAVE_TPETRA_DEBUG
557  if (lclRow >= lclMatrix.numRows ()) {
558 #endif // HAVE_TPETRA_DEBUG
559  // It's bad if this is not a valid local row index. One thing
560  // we can do is just pack the flag invalid value for the column
561  // indices. That makes sure that the receiving process knows
562  // something went wrong.
563  const GO flagInd = Tpetra::Details::OrdinalTraits<GO>::invalid ();
564  for (size_t k = 0; k < numEnt; ++k) {
565  // As of CUDA 6, it's totally fine to use memcpy in a CUDA
566  // device function. It does what one would expect.
567  memcpy (indOut + k * sizeof (GO), &flagInd, sizeof (GO));
568  }
569  // The values don't actually matter, but we might as well pack
570  // something here.
571  const IST zero = Kokkos::ArithTraits<IST>::zero ();
572  for (size_t k = 0; k < numEnt; ++k) {
573  // As of CUDA 6, it's totally fine to use memcpy in a CUDA
574  // device function. It does what one would expect.
575  memcpy (valOut + k * sizeof (IST), &zero, sizeof (IST));
576  }
577  return false;
578  }
579 
580  // Since the matrix is locally indexed on the calling process, we
581  // have to use its column Map (which it _must_ have in this case)
582  // to convert to global indices.
583  const offset_type rowBeg = lclMatrix.graph.row_map[lclRow];
584  const offset_type rowEnd = lclMatrix.graph.row_map[lclRow + 1];
585 
586  auto indIn = subview (lclMatrix.graph.entries, pair_type (rowBeg, rowEnd));
587  auto valIn = subview (lclMatrix.values, pair_type (rowBeg, rowEnd));
588 
589  // Copy column indices one at a time, so that we don't need
590  // temporary storage.
591  for (size_t k = 0; k < numEnt; ++k) {
592  const GO gblIndIn = lclColMap.getGlobalElement (indIn[k]);
593  // As of CUDA 6, it's totally fine to use memcpy in a CUDA
594  // device function. It does what one would expect.
595  memcpy (indOut + k * sizeof (GO), &gblIndIn, sizeof (GO));
596  }
597  // As of CUDA 6, it's totally fine to use memcpy in a CUDA device
598  // function. It does what one would expect.
599  memcpy (valOut, valIn.ptr_on_device (), numEnt * sizeof (IST));
600  return true;
601 }
602 
603 
604 
605 
630 template<class LocalMatrixType, class LocalMapType>
631 bool
632 packCrsMatrix (const LocalMatrixType& lclMatrix,
633  const LocalMapType& lclColMap,
634  std::unique_ptr<std::string>& errStr,
635  Teuchos::Array<char>& exports,
636  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
637  size_t& constantNumPackets,
638  const Teuchos::ArrayView<const typename LocalMatrixType::ordinal_type>& exportLIDs,
639  const int myRank,
640  Distributor& /* dist */)
641 {
642  using Kokkos::View;
643  using Kokkos::HostSpace;
644  using Kokkos::MemoryUnmanaged;
646  typedef typename LocalMapType::local_ordinal_type LO;
647  typedef typename LocalMapType::global_ordinal_type GO;
648  typedef typename LocalMatrixType::value_type IST;
649  typedef typename LocalMatrixType::device_type device_type;
650  typedef typename device_type::execution_space execution_space;
651  typedef typename Kokkos::RangePolicy<execution_space, LO> range_type;
652  const char prefix[] = "Tpetra::Details::packCrsMatrix: ";
653 
654  static_assert (std::is_same<LO, typename LocalMatrixType::ordinal_type>::value,
655  "LocalMapType::local_ordinal_type and "
656  "LocalMatrixType::ordinal_type must be the same.");
657  // Setting this to zero tells the caller to expect a possibly
658  // different ("nonconstant") number of packets per local index
659  // (i.e., a possibly different number of entries per row).
660  constantNumPackets = 0;
661 
662  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.size ());
663  if (numExportLIDs != static_cast<size_t> (numPacketsPerLID.size ())) {
664  std::ostringstream os;
665  os << prefix << "exportLIDs.size() = " << numExportLIDs
666  << " != numPacketsPerLID.size() = " << numPacketsPerLID.size ()
667  << "." << std::endl;
668  if (errStr.get () == NULL) {
669  errStr = std::unique_ptr<std::string> (new std::string (os.str ()));
670  }
671  else {
672  *errStr = *errStr + os.str ();
673  }
674  return false;
675  }
676  if (numExportLIDs != 0 && numPacketsPerLID.getRawPtr () == NULL) {
677  std::ostringstream os;
678  os << prefix << "numExportLIDs = " << numExportLIDs << " != 0, but "
679  << "numPacketsPerLID.getRawPtr() = " << numPacketsPerLID.getRawPtr ()
680  << " != NULL." << std::endl;
681  if (errStr.get () == NULL) {
682  errStr = std::unique_ptr<std::string> (new std::string (os.str ()));
683  }
684  else {
685  *errStr = *errStr + os.str ();
686  }
687  return false;
688  }
689 
690  if (numExportLIDs == 0) {
691  exports.resize (0);
692  return true; // nothing to pack
693  }
694 
695  typename LocalMatrixType::device_type outputDevice;
697  // This is an output array, so we don't have to copy to device here.
698  // However, we'll have to remember to copy back to host when done.
699  auto numPktPerLid_d =
701  numPacketsPerLID.getRawPtr (),
702  numPacketsPerLID.size (),
703  false,
704  "numPktPerLid");
705  // This is an input array, so we have to copy to device here.
706  // However, we never need to copy it back to host.
707  auto packLids_d =
709  exportLIDs.getRawPtr (),
710  exportLIDs.size (),
711  true,
712  "packLids");
713  // Array of offsets into the pack buffer.
714  Kokkos::View<size_t*, device_type> packOffsets_d ("packOffsets",
715  numExportLIDs + 1);
716  // Compute number of packets per LID (row to send), as well as
717  // corresponding offsets (the prefix sum of the packet counts).
718  std::unique_ptr<std::ostringstream> errStrm;
719  std::pair<size_t, bool> countResult =
720  computeNumPacketsAndOffsets (errStrm,
721  packOffsets_d,
722  numPktPerLid_d,
723  lclMatrix.graph.row_map,
724  packLids_d,
725  sizeof (LO),
726  sizeof (IST),
727  sizeof (GO));
728 
729  if (! countResult.second) {
730  if (errStr.get () == NULL) {
731  errStr = std::unique_ptr<std::string> (new std::string (errStrm->str ()));
732  }
733  else {
734  *errStr = *errStr + errStrm->str ();
735  }
736  return false;
737  }
738 
739  // The counts were an output of computeNumPacketsAndOffsets, so we
740  // have to copy them back to host.
741  typename decltype (numPktPerLid_d)::HostMirror numPktPerLid_h (numPacketsPerLID.getRawPtr (),
742  numPacketsPerLID.size ());
743  Kokkos::deep_copy (numPktPerLid_h, numPktPerLid_d);
744 
745  // Resize the output pack buffer if needed.
746  if (countResult.first > static_cast<size_t> (exports.size ())) {
747  exports.resize (countResult.first);
748  }
749  // Make device version of output pack buffer. This is an output
750  // array, so we don't have to copy to device here.
751  auto packBuf_d =
753  exports.getRawPtr (),
754  countResult.first,
755  false,
756  "packBuf");
757  // Now do the actual pack!
758  typedef PackCrsMatrixFunctor<
759  decltype (numPktPerLid_d),
760  decltype (packOffsets_d),
761  decltype (packBuf_d),
762  decltype (packLids_d),
763  LocalMatrixType,
764  LocalMapType> pack_functor_type;
765  pack_functor_type packer (numPktPerLid_d, packOffsets_d, packBuf_d,
766  packLids_d, lclMatrix, lclColMap);
767  typename pack_functor_type::value_type result;
768  Kokkos::parallel_reduce (range_type (0, numExportLIDs), packer, result);
769 
770  if (! result.success ()) {
771  std::ostringstream os;
772  os << "Proc " << myRank << ": packCrsMatrix failed. "
773  << result.summary()
774  << std::endl;
775  if (errStr.get () != NULL) {
776  errStr = std::unique_ptr<std::string> (new std::string ());
777  *errStr = os.str ();
778  }
779  return false;
780  }
781 
782  // Copy pack result back to host, if needed.
783  typename decltype (packBuf_d)::HostMirror packBuf_h (exports.getRawPtr (),
784  countResult.first);
785  Kokkos::deep_copy (packBuf_h, packBuf_d);
786  return true; // if we got this far, we succeeded
787 }
788 
789 } // namespace Details
790 } // namespace Tpetra
791 
792 #endif // TPETRA_DETAILS_PACKCRSMATRIX_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
bool packCrsMatrix(const LocalMatrixType &lclMatrix, const LocalMapType &lclColMap, std::unique_ptr< std::string > &errStr, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t &constantNumPackets, const Teuchos::ArrayView< const typename LocalMatrixType::ordinal_type > &exportLIDs, const int myRank, Distributor &)
Pack specified entries of the given local sparse matrix for communication.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Reduction result for PackCrsMatrixFunctor below.
Implementation details of Tpetra.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Sets up and executes a communication plan for a Tpetra DistObject.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
KOKKOS_FUNCTION bool packCrsMatrixRow(const LocalMatrixType &lclMatrix, const LocalMapType &lclColMap, char *const numEntOut, char *const valOut, char *const indOut, const size_t numEnt, const typename LocalMatrixType::ordinal_type lclRow)
Packs a single row of the CrsMatrix.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...