Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
78 
79 namespace Tpetra {
80 
81 //
82 // Users must never rely on anything in the Details namespace.
83 //
84 namespace Details {
85 
86 namespace UnpackAndCombineCrsMatrixImpl {
87 
97 template<class ST, class LO, class GO>
98 KOKKOS_FUNCTION int
99 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
100  const typename PackTraits<int>::output_array_type& pids_out,
101  const typename PackTraits<ST>::output_array_type& vals_out,
102  const char imports[],
103  const size_t offset,
104  const size_t /* num_bytes */,
105  const size_t num_ent,
106  const size_t bytes_per_value)
107 {
108  if (num_ent == 0) {
109  // Empty rows always take zero bytes, to ensure sparsity.
110  return 0;
111  }
112  bool unpack_pids = pids_out.size() > 0;
113 
114  const size_t num_ent_beg = offset;
115  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
116 
117  const size_t gids_beg = num_ent_beg + num_ent_len;
118  const size_t gids_len =
119  num_ent * PackTraits<GO>::packValueCount (GO (0));
120 
121  const size_t pids_beg = gids_beg + gids_len;
122  const size_t pids_len = unpack_pids ?
123  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
124  size_t (0);
125 
126  const size_t vals_beg = gids_beg + gids_len + pids_len;
127  const size_t vals_len = num_ent * bytes_per_value;
128 
129  const char* const num_ent_in = imports + num_ent_beg;
130  const char* const gids_in = imports + gids_beg;
131  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
132  const char* const vals_in = imports + vals_beg;
133 
134  size_t num_bytes_out = 0;
135  LO num_ent_out;
136  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
137  if (static_cast<size_t> (num_ent_out) != num_ent) {
138  return 20; // error code
139  }
140 
141  {
142  Kokkos::pair<int, size_t> p;
143  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
144  if (p.first != 0) {
145  return 21; // error code
146  }
147  num_bytes_out += p.second;
148 
149  if (unpack_pids) {
150  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
151  if (p.first != 0) {
152  return 22; // error code
153  }
154  num_bytes_out += p.second;
155  }
156 
157  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
158  if (p.first != 0) {
159  return 23; // error code
160  }
161  num_bytes_out += p.second;
162  }
163 
164  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165  if (num_bytes_out != expected_num_bytes) {
166  return 24; // error code
167  }
168  return 0; // no errors
169 }
170 
181 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
183  typedef LocalMatrix local_matrix_type;
184  typedef LocalMap local_map_type;
185 
186  typedef typename local_matrix_type::value_type ST;
187  typedef typename local_map_type::local_ordinal_type LO;
188  typedef typename local_map_type::global_ordinal_type GO;
189  typedef typename local_map_type::device_type DT;
190  typedef typename DT::execution_space XS;
191 
192  typedef Kokkos::View<const size_t*, BufferDeviceType>
193  num_packets_per_lid_type;
194  typedef Kokkos::View<const size_t*, DT> offsets_type;
195  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
197 
198  typedef Kokkos::View<int, DT> error_type;
199  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
200 
201  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202  "LocalMap::local_ordinal_type and "
203  "LocalMatrix::ordinal_type must be the same.");
204 
205  local_matrix_type local_matrix;
206  local_map_type local_col_map;
207  input_buffer_type imports;
208  num_packets_per_lid_type num_packets_per_lid;
209  import_lids_type import_lids;
210  Kokkos::View<const LO*[2], DT> batch_info;
211  offsets_type offsets;
212  Tpetra::CombineMode combine_mode;
213  size_t batch_size;
214  size_t bytes_per_value;
215  bool atomic;
216  error_type error_code;
217 
219  const local_matrix_type& local_matrix_in,
220  const local_map_type& local_col_map_in,
221  const input_buffer_type& imports_in,
222  const num_packets_per_lid_type& num_packets_per_lid_in,
223  const import_lids_type& import_lids_in,
224  const Kokkos::View<const LO*[2], DT>& batch_info_in,
225  const offsets_type& offsets_in,
226  const Tpetra::CombineMode combine_mode_in,
227  const size_t batch_size_in,
228  const size_t bytes_per_value_in,
229  const bool atomic_in) :
230  local_matrix (local_matrix_in),
231  local_col_map (local_col_map_in),
232  imports (imports_in),
233  num_packets_per_lid (num_packets_per_lid_in),
234  import_lids (import_lids_in),
235  batch_info (batch_info_in),
236  offsets (offsets_in),
237  combine_mode (combine_mode_in),
238  batch_size (batch_size_in),
239  bytes_per_value (bytes_per_value_in),
240  atomic (atomic_in),
241  error_code("error")
242  {}
243 
244  KOKKOS_INLINE_FUNCTION
245  void operator()(member_type team_member) const
246  {
247  using Kokkos::View;
248  using Kokkos::subview;
249  using Kokkos::MemoryUnmanaged;
250 
251  const LO batch = team_member.league_rank();
252  const LO lid_no = batch_info(batch, 0);
253  const LO batch_no = batch_info(batch, 1);
254 
255  const size_t num_bytes = num_packets_per_lid(lid_no);
256 
257  // Only unpack data if there is a nonzero number of bytes.
258  if (num_bytes == 0)
259  return;
260 
261  // there is actually something in the row
262  const LO import_lid = import_lids(lid_no);
263  const size_t buf_size = imports.size();
264  const size_t offset = offsets(lid_no);
265 
266  // Get the number of entries to expect in the received data for this row.
267  LO num_ent_LO = 0;
268  const char* const in_buf = imports.data() + offset;
269  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
270  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
271 
272  // Count the number of bytes expected to unpack
273  size_t expected_num_bytes = 0;
274  {
275  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
276  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
277  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
278  }
279 
280  if (expected_num_bytes > num_bytes)
281  {
282 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
283 #ifndef KOKKOS_ENABLE_SYCL
284  printf(
285  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
288  );
289 #endif
290  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
291  return;
292  }
293 
294  if (offset > buf_size || offset + num_bytes > buf_size)
295  {
296 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
297 #ifndef KOKKOS_ENABLE_SYCL
298  printf(
299  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300  "At row %d, the offset (%d) > buffer size (%d)\n",
301  (int) lid_no, (int) offset, (int) buf_size
302  );
303 #endif
304  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
305  return;
306  }
307 
308  // Determine the number of entries to unpack in this batch
309  size_t num_entries_in_batch = 0;
310  if (num_entries_in_row <= batch_size)
311  num_entries_in_batch = num_entries_in_row;
312  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313  num_entries_in_batch = batch_size;
314  else
315  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
316 
317  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
318  const size_t num_ent_start = offset;
319  const size_t num_ent_end = num_ent_start + bytes_per_lid;
320 
321  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
322  const size_t gids_start = num_ent_end;
323  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
324 
325  const size_t vals_start = gids_end;
326 
327  const size_t shift = batch_no * batch_size;
328  const char* const num_ent_in = imports.data() + num_ent_start;
329  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
331 
332  LO num_ent_out;
333  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
334  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
335  {
336 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
337 #ifndef KOKKOS_ENABLE_SYCL
338  printf(
339  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
342  );
343 #endif
344  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
345  }
346 
347  constexpr bool matrix_has_sorted_rows = true; // see #6282
348  Kokkos::parallel_for(
349  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
350  KOKKOS_LAMBDA(const LO& j)
351  {
352  size_t distance = 0;
353 
354  GO gid_out;
355  distance = j * bytes_per_gid;
356  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
357  auto lid_out = local_col_map.getLocalElement(gid_out);
358 
359  // Column indices come in as global indices, in case the
360  // source object's column Map differs from the target object's
361  // (this's) column Map, and must be converted local index values
362 
363  // assume that ST is default constructible
364  ST val_out;
365  distance = j * bytes_per_value;
366  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
367 
368  if (combine_mode == ADD) {
369  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
370  // different threads don't touch the same row (i.e., no
371  // duplicates in incoming LIDs list).
372  const bool use_atomic_updates = atomic;
373  (void)local_matrix.sumIntoValues(
374  import_lid,
375  &lid_out,
376  1,
377  &val_out,
378  matrix_has_sorted_rows,
379  use_atomic_updates
380  );
381  } else if (combine_mode == REPLACE) {
382  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
383  // combine mode with multiple incoming rows that touch the same
384  // target matrix entries, so we never need atomic updates.
385  const bool use_atomic_updates = false;
386  (void)local_matrix.replaceValues(
387  import_lid,
388  &lid_out,
389  1,
390  &val_out,
391  matrix_has_sorted_rows,
392  use_atomic_updates
393  );
394  } else {
395  // should never get here
396 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
397 #ifndef KOKKOS_ENABLE_SYCL
398  printf(
399  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
400  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
401  );
402 #endif
403  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
404  }
405  }
406  );
407 
408  team_member.team_barrier();
409 
410  }
411 
413  int error() const {
414  auto error_code_h = Kokkos::create_mirror_view_and_copy(
415  Kokkos::HostSpace(), error_code
416  );
417  return error_code_h();
418  }
419 
420 };
421 
422 struct MaxNumEntTag {};
423 struct TotNumEntTag {};
424 
433 template<class LO, class DT, class BDT>
435 public:
436  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
437  typedef Kokkos::View<const size_t*, DT> offsets_type;
438  typedef Kokkos::View<const char*, BDT> input_buffer_type;
439  // This needs to be public, since it appears in the argument list of
440  // public methods (see below). Otherwise, build errors may happen.
441  typedef size_t value_type;
442 
443 private:
444  num_packets_per_lid_type num_packets_per_lid;
445  offsets_type offsets;
446  input_buffer_type imports;
447 
448 public:
449  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
450  const offsets_type& offsets_in,
451  const input_buffer_type& imports_in) :
452  num_packets_per_lid (num_packets_per_lid_in),
453  offsets (offsets_in),
454  imports (imports_in)
455  {}
456 
457  KOKKOS_INLINE_FUNCTION void
458  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
459  // Get how many entries to expect in the received data for this row.
460  const size_t num_bytes = num_packets_per_lid(i);
461  if (num_bytes > 0) {
462  LO num_ent_LO = 0; // output argument of unpackValue
463  const char* const in_buf = imports.data () + offsets(i);
464  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
465  const size_t num_ent = static_cast<size_t> (num_ent_LO);
466 
467  update = (update < num_ent) ? num_ent : update;
468  }
469  }
470 
471  KOKKOS_INLINE_FUNCTION void
472  join (const MaxNumEntTag,
473  volatile value_type& dst,
474  const volatile value_type& src) const
475  {
476  if (dst < src) dst = src;
477  }
478 
479  KOKKOS_INLINE_FUNCTION void
480  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
481  // Get how many entries to expect in the received data for this row.
482  const size_t num_bytes = num_packets_per_lid(i);
483  if (num_bytes > 0) {
484  LO num_ent_LO = 0; // output argument of unpackValue
485  const char* const in_buf = imports.data () + offsets(i);
486  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
487  tot_num_ent += static_cast<size_t> (num_ent_LO);
488  }
489  }
490 };
491 
499 template<class LO, class DT, class BDT>
500 size_t
502  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
503  const Kokkos::View<const size_t*, DT>& offsets,
504  const Kokkos::View<const char*, BDT>& imports)
505 {
506  typedef typename DT::execution_space XS;
507  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
508  MaxNumEntTag> range_policy;
509 
510  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
511  imports);
512  const LO numRowsToUnpack =
513  static_cast<LO> (num_packets_per_lid.extent (0));
514  size_t max_num_ent = 0;
515  Kokkos::parallel_reduce ("Max num entries in CRS",
516  range_policy (0, numRowsToUnpack),
517  functor, max_num_ent);
518  return max_num_ent;
519 }
520 
528 template<class LO, class DT, class BDT>
529 size_t
531  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
532  const Kokkos::View<const size_t*, DT>& offsets,
533  const Kokkos::View<const char*, BDT>& imports)
534 {
535  typedef typename DT::execution_space XS;
536  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
537  size_t tot_num_ent = 0;
538  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
539  imports);
540  const LO numRowsToUnpack =
541  static_cast<LO> (num_packets_per_lid.extent (0));
542  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
543  range_policy (0, numRowsToUnpack),
544  functor, tot_num_ent);
545  return tot_num_ent;
546 }
547 
548 template<class LO>
549 KOKKOS_INLINE_FUNCTION
550 size_t
551 unpackRowCount(const char imports[],
552  const size_t offset,
553  const size_t num_bytes)
554 {
555  using PT = PackTraits<LO>;
556 
557  LO num_ent_LO = 0;
558  if (num_bytes > 0) {
559  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
560  if (p_num_bytes > num_bytes) {
561  return OrdinalTraits<size_t>::invalid();
562  }
563  const char* const in_buf = imports + offset;
564  (void) PT::unpackValue(num_ent_LO, in_buf);
565  }
566  return static_cast<size_t>(num_ent_LO);
567 }
568 
573 template<class View1, class View2>
574 inline
575 bool
577  const View1& batches_per_lid,
578  View2& batch_info
579 )
580 {
581  using LO = typename View2::value_type;
582  size_t batch = 0;
583  for (size_t i=0; i<batches_per_lid.extent(0); i++)
584  {
585  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
586  {
587  batch_info(batch, 0) = static_cast<LO>(i);
588  batch_info(batch, 1) = batch_no;
589  batch++;
590  }
591  }
592  return batch == batch_info.extent(0);
593 }
594 
602 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
603 void
605  const LocalMatrix& local_matrix,
606  const LocalMap& local_map,
607  const Kokkos::View<const char*, BufferDeviceType>& imports,
608  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
610  const Tpetra::CombineMode combine_mode)
611 {
612  using ST = typename LocalMatrix::value_type;
613  using LO = typename LocalMap::local_ordinal_type;
614  using DT = typename LocalMap::device_type;
615  using XS = typename DT::execution_space;
616  const char prefix[] =
617  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
618  "unpackAndCombineIntoCrsMatrix: ";
619 
620  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
621  if (num_import_lids == 0) {
622  // Nothing to unpack
623  return;
624  }
625 
626  {
627  // Check for correct input
628  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
629  std::invalid_argument,
630  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
631  "static graph (i.e., was constructed with the CrsMatrix constructor "
632  "that takes a const CrsGraph pointer).");
633 
634  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
635  std::invalid_argument,
636  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
637  "(i.e., was constructed with the CrsMatrix constructor that takes a "
638  "const CrsGraph pointer).");
639 
640  // Unknown combine mode!
641  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
642  std::invalid_argument,
643  prefix << "Invalid combine mode; should never get "
644  "here! Please report this bug to the Tpetra developers.");
645 
646  // Check that sizes of input objects are consistent.
647  bool bad_num_import_lids =
648  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
649  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
650  std::invalid_argument,
651  prefix << "importLIDs.size() (" << num_import_lids << ") != "
652  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
653  } // end QA error checking
654 
655  // Get the offsets
656  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
657  computeOffsetsFromCounts(offsets, num_packets_per_lid);
658 
659  // Determine the sizes of the unpack batches
660  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
661  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
662  const size_t batch_size = std::min(default_batch_size, max_num_ent);
663 
664  // To achieve some balance amongst threads, unpack each row in equal size batches
665  size_t num_batches = 0;
666  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
667  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
668  // Compute meta data that allows batch unpacking
669  Kokkos::parallel_reduce(
670  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
671  KOKKOS_LAMBDA(const size_t i, size_t& batches)
672  {
673  const size_t num_entries_in_row = unpackRowCount<LO>(
674  imports.data(), offsets(i), num_packets_per_lid(i)
675  );
676  batches_per_lid(i) =
677  (num_entries_in_row <= batch_size) ?
678  1 :
679  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
680  batches += batches_per_lid(i);
681  },
682  num_batches
683  );
684  Kokkos::resize(batch_info, num_batches);
685 
686  Kokkos::HostSpace host_space;
687  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
688  Kokkos::deep_copy(batches_per_lid_h, batches_per_lid);
689 
690  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
691 
692  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
693  Kokkos::deep_copy(batch_info, batch_info_h);
694 
695  // FIXME (TJF SEP 2017)
696  // The scalar type is not necessarily default constructible
697  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
698 
699  // Now do the actual unpack!
700  const bool atomic = XS::concurrency() != 1;
702  functor f(
703  local_matrix,
704  local_map,
705  imports,
706  num_packets_per_lid,
707  import_lids,
708  batch_info,
709  offsets,
710  combine_mode,
711  batch_size,
712  bytes_per_value,
713  atomic
714  );
715 
716  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
718 #if defined(KOKKOS_ENABLE_CUDA)
719  constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
720 #else
721  constexpr bool is_cuda = false;
722 #endif
723  if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
724  {
725  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
726  }
727  else
728  {
729  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
730  }
731 
732  auto error_code = f.error();
733  TEUCHOS_TEST_FOR_EXCEPTION(
734  error_code != 0,
735  std::runtime_error,
736  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
737  );
738 }
739 
740 template<class LocalMatrix, class BufferDeviceType>
741 size_t
743  const LocalMatrix& local_matrix,
745  const Kokkos::View<const char*, BufferDeviceType>& imports,
746  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
747  const size_t num_same_ids)
748 {
749  using Kokkos::parallel_reduce;
750  typedef typename LocalMatrix::ordinal_type LO;
751  typedef typename LocalMatrix::device_type device_type;
752  typedef typename device_type::execution_space XS;
753  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
754  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
755  typedef BufferDeviceType BDT;
756 
757  size_t count = 0;
758  LO num_items;
759 
760  // Number of matrix entries to unpack (returned by this function).
761  num_items = static_cast<LO>(num_same_ids);
762  if (num_items) {
763  size_t kcnt = 0;
764  parallel_reduce(range_policy(0, num_items),
765  KOKKOS_LAMBDA(const LO lid, size_t& update) {
766  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
767  -local_matrix.graph.row_map[lid]);
768  }, kcnt);
769  count += kcnt;
770  }
771 
772  // Count entries copied directly from the source matrix with permuting.
773  num_items = static_cast<LO>(permute_from_lids.extent(0));
774  if (num_items) {
775  size_t kcnt = 0;
776  parallel_reduce(range_policy(0, num_items),
777  KOKKOS_LAMBDA(const LO i, size_t& update) {
778  const LO lid = permute_from_lids(i);
779  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
780  - local_matrix.graph.row_map[lid]);
781  }, kcnt);
782  count += kcnt;
783  }
784 
785  {
786  // Count entries received from other MPI processes.
787  const size_type np = num_packets_per_lid.extent(0);
788  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
789  computeOffsetsFromCounts(offsets, num_packets_per_lid);
790  count +=
791  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
792  offsets, imports);
793  }
794 
795  return count;
796 }
797 
799 template<class LO, class DT, class BDT>
800 int
801 setupRowPointersForRemotes(
802  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
803  const typename PackTraits<LO>::input_array_type& import_lids,
804  const Kokkos::View<const char*, BDT>& imports,
805  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
806  const typename PackTraits<size_t>::input_array_type& offsets)
807 {
808  using Kokkos::parallel_reduce;
809  typedef typename DT::execution_space XS;
810  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
811  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
812 
813  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
814  const size_type N = num_packets_per_lid.extent(0);
815 
816  int errors = 0;
817  parallel_reduce ("Setup row pointers for remotes",
818  range_policy (0, N),
819  KOKKOS_LAMBDA (const size_t i, int& k_error) {
820  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
821  const size_t num_bytes = num_packets_per_lid(i);
822  const size_t offset = offsets(i);
823  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
824  if (num_ent == InvalidNum) {
825  k_error += 1;
826  }
827  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
828  }, errors);
829  return errors;
830 }
831 
832 // Convert array of row lengths to a CRS pointer array
833 template<class DT>
834 void
835 makeCrsRowPtrFromLengths(
836  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
837  const Kokkos::View<size_t*,DT>& new_start_row)
838 {
839  using Kokkos::parallel_scan;
840  typedef typename DT::execution_space XS;
841  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
842  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
843  const size_type N = new_start_row.extent(0);
844  parallel_scan(range_policy(0, N),
845  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
846  auto cur_val = tgt_rowptr(i);
847  if (final) {
848  tgt_rowptr(i) = update;
849  new_start_row(i) = tgt_rowptr(i);
850  }
851  update += cur_val;
852  }
853  );
854 }
855 
856 template<class LocalMatrix, class LocalMap>
857 void
858 copyDataFromSameIDs(
860  const typename PackTraits<int>::output_array_type& tgt_pids,
862  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
863  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
864  const typename PackTraits<int>::input_array_type& src_pids,
865  const LocalMatrix& local_matrix,
866  const LocalMap& local_col_map,
867  const size_t num_same_ids,
868  const int my_pid)
869 {
870  using Kokkos::parallel_for;
871  typedef typename LocalMap::device_type DT;
872  typedef typename LocalMap::local_ordinal_type LO;
873  typedef typename DT::execution_space XS;
874  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
875 
876  parallel_for(range_policy(0, num_same_ids),
877  KOKKOS_LAMBDA(const size_t i) {
878  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
879 
880  const LO src_lid = static_cast<LO>(i);
881  size_t src_row = local_matrix.graph.row_map(src_lid);
882 
883  const LO tgt_lid = static_cast<LO>(i);
884  const size_t tgt_row = tgt_rowptr(tgt_lid);
885 
886  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
887  - local_matrix.graph.row_map(src_lid);
888  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
889 
890  for (size_t j=local_matrix.graph.row_map(src_lid);
891  j<local_matrix.graph.row_map(src_lid+1); ++j) {
892  LO src_col = local_matrix.graph.entries(j);
893  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
894  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
895  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
896  }
897  }
898  );
899 }
900 
901 template<class LocalMatrix, class LocalMap>
902 void
903 copyDataFromPermuteIDs(
905  const typename PackTraits<int>::output_array_type& tgt_pids,
907  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
908  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
909  const typename PackTraits<int>::input_array_type& src_pids,
912  const LocalMatrix& local_matrix,
913  const LocalMap& local_col_map,
914  const int my_pid)
915 {
916  using Kokkos::parallel_for;
917  typedef typename LocalMap::device_type DT;
918  typedef typename LocalMap::local_ordinal_type LO;
919  typedef typename DT::execution_space XS;
920  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
921  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
922 
923  const size_type num_permute_to_lids = permute_to_lids.extent(0);
924 
925  parallel_for(range_policy(0, num_permute_to_lids),
926  KOKKOS_LAMBDA(const size_t i) {
927  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
928 
929  const LO src_lid = permute_from_lids(i);
930  const size_t src_row = local_matrix.graph.row_map(src_lid);
931 
932  const LO tgt_lid = permute_to_lids(i);
933  const size_t tgt_row = tgt_rowptr(tgt_lid);
934 
935  size_t nsr = local_matrix.graph.row_map(src_lid+1)
936  - local_matrix.graph.row_map(src_lid);
937  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
938 
939  for (size_t j=local_matrix.graph.row_map(src_lid);
940  j<local_matrix.graph.row_map(src_lid+1); ++j) {
941  LO src_col = local_matrix.graph.entries(j);
942  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
943  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
944  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
945  }
946  }
947  );
948 }
949 
950 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
951 int
952 unpackAndCombineIntoCrsArrays2(
954  const typename PackTraits<int>::output_array_type& tgt_pids,
956  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
957  const typename PackTraits<size_t>::input_array_type& offsets,
959  const Kokkos::View<const char*, BufferDeviceType>& imports,
960  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
961  const LocalMatrix& /* local_matrix */,
962  const LocalMap /*& local_col_map*/,
963  const int my_pid,
964  const size_t bytes_per_value)
965 {
966  using Kokkos::View;
967  using Kokkos::subview;
968  using Kokkos::MemoryUnmanaged;
969  using Kokkos::parallel_reduce;
970  using Kokkos::atomic_fetch_add;
972  typedef typename LocalMap::device_type DT;
973  typedef typename LocalMap::local_ordinal_type LO;
974  typedef typename LocalMap::global_ordinal_type GO;
975  typedef typename LocalMatrix::value_type ST;
976  typedef typename DT::execution_space XS;
977  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
978  typedef typename Kokkos::pair<size_type, size_type> slice;
979  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
980 
981  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
982  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
983  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
984 
985  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
986 
987  int errors = 0;
988  const size_type num_import_lids = import_lids.size();
989 
990  // RemoteIDs: Loop structure following UnpackAndCombine
991  parallel_reduce ("Unpack and combine into CRS",
992  range_policy (0, num_import_lids),
993  KOKKOS_LAMBDA (const size_t i, int& k_error) {
994  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
995  const size_t num_bytes = num_packets_per_lid(i);
996  const size_t offset = offsets(i);
997  if (num_bytes == 0) {
998  // Empty buffer means that the row is empty.
999  return;
1000  }
1001  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1002  if (num_ent == InvalidNum) {
1003  k_error += 1;
1004  return;
1005  }
1006  const LO lcl_row = import_lids(i);
1007  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1008  const size_t end_row = start_row + num_ent;
1009 
1010  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1011  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1012  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1013 
1014  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1015  imports.data(), offset, num_bytes,
1016  num_ent, bytes_per_value);
1017 
1018  // Correct target PIDs.
1019  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1020  const int pid = pids_out(j);
1021  pids_out(j) = (pid != my_pid) ? pid : -1;
1022  }
1023  }, errors);
1024 
1025  return errors;
1026 }
1027 
1028 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1029 void
1031  const LocalMatrix & local_matrix,
1032  const LocalMap & local_col_map,
1034  const Kokkos::View<const char*, BufferDeviceType>& imports,
1035  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1038  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1041  const typename PackTraits<int>::input_array_type& src_pids,
1042  const typename PackTraits<int>::output_array_type& tgt_pids,
1043  const size_t num_same_ids,
1044  const size_t tgt_num_rows,
1045  const size_t tgt_num_nonzeros,
1046  const int my_tgt_pid,
1047  const size_t bytes_per_value)
1048 {
1049  using Kokkos::View;
1050  using Kokkos::subview;
1051  using Kokkos::parallel_for;
1052  using Kokkos::MemoryUnmanaged;
1054  typedef typename LocalMap::device_type DT;
1055  typedef typename LocalMap::local_ordinal_type LO;
1056  typedef typename DT::execution_space XS;
1057  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1058  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1059  typedef BufferDeviceType BDT;
1060 
1061  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1062 
1063  const size_t N = tgt_num_rows;
1064 
1065  // In the case of reduced communicators, the sourceMatrix won't have
1066  // the right "my_pid", so thus we have to supply it.
1067  const int my_pid = my_tgt_pid;
1068 
1069  // Zero the rowptr
1070  parallel_for(range_policy(0, N+1),
1071  KOKKOS_LAMBDA(const size_t i) {
1072  tgt_rowptr(i) = 0;
1073  }
1074  );
1075 
1076  // same IDs: Always first, always in the same place
1077  parallel_for(range_policy(0, num_same_ids),
1078  KOKKOS_LAMBDA(const size_t i) {
1079  const LO tgt_lid = static_cast<LO>(i);
1080  const LO src_lid = static_cast<LO>(i);
1081  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1082  - local_matrix.graph.row_map(src_lid);
1083  }
1084  );
1085 
1086  // Permute IDs: Still local, but reordered
1087  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1088  parallel_for(range_policy(0, num_permute_to_lids),
1089  KOKKOS_LAMBDA(const size_t i) {
1090  const LO tgt_lid = permute_to_lids(i);
1091  const LO src_lid = permute_from_lids(i);
1092  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1093  - local_matrix.graph.row_map(src_lid);
1094  }
1095  );
1096 
1097  // Get the offsets from the number of packets per LID
1098  const size_type num_import_lids = import_lids.extent(0);
1099  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1100  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1101 
1102 #ifdef HAVE_TPETRA_DEBUG
1103  {
1104  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1105  const bool condition =
1106  nth_offset_h != static_cast<size_t>(imports.extent (0));
1107  TEUCHOS_TEST_FOR_EXCEPTION
1108  (condition, std::logic_error, prefix
1109  << "The final offset in bytes " << nth_offset_h
1110  << " != imports.size() = " << imports.extent(0)
1111  << ". Please report this bug to the Tpetra developers.");
1112  }
1113 #endif // HAVE_TPETRA_DEBUG
1114 
1115  // Setup row pointers for remotes
1116  int k_error =
1117  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1118  import_lids, imports, num_packets_per_lid, offsets);
1119  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1120  << " Error transferring data to target row pointers. "
1121  "Please report this bug to the Tpetra developers.");
1122 
1123  // If multiple processes contribute to the same row, we may need to
1124  // update row offsets. This tracks that.
1125  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1126 
1127  // Turn row length into a real CRS row pointer
1128  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1129 
1130  // SameIDs: Copy the data over
1131  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1132  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1133 
1134  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1135  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1136  local_matrix, local_col_map, my_pid);
1137 
1138  if (imports.extent(0) <= 0) {
1139  return;
1140  }
1141 
1142  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1143  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1144  local_matrix, local_col_map, my_pid, bytes_per_value);
1145  TEUCHOS_TEST_FOR_EXCEPTION(
1146  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1147  "should never happen. Please report this bug to the Tpetra developers.");
1148 
1149  return;
1150 }
1151 
1152 } // namespace UnpackAndCombineCrsMatrixImpl
1153 
1193 template<typename ST, typename LO, typename GO, typename Node>
1194 void
1196  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1197  const Teuchos::ArrayView<const char>& imports,
1198  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1199  const Teuchos::ArrayView<const LO>& importLIDs,
1200  size_t /* constantNumPackets */,
1201  CombineMode combineMode)
1202 {
1203  using Kokkos::View;
1204  typedef typename Node::device_type device_type;
1205  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1206  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1207  "Node::device_type and LocalMatrix::device_type must be the same.");
1208 
1209  // Convert all Teuchos::Array to Kokkos::View.
1210  device_type outputDevice;
1211 
1212  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1213  // them to device. Since unpacking is done directly in to the local matrix
1214  // (lclMatrix), no copying needs to be performed after unpacking.
1215  auto num_packets_per_lid_d =
1216  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1217  numPacketsPerLID.size(), true, "num_packets_per_lid");
1218 
1219  auto import_lids_d =
1220  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1221  importLIDs.size(), true, "import_lids");
1222 
1223  auto imports_d =
1224  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1225  imports.size(), true, "imports");
1226 
1227  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1228  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1229 
1230 //KDDKDD This loop doesn't appear to do anything; what is it?
1231 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1232 //KDDKDD {
1233 //KDDKDD auto lclRow = importLIDs[i];
1234 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1235 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1236 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1237 //KDDKDD }
1238  // Now do the actual unpack!
1239  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1240  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1241  import_lids_d, combineMode);
1242 
1243 }
1244 
1245 template<typename ST, typename LO, typename GO, typename NT>
1246 void
1247 unpackCrsMatrixAndCombineNew(
1248  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1249  Kokkos::DualView<char*,
1251  Kokkos::DualView<size_t*,
1252  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1253  const Kokkos::DualView<const LO*,
1255  const size_t /* constantNumPackets */,
1256  const CombineMode combineMode)
1257 {
1258  using Kokkos::View;
1259  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1260  using dist_object_type = DistObject<char, LO, GO, NT>;
1261  using device_type = typename crs_matrix_type::device_type;
1262  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1263  using buffer_device_type = typename dist_object_type::buffer_device_type;
1264 
1265  static_assert
1266  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1267  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1268  "must be the same.");
1269 
1270  if (numPacketsPerLID.need_sync_device()) {
1271  numPacketsPerLID.sync_device ();
1272  }
1273  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1274 
1275  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1276  auto import_lids_d = importLIDs.view_device ();
1277 
1278  if (imports.need_sync_device()) {
1279  imports.sync_device ();
1280  }
1281  auto imports_d = imports.view_device ();
1282 
1283  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1284  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1285  typedef decltype (local_col_map) local_map_type;
1286 
1287  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288  local_matrix_device_type,
1289  local_map_type,
1290  buffer_device_type
1291  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1292  import_lids_d, combineMode);
1293 }
1294 
1341 //
1350 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1351 size_t
1354  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1355  const Teuchos::ArrayView<const char> &imports,
1356  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1357  size_t /* constantNumPackets */,
1358  CombineMode /* combineMode */,
1359  size_t numSameIDs,
1360  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1361  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1362 {
1363  using Kokkos::MemoryUnmanaged;
1364  using Kokkos::View;
1365  typedef typename Node::device_type DT;
1367  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1368 
1369  TEUCHOS_TEST_FOR_EXCEPTION
1370  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1371  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1372  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1373  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1374  // process, then the matrix is neither locally nor globally indexed.
1375  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1376  TEUCHOS_TEST_FOR_EXCEPTION
1377  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1378  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1379  TEUCHOS_TEST_FOR_EXCEPTION
1380  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1381  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1382  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1383 
1384  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1385  auto permute_from_lids_d =
1387  permuteFromLIDs.getRawPtr (),
1388  permuteFromLIDs.size (), true,
1389  "permute_from_lids");
1390  auto imports_d =
1392  imports.getRawPtr (),
1393  imports.size (), true,
1394  "imports");
1395  auto num_packets_per_lid_d =
1397  numPacketsPerLID.getRawPtr (),
1398  numPacketsPerLID.size (), true,
1399  "num_packets_per_lid");
1400 
1401  return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1402  local_matrix, permute_from_lids_d, imports_d,
1403  num_packets_per_lid_d, numSameIDs);
1404 }
1405 
1420 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1421 void
1424  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1425  const Teuchos::ArrayView<const char>& imports,
1426  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1427  const size_t /* constantNumPackets */,
1428  const CombineMode /* combineMode */,
1429  const size_t numSameIDs,
1430  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1431  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1432  size_t TargetNumRows,
1433  size_t TargetNumNonzeros,
1434  const int MyTargetPID,
1435  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1436  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1437  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1438  const Teuchos::ArrayView<const int>& SourcePids,
1439  Teuchos::Array<int>& TargetPids)
1440 {
1442 
1443  using Kokkos::View;
1444  using Kokkos::deep_copy;
1445 
1446  using Teuchos::ArrayView;
1447  using Teuchos::outArg;
1448  using Teuchos::REDUCE_MAX;
1449  using Teuchos::reduceAll;
1450 
1451  typedef LocalOrdinal LO;
1452 
1453  typedef typename Node::device_type DT;
1454 
1456  typedef typename matrix_type::impl_scalar_type ST;
1457  typedef typename ArrayView<const LO>::size_type size_type;
1458 
1459  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1460 
1461  TEUCHOS_TEST_FOR_EXCEPTION(
1462  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1464  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1465 
1466  TEUCHOS_TEST_FOR_EXCEPTION(
1467  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1469  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1470  const size_type numImportLIDs = importLIDs.size ();
1471 
1472  TEUCHOS_TEST_FOR_EXCEPTION(
1473  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1475  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1476 
1477  // Preseed TargetPids with -1 for local
1478  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479  TargetPids.resize (TargetNumNonzeros);
1480  }
1481  TargetPids.assign (TargetNumNonzeros, -1);
1482 
1483  // Grab pointers for sourceMatrix
1484  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1485  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486 
1487  // Convert input arrays to Kokkos::View
1488  DT outputDevice;
1489  auto import_lids_d =
1490  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1491  importLIDs.size(), true, "import_lids");
1492 
1493  auto imports_d =
1494  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1495  imports.size(), true, "imports");
1496 
1497  auto num_packets_per_lid_d =
1498  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1499  numPacketsPerLID.size(), true, "num_packets_per_lid");
1500 
1501  auto permute_from_lids_d =
1502  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1503  permuteFromLIDs.size(), true, "permute_from_lids");
1504 
1505  auto permute_to_lids_d =
1506  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1507  permuteToLIDs.size(), true, "permute_to_lids");
1508 
1509  auto crs_rowptr_d =
1510  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1511  CRS_rowptr.size(), true, "crs_rowptr");
1512 
1513  auto crs_colind_d =
1514  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1515  CRS_colind.size(), true, "crs_colidx");
1516 
1517 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518  static_assert (! std::is_same<
1519  typename std::remove_const<
1520  typename std::decay<
1521  decltype (CRS_vals)
1522  >::type::value_type
1523  >::type,
1524  std::complex<double> >::value,
1525  "CRS_vals::value_type is std::complex<double>; this should never happen"
1526  ", since std::complex does not work in Kokkos::View objects.");
1527 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1528 
1529  auto crs_vals_d =
1530  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1531  CRS_vals.size(), true, "crs_vals");
1532 
1533 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534  static_assert (! std::is_same<
1535  typename decltype (crs_vals_d)::non_const_value_type,
1536  std::complex<double> >::value,
1537  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538  "never happen, since std::complex does not work in Kokkos::View objects.");
1539 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1540 
1541  auto src_pids_d =
1542  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1543  SourcePids.size(), true, "src_pids");
1544 
1545  auto tgt_pids_d =
1546  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1547  TargetPids.size(), true, "tgt_pids");
1548 
1549  size_t bytes_per_value = 0;
1551  // assume that ST is default constructible
1552  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1553  }
1554  else {
1555  // Since the packed data come from the source matrix, we can use the source
1556  // matrix to get the number of bytes per Scalar value stored in the matrix.
1557  // This assumes that all Scalar values in the source matrix require the same
1558  // number of bytes. If the source matrix has no entries on the calling
1559  // process, then we hope that some process does have some idea how big
1560  // a Scalar value is. Of course, if no processes have any entries, then no
1561  // values should be packed (though this does assume that in our packing
1562  // scheme, rows with zero entries take zero bytes).
1563  size_t bytes_per_value_l = 0;
1564  if (local_matrix.values.extent(0) > 0) {
1565  const ST& val = local_matrix.values(0);
1566  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1567  } else {
1568  const ST& val = crs_vals_d(0);
1569  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1570  }
1571  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1572  Teuchos::REDUCE_MAX,
1573  bytes_per_value_l,
1574  outArg(bytes_per_value));
1575  }
1576 
1577 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578  static_assert (! std::is_same<
1579  typename decltype (crs_vals_d)::non_const_value_type,
1580  std::complex<double> >::value,
1581  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582  "never happen, since std::complex does not work in Kokkos::View objects.");
1583 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1584 
1585  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1586  local_matrix, local_col_map, import_lids_d, imports_d,
1587  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1590  bytes_per_value);
1591 
1592  // Copy outputs back to host
1593  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1595  deep_copy(crs_rowptr_h, crs_rowptr_d);
1596 
1597  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598  CRS_colind.getRawPtr(), CRS_colind.size());
1599  deep_copy(crs_colind_h, crs_colind_d);
1600 
1601  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602  CRS_vals.getRawPtr(), CRS_vals.size());
1603  deep_copy(crs_vals_h, crs_vals_d);
1604 
1605  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606  TargetPids.getRawPtr(), TargetPids.size());
1607  deep_copy(tgt_pids_h, tgt_pids_d);
1608 
1609 }
1610 
1611 } // namespace Details
1612 } // namespace Tpetra
1613 
1614 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1615  template void \
1616  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617  const CrsMatrix<ST, LO, GO, NT>&, \
1618  const Teuchos::ArrayView<const char>&, \
1619  const Teuchos::ArrayView<const size_t>&, \
1620  const Teuchos::ArrayView<const LO>&, \
1621  size_t, \
1622  CombineMode); \
1623  template void \
1624  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1625  const CrsMatrix<ST, LO, GO, NT>&, \
1626  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1627  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1629  const size_t, \
1630  const CombineMode); \
1631  template void \
1632  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1633  const CrsMatrix<ST, LO, GO, NT> &, \
1634  const Teuchos::ArrayView<const LO>&, \
1635  const Teuchos::ArrayView<const char>&, \
1636  const Teuchos::ArrayView<const size_t>&, \
1637  const size_t, \
1638  const CombineMode, \
1639  const size_t, \
1640  const Teuchos::ArrayView<const LO>&, \
1641  const Teuchos::ArrayView<const LO>&, \
1642  size_t, \
1643  size_t, \
1644  const int, \
1645  const Teuchos::ArrayView<size_t>&, \
1646  const Teuchos::ArrayView<GO>&, \
1647  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1648  const Teuchos::ArrayView<const int>&, \
1649  Teuchos::Array<int>&); \
1650  template size_t \
1651  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1652  const CrsMatrix<ST, LO, GO, NT> &, \
1653  const Teuchos::ArrayView<const LO> &, \
1654  const Teuchos::ArrayView<const char> &, \
1655  const Teuchos::ArrayView<const size_t>&, \
1656  size_t, \
1657  CombineMode, \
1658  size_t, \
1659  const Teuchos::ArrayView<const LO>&, \
1660  const Teuchos::ArrayView<const LO>&);
1661 
1662 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Base class for distributed Tpetra objects that support data redistribution.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.