Tpetra parallel linear algebra  Version of the Day
Tpetra_Details_packCrsGraph_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
41 #define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
52 #include "Tpetra_CrsGraph_decl.hpp"
53 #include <memory>
54 #include <string>
55 
77 
78 namespace Tpetra {
79 
80 //
81 // Users must never rely on anything in the Details namespace.
82 //
83 namespace Details {
84 
85 namespace PackCrsGraphImpl {
93 template<class OutputOffsetsViewType,
94  class CountsViewType,
95  class InputOffsetsViewType,
96  class InputLocalRowIndicesViewType,
97  class InputLocalRowPidsViewType,
98  const bool debug =
99 #ifdef HAVE_TPETRA_DEBUG
100  true
101 #else
102  false
103 #endif // HAVE_TPETRA_DEBUG
104  >
106 public:
107  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
108  typedef typename CountsViewType::non_const_value_type count_type;
109  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
110  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
111  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
112  // output Views drive where execution happens.
113  typedef typename OutputOffsetsViewType::device_type device_type;
114  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
115  typename device_type::execution_space>::value,
116  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
117  static_assert (Kokkos::Impl::is_view<OutputOffsetsViewType>::value,
118  "OutputOffsetsViewType must be a Kokkos::View.");
119  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
120  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
121  static_assert (std::is_integral<output_offset_type>::value,
122  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
123  static_assert (Kokkos::Impl::is_view<CountsViewType>::value,
124  "CountsViewType must be a Kokkos::View.");
125  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
126  "CountsViewType must be a nonconst Kokkos::View.");
127  static_assert (std::is_integral<count_type>::value,
128  "The type of each entry of CountsViewType must be a built-in integer type.");
129  static_assert (Kokkos::Impl::is_view<InputOffsetsViewType>::value,
130  "InputOffsetsViewType must be a Kokkos::View.");
131  static_assert (std::is_integral<input_offset_type>::value,
132  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
133  static_assert (Kokkos::Impl::is_view<InputLocalRowIndicesViewType>::value,
134  "InputLocalRowIndicesViewType must be a Kokkos::View.");
135  static_assert (std::is_integral<local_row_index_type>::value,
136  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
137 
138  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
139  const CountsViewType& counts,
140  const InputOffsetsViewType& rowOffsets,
141  const InputLocalRowIndicesViewType& lclRowInds,
142  const InputLocalRowPidsViewType& lclRowPids) :
143  outputOffsets_ (outputOffsets),
144  counts_ (counts),
145  rowOffsets_ (rowOffsets),
146  lclRowInds_ (lclRowInds),
147  lclRowPids_ (lclRowPids),
148  error_ ("error") // don't forget this, or you'll get segfaults!
149  {
150  if (debug) {
151  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
152 
153  if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
154  std::ostringstream os;
155  os << "lclRowInds.extent(0) = " << numRowsToPack
156  << " != counts.extent(0) = " << counts_.extent (0)
157  << ".";
158  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
159  }
160  if (static_cast<size_t> (numRowsToPack + 1) !=
161  static_cast<size_t> (outputOffsets_.extent (0))) {
162  std::ostringstream os;
163  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
164  << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
165  << ".";
166  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
167  }
168  }
169  }
170 
171  KOKKOS_INLINE_FUNCTION void
172  operator() (const local_row_index_type& curInd,
173  output_offset_type& update,
174  const bool final) const
175  {
176  if (debug) {
177  if (curInd < static_cast<local_row_index_type> (0)) {
178  error_ () = 1;
179  return;
180  }
181  }
182 
183  if (final) {
184  if (debug) {
185  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
186  error_ () = 2;
187  return;
188  }
189  }
190  outputOffsets_(curInd) = update;
191  }
192 
193  if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
194  const auto lclRow = lclRowInds_(curInd);
195  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
196  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
197  error_ () = 3;
198  return;
199  }
200  // count_type could differ from the type of each row offset.
201  // For example, row offsets might each be 64 bits, but if their
202  // difference always fits in 32 bits, we may then safely use a
203  // 32-bit count_type.
204  const count_type count =
205  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
206 
207  // We pack first the global column indices and then pids (if any),
208  // However, if the number of entries in the row is zero, we pack nothing.
209  const count_type numEntToPack = (count == 0)
210  ? static_cast<count_type>(0)
211  : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0));
212 
213  if (final) {
214  counts_(curInd) = numEntToPack;
215  }
216  update += numEntToPack;
217  }
218  }
219 
220  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
221  // have join both with and without volatile! Otherwise intrawarp
222  // joins are really slow on GPUs.
223 
225  int getError () const {
226  auto error_h = Kokkos::create_mirror_view (error_);
227  Kokkos::deep_copy (error_h, error_);
228  return error_h ();
229  }
230 
231 private:
232  OutputOffsetsViewType outputOffsets_;
233  CountsViewType counts_;
234  typename InputOffsetsViewType::const_type rowOffsets_;
235  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
236  typename InputLocalRowPidsViewType::const_type lclRowPids_;
237  Kokkos::View<int, device_type> error_;
238 };
239 
249 template<class OutputOffsetsViewType,
250  class CountsViewType,
251  class InputOffsetsViewType,
252  class InputLocalRowIndicesViewType,
253  class InputLocalRowPidsViewType>
254 typename CountsViewType::non_const_value_type
255 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
256  const CountsViewType& counts,
257  const InputOffsetsViewType& rowOffsets,
258  const InputLocalRowIndicesViewType& lclRowInds,
259  const InputLocalRowPidsViewType& lclRowPids)
260 {
261  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
262  CountsViewType, typename InputOffsetsViewType::const_type,
263  typename InputLocalRowIndicesViewType::const_type,
264  typename InputLocalRowPidsViewType::const_type> functor_type;
265  typedef typename CountsViewType::non_const_value_type count_type;
266  typedef typename OutputOffsetsViewType::size_type size_type;
267  typedef typename OutputOffsetsViewType::execution_space execution_space;
268  typedef typename functor_type::local_row_index_type LO;
269  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
270  const char prefix[] = "computeNumPacketsAndOffsets: ";
271 
272  count_type count = 0;
273  const count_type numRowsToPack = lclRowInds.extent (0);
274 
275  if (numRowsToPack == 0) {
276  return count;
277  }
278  else {
279  TEUCHOS_TEST_FOR_EXCEPTION
280  (rowOffsets.extent (0) <= static_cast<size_type> (1),
281  std::invalid_argument, prefix << "There is at least one row to pack, "
282  "but the graph has no rows. lclRowInds.extent(0) = " <<
283  numRowsToPack << ", but rowOffsets.extent(0) = " <<
284  rowOffsets.extent (0) << " <= 1.");
285  TEUCHOS_TEST_FOR_EXCEPTION
286  (outputOffsets.extent (0) !=
287  static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
288  prefix << "Output dimension does not match number of rows to pack. "
289  << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
290  << " != lclRowInds.extent(0) + 1 = "
291  << static_cast<size_type> (numRowsToPack + 1) << ".");
292  TEUCHOS_TEST_FOR_EXCEPTION
293  (counts.extent (0) != numRowsToPack, std::invalid_argument,
294  prefix << "counts.extent(0) = " << counts.extent (0)
295  << " != numRowsToPack = " << numRowsToPack << ".");
296 
297  functor_type f (outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids);
298  Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f);
299 
300  // At least in debug mode, this functor checks for errors.
301  const int errCode = f.getError ();
302  TEUCHOS_TEST_FOR_EXCEPTION
303  (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
304  << errCode << " != 0.");
305 
306 #if 0
307  size_t total = 0;
308  for (LO k = 0; k < numRowsToPack; ++k) {
309  total += counts[k];
310  }
311  if (outputOffsets(numRowsToPack) != total) {
312  if (errStr.get () == NULL) {
313  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
314  }
315  std::ostringstream& os = *errStr;
316  os << prefix
317  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
318  << outputOffsets(numRowsToPack) << " != sum of counts = "
319  << total << "." << std::endl;
320  if (numRowsToPack != 0) {
321  // Only print the array if it's not too long.
322  if (numRowsToPack < static_cast<LO> (10)) {
323  os << "outputOffsets: [";
324  for (LO i = 0; i <= numRowsToPack; ++i) {
325  os << outputOffsets(i);
326  if (static_cast<LO> (i + 1) <= numRowsToPack) {
327  os << ",";
328  }
329  }
330  os << "]" << std::endl;
331  os << "counts: [";
332  for (LO i = 0; i < numRowsToPack; ++i) {
333  os << counts(i);
334  if (static_cast<LO> (i + 1) < numRowsToPack) {
335  os << ",";
336  }
337  }
338  os << "]" << std::endl;
339  }
340  else {
341  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
342  << outputOffsets(numRowsToPack-1) << "." << std::endl;
343  }
344  }
345  count = outputOffsets(numRowsToPack);
346  return {false, errStr};
347  }
348 #endif // HAVE_TPETRA_DEBUG
349 
350  // Get last entry of outputOffsets, which is the sum of the entries
351  // of counts. Don't assume UVM.
352  using Tpetra::Details::getEntryOnHost;
353  return static_cast<count_type> (getEntryOnHost (outputOffsets,
354  numRowsToPack));
355  }
356 }
357 
368 template<class Packet,
369  class LocalMapType,
370  class BufferDeviceType,
371  class InputLidsType,
372  class InputPidsType>
373 KOKKOS_FUNCTION
374 size_t
375 packRow(const LocalMapType& col_map,
376  const Kokkos::View<Packet*, BufferDeviceType>& exports,
377  const InputLidsType& lids_in,
378  const InputPidsType& pids_in,
379  const size_t offset,
380  const size_t num_ent,
381  const bool pack_pids)
382 {
383  using LO = typename LocalMapType::local_ordinal_type;
384  using GO = typename LocalMapType::global_ordinal_type;
385 
386  if (num_ent == 0) {
387  // Empty rows always take zero bytes, to ensure sparsity.
388  return static_cast<size_t>(0);
389  }
390 
391  size_t num_ent_packed = num_ent;
392  if (pack_pids) {
393  num_ent_packed += num_ent;
394  }
395 
396  // Copy column indices one at a time, so that we don't need
397  // temporary storage.
398  for (size_t k = 0; k < num_ent; ++k) {
399  const LO lid = lids_in[k];
400  const GO gid = col_map.getGlobalElement (lid);
401  exports(offset+k) = gid;
402  }
403  // Copy PIDs one at a time, so that we don't need temporary storage.
404  if (pack_pids) {
405  for (size_t k = 0; k < num_ent; ++k) {
406  const LO lid = lids_in[k];
407  const int pid = pids_in[lid];
408  exports(offset+num_ent+k) = static_cast<GO>(pid);
409  }
410  }
411 
412  return num_ent_packed;
413 }
414 
415 template<class Packet,
416  class LocalGraph,
417  class LocalMap,
418  class BufferDeviceType>
419 struct PackCrsGraphFunctor {
420  using local_graph_type = LocalGraph;
421  using local_map_type = LocalMap;
422  using LO = typename local_map_type::local_ordinal_type;
423  using GO = typename local_map_type::global_ordinal_type;
424 
425  using num_packets_per_lid_view_type =
426  Kokkos::View<const size_t*, BufferDeviceType>;
427  using offsets_view_type = Kokkos::View<const size_t*, BufferDeviceType>;
428  using exports_view_type = Kokkos::View<Packet*, BufferDeviceType>;
429  using export_lids_view_type =
431  using source_pids_view_type =
433 
434  using count_type =
435  typename num_packets_per_lid_view_type::non_const_value_type;
436  using offset_type = typename offsets_view_type::non_const_value_type;
437  using value_type = Kokkos::pair<int, LO>;
438 
439  static_assert (std::is_same<LO, typename local_graph_type::data_type>::value,
440  "local_map_type::local_ordinal_type and "
441  "local_graph_type::data_type must be the same.");
442 
443  local_graph_type local_graph;
444  local_map_type local_col_map;
445  exports_view_type exports;
446  num_packets_per_lid_view_type num_packets_per_lid;
447  export_lids_view_type export_lids;
448  source_pids_view_type source_pids;
449  offsets_view_type offsets;
450  bool pack_pids;
451 
452  PackCrsGraphFunctor(const local_graph_type& local_graph_in,
453  const local_map_type& local_col_map_in,
454  const exports_view_type& exports_in,
455  const num_packets_per_lid_view_type& num_packets_per_lid_in,
456  const export_lids_view_type& export_lids_in,
457  const source_pids_view_type& source_pids_in,
458  const offsets_view_type& offsets_in,
459  const bool pack_pids_in) :
460  local_graph (local_graph_in),
461  local_col_map (local_col_map_in),
462  exports (exports_in),
463  num_packets_per_lid (num_packets_per_lid_in),
464  export_lids (export_lids_in),
465  source_pids (source_pids_in),
466  offsets (offsets_in),
467  pack_pids (pack_pids_in)
468  {
469  const LO numRows = local_graph_in.numRows ();
470  const LO rowMapDim =
471  static_cast<LO> (local_graph.row_map.extent (0));
472  TEUCHOS_TEST_FOR_EXCEPTION
473  (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
474  std::logic_error, "local_graph.row_map.extent(0) = "
475  << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
476  }
477 
478  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
479  {
480  using ::Tpetra::Details::OrdinalTraits;
481  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
482  }
483 
484  KOKKOS_INLINE_FUNCTION void
485  join (volatile value_type& dst, const volatile value_type& src) const
486  {
487  // `dst` should reflect the first (least) bad index and all other
488  // associated error codes and data, so prefer keeping it.
489  if (src.first != 0 && dst.first == 0) {
490  dst = src;
491  }
492  }
493 
494  KOKKOS_INLINE_FUNCTION
495  void operator() (const LO i, value_type& dst) const
496  {
497  const size_t offset = offsets[i];
498  const LO export_lid = export_lids[i];
499  const size_t buf_size = exports.size();
500  const size_t num_packets_this_lid = num_packets_per_lid(i);
501  const size_t num_ent =
502  static_cast<size_t> (local_graph.row_map[export_lid+1]
503  - local_graph.row_map[export_lid]);
504 
505  // Only pack this row's data if it has a nonzero number of
506  // entries. We can do this because receiving processes get the
507  // number of packets, and will know that zero packets means zero
508  // entries.
509  if (num_ent == 0) {
510  return;
511  }
512 
513  if (export_lid >= static_cast<LO>(local_graph.numRows())) {
514  if (dst.first != 0) { // keep only the first error
515  dst = Kokkos::make_pair (1, i); // invalid row
516  }
517  return;
518  }
519  else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) {
520  if (dst.first != 0) { // keep only the first error
521  dst = Kokkos::make_pair (2, i); // out of bounds
522  }
523  return;
524  }
525 
526  // We can now pack this row
527 
528  // Since the graph is locally indexed on the calling process, we
529  // have to use its column Map (which it _must_ have in this case)
530  // to convert to global indices.
531  const auto row_beg = local_graph.row_map[export_lid];
532  const auto row_end = local_graph.row_map[export_lid + 1];
533  auto lids_in = Kokkos::subview (local_graph.entries,
534  Kokkos::make_pair (row_beg, row_end));
535  size_t num_ent_packed_this_row =
536  packRow (local_col_map, exports, lids_in,
537  source_pids, offset, num_ent, pack_pids);
538  if (num_ent_packed_this_row != num_packets_this_lid) {
539  if (dst.first != 0) { // keep only the first error
540  dst = Kokkos::make_pair (3, i);
541  }
542  }
543  }
544 };
545 
553 template<class Packet,
554  class LocalGraph,
555  class LocalMap,
556  class BufferDeviceType>
557 void
558 do_pack(const LocalGraph& local_graph,
559  const LocalMap& local_map,
560  const Kokkos::View<Packet*, BufferDeviceType>& exports,
561  const typename PackTraits<
562  size_t
563  >::input_array_type& num_packets_per_lid,
564  const typename PackTraits<
566  >::input_array_type& export_lids,
567  const typename PackTraits<
568  int
569  >::input_array_type& source_pids,
570  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
571  const bool pack_pids)
572 {
573  using LO = typename LocalMap::local_ordinal_type;
574  using execution_space = typename LocalGraph::device_type::execution_space;
575  using range_type = Kokkos::RangePolicy<execution_space, LO>;
576  const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: ";
577 
578  if (export_lids.extent (0) != 0) {
579  TEUCHOS_TEST_FOR_EXCEPTION
580  (static_cast<size_t> (offsets.extent (0)) !=
581  static_cast<size_t> (export_lids.extent (0) + 1),
582  std::invalid_argument, prefix << "offsets.extent(0) = "
583  << offsets.extent (0) << " != export_lids.extent(0) (= "
584  << export_lids.extent (0) << ") + 1.");
585  TEUCHOS_TEST_FOR_EXCEPTION
586  (export_lids.extent (0) != num_packets_per_lid.extent (0),
587  std::invalid_argument, prefix << "export_lids.extent(0) = " <<
588  export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
589  << num_packets_per_lid.extent (0) << ".");
590  // If exports has nonzero length at this point, then the graph
591  // has at least one entry to pack. Thus, if packing process
592  // ranks, we had better have at least one process rank to pack.
593  TEUCHOS_TEST_FOR_EXCEPTION
594  (pack_pids && exports.extent (0) != 0 &&
595  source_pids.extent (0) == 0, std::invalid_argument, prefix <<
596  "pack_pids is true, and exports.extent(0) = " <<
597  exports.extent (0) << " != 0, meaning that we need to pack at "
598  "least one graph entry, but source_pids.extent(0) = 0.");
599  }
600 
601  using pack_functor_type =
602  PackCrsGraphFunctor<Packet, LocalGraph, LocalMap,
603  BufferDeviceType>;
604  pack_functor_type f (local_graph, local_map, exports,
605  num_packets_per_lid, export_lids,
606  source_pids, offsets, pack_pids);
607 
608  typename pack_functor_type::value_type result;
609  range_type range (0, num_packets_per_lid.extent (0));
610  Kokkos::parallel_reduce (range, f, result);
611 
612  if (result.first != 0) {
613  // We can't deep_copy from AnonymousSpace Views, so we can't
614  // print out any information from them in case of error.
615  std::ostringstream os;
616  if (result.first == 1) { // invalid local row index
617  os << "invalid local row index";
618  }
619  else if (result.first == 2) { // invalid offset
620  os << "invalid offset";
621  }
622  TEUCHOS_TEST_FOR_EXCEPTION
623  (true, std::runtime_error, prefix << "PackCrsGraphFunctor "
624  "reported error code " << result.first << " (" << os.str ()
625  << ") for the first bad row " << result.second << ".");
626  }
627 }
628 
655 template<typename LO, typename GO, typename NT>
656 void
658 (const CrsGraph<LO,GO,NT>& sourceGraph,
659  Kokkos::DualView<
662  >& exports,
663  const Kokkos::View<
664  size_t*,
666  >& num_packets_per_lid,
667  const Kokkos::View<
668  const LO*,
670  >& export_lids,
671  const Kokkos::View<
672  const int*,
674  >& export_pids,
675  size_t& constant_num_packets,
676  const bool pack_pids)
677 {
678  using Kokkos::View;
679  using crs_graph_type = CrsGraph<LO, GO, NT>;
680  using packet_type = typename crs_graph_type::packet_type;
681  using buffer_device_type = typename crs_graph_type::buffer_device_type;
682  using exports_view_type = Kokkos::DualView<packet_type*, buffer_device_type>;
683  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
684  using local_map_type = typename Tpetra::Map<LO, GO, NT>::local_map_type;
685  const char prefix[] = "Tpetra::Details::packCrsGraph: ";
686  constexpr bool debug = false;
687 
688  local_graph_device_type local_graph = sourceGraph.getLocalGraphDevice ();
689  local_map_type local_col_map = sourceGraph.getColMap ()->getLocalMap ();
690 
691  // Setting this to zero tells the caller to expect a possibly
692  // different ("nonconstant") number of packets per local index
693  // (i.e., a possibly different number of entries per row).
694  constant_num_packets = 0;
695 
696  const size_t num_export_lids (export_lids.extent (0));
697  TEUCHOS_TEST_FOR_EXCEPTION
698  (num_export_lids != size_t (num_packets_per_lid.extent (0)),
699  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
700  << num_export_lids << " != num_packets_per_lid.extent(0) = "
701  << num_packets_per_lid.extent (0) << ".");
702  if (num_export_lids != 0) {
703  TEUCHOS_TEST_FOR_EXCEPTION
704  (num_packets_per_lid.data () == nullptr, std::invalid_argument,
705  prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
706  "num_packets_per_lid.data() = "
707  << num_packets_per_lid.data () << " == NULL.");
708  }
709 
710  if (num_export_lids == 0) {
711  exports = exports_view_type ("exports", 0);
712  return;
713  }
714 
715  // Array of offsets into the pack buffer.
716  View<size_t*, buffer_device_type> offsets ("offsets", num_export_lids + 1);
717 
718  // Compute number of packets per LID (row to send), as well as
719  // corresponding offsets (the prefix sum of the packet counts).
720  const size_t count =
721  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
722  local_graph.row_map, export_lids, export_pids);
723 
724  // Resize the output pack buffer if needed.
725  if (count > size_t (exports.extent (0))) {
726  exports = exports_view_type ("exports", count);
727  if (debug) {
728  std::ostringstream os;
729  os << "*** exports resized to " << count << std::endl;
730  std::cerr << os.str ();
731  }
732  }
733  if (debug) {
734  std::ostringstream os;
735  os << "*** count: " << count << ", exports.extent(0): "
736  << exports.extent (0) << std::endl;
737  std::cerr << os.str ();
738  }
739 
740  // If exports has nonzero length at this point, then the graph has
741  // at least one entry to pack. Thus, if packing process ranks, we
742  // had better have at least one process rank to pack.
743  TEUCHOS_TEST_FOR_EXCEPTION
744  (pack_pids && exports.extent (0) != 0 &&
745  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
746  "pack_pids is true, and exports.extent(0) = " <<
747  exports.extent (0) << " != 0, meaning that we need to pack at least "
748  "one graph entry, but export_pids.extent(0) = 0.");
749 
750  exports.modify_device ();
751  auto exports_d = exports.view_device ();
752  do_pack<packet_type, local_graph_device_type, local_map_type, buffer_device_type>
753  (local_graph, local_col_map, exports_d, num_packets_per_lid,
754  export_lids, export_pids, offsets, pack_pids);
755  // If we got this far, we succeeded.
756 }
757 
758 } // namespace PackCrsGraphImpl
759 
760 template<typename LO, typename GO, typename NT>
761 void
762 packCrsGraph (const CrsGraph<LO, GO, NT>& sourceGraph,
763  Teuchos::Array<typename CrsGraph<LO,GO,NT>::packet_type>& exports,
764  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
765  const Teuchos::ArrayView<const LO>& exportLIDs,
766  size_t& constantNumPackets)
767 {
768  using Kokkos::HostSpace;
769  using Kokkos::MemoryUnmanaged;
770  using Kokkos::View;
771  using crs_graph_type = CrsGraph<LO, GO, NT>;
772  using packet_type = typename crs_graph_type::packet_type;
773  using BDT = typename crs_graph_type::buffer_device_type;
774 
775  // Convert all Teuchos::Array to Kokkos::View
776 
777  // This is an output array, so we don't have to copy to device here.
778  // However, we'll have to remember to copy back to host when done.
779  BDT outputDevice;
780  View<size_t*, BDT> num_packets_per_lid_d =
782  numPacketsPerLID.getRawPtr (),
783  numPacketsPerLID.size (), false,
784  "num_packets_per_lid");
785  // This is an input array, so we have to copy to device here.
786  // However, we never need to copy it back to host.
787  View<const LO*, BDT> export_lids_d =
789  exportLIDs.getRawPtr (),
790  exportLIDs.size (), true,
791  "export_lids");
792  View<const int*, BDT> export_pids_d;
793  Kokkos::DualView<packet_type*, BDT> exports_dv;
794  constexpr bool pack_pids = false;
795 
796  static_assert
797  (std::is_same<
798  typename decltype (num_packets_per_lid_d)::non_const_value_type,
799  size_t>::value,
800  "num_packets_per_lid_d's non_const_value_type should be size_t.");
801  static_assert
802  (std::is_same<
803  typename decltype (num_packets_per_lid_d)::device_type,
804  BDT>::value,
805  "num_packets_per_lid_d's BDT should be size_t.");
806  static_assert
807  (std::is_same<
808  typename decltype (export_lids_d)::device_type,
809  BDT>::value,
810  "export_lids_d's device_type should be BDT.");
811  static_assert
812  (std::is_same<
813  typename decltype (export_pids_d)::non_const_value_type,
814  int>::value,
815  "export_pids_d's non_const_value_type should be int.");
816  static_assert
817  (std::is_same<
818  typename decltype (export_pids_d)::device_type,
819  BDT>::value,
820  "export_pids_d's device_type should be BDT.");
821 
822  PackCrsGraphImpl::packCrsGraph
823  (sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
824  export_pids_d, constantNumPackets, pack_pids);
825 
826  // The counts are an output of packCrsGraph, so we have to copy
827  // them back to host.
828  View<size_t*, HostSpace, MemoryUnmanaged>
829  num_packets_per_lid_h (numPacketsPerLID.getRawPtr (),
830  numPacketsPerLID.size ());
831  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
832 
833  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
834  // exports_dv above, then we have two host copies for exports_h.
835 
836  // The exports are an output of packCrsGraph, so we have to
837  // copy them back to host.
838  if (static_cast<size_t> (exports.size ()) !=
839  static_cast<size_t> (exports_dv.extent (0))) {
840  exports.resize (exports_dv.extent (0));
841  }
842  View<packet_type*, HostSpace, MemoryUnmanaged>
843  exports_h (exports.getRawPtr (), exports.size ());
844  Kokkos::deep_copy (exports_h, exports_dv.d_view);
845 }
846 
849 template<typename LO, typename GO, typename NT>
850 void
852  const Kokkos::DualView<
853  const LO*,
855  >& export_lids,
856  const Kokkos::DualView<
857  const int*,
859  >& export_pids,
860  Kokkos::DualView<
862  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
863  Kokkos::DualView<
864  size_t*,
866  > num_packets_per_lid,
867  size_t& constant_num_packets,
868  const bool pack_pids)
869 {
870  using Kokkos::View;
871  using crs_graph_type = CrsGraph<LO,GO,NT>;
872  using BDT = typename crs_graph_type::buffer_device_type;
873  using PT = typename crs_graph_type::packet_type;
874  using exports_dual_view_type = Kokkos::DualView<PT*, BDT>;
875  using LGT = typename crs_graph_type::local_graph_device_type;
876  using LMT = typename crs_graph_type::map_type::local_map_type;
877  const char prefix[] = "Tpetra::Details::packCrsGraphNew: ";
878 
879  const LGT local_graph = sourceGraph.getLocalGraphDevice ();
880  const LMT local_col_map = sourceGraph.getColMap ()->getLocalMap ();
881 
882  // Setting this to zero tells the caller to expect a possibly
883  // different ("nonconstant") number of packets per local index
884  // (i.e., a possibly different number of entries per row).
885  constant_num_packets = 0;
886 
887  const size_t num_export_lids =
888  static_cast<size_t> (export_lids.extent (0));
889  TEUCHOS_TEST_FOR_EXCEPTION
890  (num_export_lids !=
891  static_cast<size_t> (num_packets_per_lid.extent (0)),
892  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
893  << num_export_lids << " != num_packets_per_lid.extent(0) = "
894  << num_packets_per_lid.extent (0) << ".");
895  TEUCHOS_TEST_FOR_EXCEPTION
896  (num_export_lids != 0 &&
897  num_packets_per_lid.view_device ().data () == nullptr,
898  std::invalid_argument, prefix << "num_export_lids = "<< num_export_lids
899  << " != 0, but num_packets_per_lid.view_device().data() = nullptr.");
900 
901  if (num_export_lids == 0) {
902  exports = exports_dual_view_type ();
903  return;
904  }
905 
906  // Array of offsets into the pack buffer.
907  using offsets_type = Kokkos::View<size_t*, BDT>;
908  offsets_type offsets ("offsets", num_export_lids + 1);
909 
910  // Compute number of packets per LID (row to send), as well as
911  // corresponding offsets (the prefix sum of the packet counts).
912  num_packets_per_lid.clear_sync_state ();
913  num_packets_per_lid.modify_device ();
914  using PackCrsGraphImpl::computeNumPacketsAndOffsets;
915  const size_t count =
916  computeNumPacketsAndOffsets (offsets, num_packets_per_lid.view_device (),
917  local_graph.row_map,
918  export_lids.view_device (),
919  export_pids.view_device ());
920 
921  // Resize the output pack buffer if needed.
922  if (count > static_cast<size_t> (exports.extent (0))) {
923  exports = exports_dual_view_type ("exports", count);
924  }
925 
926  // If exports has nonzero length at this point, then the graph has
927  // at least one entry to pack. Thus, if packing process ranks, we
928  // had better have at least one process rank to pack.
929  TEUCHOS_TEST_FOR_EXCEPTION
930  (pack_pids && exports.extent (0) != 0 &&
931  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
932  "pack_pids is true, and exports.extent(0) = " <<
933  exports.extent (0) << " != 0, meaning that we need to pack at least "
934  "one graph entry, but export_pids.extent(0) = 0.");
935 
936  exports.modify_device ();
937  using PackCrsGraphImpl::do_pack;
938  do_pack<PT, LGT, LMT, BDT> (local_graph, local_col_map,
939  exports.view_device (),
940  num_packets_per_lid.view_device (),
941  export_lids.view_device (),
942  export_pids.view_device (),
943  offsets, pack_pids);
944 }
945 
946 template<typename LO, typename GO, typename NT>
947 void
949 (const CrsGraph<LO, GO, NT>& sourceGraph,
950  Kokkos::DualView<
953  >& exports_dv,
954  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
955  const Teuchos::ArrayView<const LO>& exportLIDs,
956  const Teuchos::ArrayView<const int>& sourcePIDs,
957  size_t& constantNumPackets)
958 {
959  using Kokkos::HostSpace;
960  using Kokkos::MemoryUnmanaged;
961  using Kokkos::View;
962  using crs_graph_type = CrsGraph<LO, GO, NT>;
963  using buffer_device_type = typename crs_graph_type::buffer_device_type;
964 
965  // Convert all Teuchos::Array to Kokkos::View
966 
967  // This is an output array, so we don't have to copy to device here.
968  // However, we'll have to remember to copy back to host when done.
969  View<size_t*, buffer_device_type> num_packets_per_lid_d =
970  create_mirror_view_from_raw_host_array (buffer_device_type (),
971  numPacketsPerLID.getRawPtr (),
972  numPacketsPerLID.size (), false,
973  "num_packets_per_lid");
974 
975  // This is an input array, so we have to copy to device here.
976  // However, we never need to copy it back to host.
977  View<const LO*, buffer_device_type> export_lids_d =
978  create_mirror_view_from_raw_host_array (buffer_device_type (),
979  exportLIDs.getRawPtr (),
980  exportLIDs.size (), true,
981  "export_lids");
982  // This is an input array, so we have to copy to device here.
983  // However, we never need to copy it back to host.
984  View<const int*, buffer_device_type> export_pids_d =
985  create_mirror_view_from_raw_host_array (buffer_device_type (),
986  sourcePIDs.getRawPtr (),
987  sourcePIDs.size (), true,
988  "export_pids");
989  constexpr bool pack_pids = true;
990  PackCrsGraphImpl::packCrsGraph
991  (sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
992  export_pids_d, constantNumPackets, pack_pids);
993 
994  // The counts are an output of packCrsGraph, so we
995  // have to copy them back to host.
996  View<size_t*, HostSpace, MemoryUnmanaged> num_packets_per_lid_h
997  (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
998  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
999 }
1000 
1001 } // namespace Details
1002 } // namespace Tpetra
1003 
1004 #define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT( LO, GO, NT ) \
1005  template void \
1006  Details::packCrsGraph<LO, GO, NT> ( \
1007  const CrsGraph<LO, GO, NT>&, \
1008  Teuchos::Array<CrsGraph<LO,GO,NT>::packet_type>&, \
1009  const Teuchos::ArrayView<size_t>&, \
1010  const Teuchos::ArrayView<const LO>&, \
1011  size_t&); \
1012  template void \
1013  Details::packCrsGraphNew<LO, GO, NT> ( \
1014  const CrsGraph<LO, GO, NT>&, \
1015  const Kokkos::DualView< \
1016  const LO*, \
1017  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1018  const Kokkos::DualView< \
1019  const int*, \
1020  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1021  Kokkos::DualView< \
1022  CrsGraph<LO,GO,NT>::packet_type*, \
1023  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1024  Kokkos::DualView< \
1025  size_t*, \
1026  CrsGraph<LO,GO,NT>::buffer_device_type>, \
1027  size_t&, \
1028  const bool); \
1029  template void \
1030  Details::packCrsGraphWithOwningPIDs<LO, GO, NT> ( \
1031  const CrsGraph<LO, GO, NT>&, \
1032  Kokkos::DualView<CrsGraph<LO,GO,NT>::packet_type*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1033  const Teuchos::ArrayView<size_t>&, \
1034  const Teuchos::ArrayView<const LO>&, \
1035  const Teuchos::ArrayView<const int>&, \
1036  size_t&);
1037 
1038 #endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
Declaration of the Tpetra::CrsGraph class.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
CountsViewType::non_const_value_type computeNumPacketsAndOffsets(const OutputOffsetsViewType &outputOffsets, const CountsViewType &counts, const InputOffsetsViewType &rowOffsets, const InputLocalRowIndicesViewType &lclRowInds, const InputLocalRowPidsViewType &lclRowPids)
Compute the number of packets and offsets for the pack procedure.
void do_pack(const LocalGraph &local_graph, const LocalMap &local_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const typename PackTraits< size_t >::input_array_type &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type &export_lids, const typename PackTraits< int >::input_array_type &source_pids, const Kokkos::View< const size_t *, BufferDeviceType > &offsets, const bool pack_pids)
Perform the pack operation for the graph.
KOKKOS_FUNCTION size_t packRow(const LocalMapType &col_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const InputLidsType &lids_in, const InputPidsType &pids_in, const size_t offset, const size_t num_ent, const bool pack_pids)
Packs a single row of the CrsGraph.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
"Local" part of Map suitable for Kokkos kernels.
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
Compute the number of packets and offsets for the pack procedure.
Implementation details of Tpetra.
void packCrsGraph(const CrsGraph< LO, GO, NT > &sourceGraph, Teuchos::Array< typename CrsGraph< LO, GO, NT >::packet_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void packCrsGraphNew(const CrsGraph< LO, GO, NT > &sourceGraph, const Kokkos::DualView< const LO *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportLIDs, const Kokkos::DualView< const int *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportPIDs, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports, Kokkos::DualView< size_t *, typename CrsGraph< LO, GO, NT >::buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, const bool pack_pids)
Pack specified entries of the given local sparse graph for communication, for "new" DistObject interf...
void packCrsGraphWithOwningPIDs(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Traits class for packing / unpacking data of type T.