Tpetra parallel linear algebra  Version of the Day
Tpetra_Import_Util2.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_IMPORT_UTIL2_HPP
43 #define TPETRA_IMPORT_UTIL2_HPP
44 
49 
50 #include "Tpetra_ConfigDefs.hpp"
52 #include "Tpetra_Import.hpp"
53 #include "Tpetra_HashTable.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Util.hpp"
56 #include "Tpetra_Distributor.hpp"
57 #include "Kokkos_DualView.hpp"
58 #include <Teuchos_Array.hpp>
59 #include <utility>
60 
61 // Tpetra::CrsMatrix uses the functions below in its implementation.
62 // To avoid a circular include issue, only include the declarations
63 // for CrsMatrix. We will include the definition after the functions
64 // here have been defined.
66 
67 namespace { // (anonymous)
68 
69  template<class T, class D>
70  Kokkos::View<T*, D, Kokkos::MemoryUnmanaged>
71  getNonconstView (const Teuchos::ArrayView<T>& x)
72  {
73  typedef Kokkos::View<T*, D, Kokkos::MemoryUnmanaged> view_type;
74  typedef typename view_type::size_type size_type;
75  const size_type numEnt = static_cast<size_type> (x.size ());
76  return view_type (x.getRawPtr (), numEnt);
77  }
78 
79  template<class T, class D>
80  Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged>
81  getConstView (const Teuchos::ArrayView<const T>& x)
82  {
83  typedef Kokkos::View<const T*, D, Kokkos::MemoryUnmanaged> view_type;
84  typedef typename view_type::size_type size_type;
85  const size_type numEnt = static_cast<size_type> (x.size ());
86  return view_type (x.getRawPtr (), numEnt);
87  }
88 
89  // For a given Kokkos (execution or memory) space, return both its
90  // execution space, and the corresponding host execution space.
91  template<class Space>
92  struct GetHostExecSpace {
93  typedef typename Space::execution_space execution_space;
94  typedef typename Kokkos::View<int*, execution_space>::HostMirror::execution_space host_execution_space;
95  };
96 
97 } // namespace (anonymous)
98 
99 namespace Tpetra {
100 namespace Import_Util {
101 
116 template<typename Scalar,
117  typename LocalOrdinal,
118  typename GlobalOrdinal,
119  typename Node>
120 void
121 packAndPrepareWithOwningPIDs (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
122  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
123  Kokkos::DualView<char*, typename Node::device_type>& exports,
124  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
125  size_t& constantNumPackets,
126  Distributor &distor,
127  const Teuchos::ArrayView<const int>& SourcePids);
128 
144 template<typename Scalar,
145  typename LocalOrdinal,
146  typename GlobalOrdinal,
147  typename Node>
148 size_t
149 unpackAndCombineWithOwningPIDsCount (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
150  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
151  const Teuchos::ArrayView<const char> &imports,
152  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
153  size_t constantNumPackets,
154  Distributor &distor,
155  CombineMode combineMode,
156  size_t numSameIDs,
157  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
158  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs);
159 
174 template<typename Scalar,
175  typename LocalOrdinal,
176  typename GlobalOrdinal,
177  typename Node>
178 void
179 unpackAndCombineIntoCrsArrays (const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& SourceMatrix,
180  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
181  const Teuchos::ArrayView<const char>& imports,
182  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
183  size_t constantNumPackets,
184  Distributor &distor,
185  CombineMode combineMode,
186  size_t numSameIDs,
187  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
188  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
189  size_t TargetNumRows,
190  size_t TargetNumNonzeros,
191  int MyTargetPID,
192  const Teuchos::ArrayView<size_t>& rowPointers,
193  const Teuchos::ArrayView<GlobalOrdinal>& columnIndices,
194  const Teuchos::ArrayView<Scalar>& values,
195  const Teuchos::ArrayView<const int>& SourcePids,
196  Teuchos::Array<int>& TargetPids);
197 
200 template<typename Scalar, typename Ordinal>
201 void
202 sortCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
203  const Teuchos::ArrayView<Ordinal>& CRS_colind,
204  const Teuchos::ArrayView<Scalar>&CRS_vals);
205 
206 
207 template<typename rowptr_array_type, typename colind_array_type, typename vals_array_type>
208 void
209 sortCrsEntries (const rowptr_array_type& CRS_rowptr,
210  const colind_array_type& CRS_colind,
211  const vals_array_type& CRS_vals);
212 
217 template<typename Scalar, typename Ordinal>
218 void
219 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t>& CRS_rowptr,
220  const Teuchos::ArrayView<Ordinal>& CRS_colind,
221  const Teuchos::ArrayView<Scalar>& CRS_vals);
222 
238 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
239 void
240 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowPointers,
241  const Teuchos::ArrayView<LocalOrdinal> &columnIndices_LID,
242  const Teuchos::ArrayView<GlobalOrdinal> &columnIndices_GID,
243  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & domainMap,
244  const Teuchos::ArrayView<const int> &owningPids,
245  Teuchos::Array<int> &remotePids,
246  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap);
247 
248 
249 } // namespace Import_Util
250 } // namespace Tpetra
251 
252 
253 //
254 // Implementations
255 //
256 
257 namespace { // (anonymous)
258 
275  template<class LO, class GO, class D>
276  size_t
277  packRowCount (const size_t numEnt,
278  const size_t numBytesPerValue)
279  {
281 
282  if (numEnt == 0) {
283  // Empty rows always take zero bytes, to ensure sparsity.
284  return 0;
285  }
286  else {
287  // We store the number of entries as a local index (LO).
288  LO numEntLO = 0; // packValueCount wants this.
289  GO gid;
290  int lid;
291  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
292  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
293  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (lid);
294  const size_t valsLen = numEnt * numBytesPerValue;
295  return numEntLen + gidsLen + pidsLen + valsLen;
296  }
297  }
298 
299  template<class LO, class D>
300  size_t
301  unpackRowCount (const typename Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
302  const size_t offset,
303  const size_t numBytes,
304  const size_t numBytesPerValue)
305  {
306  using Kokkos::subview;
308  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
309  typedef typename input_buffer_type::size_type size_type;
310 
311  if (numBytes == 0) {
312  // Empty rows always take zero bytes, to ensure sparsity.
313  return static_cast<size_t> (0);
314  }
315  else {
316  LO numEntLO = 0;
317  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
318 #ifdef HAVE_TPETRA_DEBUG
319  TEUCHOS_TEST_FOR_EXCEPTION(
320  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
321  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
322  << ".");
323 #endif // HAVE_TPETRA_DEBUG
324  const std::pair<size_type, size_type> rng (offset, offset + theNumBytes);
325  input_buffer_type inBuf = subview (imports, rng); // imports (offset, theNumBytes);
326  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
327  (void)actualNumBytes;
328 #ifdef HAVE_TPETRA_DEBUG
329  TEUCHOS_TEST_FOR_EXCEPTION(
330  theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
331  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
332  << ".");
333 #endif // HAVE_TPETRA_DEBUG
334  return static_cast<size_t> (numEntLO);
335  }
336  }
337 
338  // Return the number of bytes packed.
339  template<class ST, class LO, class GO, class D>
340  size_t
341  packRow (const typename Tpetra::Details::PackTraits<LO, D>::output_buffer_type& exports,
342  const size_t offset,
343  const size_t numEnt,
347  const size_t numBytesPerValue)
348  {
349  using Kokkos::subview;
351  // NOTE (mfh 02 Feb 2015) This assumes that output_buffer_type is
352  // the same, no matter what type we're packing. It's a reasonable
353  // assumption, given that we go through the trouble of PackTraits
354  // just so that we can pack data of different types in the same
355  // buffer.
356  typedef typename PackTraits<LO, D>::output_buffer_type output_buffer_type;
357  typedef typename output_buffer_type::size_type size_type;
358  typedef typename std::pair<size_type, size_type> pair_type;
359 
360  if (numEnt == 0) {
361  // Empty rows always take zero bytes, to ensure sparsity.
362  return 0;
363  }
364 
365  const GO gid = 0; // packValueCount wants this
366  const LO numEntLO = static_cast<size_t> (numEnt);
367  const int pid = 0; // packValueCount wants this
368 
369  const size_t numEntBeg = offset;
370  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
371  const size_t gidsBeg = numEntBeg + numEntLen;
372  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
373  const size_t pidsBeg = gidsBeg + gidsLen;
374  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
375  const size_t valsBeg = pidsBeg + pidsLen;
376  const size_t valsLen = numEnt * numBytesPerValue;
377 
378  output_buffer_type numEntOut =
379  subview (exports, pair_type (numEntBeg, numEntBeg + numEntLen));
380  output_buffer_type gidsOut =
381  subview (exports, pair_type (gidsBeg, gidsBeg + gidsLen));
382  output_buffer_type pidsOut =
383  subview (exports, pair_type (pidsBeg, pidsBeg + pidsLen));
384  output_buffer_type valsOut =
385  subview (exports, pair_type (valsBeg, valsBeg + valsLen));
386 
387  size_t numBytesOut = 0;
388  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
389  numBytesOut += PackTraits<GO, D>::packArray (gidsOut, gidsIn, numEnt);
390  numBytesOut += PackTraits<int, D>::packArray (pidsOut, pidsIn, numEnt);
391  numBytesOut += PackTraits<ST, D>::packArray (valsOut, valsIn, numEnt);
392 
393  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
394  TEUCHOS_TEST_FOR_EXCEPTION(
395  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
396  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
397  << expectedNumBytes << ".");
398 
399  return numBytesOut;
400  }
401 
402  // Return the number of bytes actually read / used.
403  template<class ST, class LO, class GO, class D>
404  size_t
405  unpackRow (const typename Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
409  const size_t offset,
410  const size_t numBytes,
411  const size_t numEnt,
412  const size_t numBytesPerValue)
413  {
414  using Kokkos::subview;
416  // NOTE (mfh 02 Feb 2015) This assumes that input_buffer_type is
417  // the same, no matter what type we're unpacking. It's a
418  // reasonable assumption, given that we go through the trouble of
419  // PackTraits just so that we can pack data of different types in
420  // the same buffer.
421  typedef typename PackTraits<LO, D>::input_buffer_type input_buffer_type;
422  typedef typename input_buffer_type::size_type size_type;
423  typedef typename std::pair<size_type, size_type> pair_type;
424 
425  if (numBytes == 0) {
426  // Rows with zero bytes always have zero entries.
427  return 0;
428  }
429  TEUCHOS_TEST_FOR_EXCEPTION(
430  static_cast<size_t> (imports.dimension_0 ()) <= offset, std::logic_error,
431  "unpackRow: imports.dimension_0() = " << imports.dimension_0 () <<
432  " <= offset = " << offset << ".");
433  TEUCHOS_TEST_FOR_EXCEPTION(
434  static_cast<size_t> (imports.dimension_0 ()) < offset + numBytes,
435  std::logic_error, "unpackRow: imports.dimension_0() = "
436  << imports.dimension_0 () << " < offset + numBytes = "
437  << (offset + numBytes) << ".");
438 
439  const GO gid = 0; // packValueCount wants this
440  const LO lid = 0; // packValueCount wants this
441  const int pid = 0; // packValueCount wants this
442 
443  const size_t numEntBeg = offset;
444  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
445  const size_t gidsBeg = numEntBeg + numEntLen;
446  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
447  const size_t pidsBeg = gidsBeg + gidsLen;
448  const size_t pidsLen = numEnt * PackTraits<int, D>::packValueCount (pid);
449  const size_t valsBeg = pidsBeg + pidsLen;
450  const size_t valsLen = numEnt * numBytesPerValue;
451 
452  input_buffer_type numEntIn = subview (imports, pair_type (numEntBeg, numEntBeg + numEntLen));
453  input_buffer_type gidsIn = subview (imports, pair_type (gidsBeg, gidsBeg + gidsLen));
454  input_buffer_type pidsIn = subview (imports, pair_type (pidsBeg, pidsBeg + pidsLen));
455  input_buffer_type valsIn = subview (imports, pair_type (valsBeg, valsBeg + valsLen));
456 
457  size_t numBytesOut = 0;
458  LO numEntOut;
459  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
460  TEUCHOS_TEST_FOR_EXCEPTION(
461  static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
462  "unpackRow: Expected number of entries " << numEnt
463  << " != actual number of entries " << numEntOut << ".");
464 
465  numBytesOut += PackTraits<GO, D>::unpackArray (gidsOut, gidsIn, numEnt);
466  numBytesOut += PackTraits<int, D>::unpackArray (pidsOut, pidsIn, numEnt);
467  numBytesOut += PackTraits<ST, D>::unpackArray (valsOut, valsIn, numEnt);
468 
469  TEUCHOS_TEST_FOR_EXCEPTION(
470  numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
471  << numBytesOut << " != numBytes = " << numBytes << ".");
472  const size_t expectedNumBytes = numEntLen + gidsLen + pidsLen + valsLen;
473  TEUCHOS_TEST_FOR_EXCEPTION(
474  numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
475  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
476  << expectedNumBytes << ".");
477  return numBytesOut;
478  }
479 
480  // mfh 28 Apr 2016: Sometimes we have a raw host array, and we need
481  // to make a Kokkos::View out of it that lives in a certain memory
482  // space. We don't want to make a deep copy of the input array if
483  // we don't need to, but if the memory spaces are different, we need
484  // to. The following code does that. The struct is an
485  // implementation detail, and the "free" function
486  // get1DConstViewOfUnmanagedArray is the interface to call.
487 
488  template<class ST, class DT,
489  const bool outputIsHostMemory =
490  std::is_same<typename DT::memory_space, Kokkos::HostSpace>::value>
491  struct Get1DConstViewOfUnmanagedHostArray {};
492 
493  template<class ST, class DT>
494  struct Get1DConstViewOfUnmanagedHostArray<ST, DT, true> {
495  typedef Kokkos::View<const ST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> output_view_type;
496 
497  static output_view_type
498  getView (const char /* label */ [], const ST* x_raw, const size_t x_len)
499  {
500  // We can return the input array, wrapped as an unmanaged View.
501  // Ignore the label, since unmanaged Views don't have labels.
502  return output_view_type (x_raw, x_len);
503  }
504  };
505 
506  template<class ST, class DT>
507  struct Get1DConstViewOfUnmanagedHostArray<ST, DT, false> {
508  typedef Kokkos::View<const ST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> input_view_type;
509  typedef Kokkos::View<const ST*, DT> output_view_type;
510 
511  static output_view_type
512  getView (const char label[], const ST* x_raw, const size_t x_len)
513  {
514  input_view_type x_in (x_raw, x_len);
515  // The memory spaces are different, so we have to create a new
516  // View which is a deep copy of the input array.
517  //
518  // FIXME (mfh 28 Apr 2016) This needs to be converted to
519  // std::string, else the compiler can't figure out what
520  // constructor we're calling.
521  Kokkos::View<ST*, DT> x_out (std::string (label), x_len);
522  Kokkos::deep_copy (x_out, x_in);
523  return x_out;
524  }
525  };
526 
527  template<class ST, class DT>
528  typename Get1DConstViewOfUnmanagedHostArray<ST, DT>::output_view_type
529  get1DConstViewOfUnmanagedHostArray (const char label[], const ST* x_raw, const size_t x_len)
530  {
531  return Get1DConstViewOfUnmanagedHostArray<ST, DT>::getView (label, x_raw, x_len);
532  }
533 
534 } // namespace (anonymous)
535 
536 
537 namespace Tpetra {
538 namespace Import_Util {
539 
540 template<typename Scalar,
541  typename LocalOrdinal,
542  typename GlobalOrdinal,
543  typename Node>
544 void
546  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
547  Kokkos::DualView<char*, typename Node::device_type>& exports,
548  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
549  size_t& constantNumPackets,
550  Distributor &distor,
551  const Teuchos::ArrayView<const int>& SourcePids)
552 {
554  using Kokkos::MemoryUnmanaged;
555  using Kokkos::subview;
556  using Kokkos::View;
557  using Teuchos::Array;
558  using Teuchos::ArrayView;
559  using Teuchos::as;
560  using Teuchos::RCP;
561  typedef LocalOrdinal LO;
562  typedef GlobalOrdinal GO;
564  typedef typename matrix_type::impl_scalar_type ST;
565  typedef typename Node::device_type device_type;
566  typedef typename device_type::execution_space execution_space;
567  // The device type of the HostMirror of a device_type View. This
568  // might not necessarily be Kokkos::HostSpace! In particular, if
569  // device_type::memory_space is CudaUVMSpace and the CMake option
570  // Kokkos_ENABLE_Cuda_UVM is ON, the corresponding HostMirror's
571  // memory space is also CudaUVMSpace.
572  typedef typename Kokkos::DualView<char*, device_type>::t_host::device_type HDS;
573  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
574  typedef typename ArrayView<const LO>::size_type size_type;
575  typedef std::pair<typename View<int*, HDS>::size_type,
576  typename View<int*, HDS>::size_type> pair_type;
577  const char prefix[] = "Tpetra::Import_Util::packAndPrepareWithOwningPIDs: ";
578 
579  // FIXME (mfh 03 Jan 2015) Currently, it might be the case that if a
580  // graph or matrix owns no entries on a process, then it reports
581  // that it is neither locally nor globally indexed, even if the
582  // graph or matrix has a column Map. This should change once we get
583  // rid of lazy initialization in CrsGraph and CrsMatrix.
584  TEUCHOS_TEST_FOR_EXCEPTION(
585  ! SourceMatrix.isLocallyIndexed (), std::invalid_argument,
586  prefix << "SourceMatrix must be locally indexed.");
587  TEUCHOS_TEST_FOR_EXCEPTION(
588  SourceMatrix.getColMap ().is_null (), std::logic_error,
589  prefix << "The source matrix claims to be locally indexed, but its column "
590  "Map is null. This should never happen. Please report this bug to the "
591  "Tpetra developers.");
592  const size_type numExportLIDs = exportLIDs.size ();
593  TEUCHOS_TEST_FOR_EXCEPTION(
594  numExportLIDs != numPacketsPerLID.size (), std::invalid_argument, prefix
595  << "exportLIDs.size() = " << numExportLIDs << "!= numPacketsPerLID.size() "
596  << " = " << numPacketsPerLID.size () << ".");
597  TEUCHOS_TEST_FOR_EXCEPTION(
598  static_cast<size_t> (SourcePids.size ()) != SourceMatrix.getColMap ()->getNodeNumElements (),
599  std::invalid_argument, prefix << "SourcePids.size() = "
600  << SourcePids.size ()
601  << "!= SourceMatrix.getColMap()->getNodeNumElements() = "
602  << SourceMatrix.getColMap ()->getNodeNumElements () << ".");
603 
604  // This tells the caller that different rows may have different
605  // numbers of entries. That is, the number of packets per LID might
606  // not be a constant.
607  constantNumPackets = 0;
608 
609  // Compute the number of bytes ("packets") per row to pack. While
610  // we're at it, compute the total number of matrix entries to send,
611  // and the max number of entries in any of the rows we're sending.
612  size_t totalNumBytes = 0;
613  size_t totalNumEntries = 0;
614  size_t maxRowLength = 0;
615  for (size_type i = 0; i < numExportLIDs; ++i) {
616  const LO lclRow = exportLIDs[i];
617  const size_t numEnt = SourceMatrix.getNumEntriesInLocalRow (lclRow);
618 
619  // The 'if' branch implicitly assumes that packRowCount() returns
620  // zero if numEnt == 0.
621  size_t numBytesPerValue = 0;
622  if (numEnt > 0) {
623  // Get a locally indexed view of the current row's data. If the
624  // current row has > 0 entries, we need an entry in order to
625  // figure out the byte count of the packed row. (We really only
626  // need it if ST's size is determined at run time.)
627  ArrayView<const Scalar> valsView;
628  ArrayView<const LO> lidsView;
629  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
630  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
631  View<const ST*, Kokkos::HostSpace, MemoryUnmanaged> valsViewK (valsViewRaw, valsView.size ());
632  TEUCHOS_TEST_FOR_EXCEPTION(
633  static_cast<size_t> (valsViewK.dimension_0 ()) != numEnt,
634  std::logic_error, prefix << "Local row " << i << " claims to have "
635  << numEnt << "entry/ies, but the View returned by getLocalRowView() "
636  "has " << valsViewK.dimension_0 () << " entry/ies. This should never "
637  "happen. Please report this bug to the Tpetra developers.");
638 
639  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
640  // space here for now, this doesn't assume UVM. That may change
641  // in the future, if we ever start packing on the device.
642  //
643  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
644  // returned by packValueCount is independent of the memory
645  // space. This assumption helps with #227.
646  numBytesPerValue = PackTraits<ST, Kokkos::HostSpace>::packValueCount (valsViewK(0));
647  }
648 
649  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
650  // returned by packRowCount is independent of the memory space.
651  // This assumption helps with #227.
652  const size_t numBytes =
653  packRowCount<LO, GO, Kokkos::HostSpace> (numEnt, numBytesPerValue);
654  numPacketsPerLID[i] = numBytes;
655  totalNumBytes += numBytes;
656  totalNumEntries += numEnt;
657  maxRowLength = std::max (maxRowLength, numEnt);
658  }
659 
660  // We use a "struct of arrays" approach to packing each row's
661  // entries. All the column indices (as global indices) go first,
662  // then all their owning process ranks, and then the values.
663  if (totalNumEntries > 0) {
664  if (static_cast<size_t> (exports.dimension_0 ()) !=
665  static_cast<size_t> (totalNumBytes)) {
666  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
667  // temporarily needed for #227 debugging. Should be able to
668  // remove them after that's fixed.
669  execution_space::fence ();
670  exports = Kokkos::DualView<char*, device_type> ("exports", totalNumBytes);
671  execution_space::fence ();
672  }
673 
674  // mfh 26 Apr 2016: The code below currently fills on host. We
675  // may change this in the future.
676  exports.template modify<Kokkos::HostSpace> ();
677 
678  // FIXME (mfh 28 Apr 2016) We take subviews in a loop, so it might
679  // pay to use an unmanaged View to reduce reference count update
680  // overhead. On the other hand, with CudaUVMSpace, it's not
681  // obvious what the type of "the unmanaged version of exportsK"
682  // should be. It's memory space is not Kokkos::HostSpace, for
683  // example! What we really need is a "create_unmanaged_view"
684  // function that works like Kokkos::Compat::create_const_view.
685  // For now, I'll just use the managed View.
686 
687  //auto exportsK_managed = exports.template view<Kokkos::HostSpace> ();
688  //View<char*, Kokkos::HostSpace, MemoryUnmanaged> exportsK = exportsK_managed;
689 
690  auto exports_h = exports.template view<Kokkos::HostSpace> ();
691 
692  // Current position (in bytes) in the 'exports' output array.
693  size_t offset = 0;
694 
695  // For each row of the matrix owned by the calling process, pack
696  // that row's column indices and values into the exports array.
697 
698  // Locally indexed matrices always have a column Map.
699  const map_type& colMap = * (SourceMatrix.getColMap ());
700 
701  // Temporary buffers for a copy of the column gids/pids
702  typename View<GO*, device_type>::HostMirror gids;
703  typename View<int*, device_type>::HostMirror pids;
704  {
705  GO gid;
706  int pid;
707  gids = PackTraits<GO, HDS>::allocateArray (gid, maxRowLength, "gids");
708  pids = PackTraits<int, HDS>::allocateArray (pid, maxRowLength, "pids");
709  }
710 
711  for (size_type i = 0; i < numExportLIDs; i++) {
712  const LO lclRow = exportLIDs[i];
713 
714  // Get a locally indexed view of the current row's data.
715  ArrayView<const Scalar> valsView;
716  ArrayView<const LO> lidsView;
717  SourceMatrix.getLocalRowView (lclRow, lidsView, valsView);
718  const ST* valsViewRaw = reinterpret_cast<const ST*> (valsView.getRawPtr ());
719 
720  // This is just a shallow copy if not CUDA, else a deep copy
721  // (into HDS, namely Device<Cuda, CudaUVMSpace>) if CUDA.
722  //
723  // FIXME (mfh 28 Apr 2016) This is slow for the CUDA case, but
724  // it should be correct.
725  auto valsViewK = get1DConstViewOfUnmanagedHostArray<ST, HDS> ("valsViewK", valsViewRaw, static_cast<size_t> (valsView.size ()));
726  const size_t numEnt = static_cast<size_t> (valsViewK.dimension_0 ());
727 
728  // NOTE (mfh 07 Feb 2015) Since we're using the host memory
729  // space here for now, this doesn't assume UVM. That may change
730  // in the future, if we ever start packing on the device.
731  //
732  // FIXME (mfh 28 Apr 2016) For now, we assume that the value
733  // returned by packValueCount is independent of the memory
734  // space. This assumption helps with #227.
735  const size_t numBytesPerValue = numEnt == 0 ?
736  static_cast<size_t> (0) :
737  PackTraits<ST, Kokkos::HostSpace>::packValueCount (valsViewK(0));
738 
739  // Convert column indices as LIDs to column indices as GIDs.
740  auto gidsView = subview (gids, pair_type (0, numEnt));
741  auto pidsView = subview (pids, pair_type (0, numEnt));
742  for (size_t k = 0; k < numEnt; ++k) {
743  gidsView(k) = colMap.getGlobalElement (lidsView[k]);
744  pidsView(k) = SourcePids[lidsView[k]];
745  }
746 
747  // Copy the row's data into the current spot in the exports array.
748  const size_t numBytes =
749  packRow<ST, LO, GO, HDS> (exports_h, offset, numEnt,
750  gidsView, pidsView, valsViewK,
751  numBytesPerValue);
752  // Keep track of how many bytes we packed.
753  offset += numBytes;
754  }
755 
756 #ifdef HAVE_TPETRA_DEBUG
757  TEUCHOS_TEST_FOR_EXCEPTION(
758  offset != totalNumBytes, std::logic_error, prefix << "At end of method, "
759  "the final offset (in bytes) " << offset << " does not equal the total "
760  "number of bytes packed " << totalNumBytes << ". Please report this bug "
761  "to the Tpetra developers.");
762 #endif // HAVE_TPETRA_DEBUG
763  }
764 }
765 
766 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
767 size_t
769  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
770  const Teuchos::ArrayView<const char> &imports,
771  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
772  size_t constantNumPackets,
773  Distributor &distor,
774  CombineMode combineMode,
775  size_t numSameIDs,
776  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
777  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
778 {
779  using Kokkos::MemoryUnmanaged;
780  using Kokkos::View;
781  typedef LocalOrdinal LO;
782  typedef GlobalOrdinal GO;
783  typedef CrsMatrix<Scalar, LO, GO, Node> matrix_type;
784  typedef typename matrix_type::impl_scalar_type ST;
785  typedef typename Teuchos::ArrayView<const LO>::size_type size_type;
786  typedef typename Node::execution_space execution_space;
787  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
788  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
789 
790  TEUCHOS_TEST_FOR_EXCEPTION(
791  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
792  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
793  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
794  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
795  // process, then the matrix is neither locally nor globally indexed.
796  const bool locallyIndexed = SourceMatrix.isLocallyIndexed ();
797  TEUCHOS_TEST_FOR_EXCEPTION(
798  ! locallyIndexed, std::invalid_argument, prefix << "The input CrsMatrix "
799  "'SourceMatrix' must be locally indexed.");
800  TEUCHOS_TEST_FOR_EXCEPTION(
801  importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
802  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
803  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
804 
805  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
806  imports.size ());
807 
808  // Number of matrix entries to unpack (returned by this function).
809  size_t nnz = 0;
810 
811  // Count entries copied directly from the source matrix without permuting.
812  for (size_t sourceLID = 0; sourceLID < numSameIDs; ++sourceLID) {
813  const LO srcLID = static_cast<LO> (sourceLID);
814  nnz += SourceMatrix.getNumEntriesInLocalRow (srcLID);
815  }
816 
817  // Count entries copied directly from the source matrix with permuting.
818  const size_type numPermuteToLIDs = permuteToLIDs.size ();
819  for (size_type p = 0; p < numPermuteToLIDs; ++p) {
820  nnz += SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[p]);
821  }
822 
823  // Count entries received from other MPI processes.
824  size_t offset = 0;
825  const size_type numImportLIDs = importLIDs.size ();
826  for (size_type i = 0; i < numImportLIDs; ++i) {
827  const size_t numBytes = numPacketsPerLID[i];
828  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
829  // values, if it has one) for the (possibly run-time-depenendent)
830  // number of bytes of one of its entries.
831  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
832  numBytes, sizeof (ST));
833  nnz += numEnt;
834  offset += numBytes;
835  }
836  return nnz;
837 }
838 
839 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
840 void
842  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
843  const Teuchos::ArrayView<const char>& imports,
844  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
845  const size_t constantNumPackets,
846  Distributor& distor,
847  const CombineMode combineMode,
848  const size_t numSameIDs,
849  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
850  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
851  size_t TargetNumRows,
852  size_t TargetNumNonzeros,
853  const int MyTargetPID,
854  const Teuchos::ArrayView<size_t>& CSR_rowptr,
855  const Teuchos::ArrayView<GlobalOrdinal>& CSR_colind,
856  const Teuchos::ArrayView<Scalar>& CSR_vals,
857  const Teuchos::ArrayView<const int>& SourcePids,
858  Teuchos::Array<int>& TargetPids)
859 {
861  using Kokkos::MemoryUnmanaged;
862  using Kokkos::subview;
863  using Kokkos::View;
864  using Teuchos::ArrayRCP;
865  using Teuchos::ArrayView;
866  using Teuchos::as;
867  using Teuchos::av_reinterpret_cast;
868  using Teuchos::outArg;
869  using Teuchos::REDUCE_MAX;
870  using Teuchos::reduceAll;
871  typedef LocalOrdinal LO;
872  typedef GlobalOrdinal GO;
873  typedef typename Node::execution_space execution_space;
874  typedef typename GetHostExecSpace<execution_space>::host_execution_space HES;
876  typedef typename matrix_type::impl_scalar_type ST;
877  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
878  typedef typename ArrayView<const LO>::size_type size_type;
879  //typedef std::pair<typename Kokkos::View<int*, HES>::size_type,
880  // typename Kokkos::View<int*, HES>::size_type> pair_type;
881  const char prefix[] = "Tpetra::Import_Util::unpackAndCombineIntoCrsArrays: ";
882 
883  const size_t N = TargetNumRows;
884  const size_t mynnz = TargetNumNonzeros;
885  // In the case of reduced communicators, the SourceMatrix won't have
886  // the right "MyPID", so thus we have to supply it.
887  const int MyPID = MyTargetPID;
888 
889  TEUCHOS_TEST_FOR_EXCEPTION(
890  TargetNumRows + 1 != static_cast<size_t> (CSR_rowptr.size ()),
891  std::invalid_argument, prefix << "CSR_rowptr.size() = " <<
892  CSR_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
893  TEUCHOS_TEST_FOR_EXCEPTION(
894  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
895  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
896  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
897  const size_type numImportLIDs = importLIDs.size ();
898  TEUCHOS_TEST_FOR_EXCEPTION(
899  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
900  prefix << "importLIDs.size() = " << numImportLIDs << " != "
901  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
902 
903  // Kokkos::View of the input buffer of bytes to unpack.
904  View<const char*, HES, MemoryUnmanaged> importsK (imports.getRawPtr (),
905  imports.size ());
906  // Zero the rowptr
907  for (size_t i = 0; i< N+1; ++i) {
908  CSR_rowptr[i] = 0;
909  }
910 
911  // SameIDs: Always first, always in the same place
912  for (size_t i = 0; i < numSameIDs; ++i) {
913  CSR_rowptr[i] = SourceMatrix.getNumEntriesInLocalRow (static_cast<LO> (i));
914  }
915 
916  // PermuteIDs: Still local, but reordered
917  size_t numPermuteIDs = permuteToLIDs.size ();
918  for (size_t i = 0; i < numPermuteIDs; ++i) {
919  CSR_rowptr[permuteToLIDs[i]] =
920  SourceMatrix.getNumEntriesInLocalRow (permuteFromLIDs[i]);
921  }
922 
923  // Setup CSR_rowptr for remotes
924  {
925  size_t offset = 0;
926  for (size_type k = 0; k < numImportLIDs; ++k) {
927  const size_t numBytes = numPacketsPerLID[k];
928  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
929  // values, if it has one) for the (possibly run-time -
930  // depenendent) number of bytes of one of its entries.
931  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset,
932  numBytes, sizeof (ST));
933  CSR_rowptr[importLIDs[k]] += numEnt;
934  offset += numBytes;
935  }
936  }
937 
938  // If multiple processes contribute to the same row, we may need to
939  // update row offsets. This tracks that.
940  Teuchos::Array<size_t> NewStartRow (N + 1);
941 
942  // Turn row length into a real CSR_rowptr
943  size_t last_len = CSR_rowptr[0];
944  CSR_rowptr[0] = 0;
945  for (size_t i = 1; i < N+1; ++i) {
946  size_t new_len = CSR_rowptr[i];
947  CSR_rowptr[i] = last_len + CSR_rowptr[i-1];
948  NewStartRow[i] = CSR_rowptr[i];
949  last_len = new_len;
950  }
951 
952  TEUCHOS_TEST_FOR_EXCEPTION(
953  CSR_rowptr[N] != mynnz, std::invalid_argument, prefix << "CSR_rowptr[last]"
954  " = " << CSR_rowptr[N] << "!= mynnz = " << mynnz << ".");
955 
956  // Preseed TargetPids with -1 for local
957  if (static_cast<size_t> (TargetPids.size ()) != mynnz) {
958  TargetPids.resize (mynnz);
959  }
960  TargetPids.assign (mynnz, -1);
961 
962  // Grab pointers for SourceMatrix
963  ArrayRCP<const size_t> Source_rowptr_RCP;
964  ArrayRCP<const LO> Source_colind_RCP;
965  ArrayRCP<const Scalar> Source_vals_RCP;
966  SourceMatrix.getAllValues (Source_rowptr_RCP, Source_colind_RCP, Source_vals_RCP);
967  ArrayView<const size_t> Source_rowptr = Source_rowptr_RCP ();
968  ArrayView<const LO> Source_colind = Source_colind_RCP ();
969  ArrayView<const Scalar> Source_vals = Source_vals_RCP ();
970 
971  const map_type& sourceColMap = * (SourceMatrix.getColMap());
972  ArrayView<const GO> globalColElts = sourceColMap.getNodeElementList();
973 
974  // SameIDs: Copy the data over
975  for (size_t i = 0; i < numSameIDs; ++i) {
976  size_t FromRow = Source_rowptr[i];
977  size_t ToRow = CSR_rowptr[i];
978  NewStartRow[i] += Source_rowptr[i+1] - Source_rowptr[i];
979 
980  for (size_t j = Source_rowptr[i]; j < Source_rowptr[i+1]; ++j) {
981  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
982  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
983  TargetPids[ToRow + j - FromRow] =
984  (SourcePids[Source_colind[j]] != MyPID) ?
985  SourcePids[Source_colind[j]] : -1;
986  }
987  }
988 
989  size_t numBytesPerValue = 0;
990  if (PackTraits<ST, HES>::compileTimeSize) {
991  ST val; // assume that ST is default constructible
992  numBytesPerValue = PackTraits<ST, HES>::packValueCount (val);
993  }
994  else {
995  // Since the packed data come from the source matrix, we can use
996  // the source matrix to get the number of bytes per Scalar value
997  // stored in the matrix. This assumes that all Scalar values in
998  // the source matrix require the same number of bytes. If the
999  // source matrix has no entries on the calling process, then we
1000  // have to ask the target matrix (via the output CSR arrays). If
1001  // the target matrix has no entries on input on the calling
1002  // process, then hope that some process does have some idea how
1003  // big a Scalar value is. Of course, if no processes have any
1004  // entries, then no values should be packed (though this does
1005  // assume that in our packing scheme, rows with zero entries take
1006  // zero bytes).
1007  if (Source_rowptr.size () == 0 || Source_rowptr[Source_rowptr.size () - 1] == 0) {
1008  numBytesPerValue = PackTraits<ST, HES>::packValueCount (CSR_vals[0]);
1009  }
1010  else {
1011  numBytesPerValue = PackTraits<ST, HES>::packValueCount (Source_vals[0]);
1012  }
1013  size_t lclNumBytesPerValue = numBytesPerValue;
1014  Teuchos::RCP<const Teuchos::Comm<int> > comm = SourceMatrix.getComm ();
1015  reduceAll<int, size_t> (*comm, REDUCE_MAX, lclNumBytesPerValue,
1016  outArg (numBytesPerValue));
1017  }
1018 
1019  // PermuteIDs: Copy the data over
1020  for (size_t i = 0; i < numPermuteIDs; ++i) {
1021  LO FromLID = permuteFromLIDs[i];
1022  size_t FromRow = Source_rowptr[FromLID];
1023  size_t ToRow = CSR_rowptr[permuteToLIDs[i]];
1024 
1025  NewStartRow[permuteToLIDs[i]] += Source_rowptr[FromLID+1]-Source_rowptr[FromLID];
1026 
1027  for (size_t j = Source_rowptr[FromLID]; j < Source_rowptr[FromLID+1]; ++j) {
1028  CSR_vals[ToRow + j - FromRow] = Source_vals[j];
1029  CSR_colind[ToRow + j - FromRow] = globalColElts[Source_colind[j]];
1030  TargetPids[ToRow + j - FromRow] =
1031  (SourcePids[Source_colind[j]] != MyPID) ?
1032  SourcePids[Source_colind[j]] : -1;
1033  }
1034  }
1035 
1036  // RemoteIDs: Loop structure following UnpackAndCombine
1037  if (imports.size () > 0) {
1038  size_t offset = 0;
1039 #ifdef HAVE_TPETRA_DEBUG
1040  int lclErr = 0;
1041 #endif
1042 
1043  for (size_t i = 0; i < static_cast<size_t> (numImportLIDs); ++i) {
1044  const size_t numBytes = numPacketsPerLID[i];
1045  if (numBytes == 0) {
1046  // Empty buffer for that row means that the row is empty.
1047  continue;
1048  }
1049  // FIXME (mfh 07 Feb 2015) Ask the matrix (rather, one of its
1050  // values, if it has one) for the (possibly run-time -
1051  // depenendent) number of bytes of one of its entries.
1052  const size_t numEnt = unpackRowCount<LO, HES> (importsK, offset, numBytes,
1053  numBytesPerValue);
1054  const LO lclRow = importLIDs[i];
1055  const size_t StartRow = NewStartRow[lclRow];
1056  NewStartRow[lclRow] += numEnt;
1057 
1058  View<GO*, HES, MemoryUnmanaged> gidsOut =
1059  getNonconstView<GO, HES> (CSR_colind (StartRow, numEnt));
1060  View<int*, HES, MemoryUnmanaged> pidsOut =
1061  getNonconstView<int, HES> (TargetPids (StartRow, numEnt));
1062  ArrayView<Scalar> valsOutS = CSR_vals (StartRow, numEnt);
1063  View<ST*, HES, MemoryUnmanaged> valsOut =
1064  getNonconstView<ST, HES> (av_reinterpret_cast<ST> (valsOutS));
1065 
1066  const size_t numBytesOut =
1067  unpackRow<ST, LO, GO, HES> (gidsOut, pidsOut, valsOut, importsK,
1068  offset, numBytes, numEnt, numBytesPerValue);
1069  if (numBytesOut != numBytes) {
1070 #ifdef HAVE_TPETRA_DEBUG
1071  lclErr = 1;
1072 #endif
1073  break;
1074  }
1075 
1076  // Correct target PIDs.
1077  for (size_t j = 0; j < numEnt; ++j) {
1078  const int pid = pidsOut[j];
1079  pidsOut[j] = (pid != MyPID) ? pid : -1;
1080  }
1081  offset += numBytes;
1082  }
1083 #ifdef HAVE_TPETRA_DEBUG
1084  TEUCHOS_TEST_FOR_EXCEPTION(
1085  offset != static_cast<size_t> (imports.size ()), std::logic_error, prefix
1086  << "After unpacking and counting all the import packets, the final offset"
1087  " in bytes " << offset << " != imports.size() = " << imports.size () <<
1088  ". Please report this bug to the Tpetra developers.");
1089  TEUCHOS_TEST_FOR_EXCEPTION(
1090  lclErr != 0, std::logic_error, prefix << "numBytes != numBytesOut "
1091  "somewhere in unpack loop. This should never happen. "
1092  "Please report this bug to the Tpetra developers.");
1093 #endif // HAVE_TPETRA_DEBUG
1094  }
1095 }
1096 
1097 // Note: This should get merged with the other Tpetra sort routines eventually.
1098 template<typename Scalar, typename Ordinal>
1099 void
1100 sortCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1101  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1102  const Teuchos::ArrayView<Scalar> &CRS_vals)
1103 {
1104  // For each row, sort column entries from smallest to largest.
1105  // Use shell sort. Stable sort so it is fast if indices are already sorted.
1106  // Code copied from Epetra_CrsMatrix::SortEntries()
1107  size_t NumRows = CRS_rowptr.size()-1;
1108  size_t nnz = CRS_colind.size();
1109 
1110  for(size_t i = 0; i < NumRows; i++){
1111  size_t start=CRS_rowptr[i];
1112  if(start >= nnz) continue;
1113 
1114  Scalar* locValues = &CRS_vals[start];
1115  size_t NumEntries = CRS_rowptr[i+1] - start;
1116  Ordinal* locIndices = &CRS_colind[start];
1117 
1118  Ordinal n = NumEntries;
1119  Ordinal m = 1;
1120  while (m<n) m = m*3+1;
1121  m /= 3;
1122 
1123  while(m > 0) {
1124  Ordinal max = n - m;
1125  for(Ordinal j = 0; j < max; j++) {
1126  for(Ordinal k = j; k >= 0; k-=m) {
1127  if(locIndices[k+m] >= locIndices[k])
1128  break;
1129  Scalar dtemp = locValues[k+m];
1130  locValues[k+m] = locValues[k];
1131  locValues[k] = dtemp;
1132  Ordinal itemp = locIndices[k+m];
1133  locIndices[k+m] = locIndices[k];
1134  locIndices[k] = itemp;
1135  }
1136  }
1137  m = m/3;
1138  }
1139  }
1140 }
1141 
1142 
1143 template<typename rowptr_array_type, typename colind_array_type, typename vals_array_type>
1144 void
1145 sortCrsEntries (const rowptr_array_type& CRS_rowptr,
1146  const colind_array_type& CRS_colind,
1147  const vals_array_type& CRS_vals) {
1148  // For each row, sort column entries from smallest to largest.
1149  // Use shell sort. Stable sort so it is fast if indices are already sorted.
1150  // Code copied from Epetra_CrsMatrix::SortEntries()
1151  // NOTE: This should not be taken as a particularly efficient way to sort
1152  // rows of matrices in parallel. But it is correct, so that's something.
1153  size_t NumRows = CRS_rowptr.dimension_0()-1;
1154  size_t nnz = CRS_colind.dimension_0();
1155  typedef typename colind_array_type::traits::non_const_value_type Ordinal;
1156  typedef typename vals_array_type::traits::non_const_value_type Scalar;
1157 
1158  typedef size_t index_type; // what this function was using; not my choice
1159  typedef typename vals_array_type::execution_space execution_space;
1160  // Specify RangePolicy explicitly, in order to use correct execution
1161  // space. See #1345.
1162  typedef Kokkos::RangePolicy<execution_space, index_type> range_type;
1163 
1164  Kokkos::parallel_for (range_type (0, NumRows), KOKKOS_LAMBDA (const size_t i) {
1165  size_t start=CRS_rowptr(i);
1166  if(start < nnz) {
1167  size_t NumEntries = CRS_rowptr(i+1) - start;
1168 
1169  Ordinal n = (Ordinal) NumEntries;
1170  Ordinal m = 1;
1171  while (m<n) m = m*3+1;
1172  m /= 3;
1173 
1174  while(m > 0) {
1175  Ordinal max = n - m;
1176  for(Ordinal j = 0; j < max; j++) {
1177  for(Ordinal k = j; k >= 0; k-=m) {
1178  size_t sk = start+k;
1179  if(CRS_colind(sk+m) >= CRS_colind(sk))
1180  break;
1181  Scalar dtemp = CRS_vals(sk+m);
1182  CRS_vals(sk+m) = CRS_vals(sk);
1183  CRS_vals(sk) = dtemp;
1184  Ordinal itemp = CRS_colind(sk+m);
1185  CRS_colind(sk+m) = CRS_colind(sk);
1186  CRS_colind(sk) = itemp;
1187  }
1188  }
1189  m = m/3;
1190  }
1191  }
1192  });
1193 }
1194 
1195 
1196 
1197 
1198 
1199 // Note: This should get merged with the other Tpetra sort routines eventually.
1200 template<typename Scalar, typename Ordinal>
1201 void
1202 sortAndMergeCrsEntries (const Teuchos::ArrayView<size_t> &CRS_rowptr,
1203  const Teuchos::ArrayView<Ordinal> & CRS_colind,
1204  const Teuchos::ArrayView<Scalar> &CRS_vals)
1205 {
1206  // For each row, sort column entries from smallest to largest,
1207  // merging column ids that are identify by adding values. Use shell
1208  // sort. Stable sort so it is fast if indices are already sorted.
1209  // Code copied from Epetra_CrsMatrix::SortEntries()
1210 
1211  size_t NumRows = CRS_rowptr.size()-1;
1212  size_t nnz = CRS_colind.size();
1213  size_t new_curr=CRS_rowptr[0], old_curr=CRS_rowptr[0];
1214 
1215  for(size_t i = 0; i < NumRows; i++){
1216  size_t old_rowptr_i=CRS_rowptr[i];
1217  CRS_rowptr[i] = old_curr;
1218  if(old_rowptr_i >= nnz) continue;
1219 
1220  Scalar* locValues = &CRS_vals[old_rowptr_i];
1221  size_t NumEntries = CRS_rowptr[i+1] - old_rowptr_i;
1222  Ordinal* locIndices = &CRS_colind[old_rowptr_i];
1223 
1224  // Sort phase
1225  Ordinal n = NumEntries;
1226  Ordinal m = n/2;
1227 
1228  while(m > 0) {
1229  Ordinal max = n - m;
1230  for(Ordinal j = 0; j < max; j++) {
1231  for(Ordinal k = j; k >= 0; k-=m) {
1232  if(locIndices[k+m] >= locIndices[k])
1233  break;
1234  Scalar dtemp = locValues[k+m];
1235  locValues[k+m] = locValues[k];
1236  locValues[k] = dtemp;
1237  Ordinal itemp = locIndices[k+m];
1238  locIndices[k+m] = locIndices[k];
1239  locIndices[k] = itemp;
1240  }
1241  }
1242  m = m/2;
1243  }
1244 
1245  // Merge & shrink
1246  for(size_t j=old_rowptr_i; j < CRS_rowptr[i+1]; j++) {
1247  if(j > old_rowptr_i && CRS_colind[j]==CRS_colind[new_curr-1]) {
1248  CRS_vals[new_curr-1] += CRS_vals[j];
1249  }
1250  else if(new_curr==j) {
1251  new_curr++;
1252  }
1253  else {
1254  CRS_colind[new_curr] = CRS_colind[j];
1255  CRS_vals[new_curr] = CRS_vals[j];
1256  new_curr++;
1257  }
1258  }
1259  old_curr=new_curr;
1260  }
1261 
1262  CRS_rowptr[NumRows] = new_curr;
1263 }
1264 
1265 template <typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1266 void
1267 lowCommunicationMakeColMapAndReindex (const Teuchos::ArrayView<const size_t> &rowptr,
1268  const Teuchos::ArrayView<LocalOrdinal> &colind_LID,
1269  const Teuchos::ArrayView<GlobalOrdinal> &colind_GID,
1270  const Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> >& domainMapRCP,
1271  const Teuchos::ArrayView<const int> &owningPIDs,
1272  Teuchos::Array<int> &remotePIDs,
1273  Teuchos::RCP<const Tpetra::Map<LocalOrdinal,GlobalOrdinal,Node> > & colMap)
1274 {
1275  using Teuchos::rcp;
1276  typedef LocalOrdinal LO;
1277  typedef GlobalOrdinal GO;
1278  typedef Tpetra::global_size_t GST;
1279  typedef Tpetra::Map<LO, GO, Node> map_type;
1280  const char prefix[] = "lowCommunicationMakeColMapAndReindex: ";
1281 
1282  // The domainMap is an RCP because there is a shortcut for a
1283  // (common) special case to return the columnMap = domainMap.
1284  const map_type& domainMap = *domainMapRCP;
1285 
1286  // Scan all column indices and sort into two groups:
1287  // Local: those whose GID matches a GID of the domain map on this processor and
1288  // Remote: All others.
1289  const size_t numDomainElements = domainMap.getNodeNumElements ();
1290  Teuchos::Array<bool> LocalGIDs;
1291  if (numDomainElements > 0) {
1292  LocalGIDs.resize (numDomainElements, false); // Assume domain GIDs are not local
1293  }
1294 
1295  // In principle it is good to have RemoteGIDs and RemotGIDList be as
1296  // long as the number of remote GIDs on this processor, but this
1297  // would require two passes through the column IDs, so we make it
1298  // the max of 100 and the number of block rows.
1299  //
1300  // FIXME (mfh 11 Feb 2015) Tpetra::Details::HashTable can hold at
1301  // most INT_MAX entries, but it's possible to have more rows than
1302  // that (if size_t is 64 bits and int is 32 bits).
1303  const size_t numMyRows = rowptr.size () - 1;
1304  const int hashsize = std::max (static_cast<int> (numMyRows), 100);
1305 
1306  Tpetra::Details::HashTable<GO, LO> RemoteGIDs (hashsize);
1307  Teuchos::Array<GO> RemoteGIDList;
1308  RemoteGIDList.reserve (hashsize);
1309  Teuchos::Array<int> PIDList;
1310  PIDList.reserve (hashsize);
1311 
1312  // Here we start using the *LocalOrdinal* colind_LID array. This is
1313  // safe even if both columnIndices arrays are actually the same
1314  // (because LocalOrdinal==GO). For *local* GID's set
1315  // colind_LID with with their LID in the domainMap. For *remote*
1316  // GIDs, we set colind_LID with (numDomainElements+NumRemoteColGIDs)
1317  // before the increment of the remote count. These numberings will
1318  // be separate because no local LID is greater than
1319  // numDomainElements.
1320 
1321  size_t NumLocalColGIDs = 0;
1322  LO NumRemoteColGIDs = 0;
1323  for (size_t i = 0; i < numMyRows; ++i) {
1324  for(size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1325  const GO GID = colind_GID[j];
1326  // Check if GID matches a row GID
1327  const LO LID = domainMap.getLocalElement (GID);
1328  if(LID != -1) {
1329  const bool alreadyFound = LocalGIDs[LID];
1330  if (! alreadyFound) {
1331  LocalGIDs[LID] = true; // There is a column in the graph associated with this domain map GID
1332  NumLocalColGIDs++;
1333  }
1334  colind_LID[j] = LID;
1335  }
1336  else {
1337  const LO hash_value = RemoteGIDs.get (GID);
1338  if (hash_value == -1) { // This means its a new remote GID
1339  const int PID = owningPIDs[j];
1340  TEUCHOS_TEST_FOR_EXCEPTION(
1341  PID == -1, std::invalid_argument, prefix << "Cannot figure out if "
1342  "PID is owned.");
1343  colind_LID[j] = static_cast<LO> (numDomainElements + NumRemoteColGIDs);
1344  RemoteGIDs.add (GID, NumRemoteColGIDs);
1345  RemoteGIDList.push_back (GID);
1346  PIDList.push_back (PID);
1347  NumRemoteColGIDs++;
1348  }
1349  else {
1350  colind_LID[j] = static_cast<LO> (numDomainElements + hash_value);
1351  }
1352  }
1353  }
1354  }
1355 
1356  // Possible short-circuit: If all domain map GIDs are present as
1357  // column indices, then set ColMap=domainMap and quit.
1358  if (domainMap.getComm ()->getSize () == 1) {
1359  // Sanity check: When there is only one process, there can be no
1360  // remoteGIDs.
1361  TEUCHOS_TEST_FOR_EXCEPTION(
1362  NumRemoteColGIDs != 0, std::runtime_error, prefix << "There is only one "
1363  "process in the domain Map's communicator, which means that there are no "
1364  "\"remote\" indices. Nevertheless, some column indices are not in the "
1365  "domain Map.");
1366  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1367  // In this case, we just use the domainMap's indices, which is,
1368  // not coincidently, what we clobbered colind with up above
1369  // anyway. No further reindexing is needed.
1370  colMap = domainMapRCP;
1371  return;
1372  }
1373  }
1374 
1375  // Now build the array containing column GIDs
1376  // Build back end, containing remote GIDs, first
1377  const LO numMyCols = NumLocalColGIDs + NumRemoteColGIDs;
1378  Teuchos::Array<GO> ColIndices;
1379  GO* RemoteColIndices = NULL;
1380  if (numMyCols > 0) {
1381  ColIndices.resize (numMyCols);
1382  if (NumLocalColGIDs != static_cast<size_t> (numMyCols)) {
1383  RemoteColIndices = &ColIndices[NumLocalColGIDs]; // Points to back half of ColIndices
1384  }
1385  }
1386 
1387  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1388  RemoteColIndices[i] = RemoteGIDList[i];
1389  }
1390 
1391  // Build permute array for *remote* reindexing.
1392  Teuchos::Array<LO> RemotePermuteIDs (NumRemoteColGIDs);
1393  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1394  RemotePermuteIDs[i]=i;
1395  }
1396 
1397  // Sort External column indices so that all columns coming from a
1398  // given remote processor are contiguous. This is a sort with two
1399  // auxillary arrays: RemoteColIndices and RemotePermuteIDs.
1400  Tpetra::sort3 (PIDList.begin (), PIDList.end (),
1401  ColIndices.begin () + NumLocalColGIDs,
1402  RemotePermuteIDs.begin ());
1403 
1404  // Stash the RemotePIDs.
1405  //
1406  // Note: If Teuchos::Array had a shrink_to_fit like std::vector,
1407  // we'd call it here.
1408  remotePIDs = PIDList;
1409 
1410  // Sort external column indices so that columns from a given remote
1411  // processor are not only contiguous but also in ascending
1412  // order. NOTE: I don't know if the number of externals associated
1413  // with a given remote processor is known at this point ... so I
1414  // count them here.
1415 
1416  // NTS: Only sort the RemoteColIndices this time...
1417  LO StartCurrent = 0, StartNext = 1;
1418  while (StartNext < NumRemoteColGIDs) {
1419  if (PIDList[StartNext]==PIDList[StartNext-1]) {
1420  StartNext++;
1421  }
1422  else {
1423  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1424  ColIndices.begin () + NumLocalColGIDs + StartNext,
1425  RemotePermuteIDs.begin () + StartCurrent);
1426  StartCurrent = StartNext;
1427  StartNext++;
1428  }
1429  }
1430  Tpetra::sort2 (ColIndices.begin () + NumLocalColGIDs + StartCurrent,
1431  ColIndices.begin () + NumLocalColGIDs + StartNext,
1432  RemotePermuteIDs.begin () + StartCurrent);
1433 
1434  // Reverse the permutation to get the information we actually care about
1435  Teuchos::Array<LO> ReverseRemotePermuteIDs (NumRemoteColGIDs);
1436  for (LO i = 0; i < NumRemoteColGIDs; ++i) {
1437  ReverseRemotePermuteIDs[RemotePermuteIDs[i]] = i;
1438  }
1439 
1440  // Build permute array for *local* reindexing.
1441  bool use_local_permute = false;
1442  Teuchos::Array<LO> LocalPermuteIDs (numDomainElements);
1443 
1444  // Now fill front end. Two cases:
1445  //
1446  // (1) If the number of Local column GIDs is the same as the number
1447  // of Local domain GIDs, we can simply read the domain GIDs into
1448  // the front part of ColIndices, otherwise
1449  //
1450  // (2) We step through the GIDs of the domainMap, checking to see if
1451  // each domain GID is a column GID. we want to do this to
1452  // maintain a consistent ordering of GIDs between the columns
1453  // and the domain.
1454  Teuchos::ArrayView<const GO> domainGlobalElements = domainMap.getNodeElementList();
1455  if (static_cast<size_t> (NumLocalColGIDs) == numDomainElements) {
1456  if (NumLocalColGIDs > 0) {
1457  // Load Global Indices into first numMyCols elements column GID list
1458  std::copy (domainGlobalElements.begin (), domainGlobalElements.end (),
1459  ColIndices.begin ());
1460  }
1461  }
1462  else {
1463  LO NumLocalAgain = 0;
1464  use_local_permute = true;
1465  for (size_t i = 0; i < numDomainElements; ++i) {
1466  if (LocalGIDs[i]) {
1467  LocalPermuteIDs[i] = NumLocalAgain;
1468  ColIndices[NumLocalAgain++] = domainGlobalElements[i];
1469  }
1470  }
1471  TEUCHOS_TEST_FOR_EXCEPTION(
1472  static_cast<size_t> (NumLocalAgain) != NumLocalColGIDs,
1473  std::runtime_error, prefix << "Local ID count test failed.");
1474  }
1475 
1476  // Make column Map
1477  const GST minus_one = Teuchos::OrdinalTraits<GST>::invalid ();
1478  colMap = rcp (new map_type (minus_one, ColIndices, domainMap.getIndexBase (),
1479  domainMap.getComm (), domainMap.getNode ()));
1480 
1481  // Low-cost reindex of the matrix
1482  for (size_t i = 0; i < numMyRows; ++i) {
1483  for (size_t j = rowptr[i]; j < rowptr[i+1]; ++j) {
1484  const LO ID = colind_LID[j];
1485  if (static_cast<size_t> (ID) < numDomainElements) {
1486  if (use_local_permute) {
1487  colind_LID[j] = LocalPermuteIDs[colind_LID[j]];
1488  }
1489  // In the case where use_local_permute==false, we just copy
1490  // the DomainMap's ordering, which it so happens is what we
1491  // put in colind_LID to begin with.
1492  }
1493  else {
1494  colind_LID[j] = NumLocalColGIDs + ReverseRemotePermuteIDs[colind_LID[j]-numDomainElements];
1495  }
1496  }
1497  }
1498 }
1499 
1500 } // namespace Import_Util
1501 } // namespace Tpetra
1502 
1503 // We can include the definitions for Tpetra::CrsMatrix now that the above
1504 // functions have been defined. For ETI, this isn't necessary, so we just
1505 // including the generated hpp
1506 #include "Tpetra_CrsMatrix.hpp"
1507 
1508 #endif // TPETRA_IMPORT_UTIL_HPP
void sortAndMergeCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort and merge the entries of the (raw CSR) matrix by column index within each row.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void unpackAndCombineIntoCrsArrays(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, int MyTargetPID, const Teuchos::ArrayView< size_t > &rowPointers, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices, const Teuchos::ArrayView< Scalar > &values, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort3(const IT1 &first1, const IT1 &last1, const IT2 &first2, const IT3 &first3)
Sort the first array, and apply the same permutation to the second and third arrays.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Declaration of the Tpetra::CrsMatrix class.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
size_t unpackAndCombineWithOwningPIDsCount(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &importLIDs, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteToLIDs, const Teuchos::ArrayView< const LocalOrdinal > &permuteFromLIDs)
Special version of Tpetra::CrsMatrix::unpackAndCombine that also unpacks owning process ranks...
Kokkos::View< char *, D, Kokkos::MemoryUnmanaged > output_buffer_type
The type of an output buffer of bytes.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
void sortCrsEntries(const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< Ordinal > &CRS_colind, const Teuchos::ArrayView< Scalar > &CRS_vals)
Sort the entries of the (raw CSR) matrix by column index within each row.
void lowCommunicationMakeColMapAndReindex(const Teuchos::ArrayView< const size_t > &rowPointers, const Teuchos::ArrayView< LocalOrdinal > &columnIndices_LID, const Teuchos::ArrayView< GlobalOrdinal > &columnIndices_GID, const Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::ArrayView< const int > &owningPids, Teuchos::Array< int > &remotePids, Teuchos::RCP< const Tpetra::Map< LocalOrdinal, GlobalOrdinal, Node > > &colMap)
lowCommunicationMakeColMapAndReindex
size_t global_size_t
Global size_t object.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Kokkos::View< const value_type *, D, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
void packAndPrepareWithOwningPIDs(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &SourceMatrix, const Teuchos::ArrayView< const LocalOrdinal > &exportLIDs, Kokkos::DualView< char *, typename Node::device_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, size_t &constantNumPackets, Distributor &distor, const Teuchos::ArrayView< const int > &SourcePids)
Special version of Tpetra::CrsMatrix::packAndPrepare that also packs owning process ranks...
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
Kokkos::View< const char *, D, Kokkos::MemoryUnmanaged > input_buffer_type
The type of an input buffer of bytes.
size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const
Returns the current number of entries on this node in the specified local row.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
ValueType get(const KeyType key)
Get the value corresponding to the given key.
A parallel distribution of indices over processes.
void add(const KeyType key, const ValueType value)
Add a key and its value to the hash table.
Kokkos::View< value_type *, D, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
Stand-alone utility functions and macros.
Teuchos::RCP< const map_type > getColMap() const
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which the matrix is distributed.
bool isLocallyIndexed() const
Whether the matrix is locally indexed on the calling process.