Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
41 #define TPETRA_MULTIVECTOR_DEF_HPP
42 
50 
51 #include "Tpetra_Core.hpp"
52 #include "Tpetra_Util.hpp"
53 #include "Tpetra_Vector.hpp"
57 #include "Tpetra_Details_fill.hpp"
65 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
66 # include "Teuchos_SerialDenseMatrix.hpp"
67 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
68 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
69 #include "KokkosCompat_View.hpp"
70 #include "KokkosBlas.hpp"
71 #include "KokkosKernels_Utils.hpp"
72 #include "Kokkos_Random.hpp"
73 #include "Kokkos_ArithTraits.hpp"
74 #include <memory>
75 #include <sstream>
76 
77 #ifdef HAVE_TPETRA_INST_FLOAT128
78 namespace Kokkos {
79  // FIXME (mfh 04 Sep 2015) Just a stub for now!
80  template<class Generator>
81  struct rand<Generator, __float128> {
82  static KOKKOS_INLINE_FUNCTION __float128 max ()
83  {
84  return static_cast<__float128> (1.0);
85  }
86  static KOKKOS_INLINE_FUNCTION __float128
87  draw (Generator& gen)
88  {
89  // Half the smallest normalized double, is the scaling factor of
90  // the lower-order term in the double-double representation.
91  const __float128 scalingFactor =
92  static_cast<__float128> (std::numeric_limits<double>::min ()) /
93  static_cast<__float128> (2.0);
94  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
95  const __float128 lowerOrderTerm =
96  static_cast<__float128> (gen.drand ()) * scalingFactor;
97  return higherOrderTerm + lowerOrderTerm;
98  }
99  static KOKKOS_INLINE_FUNCTION __float128
100  draw (Generator& gen, const __float128& range)
101  {
102  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
103  const __float128 scalingFactor =
104  static_cast<__float128> (std::numeric_limits<double>::min ()) /
105  static_cast<__float128> (2.0);
106  const __float128 higherOrderTerm =
107  static_cast<__float128> (gen.drand (range));
108  const __float128 lowerOrderTerm =
109  static_cast<__float128> (gen.drand (range)) * scalingFactor;
110  return higherOrderTerm + lowerOrderTerm;
111  }
112  static KOKKOS_INLINE_FUNCTION __float128
113  draw (Generator& gen, const __float128& start, const __float128& end)
114  {
115  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
116  const __float128 scalingFactor =
117  static_cast<__float128> (std::numeric_limits<double>::min ()) /
118  static_cast<__float128> (2.0);
119  const __float128 higherOrderTerm =
120  static_cast<__float128> (gen.drand (start, end));
121  const __float128 lowerOrderTerm =
122  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
123  return higherOrderTerm + lowerOrderTerm;
124  }
125  };
126 } // namespace Kokkos
127 #endif // HAVE_TPETRA_INST_FLOAT128
128 
129 namespace { // (anonymous)
130 
145  template<class ST, class LO, class GO, class NT>
147  allocDualView (const size_t lclNumRows,
148  const size_t numCols,
149  const bool zeroOut = true,
150  const bool allowPadding = false)
151  {
152  using ::Tpetra::Details::Behavior;
153  using Kokkos::AllowPadding;
154  using Kokkos::view_alloc;
155  using Kokkos::WithoutInitializing;
156  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
157  typedef typename dual_view_type::t_dev dev_view_type;
158  // This needs to be a string and not a char*, if given as an
159  // argument to Kokkos::view_alloc. This is because view_alloc
160  // also allows a raw pointer as its first argument. See
161  // https://github.com/kokkos/kokkos/issues/434.
162  const std::string label ("MV::DualView");
163  const bool debug = Behavior::debug ();
164 
165  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
166  // creation of the DualView's Views works around
167  // Kokkos::DualView's current inability to accept an
168  // AllocationProperties initial argument (as Kokkos::View does).
169  // However, the work-around is harmless, since it does what the
170  // (currently nonexistent) equivalent DualView constructor would
171  // have done anyway.
172 
173  dev_view_type d_view;
174  if (zeroOut) {
175  if (allowPadding) {
176  d_view = dev_view_type (view_alloc (label, AllowPadding),
177  lclNumRows, numCols);
178  }
179  else {
180  d_view = dev_view_type (view_alloc (label),
181  lclNumRows, numCols);
182  }
183  }
184  else {
185  if (allowPadding) {
186  d_view = dev_view_type (view_alloc (label,
187  WithoutInitializing,
188  AllowPadding),
189  lclNumRows, numCols);
190  }
191  else {
192  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
193  lclNumRows, numCols);
194  }
195  if (debug) {
196  // Filling with NaN is a cheap and effective way to tell if
197  // downstream code is trying to use a MultiVector's data
198  // without them having been initialized. ArithTraits lets us
199  // call nan() even if the scalar type doesn't define it; it
200  // just returns some undefined value in the latter case. This
201  // won't hurt anything because by setting zeroOut=false, users
202  // already agreed that they don't care about the contents of
203  // the MultiVector.
204  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
205  KokkosBlas::fill (d_view, nan);
206  }
207  }
208  if (debug) {
209  TEUCHOS_TEST_FOR_EXCEPTION
210  (static_cast<size_t> (d_view.extent (0)) != lclNumRows ||
211  static_cast<size_t> (d_view.extent (1)) != numCols, std::logic_error,
212  "allocDualView: d_view's dimensions actual dimensions do not match "
213  "requested dimensions. d_view is " << d_view.extent (0) << " x " <<
214  d_view.extent (1) << "; requested " << lclNumRows << " x " << numCols
215  << ". Please report this bug to the Tpetra developers.");
216  }
217 
218  dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
219  // Whether or not the user cares about the initial contents of the
220  // MultiVector, the device and host views are out of sync. We
221  // prefer to work in device memory. The way to ensure this
222  // happens is to mark the device view as modified.
223  dv.modify_device ();
224 
225  return dv;
226  }
227 
228  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
229  //
230  // T: The type of the entries of the View.
231  // ExecSpace: The Kokkos execution space.
232  template<class T, class ExecSpace>
233  struct MakeUnmanagedView {
234  // The 'false' part of the branch carefully ensures that this
235  // won't attempt to use a host execution space that hasn't been
236  // initialized. For example, if Kokkos::OpenMP is disabled and
237  // Kokkos::Threads is enabled, the latter is always the default
238  // execution space of Kokkos::HostSpace, even when ExecSpace is
239  // Kokkos::Serial. That's why we go through the trouble of asking
240  // Kokkos::DualView what _its_ space is. That seems to work
241  // around this default execution space issue.
242  //
243  typedef typename Kokkos::Impl::if_c<
244  Kokkos::Impl::SpaceAccessibility<
245  typename ExecSpace::memory_space,
246  Kokkos::HostSpace>::accessible,
247  typename ExecSpace::device_type,
248  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
249  typedef Kokkos::LayoutLeft array_layout;
250  typedef Kokkos::View<T*, array_layout, host_exec_space,
251  Kokkos::MemoryUnmanaged> view_type;
252 
253  static view_type getView (const Teuchos::ArrayView<T>& x_in)
254  {
255  const size_t numEnt = static_cast<size_t> (x_in.size ());
256  if (numEnt == 0) {
257  return view_type ();
258  } else {
259  return view_type (x_in.getRawPtr (), numEnt);
260  }
261  }
262  };
263 
264  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
265  // taking a subview of a 0 x N DualView incorrectly always results
266  // in a 0 x 0 DualView.
267  template<class DualViewType>
268  DualViewType
269  takeSubview (const DualViewType& X,
270  const Kokkos::Impl::ALL_t&,
271  const std::pair<size_t, size_t>& colRng)
272  {
273  if (X.extent (0) == 0 && X.extent (1) != 0) {
274  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
275  }
276  else {
277  return subview (X, Kokkos::ALL (), colRng);
278  }
279  }
280 
281  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
282  // taking a subview of a 0 x N DualView incorrectly always results
283  // in a 0 x 0 DualView.
284  template<class DualViewType>
285  DualViewType
286  takeSubview (const DualViewType& X,
287  const std::pair<size_t, size_t>& rowRng,
288  const std::pair<size_t, size_t>& colRng)
289  {
290  if (X.extent (0) == 0 && X.extent (1) != 0) {
291  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
292  }
293  else {
294  return subview (X, rowRng, colRng);
295  }
296  }
297 
298  template<class DualViewType>
299  size_t
300  getDualViewStride (const DualViewType& dv)
301  {
302  // FIXME (mfh 15 Mar 2019) DualView doesn't have a stride
303  // method yet, but its Views do.
304  const size_t LDA = dv.d_view.stride (1);
305  const size_t numRows = dv.extent (0);
306 
307  if (LDA == 0) {
308  return (numRows == 0) ? size_t (1) : numRows;
309  }
310  else {
311  return LDA;
312  }
313  }
314 
315  template<class ViewType>
316  size_t
317  getViewStride (const ViewType& view)
318  {
319  const size_t LDA = view.stride (1);
320  const size_t numRows = view.extent (0);
321 
322  if (LDA == 0) {
323  return (numRows == 0) ? size_t (1) : numRows;
324  }
325  else {
326  return LDA;
327  }
328  }
329 
330  template <class impl_scalar_type, class buffer_device_type>
331  bool
332  runKernelOnHost (
333  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports
334  )
335  {
336  if (! imports.need_sync_device ()) {
337  return false; // most up-to-date on device
338  }
339  else { // most up-to-date on host,
340  // but if large enough, worth running on device anyway
341  size_t localLengthThreshold =
343  return imports.extent(0) <= localLengthThreshold;
344  }
345  }
346 
347 
348  template <class SC, class LO, class GO, class NT>
349  bool
350  runKernelOnHost (const ::Tpetra::MultiVector<SC, LO, GO, NT>& X)
351  {
352  if (! X.need_sync_device ()) {
353  return false; // most up-to-date on device
354  }
355  else { // most up-to-date on host
356  // but if large enough, worth running on device anyway
357  size_t localLengthThreshold =
359  return X.getLocalLength () <= localLengthThreshold;
360  }
361  }
362 
363  template <class SC, class LO, class GO, class NT>
364  void
365  multiVectorNormImpl (typename ::Tpetra::MultiVector<SC, LO, GO, NT>::mag_type norms[],
368  {
369  using namespace Tpetra;
372  using val_type = typename MV::impl_scalar_type;
373  using mag_type = typename MV::mag_type;
374  using dual_view_type = typename MV::dual_view_type;
375 
376  auto map = X.getMap ();
377  auto comm = map.is_null () ? nullptr : map->getComm ().getRawPtr ();
378  auto whichVecs = getMultiVectorWhichVectors (X);
379  const bool isConstantStride = X.isConstantStride ();
380  const bool isDistributed = X.isDistributed ();
381 
382  const bool runOnHost = runKernelOnHost (X);
383  if (runOnHost) {
384  using view_type = typename dual_view_type::t_host;
385  using array_layout = typename view_type::array_layout;
386  using device_type = typename view_type::device_type;
387 
388  auto X_lcl = X.getLocalViewHost(Access::ReadOnly);
389  normImpl<val_type, array_layout, device_type,
390  mag_type> (norms, X_lcl, whichNorm, whichVecs,
391  isConstantStride, isDistributed, comm);
392  }
393  else {
394  using view_type = typename dual_view_type::t_dev;
395  using array_layout = typename view_type::array_layout;
396  using device_type = typename view_type::device_type;
397 
398  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
399  normImpl<val_type, array_layout, device_type,
400  mag_type> (norms, X_lcl, whichNorm, whichVecs,
401  isConstantStride, isDistributed, comm);
402  }
403  }
404 } // namespace (anonymous)
405 
406 
407 namespace Tpetra {
408 
409  namespace Details {
410  template <typename DstView, typename SrcView>
411  struct AddAssignFunctor {
412  // This functor would be better as a lambda, but CUDA cannot compile
413  // lambdas in protected functions. It compiles fine with the functor.
414  AddAssignFunctor(DstView &tgt_, SrcView &src_) : tgt(tgt_), src(src_) {}
415 
416  KOKKOS_INLINE_FUNCTION void
417  operator () (const size_t k) const { tgt(k) += src(k); }
418 
419  DstView tgt;
420  SrcView src;
421  };
422  }
423 
424  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
425  bool
427  vectorIndexOutOfRange (const size_t VectorIndex) const {
428  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
429  }
430 
431  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
433  MultiVector () :
434  base_type (Teuchos::rcp (new map_type ()))
435  {}
436 
437  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
439  MultiVector (const Teuchos::RCP<const map_type>& map,
440  const size_t numVecs,
441  const bool zeroOut) : /* default is true */
442  base_type (map)
443  {
444  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
445 
446  const size_t lclNumRows = this->getLocalLength ();
447  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
448  origView_ = view_;
449  owningView_ = view_;
450  }
451 
452  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
455  const Teuchos::DataAccess copyOrView) :
456  base_type (source),
457  view_ (source.view_),
458  origView_ (source.origView_),
459  owningView_ (source.owningView_),
460  whichVectors_ (source.whichVectors_)
461  {
463  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
464  "const Teuchos::DataAccess): ";
465 
466  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
467 
468  if (copyOrView == Teuchos::Copy) {
469  // Reuse the conveniently already existing function that creates
470  // a deep copy.
471  MV cpy = createCopy (source);
472  this->view_ = cpy.view_;
473  this->origView_ = cpy.origView_;
474  this->owningView_ = cpy.owningView_;
475  this->whichVectors_ = cpy.whichVectors_;
476  }
477  else if (copyOrView == Teuchos::View) {
478  }
479  else {
480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
481  true, std::invalid_argument, "Second argument 'copyOrView' has an "
482  "invalid value " << copyOrView << ". Valid values include "
483  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
484  << Teuchos::View << ".");
485  }
486  }
487 
488  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
490  MultiVector (const Teuchos::RCP<const map_type>& map,
491  const dual_view_type& view) :
492  base_type (map),
493  view_ (view),
494  origView_ (view),
495  owningView_ (view)
496  {
497  const char tfecfFuncName[] = "MultiVector(Map,DualView): ";
498  const size_t lclNumRows_map = map.is_null () ? size_t (0) :
499  map->getNodeNumElements ();
500  const size_t lclNumRows_view = view.extent (0);
501  const size_t LDA = getDualViewStride (view);
502 
503  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
504  (LDA < lclNumRows_map || lclNumRows_map != lclNumRows_view,
505  std::invalid_argument, "Kokkos::DualView does not match Map. "
506  "map->getNodeNumElements() = " << lclNumRows_map
507  << ", view.extent(0) = " << lclNumRows_view
508  << ", and getStride() = " << LDA << ".");
509 
510  using ::Tpetra::Details::Behavior;
511  const bool debug = Behavior::debug ();
512  if (debug) {
513  using ::Tpetra::Details::checkGlobalDualViewValidity;
514  std::ostringstream gblErrStrm;
515  const bool verbose = Behavior::verbose ();
516  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
517  const bool gblValid =
518  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
519  comm.getRawPtr ());
520  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
521  (! gblValid, std::runtime_error, gblErrStrm.str ());
522  }
523  }
524 
525 
526  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
528  MultiVector (const Teuchos::RCP<const map_type>& map,
529  const typename dual_view_type::t_dev& d_view) :
530  base_type (map)
531  {
532  using Teuchos::ArrayRCP;
533  using Teuchos::RCP;
534  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
535 
536  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
537 
538  const size_t LDA = getViewStride (d_view);
539  const size_t lclNumRows = map->getNodeNumElements ();
540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
541  (LDA < lclNumRows, std::invalid_argument, "Map does not match "
542  "Kokkos::View. map->getNodeNumElements() = " << lclNumRows
543  << ", View's column stride = " << LDA
544  << ", and View's extent(0) = " << d_view.extent (0) << ".");
545 
546  auto h_view = Kokkos::create_mirror_view (d_view);
547  view_ = dual_view_type (d_view, h_view);
548 
549  using ::Tpetra::Details::Behavior;
550  const bool debug = Behavior::debug ();
551  if (debug) {
552  using ::Tpetra::Details::checkGlobalDualViewValidity;
553  std::ostringstream gblErrStrm;
554  const bool verbose = Behavior::verbose ();
555  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
556  const bool gblValid =
557  checkGlobalDualViewValidity (&gblErrStrm, view_, verbose,
558  comm.getRawPtr ());
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
560  (! gblValid, std::runtime_error, gblErrStrm.str ());
561  }
562  // The user gave us a device view. In order to respect its
563  // initial contents, we mark the DualView as "modified on device."
564  // That way, the next sync will synchronize it with the host view.
565  view_.modify_device ();
566  origView_ = view_;
567  owningView_ = view_;
568  }
569 
570  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
572  MultiVector (const Teuchos::RCP<const map_type>& map,
573  const dual_view_type& view,
574  const dual_view_type& origView) :
575  base_type (map),
576  view_ (view),
577  origView_ (origView),
578  owningView_ (origView)
579  {
580  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
581 
582  const size_t LDA = getDualViewStride (origView);
583  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
584  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
585  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
586  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
587  << ". This may also mean that the input origView's first dimension (number "
588  "of rows = " << origView.extent (0) << ") does not not match the number "
589  "of entries in the Map on the calling process.");
590 
591  using ::Tpetra::Details::Behavior;
592  const bool debug = Behavior::debug ();
593  if (debug) {
594  using ::Tpetra::Details::checkGlobalDualViewValidity;
595  std::ostringstream gblErrStrm;
596  const bool verbose = Behavior::verbose ();
597  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
598  const bool gblValid_0 =
599  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
600  comm.getRawPtr ());
601  const bool gblValid_1 =
602  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
603  comm.getRawPtr ());
604  const bool gblValid = gblValid_0 && gblValid_1;
605  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
606  (! gblValid, std::runtime_error, gblErrStrm.str ());
607  }
608  }
609 
610 
611  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
613  MultiVector (const Teuchos::RCP<const map_type>& map,
614  const dual_view_type& view,
615  const Teuchos::ArrayView<const size_t>& whichVectors) :
616  base_type (map),
617  view_ (view),
618  origView_ (view),
619  owningView_ (view),
620  whichVectors_ (whichVectors.begin (), whichVectors.end ())
621  {
622  using Kokkos::ALL;
623  using Kokkos::subview;
624  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
625 
626  using ::Tpetra::Details::Behavior;
627  const bool debug = Behavior::debug ();
628  if (debug) {
629  using ::Tpetra::Details::checkGlobalDualViewValidity;
630  std::ostringstream gblErrStrm;
631  const bool verbose = Behavior::verbose ();
632  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
633  const bool gblValid =
634  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
635  comm.getRawPtr ());
636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
637  (! gblValid, std::runtime_error, gblErrStrm.str ());
638  }
639 
640  const size_t lclNumRows = map.is_null () ? size_t (0) :
641  map->getNodeNumElements ();
642  // Check dimensions of the input DualView. We accept that Kokkos
643  // might not allow construction of a 0 x m (Dual)View with m > 0,
644  // so we only require the number of rows to match if the
645  // (Dual)View has more than zero columns. Likewise, we only
646  // require the number of columns to match if the (Dual)View has
647  // more than zero rows.
648  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
649  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
650  std::invalid_argument, "view.extent(0) = " << view.extent (0)
651  << " < map->getNodeNumElements() = " << lclNumRows << ".");
652  if (whichVectors.size () != 0) {
653  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
654  view.extent (1) != 0 && view.extent (1) == 0,
655  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
656  " = " << whichVectors.size () << " > 0.");
657  size_t maxColInd = 0;
658  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
659  for (size_type k = 0; k < whichVectors.size (); ++k) {
660  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
661  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
662  std::invalid_argument, "whichVectors[" << k << "] = "
663  "Teuchos::OrdinalTraits<size_t>::invalid().");
664  maxColInd = std::max (maxColInd, whichVectors[k]);
665  }
666  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
667  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
668  std::invalid_argument, "view.extent(1) = " << view.extent (1)
669  << " <= max(whichVectors) = " << maxColInd << ".");
670  }
671 
672  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
673  // zero strides, so modify in that case.
674  const size_t LDA = getDualViewStride (view);
675  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
676  (LDA < lclNumRows, std::invalid_argument,
677  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
678 
679  if (whichVectors.size () == 1) {
680  // If whichVectors has only one entry, we don't need to bother
681  // with nonconstant stride. Just shift the view over so it
682  // points to the desired column.
683  //
684  // NOTE (mfh 10 May 2014) This is a special case where we set
685  // origView_ just to view that one column, not all of the
686  // original columns. This ensures that the use of origView_ in
687  // offsetView works correctly.
688  //
689  // However, owningView_ is not a subview.
690  const std::pair<size_t, size_t> colRng (whichVectors[0],
691  whichVectors[0] + 1);
692  origView_ = view_;
693  view_ = takeSubview (view_, ALL (), colRng);
694  // whichVectors_.size() == 0 means "constant stride."
695  whichVectors_.clear ();
696  }
697  }
698 
699  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
701  MultiVector (const Teuchos::RCP<const map_type>& map,
702  const dual_view_type& view,
703  const dual_view_type& origView,
704  const Teuchos::ArrayView<const size_t>& whichVectors) :
705  base_type (map),
706  view_ (view),
707  origView_ (origView),
708  owningView_ (origView),
709  whichVectors_ (whichVectors.begin (), whichVectors.end ())
710  {
711  using Kokkos::ALL;
712  using Kokkos::subview;
713  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
714 
715  using ::Tpetra::Details::Behavior;
716  const bool debug = Behavior::debug ();
717  if (debug) {
718  using ::Tpetra::Details::checkGlobalDualViewValidity;
719  std::ostringstream gblErrStrm;
720  const bool verbose = Behavior::verbose ();
721  const auto comm = map.is_null () ? Teuchos::null : map->getComm ();
722  const bool gblValid_0 =
723  checkGlobalDualViewValidity (&gblErrStrm, view, verbose,
724  comm.getRawPtr ());
725  const bool gblValid_1 =
726  checkGlobalDualViewValidity (&gblErrStrm, origView, verbose,
727  comm.getRawPtr ());
728  const bool gblValid = gblValid_0 && gblValid_1;
729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730  (! gblValid, std::runtime_error, gblErrStrm.str ());
731  }
732 
733  const size_t lclNumRows = this->getLocalLength ();
734  // Check dimensions of the input DualView. We accept that Kokkos
735  // might not allow construction of a 0 x m (Dual)View with m > 0,
736  // so we only require the number of rows to match if the
737  // (Dual)View has more than zero columns. Likewise, we only
738  // require the number of columns to match if the (Dual)View has
739  // more than zero rows.
740  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
741  view.extent (1) != 0 && static_cast<size_t> (view.extent (0)) < lclNumRows,
742  std::invalid_argument, "view.extent(0) = " << view.extent (0)
743  << " < map->getNodeNumElements() = " << lclNumRows << ".");
744  if (whichVectors.size () != 0) {
745  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
746  view.extent (1) != 0 && view.extent (1) == 0,
747  std::invalid_argument, "view.extent(1) = 0, but whichVectors.size()"
748  " = " << whichVectors.size () << " > 0.");
749  size_t maxColInd = 0;
750  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
751  for (size_type k = 0; k < whichVectors.size (); ++k) {
752  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
753  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
754  std::invalid_argument, "whichVectors[" << k << "] = "
755  "Teuchos::OrdinalTraits<size_t>::invalid().");
756  maxColInd = std::max (maxColInd, whichVectors[k]);
757  }
758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
759  view.extent (1) != 0 && static_cast<size_t> (view.extent (1)) <= maxColInd,
760  std::invalid_argument, "view.extent(1) = " << view.extent (1)
761  << " <= max(whichVectors) = " << maxColInd << ".");
762  }
763 
764  // If extent(1) is 0, the stride might be 0. BLAS doesn't like
765  // zero strides, so modify in that case.
766  const size_t LDA = getDualViewStride (origView);
767  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
768  (LDA < lclNumRows, std::invalid_argument, "Map and DualView origView "
769  "do not match. LDA = " << LDA << " < this->getLocalLength() = " <<
770  lclNumRows << ". origView.extent(0) = " << origView.extent(0)
771  << ", origView.stride(1) = " << origView.d_view.stride(1) << ".");
772 
773  if (whichVectors.size () == 1) {
774  // If whichVectors has only one entry, we don't need to bother
775  // with nonconstant stride. Just shift the view over so it
776  // points to the desired column.
777  //
778  // NOTE (mfh 10 May 2014) This is a special case where we set
779  // origView_ just to view that one column, not all of the
780  // original columns. This ensures that the use of origView_ in
781  // offsetView works correctly.
782  const std::pair<size_t, size_t> colRng (whichVectors[0],
783  whichVectors[0] + 1);
784  view_ = takeSubview (view_, ALL (), colRng);
785  origView_ = takeSubview (origView_, ALL (), colRng);
786  // whichVectors_.size() == 0 means "constant stride."
787  whichVectors_.clear ();
788  }
789  }
790 
791  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
793  MultiVector (const Teuchos::RCP<const map_type>& map,
794  const Teuchos::ArrayView<const Scalar>& data,
795  const size_t LDA,
796  const size_t numVecs) :
797  base_type (map)
798  {
799  typedef LocalOrdinal LO;
800  typedef GlobalOrdinal GO;
801  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
802  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
803 
804  // Deep copy constructor, constant stride (NO whichVectors_).
805  // There is no need for a deep copy constructor with nonconstant stride.
806 
807  const size_t lclNumRows =
808  map.is_null () ? size_t (0) : map->getNodeNumElements ();
809  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
810  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
811  "map->getNodeNumElements() = " << lclNumRows << ".");
812  if (numVecs != 0) {
813  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
814  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
815  (static_cast<size_t> (data.size ()) < minNumEntries,
816  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
817  "entries, given the input Map and number of vectors in the MultiVector."
818  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
819  "map->getNodeNumElements () = " << minNumEntries << ".");
820  }
821 
822  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
823  //Note: X_out will be completely overwritten
824  auto X_out = this->getLocalViewDevice(Access::OverwriteAll);
825  origView_ = view_;
826  owningView_ = view_;
827 
828  // Make an unmanaged host Kokkos::View of the input data. First
829  // create a View (X_in_orig) with the original stride. Then,
830  // create a subview (X_in) with the right number of columns.
831  const impl_scalar_type* const X_in_raw =
832  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
833  Kokkos::View<const impl_scalar_type**,
834  Kokkos::LayoutLeft,
835  Kokkos::HostSpace,
836  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
837  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
838  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
839 
840  // If LDA != X_out's column stride, then we need to copy one
841  // column at a time; Kokkos::deep_copy refuses to work in that
842  // case.
843  const size_t outStride =
844  X_out.extent (1) == 0 ? size_t (1) : X_out.stride (1);
845  if (LDA == outStride) { // strides are the same; deep_copy once
846  // This only works because MultiVector uses LayoutLeft.
847  // We would need a custom copy functor otherwise.
848  Kokkos::deep_copy (X_out, X_in);
849  }
850  else { // strides differ; copy one column at a time
851  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
852  out_col_view_type;
853  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
854  in_col_view_type;
855  for (size_t j = 0; j < numVecs; ++j) {
856  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
857  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
858  Kokkos::deep_copy (X_out_j, X_in_j);
859  }
860  }
861  }
862 
863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
865  MultiVector (const Teuchos::RCP<const map_type>& map,
866  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
867  const size_t numVecs) :
868  base_type (map)
869  {
870  typedef impl_scalar_type IST;
871  typedef LocalOrdinal LO;
872  typedef GlobalOrdinal GO;
873  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
874  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
875 
876  const size_t lclNumRows =
877  map.is_null () ? size_t (0) : map->getNodeNumElements ();
878  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
879  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
880  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
881  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
882  for (size_t j = 0; j < numVecs; ++j) {
883  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
884  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
885  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
886  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
887  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
888  << ".");
889  }
890 
891  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
892  auto X_out = this->getLocalViewDevice(Access::ReadWrite);
893 
894  // Make sure that the type of a single input column has the same
895  // array layout as each output column does, so we can deep_copy.
896  using array_layout = typename decltype (X_out)::array_layout;
897  using input_col_view_type = typename Kokkos::View<const IST*,
898  array_layout,
899  Kokkos::HostSpace,
900  Kokkos::MemoryUnmanaged>;
901 
902  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
903  for (size_t j = 0; j < numVecs; ++j) {
904  Teuchos::ArrayView<const IST> X_j_av =
905  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
906  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
907  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
908  Kokkos::deep_copy (X_j_out, X_j_in);
909  }
910  origView_ = view_;
912  }
913 
914  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
916  isConstantStride () const {
917  return whichVectors_.empty ();
918  }
919 
920  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921  size_t
923  getLocalLength () const
924  {
925  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
926  return static_cast<size_t> (0);
927  } else {
928  return this->getMap ()->getNodeNumElements ();
929  }
930  }
931 
932  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935  getGlobalLength () const
936  {
937  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
938  return static_cast<size_t> (0);
939  } else {
940  return this->getMap ()->getGlobalNumElements ();
941  }
942  }
943 
944  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
945  size_t
947  getStride () const
948  {
949  return isConstantStride () ? getDualViewStride (origView_) : size_t (0);
950  }
951 
952  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
953  bool
956  {
957  //Don't actually get a view, just get pointers.
958  auto thisData = view_.h_view.data();
959  auto otherData = other.view_.h_view.data();
960  size_t thisSpan = view_.h_view.span();
961  size_t otherSpan = other.view_.h_view.span();
962  return (otherData <= thisData && thisData < otherData + otherSpan)
963  || (thisData <= otherData && otherData < thisData + thisSpan);
964  }
965 
966  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
967  bool
969  checkSizes (const SrcDistObject& sourceObj)
970  {
971  // Check whether the source object is a MultiVector. If not, then
972  // we can't even compare sizes, so it's definitely not OK to
973  // Import or Export from it.
975  const MV* src = dynamic_cast<const MV*> (&sourceObj);
976  if (src == nullptr) {
977  return false;
978  }
979  else {
980  // The target of the Import or Export calls checkSizes() in
981  // DistObject::doTransfer(). By that point, we've already
982  // constructed an Import or Export object using the two
983  // multivectors' Maps, which means that (hopefully) we've
984  // already checked other attributes of the multivectors. Thus,
985  // all we need to do here is check the number of columns.
986  return src->getNumVectors () == this->getNumVectors ();
987  }
988  }
989 
990  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
991  size_t
993  constantNumberOfPackets () const {
994  return this->getNumVectors ();
995  }
996 
997  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
998  void
1001  (const SrcDistObject& sourceObj,
1002  const size_t numSameIDs,
1003  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1004  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1005  const CombineMode CM)
1006  {
1007  using ::Tpetra::Details::Behavior;
1009  using ::Tpetra::Details::ProfilingRegion;
1010  using std::endl;
1011  using KokkosRefactor::Details::permute_array_multi_column;
1012  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
1013  using Kokkos::Compat::create_const_view;
1015  const char tfecfFuncName[] = "copyAndPermute: ";
1016  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
1017 
1018  const bool verbose = Behavior::verbose ();
1019  std::unique_ptr<std::string> prefix;
1020  if (verbose) {
1021  auto map = this->getMap ();
1022  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1023  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1024  std::ostringstream os;
1025  os << "Proc " << myRank << ": MV::copyAndPermute: ";
1026  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1027  os << "Start" << endl;
1028  std::cerr << os.str ();
1029  }
1030 
1031  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1032  (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0),
1033  std::logic_error, "permuteToLIDs.extent(0) = "
1034  << permuteToLIDs.extent (0) << " != permuteFromLIDs.extent(0) = "
1035  << permuteFromLIDs.extent (0) << ".");
1036 
1037  // We've already called checkSizes(), so this cast must succeed.
1038  MV& sourceMV = const_cast<MV &>(dynamic_cast<const MV&> (sourceObj));
1039  const size_t numCols = this->getNumVectors ();
1040 
1041  // sourceMV doesn't belong to us, so we can't sync it. Do the
1042  // copying where it's currently sync'd.
1043  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1044  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1045  std::logic_error, "Input MultiVector needs sync to both host "
1046  "and device.");
1047  const bool copyOnHost = runKernelOnHost(sourceMV);
1048  if (verbose) {
1049  std::ostringstream os;
1050  os << *prefix << "copyOnHost=" << (copyOnHost ? "true" : "false") << endl;
1051  std::cerr << os.str ();
1052  }
1053 
1054  if (verbose) {
1055  std::ostringstream os;
1056  os << *prefix << "Copy" << endl;
1057  std::cerr << os.str ();
1058  }
1059 
1060  // TODO (mfh 15 Sep 2013) When we replace
1061  // KokkosClassic::MultiVector with a Kokkos::View, there are two
1062  // ways to copy the data:
1063  //
1064  // 1. Get a (sub)view of each column and call deep_copy on that.
1065  // 2. Write a custom kernel to copy the data.
1066  //
1067  // The first is easier, but the second might be more performant in
1068  // case we decide to use layouts other than LayoutLeft. It might
1069  // even make sense to hide whichVectors_ in an entirely new layout
1070  // for Kokkos Views.
1071 
1072  // Copy rows [0, numSameIDs-1] of the local multivectors.
1073  //
1074  // Note (ETP 2 Jul 2014) We need to always copy one column at a
1075  // time, even when both multivectors are constant-stride, since
1076  // deep_copy between strided subviews with more than one column
1077  // doesn't currently work.
1078 
1079  // FIXME (mfh 04 Feb 2019) Need to optimize for the case where
1080  // both source and target are constant stride and have multiple
1081  // columns.
1082  if (numSameIDs > 0) {
1083  const std::pair<size_t, size_t> rows (0, numSameIDs);
1084  if (copyOnHost) {
1085  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1086  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1087 
1088  for (size_t j = 0; j < numCols; ++j) {
1089  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1090  const size_t srcCol =
1091  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1092 
1093  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
1094  auto src_j = Kokkos::subview (src_h, rows, srcCol);
1095  if (CM == ADD_ASSIGN) {
1096  // Sum src_j into tgt_j
1097  using range_t =
1098  Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1099  range_t rp(0,numSameIDs);
1100  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1101  aaf(tgt_j, src_j);
1102  Kokkos::parallel_for(rp, aaf);
1103  }
1104  else {
1105  // Copy src_j into tgt_j
1106  Kokkos::deep_copy (tgt_j, src_j);
1107  }
1108  }
1109  }
1110  else { // copy on device
1111  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1112  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1113 
1114  for (size_t j = 0; j < numCols; ++j) {
1115  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
1116  const size_t srcCol =
1117  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
1118 
1119  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
1120  auto src_j = Kokkos::subview (src_d, rows, srcCol);
1121  if (CM == ADD_ASSIGN) {
1122  // Sum src_j into tgt_j
1123  using range_t =
1124  Kokkos::RangePolicy<typename Node::execution_space, size_t>;
1125  range_t rp(0,numSameIDs);
1126  Tpetra::Details::AddAssignFunctor<decltype(tgt_j), decltype(src_j)>
1127  aaf(tgt_j, src_j);
1128  Kokkos::parallel_for(rp, aaf);
1129  }
1130  else {
1131  // Copy src_j into tgt_j
1132  Kokkos::deep_copy (tgt_j, src_j);
1133  }
1134  }
1135  }
1136  }
1137 
1138 
1139  // For the remaining GIDs, execute the permutations. This may
1140  // involve noncontiguous access of both source and destination
1141  // vectors, depending on the LID lists.
1142  //
1143  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
1144  // the same process, this merges their values by replacement of
1145  // the last encountered GID, not by the specified merge rule
1146  // (such as ADD).
1147 
1148  // If there are no permutations, we are done
1149  if (permuteFromLIDs.extent (0) == 0 ||
1150  permuteToLIDs.extent (0) == 0) {
1151  if (verbose) {
1152  std::ostringstream os;
1153  os << *prefix << "No permutations. Done!" << endl;
1154  std::cerr << os.str ();
1155  }
1156  return;
1157  }
1158 
1159  if (verbose) {
1160  std::ostringstream os;
1161  os << *prefix << "Permute" << endl;
1162  std::cerr << os.str ();
1163  }
1164 
1165  // We could in theory optimize for the case where exactly one of
1166  // them is constant stride, but we don't currently do that.
1167  const bool nonConstStride =
1168  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
1169 
1170  if (verbose) {
1171  std::ostringstream os;
1172  os << *prefix << "nonConstStride="
1173  << (nonConstStride ? "true" : "false") << endl;
1174  std::cerr << os.str ();
1175  }
1176 
1177  // We only need the "which vectors" arrays if either the source or
1178  // target MV is not constant stride. Since we only have one
1179  // kernel that must do double-duty, we have to create a "which
1180  // vectors" array for the MV that _is_ constant stride.
1181  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
1182  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
1183  if (nonConstStride) {
1184  if (this->whichVectors_.size () == 0) {
1185  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
1186  tmpTgt.modify_host ();
1187  for (size_t j = 0; j < numCols; ++j) {
1188  tmpTgt.h_view(j) = j;
1189  }
1190  if (! copyOnHost) {
1191  tmpTgt.sync_device ();
1192  }
1193  tgtWhichVecs = tmpTgt;
1194  }
1195  else {
1196  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
1197  tgtWhichVecs =
1198  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
1199  "tgtWhichVecs",
1200  copyOnHost);
1201  }
1202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1203  (static_cast<size_t> (tgtWhichVecs.extent (0)) !=
1204  this->getNumVectors (),
1205  std::logic_error, "tgtWhichVecs.extent(0) = " <<
1206  tgtWhichVecs.extent (0) << " != this->getNumVectors() = " <<
1207  this->getNumVectors () << ".");
1209  if (sourceMV.whichVectors_.size () == 0) {
1210  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
1211  tmpSrc.modify_host ();
1212  for (size_t j = 0; j < numCols; ++j) {
1213  tmpSrc.h_view(j) = j;
1214  }
1215  if (! copyOnHost) {
1216  tmpSrc.sync_device ();
1217  }
1218  srcWhichVecs = tmpSrc;
1219  }
1220  else {
1221  Teuchos::ArrayView<const size_t> srcWhichVecsT =
1222  sourceMV.whichVectors_ ();
1223  srcWhichVecs =
1224  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
1225  "srcWhichVecs",
1226  copyOnHost);
1227  }
1228  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1229  (static_cast<size_t> (srcWhichVecs.extent (0)) !=
1230  sourceMV.getNumVectors (), std::logic_error,
1231  "srcWhichVecs.extent(0) = " << srcWhichVecs.extent (0)
1232  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
1233  << ".");
1234  }
1235 
1236  if (copyOnHost) { // permute on host too
1237  if (verbose) {
1238  std::ostringstream os;
1239  os << *prefix << "Get permute LIDs on host" << std::endl;
1240  std::cerr << os.str ();
1241  }
1242  auto tgt_h = this->getLocalViewHost(Access::ReadWrite);
1243  auto src_h = sourceMV.getLocalViewHost(Access::ReadOnly);
1244 
1245  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
1246  auto permuteToLIDs_h = create_const_view (permuteToLIDs.view_host ());
1247  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
1248  auto permuteFromLIDs_h =
1249  create_const_view (permuteFromLIDs.view_host ());
1250 
1251  if (verbose) {
1252  std::ostringstream os;
1253  os << *prefix << "Permute on host" << endl;
1254  std::cerr << os.str ();
1255  }
1256  if (nonConstStride) {
1257  // No need to sync first, because copyOnHost argument to
1258  // getDualViewCopyFromArrayView puts them in the right place.
1259  auto tgtWhichVecs_h =
1260  create_const_view (tgtWhichVecs.view_host ());
1261  auto srcWhichVecs_h =
1262  create_const_view (srcWhichVecs.view_host ());
1263  if (CM == ADD_ASSIGN) {
1264  using op_type = KokkosRefactor::Details::AddOp;
1265  permute_array_multi_column_variable_stride (tgt_h, src_h,
1266  permuteToLIDs_h,
1267  permuteFromLIDs_h,
1268  tgtWhichVecs_h,
1269  srcWhichVecs_h, numCols,
1270  op_type());
1271  }
1272  else {
1273  using op_type = KokkosRefactor::Details::InsertOp;
1274  permute_array_multi_column_variable_stride (tgt_h, src_h,
1275  permuteToLIDs_h,
1276  permuteFromLIDs_h,
1277  tgtWhichVecs_h,
1278  srcWhichVecs_h, numCols,
1279  op_type());
1280  }
1281  }
1282  else {
1283  if (CM == ADD_ASSIGN) {
1284  using op_type = KokkosRefactor::Details::AddOp;
1285  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1286  permuteFromLIDs_h, numCols, op_type());
1287  }
1288  else {
1289  using op_type = KokkosRefactor::Details::InsertOp;
1290  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1291  permuteFromLIDs_h, numCols, op_type());
1292  }
1293  }
1294  }
1295  else { // permute on device
1296  if (verbose) {
1297  std::ostringstream os;
1298  os << *prefix << "Get permute LIDs on device" << endl;
1299  std::cerr << os.str ();
1300  }
1301  auto tgt_d = this->getLocalViewDevice(Access::ReadWrite);
1302  auto src_d = sourceMV.getLocalViewDevice(Access::ReadOnly);
1303 
1304  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_device () );
1305  auto permuteToLIDs_d = create_const_view (permuteToLIDs.view_device ());
1306  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_device () );
1307  auto permuteFromLIDs_d =
1308  create_const_view (permuteFromLIDs.view_device ());
1309 
1310  if (verbose) {
1311  std::ostringstream os;
1312  os << *prefix << "Permute on device" << endl;
1313  std::cerr << os.str ();
1314  }
1315  if (nonConstStride) {
1316  // No need to sync first, because copyOnHost argument to
1317  // getDualViewCopyFromArrayView puts them in the right place.
1318  auto tgtWhichVecs_d = create_const_view (tgtWhichVecs.view_device ());
1319  auto srcWhichVecs_d = create_const_view (srcWhichVecs.view_device ());
1320  if (CM == ADD_ASSIGN) {
1321  using op_type = KokkosRefactor::Details::AddOp;
1322  permute_array_multi_column_variable_stride (tgt_d, src_d,
1323  permuteToLIDs_d,
1324  permuteFromLIDs_d,
1325  tgtWhichVecs_d,
1326  srcWhichVecs_d, numCols,
1327  op_type());
1328  }
1329  else {
1330  using op_type = KokkosRefactor::Details::InsertOp;
1331  permute_array_multi_column_variable_stride (tgt_d, src_d,
1332  permuteToLIDs_d,
1333  permuteFromLIDs_d,
1334  tgtWhichVecs_d,
1335  srcWhichVecs_d, numCols,
1336  op_type());
1337  }
1338  }
1339  else {
1340  if (CM == ADD_ASSIGN) {
1341  using op_type = KokkosRefactor::Details::AddOp;
1342  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1343  permuteFromLIDs_d, numCols, op_type());
1344  }
1345  else {
1346  using op_type = KokkosRefactor::Details::InsertOp;
1347  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1348  permuteFromLIDs_d, numCols, op_type());
1349  }
1350  }
1351  }
1352 
1353  if (verbose) {
1354  std::ostringstream os;
1355  os << *prefix << "Done!" << endl;
1356  std::cerr << os.str ();
1357  }
1358  }
1359 
1360  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1361  void
1364  (const SrcDistObject& sourceObj,
1365  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1366  Kokkos::DualView<impl_scalar_type*, buffer_device_type>& exports,
1367  Kokkos::DualView<size_t*, buffer_device_type> /* numExportPacketsPerLID */,
1368  size_t& constantNumPackets)
1369  {
1370  using ::Tpetra::Details::Behavior;
1371  using ::Tpetra::Details::ProfilingRegion;
1373  using Kokkos::Compat::create_const_view;
1374  using Kokkos::Compat::getKokkosViewDeepCopy;
1375  using std::endl;
1377  const char tfecfFuncName[] = "packAndPrepare: ";
1378  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1379 
1380  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1381  // have the option to check indices. We do so when Tpetra is in
1382  // debug mode. It is in debug mode by default in a debug build,
1383  // but you may control this at run time, before launching the
1384  // executable, by setting the TPETRA_DEBUG environment variable to
1385  // "1" (or "TRUE").
1386  const bool debugCheckIndices = Behavior::debug ();
1387  // mfh 03 Aug 2017, 27 Sep 2017: Set the TPETRA_VERBOSE
1388  // environment variable to "1" (or "TRUE") for copious debug
1389  // output to std::cerr on every MPI process. This is unwise for
1390  // runs with large numbers of MPI processes.
1391  const bool printDebugOutput = Behavior::verbose ();
1392 
1393  std::unique_ptr<std::string> prefix;
1394  if (printDebugOutput) {
1395  auto map = this->getMap ();
1396  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1397  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1398  std::ostringstream os;
1399  os << "Proc " << myRank << ": MV::packAndPrepare: ";
1400  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1401  os << "Start" << endl;
1402  std::cerr << os.str ();
1403  }
1404 
1405  // We've already called checkSizes(), so this cast must succeed.
1406  MV& sourceMV = const_cast<MV&>(dynamic_cast<const MV&> (sourceObj));
1408  const size_t numCols = sourceMV.getNumVectors ();
1409 
1410  // This spares us from needing to fill numExportPacketsPerLID.
1411  // Setting constantNumPackets to a nonzero value signals that
1412  // all packets have the same number of entries.
1413  constantNumPackets = numCols;
1415  // If we have no exports, there is nothing to do. Make sure this
1416  // goes _after_ setting constantNumPackets correctly.
1417  if (exportLIDs.extent (0) == 0) {
1418  if (printDebugOutput) {
1419  std::ostringstream os;
1420  os << *prefix << "No exports on this proc, DONE" << std::endl;
1421  std::cerr << os.str ();
1422  }
1423  return;
1424  }
1425 
1426  /* The layout in the export for MultiVectors is as follows:
1427  exports = { all of the data from row exportLIDs.front() ;
1428  ....
1429  all of the data from row exportLIDs.back() }
1430  This doesn't have the best locality, but is necessary because
1431  the data for a Packet (all data associated with an LID) is
1432  required to be contiguous. */
1433 
1434  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1435  // packing scheme in the above comment? The data going to a
1436  // particular process must be contiguous, of course, but those
1437  // data could include entries from multiple LIDs. DistObject just
1438  // needs to know how to index into that data. Kokkos is good at
1439  // decoupling storage intent from data layout choice.
1440 
1441  const size_t numExportLIDs = exportLIDs.extent (0);
1442  const size_t newExportsSize = numCols * numExportLIDs;
1443  if (printDebugOutput) {
1444  std::ostringstream os;
1445  os << *prefix << "realloc: "
1446  << "numExportLIDs: " << numExportLIDs
1447  << ", exports.extent(0): " << exports.extent (0)
1448  << ", newExportsSize: " << newExportsSize << std::endl;
1449  std::cerr << os.str ();
1450  }
1451  reallocDualViewIfNeeded (exports, newExportsSize, "exports");
1452 
1453  // mfh 04 Feb 2019: sourceMV doesn't belong to us, so we can't
1454  // sync it. Pack it where it's currently sync'd.
1455  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1456  (sourceMV.need_sync_device () && sourceMV.need_sync_host (),
1457  std::logic_error, "Input MultiVector needs sync to both host "
1458  "and device.");
1459  const bool packOnHost = runKernelOnHost(sourceMV);
1460  if (printDebugOutput) {
1461  std::ostringstream os;
1462  os << *prefix << "packOnHost=" << (packOnHost ? "true" : "false") << endl;
1463  std::cerr << os.str ();
1464  }
1465 
1466  // Mark 'exports' here, since we might have resized it above.
1467  // Resizing currently requires calling the constructor, which
1468  // clears out the 'modified' flags.
1469  if (packOnHost) {
1470  // nde 06 Feb 2020: If 'exports' does not require resize
1471  // when reallocDualViewIfNeeded is called, the modified flags
1472  // are not cleared out. This can result in host and device views
1473  // being out-of-sync, resuling in an error in exports.modify_* calls.
1474  // Clearing the sync flags prevents this possible case.
1475  exports.clear_sync_state ();
1476  exports.modify_host ();
1477  }
1478  else {
1479  // nde 06 Feb 2020: If 'exports' does not require resize
1480  // when reallocDualViewIfNeeded is called, the modified flags
1481  // are not cleared out. This can result in host and device views
1482  // being out-of-sync, resuling in an error in exports.modify_* calls.
1483  // Clearing the sync flags prevents this possible case.
1484  exports.clear_sync_state ();
1485  exports.modify_device ();
1486  }
1487 
1488  if (numCols == 1) { // special case for one column only
1489  // MultiVector always represents a single column with constant
1490  // stride, but it doesn't hurt to implement both cases anyway.
1491  //
1492  // ETP: I'm not sure I agree with the above statement. Can't a single-
1493  // column multivector be a subview of another multi-vector, in which case
1494  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1495  // separately.
1496  //
1497  // mfh 18 Jan 2016: In answer to ETP's comment above:
1498  // MultiVector treats single-column MultiVectors created using a
1499  // "nonconstant stride constructor" as a special case, and makes
1500  // them constant stride (by making whichVectors_ have length 0).
1501  if (sourceMV.isConstantStride ()) {
1502  using KokkosRefactor::Details::pack_array_single_column;
1503  if (printDebugOutput) {
1504  std::ostringstream os;
1505  os << *prefix << "Pack numCols=1 const stride" << endl;
1506  std::cerr << os.str ();
1507  }
1508  if (packOnHost) {
1509  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1510  pack_array_single_column (exports.view_host (),
1511  src_host,
1512  exportLIDs.view_host (),
1513  0,
1514  debugCheckIndices);
1515  }
1516  else { // pack on device
1517  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1518  pack_array_single_column (exports.view_device (),
1519  src_dev,
1520  exportLIDs.view_device (),
1521  0,
1522  debugCheckIndices);
1523  }
1524  }
1525  else {
1526  using KokkosRefactor::Details::pack_array_single_column;
1527  if (printDebugOutput) {
1528  std::ostringstream os;
1529  os << *prefix << "Pack numCols=1 nonconst stride" << endl;
1530  std::cerr << os.str ();
1531  }
1532  if (packOnHost) {
1533  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1534  pack_array_single_column (exports.view_host (),
1535  src_host,
1536  exportLIDs.view_host (),
1537  sourceMV.whichVectors_[0],
1538  debugCheckIndices);
1539  }
1540  else { // pack on device
1541  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1542  pack_array_single_column (exports.view_device (),
1543  src_dev,
1544  exportLIDs.view_device (),
1545  sourceMV.whichVectors_[0],
1546  debugCheckIndices);
1547  }
1548  }
1549  }
1550  else { // the source MultiVector has multiple columns
1551  if (sourceMV.isConstantStride ()) {
1552  using KokkosRefactor::Details::pack_array_multi_column;
1553  if (printDebugOutput) {
1554  std::ostringstream os;
1555  os << *prefix << "Pack numCols=" << numCols << " const stride" << endl;
1556  std::cerr << os.str ();
1557  }
1558  if (packOnHost) {
1559  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1560  pack_array_multi_column (exports.view_host (),
1561  src_host,
1562  exportLIDs.view_host (),
1563  numCols,
1564  debugCheckIndices);
1565  }
1566  else { // pack on device
1567  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1568  pack_array_multi_column (exports.view_device (),
1569  src_dev,
1570  exportLIDs.view_device (),
1571  numCols,
1572  debugCheckIndices);
1573  }
1574  }
1575  else {
1576  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1577  if (printDebugOutput) {
1578  std::ostringstream os;
1579  os << *prefix << "Pack numCols=" << numCols << " nonconst stride"
1580  << endl;
1581  std::cerr << os.str ();
1582  }
1583  // FIXME (mfh 04 Feb 2019) Creating a Kokkos::View for
1584  // whichVectors_ can be expensive, but pack and unpack for
1585  // nonconstant-stride MultiVectors is slower anyway.
1586  using IST = impl_scalar_type;
1587  using DV = Kokkos::DualView<IST*, device_type>;
1588  using HES = typename DV::t_host::execution_space;
1589  using DES = typename DV::t_dev::execution_space;
1590  Teuchos::ArrayView<const size_t> whichVecs = sourceMV.whichVectors_ ();
1591  if (packOnHost) {
1592  auto src_host = sourceMV.getLocalViewHost(Access::ReadOnly);
1593  pack_array_multi_column_variable_stride
1594  (exports.view_host (),
1595  src_host,
1596  exportLIDs.view_host (),
1597  getKokkosViewDeepCopy<HES> (whichVecs),
1598  numCols,
1599  debugCheckIndices);
1600  }
1601  else { // pack on device
1602  auto src_dev = sourceMV.getLocalViewDevice(Access::ReadOnly);
1603  pack_array_multi_column_variable_stride
1604  (exports.view_device (),
1605  src_dev,
1606  exportLIDs.view_device (),
1607  getKokkosViewDeepCopy<DES> (whichVecs),
1608  numCols,
1609  debugCheckIndices);
1610  }
1611  }
1612  }
1613 
1614  if (printDebugOutput) {
1615  std::ostringstream os;
1616  os << *prefix << "Done!" << endl;
1617  std::cerr << os.str ();
1618  }
1619 
1620  }
1621 
1622 
1623  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1624  template <class NO>
1625  typename std::enable_if<std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1626  typename NO::device_type::memory_space>::value,
1627  bool>::type
1629  reallocImportsIfNeededImpl (const size_t newSize,
1630  const bool verbose,
1631  const std::string* prefix,
1632  const bool areRemoteLIDsContiguous,
1633  const CombineMode CM)
1634  {
1635  // This implementation of reallocImportsIfNeeded is an
1636  // optimization that is specific to MultiVector. We check if the
1637  // imports_ view can be aliased to the end of the data view_. If
1638  // that is the case, we can skip the unpackAndCombine call.
1639 
1640  if (verbose) {
1641  std::ostringstream os;
1642  os << *prefix << "Realloc (if needed) imports_ from "
1643  << this->imports_.extent (0) << " to " << newSize << std::endl;
1644  std::cerr << os.str ();
1645  }
1646 
1647  bool reallocated = false;
1648 
1649  using IST = impl_scalar_type;
1650  using DV = Kokkos::DualView<IST*, Kokkos::LayoutLeft, buffer_device_type>;
1651 
1652  // Conditions for aliasing memory:
1653  // - When both sides of the dual view are in the same memory
1654  // space, we do not need to worry about syncing things.
1655  // - When both memory spaces are different, we only alias if this
1656  // does not incur additional sync'ing.
1657  // - The remote LIDs need to be contiguous, so that we do not need
1658  // to reorder received information.
1659  // - CombineMode needs to be INSERT.
1660  // - The number of vectors needs to be 1, otherwise we need to
1661  // reorder the received data.
1662  if ((dual_view_type::impl_dualview_is_single_device::value ||
1663  (Details::Behavior::assumeMpiIsCudaAware () && !this->need_sync_device()) ||
1664  (!Details::Behavior::assumeMpiIsCudaAware () && !this->need_sync_host())) &&
1665  areRemoteLIDsContiguous &&
1666  (CM == INSERT || CM == REPLACE) &&
1667  (getNumVectors() == 1) &&
1668  (newSize > 0)) {
1669 
1670  size_t offset = getLocalLength () - newSize;
1671  reallocated = this->imports_.d_view.data() != view_.d_view.data() + offset;
1672  if (reallocated) {
1673  typedef std::pair<size_t, size_t> range_type;
1674  this->imports_ = DV(view_,
1675  range_type (offset, getLocalLength () ),
1676  0);
1677 
1678  if (verbose) {
1679  std::ostringstream os;
1680  os << *prefix << "Aliased imports_ to MV.view_" << std::endl;
1681  std::cerr << os.str ();
1682  }
1683  }
1684  return reallocated;
1685  }
1686  {
1688  reallocated =
1689  reallocDualViewIfNeeded (this->unaliased_imports_, newSize, "imports");
1690  if (verbose) {
1691  std::ostringstream os;
1692  os << *prefix << "Finished realloc'ing unaliased_imports_" << std::endl;
1693  std::cerr << os.str ();
1694  }
1695  this->imports_ = this->unaliased_imports_;
1696  }
1697  return reallocated;
1698  }
1699 
1700  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1701  template <class NO>
1702  typename std::enable_if<!std::is_same<typename Tpetra::Details::DefaultTypes::CommBufferMemorySpace<typename NO::execution_space>::type,
1703  typename NO::device_type::memory_space>::value,
1704  bool>::type
1706  reallocImportsIfNeededImpl (const size_t newSize,
1707  const bool verbose,
1708  const std::string* prefix,
1709  const bool areRemoteLIDsContiguous,
1710  const CombineMode CM)
1711  {
1712  return DistObject<Scalar, LocalOrdinal, GlobalOrdinal, Node>::reallocImportsIfNeeded(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1713  }
1714 
1715  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1716  bool
1718  reallocImportsIfNeeded(const size_t newSize,
1719  const bool verbose,
1720  const std::string* prefix,
1721  const bool areRemoteLIDsContiguous,
1722  const CombineMode CM) {
1724  return reallocImportsIfNeededImpl<Node>(newSize, verbose, prefix, areRemoteLIDsContiguous, CM);
1725  }
1726 
1727  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1728  bool
1731  return (this->imports_.d_view.data() + this->imports_.d_view.extent(0) ==
1732  view_.d_view.data() + view_.d_view.extent(0));
1733  }
1734 
1735 
1736  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1737  void
1740  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1741  Kokkos::DualView<impl_scalar_type*, buffer_device_type> imports,
1742  Kokkos::DualView<size_t*, buffer_device_type> /* numPacketsPerLID */,
1743  const size_t constantNumPackets,
1744  const CombineMode CM)
1745  {
1746  using ::Tpetra::Details::Behavior;
1747  using ::Tpetra::Details::ProfilingRegion;
1748  using KokkosRefactor::Details::unpack_array_multi_column;
1749  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1750  using Kokkos::Compat::getKokkosViewDeepCopy;
1751  using std::endl;
1752  const char longFuncName[] = "Tpetra::MultiVector::unpackAndCombine";
1753  const char tfecfFuncName[] = "unpackAndCombine: ";
1754  ProfilingRegion regionUAC (longFuncName);
1755 
1756  // mfh 09 Sep 2016, 26 Sep 2017: The pack and unpack functions now
1757  // have the option to check indices. We do so when Tpetra is in
1758  // debug mode. It is in debug mode by default in a debug build,
1759  // but you may control this at run time, before launching the
1760  // executable, by setting the TPETRA_DEBUG environment variable to
1761  // "1" (or "TRUE").
1762  const bool debugCheckIndices = Behavior::debug ();
1763 
1764  const bool printDebugOutput = Behavior::verbose ();
1765  std::unique_ptr<std::string> prefix;
1766  if (printDebugOutput) {
1767  auto map = this->getMap ();
1768  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1769  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1770  std::ostringstream os;
1771  os << "Proc " << myRank << ": " << longFuncName << ": ";
1772  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1773  os << "Start" << endl;
1774  std::cerr << os.str ();
1775  }
1776 
1777  // If we have no imports, there is nothing to do
1778  if (importLIDs.extent (0) == 0) {
1779  if (printDebugOutput) {
1780  std::ostringstream os;
1781  os << *prefix << "No imports. Done!" << endl;
1782  std::cerr << os.str ();
1783  }
1784  return;
1785  }
1786 
1787  // Check, whether imports_ is a subview of the MV view.
1788  if (importsAreAliased()) {
1789  if (printDebugOutput) {
1790  std::ostringstream os;
1791  os << *prefix << "Skipping unpack, since imports_ is aliased to MV.view_. Done!" << endl;
1792  std::cerr << os.str ();
1793  }
1794  return;
1795  }
1796 
1797 
1798  const size_t numVecs = getNumVectors ();
1799  if (debugCheckIndices) {
1800  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1801  (static_cast<size_t> (imports.extent (0)) !=
1802  numVecs * importLIDs.extent (0),
1803  std::runtime_error,
1804  "imports.extent(0) = " << imports.extent (0)
1805  << " != getNumVectors() * importLIDs.extent(0) = " << numVecs
1806  << " * " << importLIDs.extent (0) << " = "
1807  << numVecs * importLIDs.extent (0) << ".");
1808 
1809  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1810  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1811  "constantNumPackets input argument must be nonzero.");
1812 
1813  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1814  (static_cast<size_t> (numVecs) !=
1815  static_cast<size_t> (constantNumPackets),
1816  std::runtime_error, "constantNumPackets must equal numVecs.");
1817  }
1818 
1819  // mfh 12 Apr 2016, 04 Feb 2019: Decide where to unpack based on
1820  // the memory space in which the imports buffer was last modified and
1821  // the size of the imports buffer.
1822  // DistObject::doTransferNew decides where it was last modified (based on
1823  // whether communication buffers used were on host or device).
1824  const bool unpackOnHost = runKernelOnHost(imports);
1825  if (unpackOnHost) {
1826  if (this->imports_.need_sync_host()) this->imports_.sync_host();
1827  }
1828  else {
1829  if (this->imports_.need_sync_device()) this->imports_.sync_device();
1830  }
1831 
1832  if (printDebugOutput) {
1833  std::ostringstream os;
1834  os << *prefix << "unpackOnHost=" << (unpackOnHost ? "true" : "false")
1835  << endl;
1836  std::cerr << os.str ();
1837  }
1838 
1839  // We have to sync before modifying, because this method may read
1840  // as well as write (depending on the CombineMode).
1841  auto imports_d = imports.view_device ();
1842  auto imports_h = imports.view_host ();
1843  auto importLIDs_d = importLIDs.view_device ();
1844  auto importLIDs_h = importLIDs.view_host ();
1845 
1846  Kokkos::DualView<size_t*, device_type> whichVecs;
1847  if (! isConstantStride ()) {
1848  Kokkos::View<const size_t*, Kokkos::HostSpace,
1849  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1850  numVecs);
1851  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1852  if (unpackOnHost) {
1853  whichVecs.modify_host ();
1854  Kokkos::deep_copy (whichVecs.view_host (), whichVecsIn);
1855  }
1856  else {
1857  whichVecs.modify_device ();
1858  Kokkos::deep_copy (whichVecs.view_device (), whichVecsIn);
1859  }
1860  }
1861  auto whichVecs_d = whichVecs.view_device ();
1862  auto whichVecs_h = whichVecs.view_host ();
1863 
1864  /* The layout in the export for MultiVectors is as follows:
1865  imports = { all of the data from row exportLIDs.front() ;
1866  ....
1867  all of the data from row exportLIDs.back() }
1868  This doesn't have the best locality, but is necessary because
1869  the data for a Packet (all data associated with an LID) is
1870  required to be contiguous. */
1871 
1872  if (numVecs > 0 && importLIDs.extent (0) > 0) {
1873  using dev_exec_space = typename dual_view_type::t_dev::execution_space;
1874  using host_exec_space = typename dual_view_type::t_host::execution_space;
1875 
1876  // This fixes GitHub Issue #4418.
1877  const bool use_atomic_updates = unpackOnHost ?
1878  host_exec_space::concurrency () != 1 :
1879  dev_exec_space::concurrency () != 1;
1880 
1881  if (printDebugOutput) {
1882  std::ostringstream os;
1883  os << *prefix << "Unpack: " << combineModeToString (CM) << endl;
1884  std::cerr << os.str ();
1885  }
1886 
1887  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1888  // custom combine modes, start editing here.
1889 
1890  if (CM == INSERT || CM == REPLACE) {
1891  using op_type = KokkosRefactor::Details::InsertOp;
1892  if (isConstantStride ()) {
1893  if (unpackOnHost) {
1894  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1895  unpack_array_multi_column (host_exec_space (),
1896  X_h, imports_h, importLIDs_h,
1897  op_type (), numVecs,
1898  use_atomic_updates,
1899  debugCheckIndices);
1900 
1901  }
1902  else { // unpack on device
1903  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1904  unpack_array_multi_column (dev_exec_space (),
1905  X_d, imports_d, importLIDs_d,
1906  op_type (), numVecs,
1907  use_atomic_updates,
1908  debugCheckIndices);
1909  }
1910  }
1911  else { // not constant stride
1912  if (unpackOnHost) {
1913  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1914  unpack_array_multi_column_variable_stride (host_exec_space (),
1915  X_h, imports_h,
1916  importLIDs_h,
1917  whichVecs_h,
1918  op_type (),
1919  numVecs,
1920  use_atomic_updates,
1921  debugCheckIndices);
1922  }
1923  else { // unpack on device
1924  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1925  unpack_array_multi_column_variable_stride (dev_exec_space (),
1926  X_d, imports_d,
1927  importLIDs_d,
1928  whichVecs_d,
1929  op_type (),
1930  numVecs,
1931  use_atomic_updates,
1932  debugCheckIndices);
1933  }
1934  }
1935  }
1936  else if (CM == ADD || CM == ADD_ASSIGN) {
1937  using op_type = KokkosRefactor::Details::AddOp;
1938  if (isConstantStride ()) {
1939  if (unpackOnHost) {
1940  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1941  unpack_array_multi_column (host_exec_space (),
1942  X_h, imports_h, importLIDs_h,
1943  op_type (), numVecs,
1944  use_atomic_updates,
1945  debugCheckIndices);
1946  }
1947  else { // unpack on device
1948  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1949  unpack_array_multi_column (dev_exec_space (),
1950  X_d, imports_d, importLIDs_d,
1951  op_type (), numVecs,
1952  use_atomic_updates,
1953  debugCheckIndices);
1954  }
1955  }
1956  else { // not constant stride
1957  if (unpackOnHost) {
1958  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1959  unpack_array_multi_column_variable_stride (host_exec_space (),
1960  X_h, imports_h,
1961  importLIDs_h,
1962  whichVecs_h,
1963  op_type (),
1964  numVecs,
1965  use_atomic_updates,
1966  debugCheckIndices);
1967  }
1968  else { // unpack on device
1969  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1970  unpack_array_multi_column_variable_stride (dev_exec_space (),
1971  X_d, imports_d,
1972  importLIDs_d,
1973  whichVecs_d,
1974  op_type (),
1975  numVecs,
1976  use_atomic_updates,
1977  debugCheckIndices);
1978  }
1979  }
1980  }
1981  else if (CM == ABSMAX) {
1982  using op_type = KokkosRefactor::Details::AbsMaxOp;
1983  if (isConstantStride ()) {
1984  if (unpackOnHost) {
1985  auto X_h = this->getLocalViewHost(Access::ReadWrite);
1986  unpack_array_multi_column (host_exec_space (),
1987  X_h, imports_h, importLIDs_h,
1988  op_type (), numVecs,
1989  use_atomic_updates,
1990  debugCheckIndices);
1991  }
1992  else { // unpack on device
1993  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
1994  unpack_array_multi_column (dev_exec_space (),
1995  X_d, imports_d, importLIDs_d,
1996  op_type (), numVecs,
1997  use_atomic_updates,
1998  debugCheckIndices);
1999  }
2000  }
2001  else {
2002  if (unpackOnHost) {
2003  auto X_h = this->getLocalViewHost(Access::ReadWrite);
2004  unpack_array_multi_column_variable_stride (host_exec_space (),
2005  X_h, imports_h,
2006  importLIDs_h,
2007  whichVecs_h,
2008  op_type (),
2009  numVecs,
2010  use_atomic_updates,
2011  debugCheckIndices);
2012  }
2013  else { // unpack on device
2014  auto X_d = this->getLocalViewDevice(Access::ReadWrite);
2015  unpack_array_multi_column_variable_stride (dev_exec_space (),
2016  X_d, imports_d,
2017  importLIDs_d,
2018  whichVecs_d,
2019  op_type (),
2020  numVecs,
2021  use_atomic_updates,
2022  debugCheckIndices);
2023  }
2024  }
2025  }
2026  else {
2027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2028  (true, std::logic_error, "Invalid CombineMode");
2029  }
2030  }
2031  else {
2032  if (printDebugOutput) {
2033  std::ostringstream os;
2034  os << *prefix << "Nothing to unpack" << endl;
2035  std::cerr << os.str ();
2036  }
2037  }
2038 
2039  if (printDebugOutput) {
2040  std::ostringstream os;
2041  os << *prefix << "Done!" << endl;
2042  std::cerr << os.str ();
2043  }
2044  }
2045 
2046  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2047  size_t
2049  getNumVectors () const
2050  {
2051  if (isConstantStride ()) {
2052  return static_cast<size_t> (view_.extent (1));
2053  } else {
2054  return static_cast<size_t> (whichVectors_.size ());
2055  }
2056  }
2057 
2058  namespace { // (anonymous)
2059 
2060  template<class RV>
2061  void
2062  gblDotImpl (const RV& dotsOut,
2063  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2064  const bool distributed)
2065  {
2066  using Teuchos::REDUCE_MAX;
2067  using Teuchos::REDUCE_SUM;
2068  using Teuchos::reduceAll;
2069  typedef typename RV::non_const_value_type dot_type;
2070 
2071  const size_t numVecs = dotsOut.extent (0);
2072 
2073  // If the MultiVector is distributed over multiple processes, do
2074  // the distributed (interprocess) part of the dot product. We
2075  // assume that the MPI implementation can read from and write to
2076  // device memory.
2077  //
2078  // replaceMap() may have removed some processes. Those
2079  // processes have a null Map. They must not participate in any
2080  // collective operations. We ask first whether the Map is null,
2081  // because isDistributed() defers that question to the Map. We
2082  // still compute and return local dots for processes not
2083  // participating in collective operations; those probably don't
2084  // make any sense, but it doesn't hurt to do them, since it's
2085  // illegal to call dot() on those processes anyway.
2086  if (distributed && ! comm.is_null ()) {
2087  // The calling process only participates in the collective if
2088  // both the Map and its Comm on that process are nonnull.
2089  const int nv = static_cast<int> (numVecs);
2090  const bool commIsInterComm = ::Tpetra::Details::isInterComm (*comm);
2091 
2092  if (commIsInterComm) {
2093  // If comm is an intercomm, then we may not alias input and
2094  // output buffers, so we have to make a copy of the local
2095  // sum.
2096  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
2097  Kokkos::deep_copy (lclDots, dotsOut);
2098  const dot_type* const lclSum = lclDots.data ();
2099  dot_type* const gblSum = dotsOut.data ();
2100  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2101  }
2102  else {
2103  dot_type* const inout = dotsOut.data ();
2104  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
2105  }
2106  }
2107  }
2108  } // namespace (anonymous)
2109 
2110  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2111  void
2114  const Kokkos::View<dot_type*, Kokkos::HostSpace>& dots) const
2115  {
2116  using ::Tpetra::Details::Behavior;
2117  using Kokkos::subview;
2118  using Teuchos::Comm;
2119  using Teuchos::null;
2120  using Teuchos::RCP;
2121  // View of all the dot product results.
2122  typedef Kokkos::View<dot_type*, Kokkos::HostSpace> RV;
2123  typedef typename dual_view_type::t_dev_const XMV;
2124  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
2125 
2126  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
2127 
2128  const size_t numVecs = this->getNumVectors ();
2129  if (numVecs == 0) {
2130  return; // nothing to do
2131  }
2132  const size_t lclNumRows = this->getLocalLength ();
2133  const size_t numDots = static_cast<size_t> (dots.extent (0));
2134  const bool debug = Behavior::debug ();
2135 
2136  if (debug) {
2137  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
2138  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2139  (! compat, std::invalid_argument, "'*this' MultiVector is not "
2140  "compatible with the input MultiVector A. We only test for this "
2141  "in debug mode.");
2142  }
2143 
2144  // FIXME (mfh 11 Jul 2014) These exception tests may not
2145  // necessarily be thrown on all processes consistently. We should
2146  // instead pass along error state with the inner product. We
2147  // could do this by setting an extra slot to
2148  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
2149  // final sum should be
2150  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2151  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2152  lclNumRows != A.getLocalLength (), std::runtime_error,
2153  "MultiVectors do not have the same local length. "
2154  "this->getLocalLength() = " << lclNumRows << " != "
2155  "A.getLocalLength() = " << A.getLocalLength () << ".");
2156  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2157  numVecs != A.getNumVectors (), std::runtime_error,
2158  "MultiVectors must have the same number of columns (vectors). "
2159  "this->getNumVectors() = " << numVecs << " != "
2160  "A.getNumVectors() = " << A.getNumVectors () << ".");
2161  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2162  numDots != numVecs, std::runtime_error,
2163  "The output array 'dots' must have the same number of entries as the "
2164  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2165  numDots << " != this->getNumVectors() = " << numVecs << ".");
2166 
2167  const std::pair<size_t, size_t> colRng (0, numVecs);
2168  RV dotsOut = subview (dots, colRng);
2169  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2170  this->getMap ()->getComm ();
2171 
2172  auto thisView = this->getLocalViewDevice(Access::ReadOnly);
2173  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
2174 
2175  ::Tpetra::Details::lclDot<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
2176  this->whichVectors_.getRawPtr (),
2177  A.whichVectors_.getRawPtr (),
2178  this->isConstantStride (), A.isConstantStride ());
2179  gblDotImpl (dotsOut, comm, this->isDistributed ());
2180  }
2181 
2182  namespace { // (anonymous)
2183  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2185  multiVectorSingleColumnDot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& x,
2187  {
2188  using ::Tpetra::Details::ProfilingRegion;
2190  using dot_type = typename MV::dot_type;
2191  ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
2192 
2193  auto map = x.getMap ();
2194  Teuchos::RCP<const Teuchos::Comm<int> > comm =
2195  map.is_null () ? Teuchos::null : map->getComm ();
2196  if (comm.is_null ()) {
2197  return Kokkos::ArithTraits<dot_type>::zero ();
2198  }
2199  else {
2200  using LO = LocalOrdinal;
2201  // The min just ensures that we don't overwrite memory that
2202  // doesn't belong to us, in the erroneous input case where x
2203  // and y have different numbers of rows.
2204  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
2205  y.getLocalLength ()));
2206  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
2207  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
2208  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
2209 
2210  // All non-unary kernels are executed on the device as per Tpetra policy. Sync to device if needed.
2211  //const_cast<MV&>(x).sync_device ();
2212  //const_cast<MV&>(y).sync_device ();
2213 
2214  auto x_2d = x.getLocalViewDevice(Access::ReadOnly);
2215  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
2216  auto y_2d = y.getLocalViewDevice(Access::ReadOnly);
2217  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
2218  lclDot = KokkosBlas::dot (x_1d, y_1d);
2219 
2220  if (x.isDistributed ()) {
2221  using Teuchos::outArg;
2222  using Teuchos::REDUCE_SUM;
2223  using Teuchos::reduceAll;
2224  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
2225  }
2226  else {
2227  gblDot = lclDot;
2228  }
2229  return gblDot;
2230  }
2231  }
2232  } // namespace (anonymous)
2233 
2234 
2236  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2237  void
2240  const Teuchos::ArrayView<dot_type>& dots) const
2241  {
2242  const char tfecfFuncName[] = "dot: ";
2243  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
2245  const size_t numVecs = this->getNumVectors ();
2246  const size_t lclNumRows = this->getLocalLength ();
2247  const size_t numDots = static_cast<size_t> (dots.size ());
2248 
2249  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
2250  // not necessarily be thrown on all processes consistently. We
2251  // keep them for now, because MultiVector's unit tests insist on
2252  // them. In the future, we should instead pass along error state
2253  // with the inner product. We could do this by setting an extra
2254  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
2255  // The final sum should be
2256  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
2257  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2258  (lclNumRows != A.getLocalLength (), std::runtime_error,
2259  "MultiVectors do not have the same local length. "
2260  "this->getLocalLength() = " << lclNumRows << " != "
2261  "A.getLocalLength() = " << A.getLocalLength () << ".");
2262  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2263  (numVecs != A.getNumVectors (), std::runtime_error,
2264  "MultiVectors must have the same number of columns (vectors). "
2265  "this->getNumVectors() = " << numVecs << " != "
2266  "A.getNumVectors() = " << A.getNumVectors () << ".");
2267  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2268  (numDots != numVecs, std::runtime_error,
2269  "The output array 'dots' must have the same number of entries as the "
2270  "number of columns (vectors) in *this and A. dots.extent(0) = " <<
2271  numDots << " != this->getNumVectors() = " << numVecs << ".");
2272 
2273  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
2274  const dot_type gblDot = multiVectorSingleColumnDot (*this, A);
2275  dots[0] = gblDot;
2276  }
2277  else {
2278  this->dot (A, Kokkos::View<dot_type*, Kokkos::HostSpace>(dots.getRawPtr (), numDots));
2279  }
2280  }
2281 
2282  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2283  void
2285  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
2286  {
2288  using ::Tpetra::Details::NORM_TWO;
2289  using ::Tpetra::Details::ProfilingRegion;
2290  ProfilingRegion region ("Tpetra::MV::norm2 (host output)");
2291 
2292  // The function needs to be able to sync X.
2293  MV& X = const_cast<MV&> (*this);
2294  multiVectorNormImpl (norms.getRawPtr (), X, NORM_TWO);
2295  }
2296 
2297  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2298  void
2300  norm2 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2301  {
2302  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2303  this->norm2 (norms_av);
2304  }
2305 
2307  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2308  void
2310  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2311  {
2313  using ::Tpetra::Details::NORM_ONE;
2314  using ::Tpetra::Details::ProfilingRegion;
2315  ProfilingRegion region ("Tpetra::MV::norm1 (host output)");
2316 
2317  // The function needs to be able to sync X.
2318  MV& X = const_cast<MV&> (*this);
2319  multiVectorNormImpl (norms.data (), X, NORM_ONE);
2320  }
2321 
2322  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2323  void
2325  norm1 (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2326  {
2327  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2328  this->norm1 (norms_av);
2329  }
2330 
2331  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2332  void
2334  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2335  {
2337  using ::Tpetra::Details::NORM_INF;
2338  using ::Tpetra::Details::ProfilingRegion;
2339  ProfilingRegion region ("Tpetra::MV::normInf (host output)");
2340 
2341  // The function needs to be able to sync X.
2342  MV& X = const_cast<MV&> (*this);
2343  multiVectorNormImpl (norms.getRawPtr (), X, NORM_INF);
2344  }
2345 
2346  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2347  void
2349  normInf (const Kokkos::View<mag_type*, Kokkos::HostSpace>& norms) const
2350  {
2351  Teuchos::ArrayView<mag_type> norms_av (norms.data (), norms.extent (0));
2352  this->normInf (norms_av);
2353  }
2354 
2355  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2356  void
2358  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2359  {
2360  // KR FIXME Overload this method to take a View.
2361  using Kokkos::ALL;
2362  using Kokkos::subview;
2363  using Teuchos::Comm;
2364  using Teuchos::RCP;
2365  using Teuchos::reduceAll;
2366  using Teuchos::REDUCE_SUM;
2367  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2368 
2369  const size_t lclNumRows = this->getLocalLength ();
2370  const size_t numVecs = this->getNumVectors ();
2371  const size_t numMeans = static_cast<size_t> (means.size ());
2372 
2373  TEUCHOS_TEST_FOR_EXCEPTION(
2374  numMeans != numVecs, std::runtime_error,
2375  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2376  << " != this->getNumVectors() = " << numVecs << ".");
2377 
2378  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2379  const std::pair<size_t, size_t> colRng (0, numVecs);
2380 
2381  // Make sure that the final output view has the same layout as the
2382  // temporary view's HostMirror. Left or Right doesn't matter for
2383  // a 1-D array anyway; this is just to placate the compiler.
2384  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2385  typedef Kokkos::View<impl_scalar_type*,
2386  typename local_view_type::HostMirror::array_layout,
2387  Kokkos::HostSpace,
2388  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2389  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2391  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2392  this->getMap ()->getComm ();
2393 
2394  // If we need sync to device, then host has the most recent version.
2395  const bool useHostVersion = this->need_sync_device ();
2396  if (useHostVersion) {
2397  // DualView was last modified on host, so run the local kernel there.
2398  auto X_lcl = subview (getLocalViewHost(Access::ReadOnly),
2399  rowRng, Kokkos::ALL ());
2400  // Compute the local sum of each column.
2401  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2402  if (isConstantStride ()) {
2403  KokkosBlas::sum (lclSums, X_lcl);
2404  }
2405  else {
2406  for (size_t j = 0; j < numVecs; ++j) {
2407  const size_t col = whichVectors_[j];
2408  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2409  }
2410  }
2411 
2412  // If there are multiple MPI processes, the all-reduce reads
2413  // from lclSums, and writes to meansOut. Otherwise, we just
2414  // copy lclSums into meansOut.
2415  if (! comm.is_null () && this->isDistributed ()) {
2416  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2417  lclSums.data (), meansOut.data ());
2418  }
2419  else {
2420  Kokkos::deep_copy (meansOut, lclSums);
2421  }
2422  }
2423  else {
2424  // DualView was last modified on device, so run the local kernel there.
2425  auto X_lcl = subview (this->getLocalViewDevice(Access::ReadOnly),
2426  rowRng, Kokkos::ALL ());
2427 
2428  // Compute the local sum of each column.
2429  Kokkos::View<impl_scalar_type*, Kokkos::HostSpace> lclSums ("MV::meanValue tmp", numVecs);
2430  if (isConstantStride ()) {
2431  KokkosBlas::sum (lclSums, X_lcl);
2432  }
2433  else {
2434  for (size_t j = 0; j < numVecs; ++j) {
2435  const size_t col = whichVectors_[j];
2436  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2437  }
2438  }
2439 
2440  // If there are multiple MPI processes, the all-reduce reads
2441  // from lclSums, and writes to meansOut. (We assume that MPI
2442  // can read device memory.) Otherwise, we just copy lclSums
2443  // into meansOut.
2444  if (! comm.is_null () && this->isDistributed ()) {
2445  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2446  lclSums.data (), meansOut.data ());
2447  }
2448  else {
2449  Kokkos::deep_copy (meansOut, lclSums);
2450  }
2451  }
2452 
2453  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2454  // to the magnitude type, since operator/ (std::complex<T>, int)
2455  // isn't necessarily defined.
2456  const impl_scalar_type OneOverN =
2457  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2458  for (size_t k = 0; k < numMeans; ++k) {
2459  meansOut(k) = meansOut(k) * OneOverN;
2460  }
2461  }
2462 
2463 
2464  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2465  void
2467  randomize ()
2468  {
2469  typedef impl_scalar_type IST;
2470  typedef Kokkos::Details::ArithTraits<IST> ATS;
2471  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2472  typedef typename pool_type::generator_type generator_type;
2473 
2474  const IST max = Kokkos::rand<generator_type, IST>::max ();
2475  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2476 
2477  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2478  }
2479 
2480 
2481  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2482  void
2484  randomize (const Scalar& minVal, const Scalar& maxVal)
2485  {
2486  typedef impl_scalar_type IST;
2487  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2488 
2489  // Seed the pseudorandom number generator using the calling
2490  // process' rank. This helps decorrelate different process'
2491  // pseudorandom streams. It's not perfect but it's effective and
2492  // doesn't require MPI communication. The seed also includes bits
2493  // from the standard library's rand().
2494  //
2495  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2496  // The code below just makes a new seed each time.
2497 
2498  const uint64_t myRank =
2499  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2500  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2501  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2502 
2503  pool_type rand_pool (seed);
2504  const IST max = static_cast<IST> (maxVal);
2505  const IST min = static_cast<IST> (minVal);
2506 
2507  auto thisView = this->getLocalViewDevice(Access::OverwriteAll);
2508 
2509  if (isConstantStride ()) {
2510  Kokkos::fill_random (thisView, rand_pool, min, max);
2511  }
2512  else {
2513  const size_t numVecs = getNumVectors ();
2514  for (size_t k = 0; k < numVecs; ++k) {
2515  const size_t col = whichVectors_[k];
2516  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2517  Kokkos::fill_random (X_k, rand_pool, min, max);
2518  }
2519  }
2520  }
2521 
2522  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2523  void
2525  putScalar (const Scalar& alpha)
2526  {
2527  using ::Tpetra::Details::ProfilingRegion;
2528  using ::Tpetra::Details::Blas::fill;
2529  using DES = typename dual_view_type::t_dev::execution_space;
2530  using HES = typename dual_view_type::t_host::execution_space;
2531  using LO = LocalOrdinal;
2532  ProfilingRegion region ("Tpetra::MultiVector::putScalar");
2533 
2534  // We need this cast for cases like Scalar = std::complex<T> but
2535  // impl_scalar_type = Kokkos::complex<T>.
2536  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2537  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
2538  const LO numVecs = static_cast<LO> (this->getNumVectors ());
2539 
2540  // Modify the most recently updated version of the data. This
2541  // avoids sync'ing, which could violate users' expectations.
2542  //
2543  // If we need sync to device, then host has the most recent version.
2544  const bool runOnHost = runKernelOnHost(*this);
2545  if (! runOnHost) {
2546  auto X = this->getLocalViewDevice(Access::OverwriteAll);
2547  if (this->isConstantStride ()) {
2548  fill (DES (), X, theAlpha, lclNumRows, numVecs);
2549  }
2550  else {
2551  fill (DES (), X, theAlpha, lclNumRows, numVecs,
2552  this->whichVectors_.getRawPtr ());
2553  }
2554  }
2555  else { // last modified in host memory, so modify data there.
2556  auto X = this->getLocalViewHost(Access::OverwriteAll);
2557  if (this->isConstantStride ()) {
2558  fill (HES (), X, theAlpha, lclNumRows, numVecs);
2559  }
2560  else {
2561  fill (HES (), X, theAlpha, lclNumRows, numVecs,
2562  this->whichVectors_.getRawPtr ());
2563  }
2564  }
2565  }
2566 
2567 
2568  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2569  void
2571  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2572  {
2573  using Teuchos::ArrayRCP;
2574  using Teuchos::Comm;
2575  using Teuchos::RCP;
2576  using ST = Scalar;
2577  using LO = LocalOrdinal;
2578  using GO = GlobalOrdinal;
2579 
2580  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2581  // it might work if the MV is a column view of another MV.
2582  // However, things might go wrong when restoring the original
2583  // Map, so we don't allow this case for now.
2584  TEUCHOS_TEST_FOR_EXCEPTION(
2585  ! this->isConstantStride (), std::logic_error,
2586  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2587  "if the MultiVector is a column view of another MultiVector (that is, if "
2588  "isConstantStride() == false).");
2589 
2590  // Case 1: current Map and new Map are both nonnull on this process.
2591  // Case 2: current Map is nonnull, new Map is null.
2592  // Case 3: current Map is null, new Map is nonnull.
2593  // Case 4: both Maps are null: forbidden.
2594  //
2595  // Case 1 means that we don't have to do anything on this process,
2596  // other than assign the new Map. (We always have to do that.)
2597  // It's an error for the user to supply a Map that requires
2598  // resizing in this case.
2599  //
2600  // Case 2 means that the calling process is in the current Map's
2601  // communicator, but will be excluded from the new Map's
2602  // communicator. We don't have to do anything on the calling
2603  // process; just leave whatever data it may have alone.
2604  //
2605  // Case 3 means that the calling process is excluded from the
2606  // current Map's communicator, but will be included in the new
2607  // Map's communicator. This means we need to (re)allocate the
2608  // local DualView if it does not have the right number of rows.
2609  // If the new number of rows is nonzero, we'll fill the newly
2610  // allocated local data with zeros, as befits a projection
2611  // operation.
2612  //
2613  // The typical use case for Case 3 is that the MultiVector was
2614  // first created with the Map with more processes, then that Map
2615  // was replaced with a Map with fewer processes, and finally the
2616  // original Map was restored on this call to replaceMap.
2617 
2618 #ifdef HAVE_TEUCHOS_DEBUG
2619  // mfh 28 Mar 2013: We can't check for compatibility across the
2620  // whole communicator, unless we know that the current and new
2621  // Maps are nonnull on _all_ participating processes.
2622  // TEUCHOS_TEST_FOR_EXCEPTION(
2623  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2624  // std::invalid_argument, "Tpetra::MultiVector::project: "
2625  // "If the input Map's communicator is compatible (has the same number of "
2626  // "processes as) the current Map's communicator, then the two Maps must be "
2627  // "compatible. The replaceMap() method is not for data redistribution; "
2628  // "use Import or Export for that purpose.");
2629 
2630  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2631  // of the Map, in case the process counts don't match.
2632 #endif // HAVE_TEUCHOS_DEBUG
2633 
2634  if (this->getMap ().is_null ()) { // current Map is null
2635  // If this->getMap() is null, that means that this MultiVector
2636  // has already had replaceMap happen to it. In that case, just
2637  // reallocate the DualView with the right size.
2638 
2639  TEUCHOS_TEST_FOR_EXCEPTION(
2640  newMap.is_null (), std::invalid_argument,
2641  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2642  "This probably means that the input Map is incorrect.");
2643 
2644  // Case 3: current Map is null, new Map is nonnull.
2645  // Reallocate the DualView with the right dimensions.
2646  const size_t newNumRows = newMap->getNodeNumElements ();
2647  const size_t origNumRows = view_.extent (0);
2648  const size_t numCols = this->getNumVectors ();
2649 
2650  if (origNumRows != newNumRows || view_.extent (1) != numCols) {
2651  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2652  }
2653  }
2654  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2655  // I am an excluded process. Reinitialize my data so that I
2656  // have 0 rows. Keep the number of columns as before.
2657  const size_t newNumRows = static_cast<size_t> (0);
2658  const size_t numCols = this->getNumVectors ();
2659  view_ = allocDualView<ST, LO, GO, Node> (newNumRows, numCols);
2660  }
2661 
2662  this->map_ = newMap;
2663  }
2664 
2665  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2666  void
2668  scale (const Scalar& alpha)
2669  {
2670  using Kokkos::ALL;
2671  using IST = impl_scalar_type;
2672 
2673  const IST theAlpha = static_cast<IST> (alpha);
2674  if (theAlpha == Kokkos::ArithTraits<IST>::one ()) {
2675  return; // do nothing
2676  }
2677  const size_t lclNumRows = getLocalLength ();
2678  const size_t numVecs = getNumVectors ();
2679  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2680  const std::pair<size_t, size_t> colRng (0, numVecs);
2681 
2682  // We can't substitute putScalar(0.0) for scale(0.0), because the
2683  // former will overwrite NaNs present in the MultiVector. The
2684  // semantics of this call require multiplying them by 0, which
2685  // IEEE 754 requires to be NaN.
2686 
2687  // If we need sync to device, then host has the most recent version.
2688  const bool useHostVersion = need_sync_device ();
2689  if (useHostVersion) {
2690  auto Y_lcl = Kokkos::subview (getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2691  if (isConstantStride ()) {
2692  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2693  }
2694  else {
2695  for (size_t k = 0; k < numVecs; ++k) {
2696  const size_t Y_col = whichVectors_[k];
2697  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2698  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2699  }
2700  }
2701  }
2702  else { // work on device
2703  auto Y_lcl = Kokkos::subview (getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2704  if (isConstantStride ()) {
2705  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2706  }
2707  else {
2708  for (size_t k = 0; k < numVecs; ++k) {
2709  const size_t Y_col = isConstantStride () ? k : whichVectors_[k];
2710  auto Y_k = Kokkos::subview (Y_lcl, ALL (), Y_col);
2711  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2712  }
2713  }
2714  }
2715  }
2716 
2717 
2718  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2719  void
2721  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2722  {
2723  const size_t numVecs = this->getNumVectors ();
2724  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2725  TEUCHOS_TEST_FOR_EXCEPTION(
2726  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2727  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2728  << numVecs << ".");
2729 
2730  // Use a DualView to copy the scaling constants onto the device.
2731  using k_alphas_type = Kokkos::DualView<impl_scalar_type*, device_type>;
2732  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2733  k_alphas.modify_host ();
2734  for (size_t i = 0; i < numAlphas; ++i) {
2735  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2736  }
2737  k_alphas.sync_device ();
2738  // Invoke the scale() overload that takes a device View of coefficients.
2739  this->scale (k_alphas.view_device ());
2740  }
2741 
2742  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2743  void
2745  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2746  {
2747  using Kokkos::ALL;
2748  using Kokkos::subview;
2749 
2750  const size_t lclNumRows = this->getLocalLength ();
2751  const size_t numVecs = this->getNumVectors ();
2752  TEUCHOS_TEST_FOR_EXCEPTION(
2753  static_cast<size_t> (alphas.extent (0)) != numVecs,
2754  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2755  "alphas.extent(0) = " << alphas.extent (0)
2756  << " != this->getNumVectors () = " << numVecs << ".");
2757  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2758  const std::pair<size_t, size_t> colRng (0, numVecs);
2759 
2760  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2761  // type of the return value of subview. This is because if we
2762  // switch the array layout from LayoutLeft to LayoutRight
2763  // (preferred for performance of block operations), the types
2764  // below won't be valid. (A view of a column of a LayoutRight
2765  // multivector has LayoutStride, not LayoutLeft.)
2766 
2767  // If we need sync to device, then host has the most recent version.
2768  const bool useHostVersion = this->need_sync_device ();
2769  if (useHostVersion) {
2770  // Work in host memory. This means we need to create a host
2771  // mirror of the input View of coefficients.
2772  auto alphas_h = Kokkos::create_mirror_view (alphas);
2773  Kokkos::deep_copy (alphas_h, alphas);
2774 
2775  auto Y_lcl = subview (this->getLocalViewHost(Access::ReadWrite), rowRng, ALL ());
2776  if (isConstantStride ()) {
2777  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2778  }
2779  else {
2780  for (size_t k = 0; k < numVecs; ++k) {
2781  const size_t Y_col = this->isConstantStride () ? k :
2782  this->whichVectors_[k];
2783  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2784  // We don't have to use the entire 1-D View here; we can use
2785  // the version that takes a scalar coefficient.
2786  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2787  }
2788  }
2789  }
2790  else { // Work in device memory, using the input View 'alphas' directly.
2791  auto Y_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
2792  if (isConstantStride ()) {
2793  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2794  }
2795  else {
2796  // FIXME (mfh 15 Mar 2019) We need one coefficient at a time,
2797  // as values on host, so copy them to host. Another approach
2798  // would be to fix scal() so that it takes a 0-D View as the
2799  // second argument.
2800  auto alphas_h = Kokkos::create_mirror_view (alphas);
2801  Kokkos::deep_copy (alphas_h, alphas);
2802 
2803  for (size_t k = 0; k < numVecs; ++k) {
2804  const size_t Y_col = this->isConstantStride () ? k :
2805  this->whichVectors_[k];
2806  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2807  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2808  }
2809  }
2810  }
2811  }
2812 
2813  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2814  void
2816  scale (const Scalar& alpha,
2818  {
2819  using Kokkos::ALL;
2820  using Kokkos::subview;
2821  const char tfecfFuncName[] = "scale: ";
2822 
2823  const size_t lclNumRows = getLocalLength ();
2824  const size_t numVecs = getNumVectors ();
2825 
2826  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2827  lclNumRows != A.getLocalLength (), std::invalid_argument,
2828  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2829  << A.getLocalLength () << ".");
2830  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2831  numVecs != A.getNumVectors (), std::invalid_argument,
2832  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2833  << A.getNumVectors () << ".");
2834 
2835  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2836  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2837  const std::pair<size_t, size_t> colRng (0, numVecs);
2838 
2839  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2840  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2841  auto Y_lcl = subview (Y_lcl_orig, rowRng, ALL ());
2842  auto X_lcl = subview (X_lcl_orig, rowRng, ALL ());
2843 
2844  if (isConstantStride () && A.isConstantStride ()) {
2845  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2846  }
2847  else {
2848  // Make sure that Kokkos only uses the local length for add.
2849  for (size_t k = 0; k < numVecs; ++k) {
2850  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2851  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2852  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2853  auto X_k = subview (X_lcl, ALL (), X_col);
2854 
2855  KokkosBlas::scal (Y_k, theAlpha, X_k);
2856  }
2857  }
2858  }
2859 
2860 
2861 
2862  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2863  void
2866  {
2867  const char tfecfFuncName[] = "reciprocal: ";
2868 
2869  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2870  getLocalLength () != A.getLocalLength (), std::runtime_error,
2871  "MultiVectors do not have the same local length. "
2872  "this->getLocalLength() = " << getLocalLength ()
2873  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2874  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2875  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2876  ": MultiVectors do not have the same number of columns (vectors). "
2877  "this->getNumVectors() = " << getNumVectors ()
2878  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2879 
2880  const size_t numVecs = getNumVectors ();
2881 
2882  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2883  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2884 
2885  if (isConstantStride () && A.isConstantStride ()) {
2886  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2887  }
2888  else {
2889  using Kokkos::ALL;
2890  using Kokkos::subview;
2891  for (size_t k = 0; k < numVecs; ++k) {
2892  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2893  auto vector_k = subview (this_view_dev, ALL (), this_col);
2894  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2895  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2896  KokkosBlas::reciprocal (vector_k, vector_Ak);
2897  }
2898  }
2899  }
2900 
2901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2902  void
2905  {
2906  const char tfecfFuncName[] = "abs";
2907 
2908  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2909  getLocalLength () != A.getLocalLength (), std::runtime_error,
2910  ": MultiVectors do not have the same local length. "
2911  "this->getLocalLength() = " << getLocalLength ()
2912  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2913  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2914  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2915  ": MultiVectors do not have the same number of columns (vectors). "
2916  "this->getNumVectors() = " << getNumVectors ()
2917  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2918  const size_t numVecs = getNumVectors ();
2919 
2920  auto this_view_dev = this->getLocalViewDevice(Access::ReadWrite);
2921  auto A_view_dev = A.getLocalViewDevice(Access::ReadOnly);
2922 
2923  if (isConstantStride () && A.isConstantStride ()) {
2924  KokkosBlas::abs (this_view_dev, A_view_dev);
2925  }
2926  else {
2927  using Kokkos::ALL;
2928  using Kokkos::subview;
2929 
2930  for (size_t k=0; k < numVecs; ++k) {
2931  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
2932  auto vector_k = subview (this_view_dev, ALL (), this_col);
2933  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
2934  auto vector_Ak = subview (A_view_dev, ALL (), A_col);
2935  KokkosBlas::abs (vector_k, vector_Ak);
2936  }
2937  }
2938  }
2939 
2940  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2941  void
2943  update (const Scalar& alpha,
2945  const Scalar& beta)
2946  {
2947  const char tfecfFuncName[] = "update: ";
2948  using Kokkos::subview;
2949  using Kokkos::ALL;
2950 
2951  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
2952 
2953  const size_t lclNumRows = getLocalLength ();
2954  const size_t numVecs = getNumVectors ();
2955 
2956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2957  lclNumRows != A.getLocalLength (), std::invalid_argument,
2958  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2959  << A.getLocalLength () << ".");
2960  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2961  numVecs != A.getNumVectors (), std::invalid_argument,
2962  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2963  << A.getNumVectors () << ".");
2964 
2965  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2966  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
2967  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2968  const std::pair<size_t, size_t> colRng (0, numVecs);
2969 
2970  auto Y_lcl_orig = this->getLocalViewDevice(Access::ReadWrite);
2971  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2972  auto X_lcl_orig = A.getLocalViewDevice(Access::ReadOnly);
2973  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2974 
2975  // The device memory of *this is about to be modified
2976  if (isConstantStride () && A.isConstantStride ()) {
2977  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
2978  }
2979  else {
2980  // Make sure that Kokkos only uses the local length for add.
2981  for (size_t k = 0; k < numVecs; ++k) {
2982  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2983  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2984  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2985  auto X_k = subview (X_lcl, ALL (), X_col);
2986 
2987  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
2988  }
2989  }
2990  }
2991 
2992  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2993  void
2995  update (const Scalar& alpha,
2997  const Scalar& beta,
2999  const Scalar& gamma)
3000  {
3001  using Kokkos::ALL;
3002  using Kokkos::subview;
3003 
3004  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3005 
3006  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3007 
3008  const size_t lclNumRows = this->getLocalLength ();
3009  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3010  lclNumRows != A.getLocalLength (), std::invalid_argument,
3011  "The input MultiVector A has " << A.getLocalLength () << " local "
3012  "row(s), but this MultiVector has " << lclNumRows << " local "
3013  "row(s).");
3014  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3015  lclNumRows != B.getLocalLength (), std::invalid_argument,
3016  "The input MultiVector B has " << B.getLocalLength () << " local "
3017  "row(s), but this MultiVector has " << lclNumRows << " local "
3018  "row(s).");
3019  const size_t numVecs = getNumVectors ();
3020  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3021  A.getNumVectors () != numVecs, std::invalid_argument,
3022  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3023  "but this MultiVector has " << numVecs << " column(s).");
3024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3025  B.getNumVectors () != numVecs, std::invalid_argument,
3026  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3027  "but this MultiVector has " << numVecs << " column(s).");
3028 
3029  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3030  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3031  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3032 
3033  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3034  const std::pair<size_t, size_t> colRng (0, numVecs);
3035 
3036  // Prefer 'auto' over specifying the type explicitly. This avoids
3037  // issues with a subview possibly having a different type than the
3038  // original view.
3039  auto C_lcl = subview (this->getLocalViewDevice(Access::ReadWrite), rowRng, ALL ());
3040  auto A_lcl = subview (A.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3041  auto B_lcl = subview (B.getLocalViewDevice(Access::ReadOnly), rowRng, ALL ());
3042 
3043  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3044  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3045  }
3046  else {
3047  // Some input (or *this) is not constant stride,
3048  // so perform the update one column at a time.
3049  for (size_t k = 0; k < numVecs; ++k) {
3050  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3051  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3052  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3053  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3054  theBeta, subview (B_lcl, rowRng, B_col),
3055  theGamma, subview (C_lcl, rowRng, this_col));
3056  }
3057  }
3058  }
3059 
3060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3061  Teuchos::ArrayRCP<const Scalar>
3063  getData (size_t j) const
3064  {
3065  using Kokkos::ALL;
3066  using IST = impl_scalar_type;
3067  const char tfecfFuncName[] = "getData: ";
3068 
3069  // Any MultiVector method that called the (classic) Kokkos Node's
3070  // viewBuffer or viewBufferNonConst methods always implied a
3071  // device->host synchronization. Thus, we synchronize here as
3072  // well.
3073 
3074  auto hostView = getLocalViewHost(Access::ReadOnly);
3075  const size_t col = isConstantStride () ? j : whichVectors_[j];
3076  auto hostView_j = Kokkos::subview (hostView, ALL (), col);
3077  Teuchos::ArrayRCP<const IST> dataAsArcp =
3078  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3079 
3080  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3081  (static_cast<size_t> (hostView_j.extent (0)) <
3082  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3083  "hostView_j.extent(0) = " << hostView_j.extent (0)
3084  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3085  "Please report this bug to the Tpetra developers.");
3086 
3087  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3088  }
3089 
3090  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3091  Teuchos::ArrayRCP<Scalar>
3093  getDataNonConst (size_t j)
3094  {
3095  using Kokkos::ALL;
3096  using Kokkos::subview;
3097  using IST = impl_scalar_type;
3098  const char tfecfFuncName[] = "getDataNonConst: ";
3099 
3100  auto hostView = getLocalViewHost(Access::ReadWrite);
3101  const size_t col = isConstantStride () ? j : whichVectors_[j];
3102  auto hostView_j = subview (hostView, ALL (), col);
3103  Teuchos::ArrayRCP<IST> dataAsArcp =
3104  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3105 
3106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3107  (static_cast<size_t> (hostView_j.extent (0)) <
3108  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3109  "hostView_j.extent(0) = " << hostView_j.extent (0)
3110  << " < dataAsArcp.size() = " << dataAsArcp.size () << ". "
3111  "Please report this bug to the Tpetra developers.");
3112 
3113  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3114  }
3115 
3116  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3117  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3119  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3120  {
3121  using Teuchos::RCP;
3123 
3124  // Check whether the index set in cols is contiguous. If it is,
3125  // use the more efficient Range1D version of subCopy.
3126  bool contiguous = true;
3127  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3128  for (size_t j = 1; j < numCopyVecs; ++j) {
3129  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3130  contiguous = false;
3131  break;
3132  }
3133  }
3134  if (contiguous && numCopyVecs > 0) {
3135  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3136  }
3137  else {
3138  RCP<const MV> X_sub = this->subView (cols);
3139  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3140  Y->assign (*X_sub);
3141  return Y;
3142  }
3143  }
3144 
3145  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3146  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3148  subCopy (const Teuchos::Range1D &colRng) const
3149  {
3150  using Teuchos::RCP;
3152 
3153  RCP<const MV> X_sub = this->subView (colRng);
3154  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3155  Y->assign (*X_sub);
3156  return Y;
3157  }
3158 
3159  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3160  size_t
3162  getOrigNumLocalRows () const {
3163  return origView_.extent (0);
3164  }
3165 
3166  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3167  size_t
3169  getOrigNumLocalCols () const {
3170  return origView_.extent (1);
3171  }
3172 
3173  template <class Scalar, class LO, class GO, class Node>
3176  const Teuchos::RCP<const map_type>& subMap,
3177  const local_ordinal_type rowOffset) :
3178  base_type (subMap)
3179  {
3180  using Kokkos::ALL;
3181  using Kokkos::subview;
3182  using Teuchos::outArg;
3183  using Teuchos::RCP;
3184  using Teuchos::rcp;
3185  using Teuchos::reduceAll;
3186  using Teuchos::REDUCE_MIN;
3187  using std::endl;
3189  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3190  const char suffix[] = "Please report this bug to the Tpetra developers.";
3191  int lclGood = 1;
3192  int gblGood = 1;
3193  std::unique_ptr<std::ostringstream> errStrm;
3194  const bool debug = ::Tpetra::Details::Behavior::debug ();
3195  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
3196 
3197  // Be careful to use the input Map's communicator, not X's. The
3198  // idea is that, on return, *this is a subview of X, using the
3199  // input Map.
3200  const auto comm = subMap->getComm ();
3201  TEUCHOS_ASSERT( ! comm.is_null () );
3202  const int myRank = comm->getRank ();
3203 
3204  const LO lclNumRowsBefore = static_cast<LO> (X.getLocalLength ());
3205  const LO numCols = static_cast<LO> (X.getNumVectors ());
3206  const LO newNumRows = static_cast<LO> (subMap->getNodeNumElements ());
3207  if (verbose) {
3208  std::ostringstream os;
3209  os << "Proc " << myRank << ": " << prefix
3210  << "X: {lclNumRows: " << lclNumRowsBefore
3211  << ", origLclNumRows: " << X.getOrigNumLocalRows ()
3212  << ", numCols: " << numCols << "}, "
3213  << "subMap: {lclNumRows: " << newNumRows << "}" << endl;
3214  std::cerr << os.str ();
3215  }
3216  // We ask for the _original_ number of rows in X, because X could
3217  // be a shorter (fewer rows) view of a longer MV. (For example, X
3218  // could be a domain Map view of a column Map MV.)
3219  const bool tooManyElts =
3220  newNumRows + rowOffset > static_cast<LO> (X.getOrigNumLocalRows ());
3221  if (tooManyElts) {
3222  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3223  *errStrm << " Proc " << myRank << ": subMap->getNodeNumElements() (="
3224  << newNumRows << ") + rowOffset (=" << rowOffset
3225  << ") > X.getOrigNumLocalRows() (=" << X.getOrigNumLocalRows ()
3226  << ")." << endl;
3227  lclGood = 0;
3228  TEUCHOS_TEST_FOR_EXCEPTION
3229  (! debug && tooManyElts, std::invalid_argument,
3230  prefix << errStrm->str () << suffix);
3231  }
3232 
3233  if (debug) {
3234  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3235  if (gblGood != 1) {
3236  std::ostringstream gblErrStrm;
3237  const std::string myErrStr =
3238  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3239  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3240  TEUCHOS_TEST_FOR_EXCEPTION
3241  (true, std::invalid_argument, gblErrStrm.str ());
3242  }
3243  }
3244 
3245  using range_type = std::pair<LO, LO>;
3246  const range_type origRowRng
3247  (rowOffset, static_cast<LO> (X.origView_.extent (0)));
3248  const range_type rowRng
3249  (rowOffset, rowOffset + newNumRows);
3250 
3251  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3252  dual_view_type newView = subview (X.view_, rowRng, ALL ());
3253 
3254  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3255  // handling subviews of degenerate Views quite so well. For some
3256  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3257  // 0. We work around by creating a new empty DualView of the
3258  // desired (degenerate) dimensions.
3259  if (newOrigView.extent (0) == 0 &&
3260  newOrigView.extent (1) != X.origView_.extent (1)) {
3261  newOrigView =
3262  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3263  }
3264  if (newView.extent (0) == 0 &&
3265  newView.extent (1) != X.view_.extent (1)) {
3266  newView =
3267  allocDualView<Scalar, LO, GO, Node> (0, X.getNumVectors ());
3268  }
3269 
3270  MV subViewMV = X.isConstantStride () ?
3271  MV (subMap, newView, newOrigView) :
3272  MV (subMap, newView, newOrigView, X.whichVectors_ ());
3273 
3274  subViewMV.owningView_ = X.owningView_;
3275 
3276  if (debug) {
3277  const LO lclNumRowsRet = static_cast<LO> (subViewMV.getLocalLength ());
3278  const LO numColsRet = static_cast<LO> (subViewMV.getNumVectors ());
3279  if (newNumRows != lclNumRowsRet || numCols != numColsRet) {
3280  lclGood = 0;
3281  if (errStrm.get () == nullptr) {
3282  errStrm = std::unique_ptr<std::ostringstream> (new std::ostringstream);
3283  }
3284  *errStrm << " Proc " << myRank <<
3285  ": subMap.getNodeNumElements(): " << newNumRows <<
3286  ", subViewMV.getLocalLength(): " << lclNumRowsRet <<
3287  ", X.getNumVectors(): " << numCols <<
3288  ", subViewMV.getNumVectors(): " << numColsRet << endl;
3289  }
3290  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
3291  if (gblGood != 1) {
3292  std::ostringstream gblErrStrm;
3293  if (myRank == 0) {
3294  gblErrStrm << prefix << "Returned MultiVector has the wrong local "
3295  "dimensions on one or more processes:" << endl;
3296  }
3297  const std::string myErrStr =
3298  errStrm.get () != nullptr ? errStrm->str () : std::string ("");
3299  ::Tpetra::Details::gathervPrint (gblErrStrm, myErrStr, *comm);
3300  gblErrStrm << suffix << endl;
3301  TEUCHOS_TEST_FOR_EXCEPTION
3302  (true, std::invalid_argument, gblErrStrm.str ());
3303  }
3304  }
3305 
3306  if (verbose) {
3307  std::ostringstream os;
3308  os << "Proc " << myRank << ": " << prefix << "Call op=" << endl;
3309  std::cerr << os.str ();
3310  }
3311 
3312  *this = subViewMV; // shallow copy
3313 
3314  if (verbose) {
3315  std::ostringstream os;
3316  os << "Proc " << myRank << ": " << prefix << "Done!" << endl;
3317  std::cerr << os.str ();
3318  }
3319  }
3320 
3321  template <class Scalar, class LO, class GO, class Node>
3324  const map_type& subMap,
3325  const size_t rowOffset) :
3326  MultiVector (X, Teuchos::RCP<const map_type> (new map_type (subMap)),
3327  static_cast<local_ordinal_type> (rowOffset))
3328  {}
3329 
3330  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3331  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3333  offsetView (const Teuchos::RCP<const map_type>& subMap,
3334  const size_t offset) const
3335  {
3337  return Teuchos::rcp (new MV (*this, *subMap, offset));
3338  }
3339 
3340  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3341  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3343  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3344  const size_t offset)
3345  {
3347  return Teuchos::rcp (new MV (*this, *subMap, offset));
3348  }
3349 
3350  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3351  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3353  subView (const Teuchos::ArrayView<const size_t>& cols) const
3354  {
3355  using Teuchos::Array;
3356  using Teuchos::rcp;
3358 
3359  const size_t numViewCols = static_cast<size_t> (cols.size ());
3360  TEUCHOS_TEST_FOR_EXCEPTION(
3361  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3362  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3363  "contain at least one entry, but cols.size() = " << cols.size ()
3364  << " == 0.");
3365 
3366  // Check whether the index set in cols is contiguous. If it is,
3367  // use the more efficient Range1D version of subView.
3368  bool contiguous = true;
3369  for (size_t j = 1; j < numViewCols; ++j) {
3370  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3371  contiguous = false;
3372  break;
3373  }
3374  }
3375  if (contiguous) {
3376  if (numViewCols == 0) {
3377  // The output MV has no columns, so there is nothing to view.
3378  return rcp (new MV (this->getMap (), numViewCols));
3379  } else {
3380  // Use the more efficient contiguous-index-range version.
3381  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3382  }
3383  }
3384 
3385  if (isConstantStride ()) {
3386  return rcp (new MV (this->getMap (), view_, origView_, cols));
3387  }
3388  else {
3389  Array<size_t> newcols (cols.size ());
3390  for (size_t j = 0; j < numViewCols; ++j) {
3391  newcols[j] = whichVectors_[cols[j]];
3392  }
3393  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3394  }
3395  }
3396 
3397 
3398  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3399  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3401  subView (const Teuchos::Range1D& colRng) const
3402  {
3403  using ::Tpetra::Details::Behavior;
3404  using Kokkos::ALL;
3405  using Kokkos::subview;
3406  using Teuchos::Array;
3407  using Teuchos::RCP;
3408  using Teuchos::rcp;
3410  const char tfecfFuncName[] = "subView(Range1D): ";
3411 
3412  const size_t lclNumRows = this->getLocalLength ();
3413  const size_t numVecs = this->getNumVectors ();
3414  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3415  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3416  // "at least one vector.");
3417  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3418  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3419  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3420  << numVecs << ".");
3421  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3422  numVecs != 0 && colRng.size () != 0 &&
3423  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3424  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3425  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3426  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3427  "[0, " << numVecs << "].");
3428 
3429  RCP<const MV> X_ret; // the MultiVector subview to return
3430 
3431  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3432  // broken for the case of views with zero rows. I will brutally
3433  // enforce that the subview has the correct dimensions. In
3434  // particular, in the case of zero rows, I will, if necessary,
3435  // create a new dual_view_type with zero rows and the correct
3436  // number of columns. In a debug build, I will use an all-reduce
3437  // to ensure that it has the correct dimensions on all processes.
3438 
3439  const std::pair<size_t, size_t> rows (0, lclNumRows);
3440  if (colRng.size () == 0) {
3441  const std::pair<size_t, size_t> cols (0, 0); // empty range
3442  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3443  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3444  }
3445  else {
3446  // Returned MultiVector is constant stride only if *this is.
3447  if (isConstantStride ()) {
3448  const std::pair<size_t, size_t> cols (colRng.lbound (),
3449  colRng.ubound () + 1);
3450  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3451  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3452  }
3453  else {
3454  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3455  // We're only asking for one column, so the result does have
3456  // constant stride, even though this MultiVector does not.
3457  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3458  whichVectors_[0] + colRng.ubound () + 1);
3459  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3460  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3461  }
3462  else {
3463  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3464  whichVectors_.begin () + colRng.ubound () + 1);
3465  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3466  }
3467  }
3468  }
3469 
3470  const bool debug = Behavior::debug ();
3471  if (debug) {
3472  using Teuchos::Comm;
3473  using Teuchos::outArg;
3474  using Teuchos::REDUCE_MIN;
3475  using Teuchos::reduceAll;
3476 
3477  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
3478  Teuchos::null : this->getMap ()->getComm ();
3479  if (! comm.is_null ()) {
3480  int lclSuccess = 1;
3481  int gblSuccess = 1;
3482 
3483  if (X_ret.is_null ()) {
3484  lclSuccess = 0;
3485  }
3486  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3487  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3488  (lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3489  "MultiVector; the return value of this method) is null on some MPI "
3490  "process in this MultiVector's communicator. This should never "
3491  "happen. Please report this bug to the Tpetra developers.");
3492  if (! X_ret.is_null () &&
3493  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3494  lclSuccess = 0;
3495  }
3496  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess,
3497  outArg (gblSuccess));
3498  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3499  (lclSuccess != 1, std::logic_error, "X_ret->getNumVectors() != "
3500  "colRng.size(), on at least one MPI process in this MultiVector's "
3501  "communicator. This should never happen. "
3502  "Please report this bug to the Tpetra developers.");
3503  }
3504  }
3505  return X_ret;
3506  }
3507 
3508 
3509  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3510  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3512  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3513  {
3515  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3516  }
3517 
3518 
3519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3520  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3522  subViewNonConst (const Teuchos::Range1D &colRng)
3523  {
3525  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3526  }
3527 
3528 
3529  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3532  const size_t j)
3533  : base_type (X.getMap ())
3534  {
3535  using Kokkos::subview;
3536  typedef std::pair<size_t, size_t> range_type;
3537  const char tfecfFuncName[] = "MultiVector(const MultiVector&, const size_t): ";
3538 
3539  const size_t numCols = X.getNumVectors ();
3540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3541  (j >= numCols, std::invalid_argument, "Input index j (== " << j
3542  << ") exceeds valid column index range [0, " << numCols << " - 1].");
3543  const size_t jj = X.isConstantStride () ?
3544  static_cast<size_t> (j) :
3545  static_cast<size_t> (X.whichVectors_[j]);
3546  this->view_ = takeSubview (X.view_, Kokkos::ALL (), range_type (jj, jj+1));
3547  this->origView_ = X.origView_;
3548  this->owningView_ = X.owningView_;
3549 
3550  // mfh 31 Jul 2017: It would be unwise to execute concurrent
3551  // Export or Import operations with different subviews of a
3552  // MultiVector. Thus, it is safe to reuse communication buffers.
3553  // See #1560 discussion.
3554  //
3555  // We only need one column's worth of buffer for imports_ and
3556  // exports_. Taking subviews now ensures that their lengths will
3557  // be exactly what we need, so we won't have to resize them later.
3558  {
3559  const size_t newSize = X.imports_.extent (0) / numCols;
3560  const size_t offset = jj*newSize;
3561  auto newImports = X.imports_;
3562  newImports.d_view = subview (X.imports_.d_view,
3563  range_type (offset, offset+newSize));
3564  newImports.h_view = subview (X.imports_.h_view,
3565  range_type (offset, offset+newSize));
3566  this->imports_ = newImports;
3567  }
3568  {
3569  const size_t newSize = X.exports_.extent (0) / numCols;
3570  const size_t offset = jj*newSize;
3571  auto newExports = X.exports_;
3572  newExports.d_view = subview (X.exports_.d_view,
3573  range_type (offset, offset+newSize));
3574  newExports.h_view = subview (X.exports_.h_view,
3575  range_type (offset, offset+newSize));
3576  this->exports_ = newExports;
3577  }
3578  // These two DualViews already either have the right number of
3579  // entries, or zero entries. This means that we don't need to
3580  // resize them.
3583  }
3584 
3585 
3586  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3587  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3589  getVector (const size_t j) const
3590  {
3592  return Teuchos::rcp (new V (*this, j));
3593  }
3594 
3595 
3596  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3597  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
3599  getVectorNonConst (const size_t j)
3600  {
3602  return Teuchos::rcp (new V (*this, j));
3603  }
3604 
3605 
3606  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3607  void
3609  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3610  {
3611  using IST = impl_scalar_type;
3612  using input_view_type = Kokkos::View<IST**, Kokkos::LayoutLeft,
3613  Kokkos::HostSpace,
3614  Kokkos::MemoryUnmanaged>;
3615  const char tfecfFuncName[] = "get1dCopy: ";
3616 
3617  const size_t numRows = this->getLocalLength ();
3618  const size_t numCols = this->getNumVectors ();
3619 
3620  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3621  (LDA < numRows, std::runtime_error,
3622  "LDA = " << LDA << " < numRows = " << numRows << ".");
3623  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3624  (numRows > size_t (0) && numCols > size_t (0) &&
3625  size_t (A.size ()) < LDA * (numCols - 1) + numRows,
3626  std::runtime_error,
3627  "A.size() = " << A.size () << ", but its size must be at least "
3628  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3629 
3630  const std::pair<size_t, size_t> rowRange (0, numRows);
3631  const std::pair<size_t, size_t> colRange (0, numCols);
3632 
3633  input_view_type A_view_orig (reinterpret_cast<IST*> (A.getRawPtr ()),
3634  LDA, numCols);
3635  auto A_view = Kokkos::subview (A_view_orig, rowRange, colRange);
3636 
3648  if (this->need_sync_host() && this->need_sync_device()) {
3651  throw std::runtime_error("Tpetra::MultiVector: A non-const view is alive outside and we cannot give a copy where host or device view can be modified outside");
3652  }
3653  else {
3654  const bool useHostView = owningView_.h_view.use_count() >= owningView_.d_view.use_count();
3655  if (this->isConstantStride ()) {
3656  if (useHostView) {
3657  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3658  Kokkos::deep_copy (A_view, srcView_host);
3659  } else {
3660  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3661  Kokkos::deep_copy (A_view, srcView_device);
3662  }
3663  }
3664  else {
3665  for (size_t j = 0; j < numCols; ++j) {
3666  const size_t srcCol = this->whichVectors_[j];
3667  auto dstColView = Kokkos::subview (A_view, rowRange, j);
3668 
3669  if (useHostView) {
3670  auto srcView_host = this->getLocalViewHost(Access::ReadOnly);
3671  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3672  Kokkos::deep_copy (dstColView, srcColView_host);
3673  } else {
3674  auto srcView_device = this->getLocalViewDevice(Access::ReadOnly);
3675  auto srcColView_device = Kokkos::subview (srcView_device, rowRange, srcCol);
3676  Kokkos::deep_copy (dstColView, srcColView_device);
3677  }
3678  }
3679  }
3680  }
3681  }
3682 
3683 
3684  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3685  void
3687  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3688  {
3690  const char tfecfFuncName[] = "get2dCopy: ";
3691  const size_t numRows = this->getLocalLength ();
3692  const size_t numCols = this->getNumVectors ();
3693 
3694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3695  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3696  std::runtime_error, "Input array of pointers must contain as many "
3697  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3698  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3699 
3700  if (numRows != 0 && numCols != 0) {
3701  // No side effects until we've validated the input.
3702  for (size_t j = 0; j < numCols; ++j) {
3703  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3704  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3705  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3706  "the input array of arrays is not long enough to fit all entries in "
3707  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3708  << " < getLocalLength() = " << numRows << ".");
3709  }
3710 
3711  // We've validated the input, so it's safe to start copying.
3712  for (size_t j = 0; j < numCols; ++j) {
3713  Teuchos::RCP<const V> X_j = this->getVector (j);
3714  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3715  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3716  }
3717  }
3718  }
3719 
3720 
3721  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3722  Teuchos::ArrayRCP<const Scalar>
3724  get1dView () const
3725  {
3726  if (getLocalLength () == 0 || getNumVectors () == 0) {
3727  return Teuchos::null;
3728  } else {
3729  TEUCHOS_TEST_FOR_EXCEPTION(
3730  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3731  "get1dView: This MultiVector does not have constant stride, so it is "
3732  "not possible to view its data as a single array. You may check "
3733  "whether a MultiVector has constant stride by calling "
3734  "isConstantStride().");
3735  // Since get1dView() is and was always marked const, I have to
3736  // cast away const here in order not to break backwards
3737  // compatibility.
3738  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3739  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3740  Kokkos::Compat::persistingView (X_lcl);
3741  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3742  }
3743  }
3744 
3745  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3746  Teuchos::ArrayRCP<Scalar>
3749  {
3750  if (this->getLocalLength () == 0 || this->getNumVectors () == 0) {
3751  return Teuchos::null;
3752  }
3753  else {
3754  TEUCHOS_TEST_FOR_EXCEPTION
3755  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3756  "get1dViewNonConst: This MultiVector does not have constant stride, "
3757  "so it is not possible to view its data as a single array. You may "
3758  "check whether a MultiVector has constant stride by calling "
3759  "isConstantStride().");
3760  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3761  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3762  Kokkos::Compat::persistingView (X_lcl);
3763  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3764  }
3765  }
3766 
3767  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3768  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3771  {
3772  auto X_lcl = getLocalViewHost(Access::ReadWrite);
3773 
3774  // Don't use the row range here on the outside, in order to avoid
3775  // a strided return type (in case Kokkos::subview is conservative
3776  // about that). Instead, use the row range for the column views
3777  // in the loop.
3778  const size_t myNumRows = this->getLocalLength ();
3779  const size_t numCols = this->getNumVectors ();
3780  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3781 
3782  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3783  for (size_t j = 0; j < numCols; ++j) {
3784  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3785  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3786  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3787  Kokkos::Compat::persistingView (X_lcl_j);
3788  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3789  }
3790  return views;
3791  }
3792 
3793  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3796  getLocalViewHost(Access::ReadOnlyStruct) const
3797  {
3798  //returning dual_view_type::t_host::const_type
3799  if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3800  const bool debug = Details::Behavior::debug();
3801  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3802  " host while a device view is alive";
3803  if (debug) {
3804  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3805  << ": " << msg << std::endl;
3806  }
3807  throw std::runtime_error(msg);
3808  }
3809  owningView_.sync_host();
3810  return view_.view_host();
3811  }
3812 
3813  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3816  getLocalViewHost(Access::ReadWriteStruct)
3817  {
3818  //returning dual_view_type::t_host::type
3819  if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3820  const bool debug = Details::Behavior::debug();
3821  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3822  " host while a device view is alive";
3823  if (debug) {
3824  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3825  << ": " << msg << std::endl;
3826  }
3827  throw std::runtime_error(msg);
3828  }
3829  owningView_.sync_host();
3830  owningView_.modify_host();
3831  return view_.view_host();
3832  }
3833 
3834  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3837  getLocalViewHost(Access::OverwriteAllStruct)
3838  {
3839  //returning dual_view_type::t_host::type
3840  if (owningView_.h_view != view_.h_view) {
3841  // view is a subview of owningView_; for safety, need to do ReadWrite
3842  return getLocalViewHost(Access::ReadWrite);
3843  }
3844 
3845  if (owningView_.d_view.use_count() > owningView_.h_view.use_count()) {
3846  const bool debug = Details::Behavior::debug();
3847  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3848  " host while a device view is alive";
3849  if (debug) {
3850  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3851  << ": " << msg << std::endl;
3852  }
3853  throw std::runtime_error(msg);
3854  }
3855  owningView_.clear_sync_state();
3856  owningView_.modify_host();
3857  return view_.view_host();
3858  }
3859 
3860  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3863  getLocalViewDevice(Access::ReadOnlyStruct) const
3864  {
3865  //returning dual_view_type::t_dev::const_type
3866  if (owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3867  const bool debug = Details::Behavior::debug();
3868  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3869  " device while a host view is alive";
3870  if (debug) {
3871  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3872  << ": " << msg << std::endl;
3873  }
3874  throw std::runtime_error(msg);
3875  }
3876  owningView_.sync_device();
3877  return view_.view_device();
3878  }
3879 
3880  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3883  getLocalViewDevice(Access::ReadWriteStruct)
3884  {
3885  //returning dual_view_type::t_dev::type
3886  if(owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3887  const bool debug = Details::Behavior::debug();
3888  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3889  " device while a host view is alive";
3890  if (debug) {
3891  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3892  << ": " << msg << std::endl;
3893  }
3894  throw std::runtime_error(msg);
3895  }
3896  owningView_.sync_device();
3897  owningView_.modify_device();
3898  return view_.view_device();
3899  }
3900 
3901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3904  getLocalViewDevice(Access::OverwriteAllStruct)
3905  {
3906  //returning dual_view_type::t_dev::type
3907  if (owningView_.h_view != view_.h_view) {
3908  // view_ is a subview of owningView_; for safety, need to do ReadWrite
3909  return getLocalViewDevice(Access::ReadWrite);
3910  }
3911 
3912  if (owningView_.h_view.use_count() > owningView_.d_view.use_count()) {
3913  const bool debug = Details::Behavior::debug();
3914  const char msg[] = "Tpetra::MultiVector: Cannot access data on "
3915  " device while a host view is alive";
3916  if (debug) {
3917  std::cout << "Rank " << this->getMap()->getComm()->getRank()
3918  << ": " << msg << std::endl;
3919  }
3920  throw std::runtime_error(msg);
3921  }
3922  owningView_.clear_sync_state();
3923  owningView_.modify_device();
3924  return view_.view_device();
3925  }
3926 
3927  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3928  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3930  get2dView () const
3931  {
3932  // Since get2dView() is and was always marked const, I have to
3933  // cast away const here in order not to break backwards
3934  // compatibility.
3935  auto X_lcl = getLocalViewHost(Access::ReadOnly);
3936 
3937  // Don't use the row range here on the outside, in order to avoid
3938  // a strided return type (in case Kokkos::subview is conservative
3939  // about that). Instead, use the row range for the column views
3940  // in the loop.
3941  const size_t myNumRows = this->getLocalLength ();
3942  const size_t numCols = this->getNumVectors ();
3943  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3944 
3945  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3946  for (size_t j = 0; j < numCols; ++j) {
3947  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3948  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3949  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
3950  Kokkos::Compat::persistingView (X_lcl_j);
3951  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
3952  }
3953  return views;
3954  }
3955 
3956  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3957  void
3959  multiply (Teuchos::ETransp transA,
3960  Teuchos::ETransp transB,
3961  const Scalar& alpha,
3964  const Scalar& beta)
3965  {
3966  using ::Tpetra::Details::ProfilingRegion;
3967  using Teuchos::CONJ_TRANS;
3968  using Teuchos::NO_TRANS;
3969  using Teuchos::TRANS;
3970  using Teuchos::RCP;
3971  using Teuchos::rcp;
3972  using Teuchos::rcpFromRef;
3973  using std::endl;
3974  using ATS = Kokkos::ArithTraits<impl_scalar_type>;
3975  using LO = local_ordinal_type;
3976  using STS = Teuchos::ScalarTraits<Scalar>;
3978  const char tfecfFuncName[] = "multiply: ";
3979  ProfilingRegion region ("Tpetra::MV::multiply");
3980 
3981  // This routine performs a variety of matrix-matrix multiply
3982  // operations, interpreting the MultiVector (this-aka C , A and B)
3983  // as 2D matrices. Variations are due to the fact that A, B and C
3984  // can be locally replicated or globally distributed MultiVectors
3985  // and that we may or may not operate with the transpose of A and
3986  // B. Possible cases are:
3987  //
3988  // Operations # Cases Notes
3989  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
3990  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
3991  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
3992  //
3993  // The following operations are not meaningful for 1-D
3994  // distributions:
3995  //
3996  // u1) C(local) = A^T(distr) * B^T(distr) 1
3997  // u2) C(local) = A (distr) * B^X(distr) 2
3998  // u3) C(distr) = A^X(local) * B^X(local) 4
3999  // u4) C(distr) = A^X(local) * B^X(distr) 4
4000  // u5) C(distr) = A^T(distr) * B^X(local) 2
4001  // u6) C(local) = A^X(distr) * B^X(local) 4
4002  // u7) C(distr) = A^X(distr) * B^X(local) 4
4003  // u8) C(local) = A^X(local) * B^X(distr) 4
4004  //
4005  // Total number of cases: 32 (= 2^5).
4006 
4007  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4008 
4009  const bool A_is_local = ! A.isDistributed ();
4010  const bool B_is_local = ! B.isDistributed ();
4011  const bool C_is_local = ! this->isDistributed ();
4012 
4013  // In debug mode, check compatibility of local dimensions. We
4014  // only do this in debug mode, since it requires an all-reduce
4015  // to ensure correctness on all processses. It's entirely
4016  // possible that only some processes may have incompatible local
4017  // dimensions. Throwing an exception only on those processes
4018  // could cause this method to hang.
4019  const bool debug = ::Tpetra::Details::Behavior::debug ();
4020  if (debug) {
4021  auto myMap = this->getMap ();
4022  if (! myMap.is_null () && ! myMap->getComm ().is_null ()) {
4023  using Teuchos::REDUCE_MIN;
4024  using Teuchos::reduceAll;
4025  using Teuchos::outArg;
4026 
4027  auto comm = myMap->getComm ();
4028  const size_t A_nrows =
4029  (transA != NO_TRANS) ? A.getNumVectors () : A.getLocalLength ();
4030  const size_t A_ncols =
4031  (transA != NO_TRANS) ? A.getLocalLength () : A.getNumVectors ();
4032  const size_t B_nrows =
4033  (transB != NO_TRANS) ? B.getNumVectors () : B.getLocalLength ();
4034  const size_t B_ncols =
4035  (transB != NO_TRANS) ? B.getLocalLength () : B.getNumVectors ();
4036 
4037  const bool lclBad = this->getLocalLength () != A_nrows ||
4038  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4039 
4040  const int myRank = comm->getRank ();
4041  std::ostringstream errStrm;
4042  if (this->getLocalLength () != A_nrows) {
4043  errStrm << "Proc " << myRank << ": this->getLocalLength()="
4044  << this->getLocalLength () << " != A_nrows=" << A_nrows
4045  << "." << std::endl;
4046  }
4047  if (this->getNumVectors () != B_ncols) {
4048  errStrm << "Proc " << myRank << ": this->getNumVectors()="
4049  << this->getNumVectors () << " != B_ncols=" << B_ncols
4050  << "." << std::endl;
4051  }
4052  if (A_ncols != B_nrows) {
4053  errStrm << "Proc " << myRank << ": A_ncols="
4054  << A_ncols << " != B_nrows=" << B_nrows
4055  << "." << std::endl;
4056  }
4057 
4058  const int lclGood = lclBad ? 0 : 1;
4059  int gblGood = 0;
4060  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4061  if (gblGood != 1) {
4062  std::ostringstream os;
4063  ::Tpetra::Details::gathervPrint (os, errStrm.str (), *comm);
4064 
4065  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4066  (true, std::runtime_error, "Inconsistent local dimensions on at "
4067  "least one process in this object's communicator." << std::endl
4068  << "Operation: "
4069  << "C(" << (C_is_local ? "local" : "distr") << ") = "
4070  << alpha << "*A"
4071  << (transA == Teuchos::TRANS ? "^T" :
4072  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4073  << "(" << (A_is_local ? "local" : "distr") << ") * "
4074  << beta << "*B"
4075  << (transA == Teuchos::TRANS ? "^T" :
4076  (transA == Teuchos::CONJ_TRANS ? "^H" : ""))
4077  << "(" << (B_is_local ? "local" : "distr") << ")." << std::endl
4078  << "Global dimensions: C(" << this->getGlobalLength () << ", "
4079  << this->getNumVectors () << "), A(" << A.getGlobalLength ()
4080  << ", " << A.getNumVectors () << "), B(" << B.getGlobalLength ()
4081  << ", " << B.getNumVectors () << ")." << std::endl
4082  << os.str ());
4083  }
4084  }
4085  }
4086 
4087  // Case 1: C(local) = A^X(local) * B^X(local)
4088  const bool Case1 = C_is_local && A_is_local && B_is_local;
4089  // Case 2: C(local) = A^T(distr) * B (distr)
4090  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4091  transA != NO_TRANS &&
4092  transB == NO_TRANS;
4093  // Case 3: C(distr) = A (distr) * B^X(local)
4094  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4095  transA == NO_TRANS;
4096 
4097  // Test that we are considering a meaningful case
4098  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4099  (! Case1 && ! Case2 && ! Case3, std::runtime_error,
4100  "Multiplication of op(A) and op(B) into *this is not a "
4101  "supported use case.");
4102 
4103  if (beta != STS::zero () && Case2) {
4104  // If Case2, then C is local and contributions must be summed
4105  // across all processes. However, if beta != 0, then accumulate
4106  // beta*C into the sum. When summing across all processes, we
4107  // only want to accumulate this once, so set beta == 0 on all
4108  // processes except Process 0.
4109  const int myRank = this->getMap ()->getComm ()->getRank ();
4110  if (myRank != 0) {
4111  beta_local = ATS::zero ();
4112  }
4113  }
4114 
4115  // We only know how to do matrix-matrix multiplies if all the
4116  // MultiVectors have constant stride. If not, we have to make
4117  // temporary copies of those MultiVectors (including possibly
4118  // *this) that don't have constant stride.
4119  RCP<MV> C_tmp;
4120  if (! isConstantStride ()) {
4121  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4122  } else {
4123  C_tmp = rcp (this, false);
4124  }
4125 
4126  RCP<const MV> A_tmp;
4127  if (! A.isConstantStride ()) {
4128  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4129  } else {
4130  A_tmp = rcpFromRef (A);
4131  }
4132 
4133  RCP<const MV> B_tmp;
4134  if (! B.isConstantStride ()) {
4135  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4136  } else {
4137  B_tmp = rcpFromRef (B);
4138  }
4139 
4140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4141  (! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4142  ! A_tmp->isConstantStride (), std::logic_error,
4143  "Failed to make temporary constant-stride copies of MultiVectors.");
4144 
4145  {
4146  const LO A_lclNumRows = A_tmp->getLocalLength ();
4147  const LO A_numVecs = A_tmp->getNumVectors ();
4148  auto A_lcl = A_tmp->getLocalViewDevice(Access::ReadOnly);
4149  auto A_sub = Kokkos::subview (A_lcl,
4150  std::make_pair (LO (0), A_lclNumRows),
4151  std::make_pair (LO (0), A_numVecs));
4152 
4153 
4154  const LO B_lclNumRows = B_tmp->getLocalLength ();
4155  const LO B_numVecs = B_tmp->getNumVectors ();
4156  auto B_lcl = B_tmp->getLocalViewDevice(Access::ReadOnly);
4157  auto B_sub = Kokkos::subview (B_lcl,
4158  std::make_pair (LO (0), B_lclNumRows),
4159  std::make_pair (LO (0), B_numVecs));
4160 
4161  const LO C_lclNumRows = C_tmp->getLocalLength ();
4162  const LO C_numVecs = C_tmp->getNumVectors ();
4163 
4164  auto C_lcl = C_tmp->getLocalViewDevice(Access::ReadWrite);
4165  auto C_sub = Kokkos::subview (C_lcl,
4166  std::make_pair (LO (0), C_lclNumRows),
4167  std::make_pair (LO (0), C_numVecs));
4168  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4169  (transA == Teuchos::TRANS ? 'T' : 'C'));
4170  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4171  (transB == Teuchos::TRANS ? 'T' : 'C'));
4172  const impl_scalar_type alpha_IST (alpha);
4173 
4174  ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4175 
4176  KokkosBlas::gemm (&ctransA, &ctransB, alpha_IST, A_sub, B_sub,
4177  beta_local, C_sub);
4178  }
4179 
4180  if (! isConstantStride ()) {
4181  ::Tpetra::deep_copy (*this, *C_tmp); // Copy the result back into *this.
4182  }
4183 
4184  // Dispose of (possibly) extra copies of A and B.
4185  A_tmp = Teuchos::null;
4186  B_tmp = Teuchos::null;
4187 
4188  // If Case 2 then sum up *this and distribute it to all processes.
4189  if (Case2) {
4190  this->reduce ();
4191  }
4192  }
4193 
4194  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4195  void
4197  elementWiseMultiply (Scalar scalarAB,
4200  Scalar scalarThis)
4201  {
4202  using Kokkos::ALL;
4203  using Kokkos::subview;
4204  const char tfecfFuncName[] = "elementWiseMultiply: ";
4205 
4206  const size_t lclNumRows = this->getLocalLength ();
4207  const size_t numVecs = this->getNumVectors ();
4208 
4209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4210  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4211  std::runtime_error, "MultiVectors do not have the same local length.");
4212  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4213  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4214  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4215  << ".");
4216 
4217  auto this_view = this->getLocalViewDevice(Access::ReadWrite);
4218  auto A_view = A.getLocalViewDevice(Access::ReadOnly);
4219  auto B_view = B.getLocalViewDevice(Access::ReadOnly);
4220 
4221  if (isConstantStride () && B.isConstantStride ()) {
4222  // A is just a Vector; it only has one column, so it always has
4223  // constant stride.
4224  //
4225  // If both *this and B have constant stride, we can do an
4226  // element-wise multiply on all columns at once.
4227  KokkosBlas::mult (scalarThis,
4228  this_view,
4229  scalarAB,
4230  subview (A_view, ALL (), 0),
4231  B_view);
4232  }
4233  else {
4234  for (size_t j = 0; j < numVecs; ++j) {
4235  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4236  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4237  KokkosBlas::mult (scalarThis,
4238  subview (this_view, ALL (), C_col),
4239  scalarAB,
4240  subview (A_view, ALL (), 0),
4241  subview (B_view, ALL (), B_col));
4242  }
4243  }
4244  }
4245 
4246  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4247  void
4249  reduce ()
4250  {
4252  using ::Tpetra::Details::ProfilingRegion;
4253  ProfilingRegion region ("Tpetra::MV::reduce");
4254 
4255  const auto map = this->getMap ();
4256  if (map.get () == nullptr) {
4257  return;
4258  }
4259  const auto comm = map->getComm ();
4260  if (comm.get () == nullptr) {
4261  return;
4262  }
4263 
4264  // Avoid giving device buffers to MPI. Even if MPI handles them
4265  // correctly, doing so may not perform well.
4266  const bool changed_on_device = this->need_sync_host ();
4267  if (changed_on_device) {
4268  // NOTE (mfh 17 Mar 2019) If we ever get rid of UVM, then device
4269  // and host will be separate allocations. In that case, it may
4270  // pay to do the all-reduce from device to host.
4271  Kokkos::fence(); // for UVM getLocalViewDevice is UVM which can be read as host by allReduceView, so we must not read until device is fenced
4272  auto X_lcl = this->getLocalViewDevice(Access::ReadWrite);
4273  allReduceView (X_lcl, X_lcl, *comm);
4274  }
4275  else {
4276  auto X_lcl = this->getLocalViewHost(Access::ReadWrite);
4277  allReduceView (X_lcl, X_lcl, *comm);
4278  }
4279  }
4280 
4281  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4282  void
4284  replaceLocalValue (const LocalOrdinal lclRow,
4285  const size_t col,
4286  const impl_scalar_type& ScalarValue)
4287  {
4289  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4290  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4291  TEUCHOS_TEST_FOR_EXCEPTION(
4292  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4293  std::runtime_error,
4294  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4295  << " is invalid. The range of valid row indices on this process "
4296  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4297  << ", " << maxLocalIndex << "].");
4298  TEUCHOS_TEST_FOR_EXCEPTION(
4299  vectorIndexOutOfRange(col),
4300  std::runtime_error,
4301  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4302  << " of the multivector is invalid.");
4303  }
4304 
4305  auto view = this->getLocalViewHost(Access::ReadWrite);
4306  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4307  view (lclRow, colInd) = ScalarValue;
4308  }
4309 
4310 
4311  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4312  void
4314  sumIntoLocalValue (const LocalOrdinal lclRow,
4315  const size_t col,
4316  const impl_scalar_type& value,
4317  const bool atomic)
4318  {
4320  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4321  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4322  TEUCHOS_TEST_FOR_EXCEPTION(
4323  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4324  std::runtime_error,
4325  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4326  << " is invalid. The range of valid row indices on this process "
4327  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4328  << ", " << maxLocalIndex << "].");
4329  TEUCHOS_TEST_FOR_EXCEPTION(
4330  vectorIndexOutOfRange(col),
4331  std::runtime_error,
4332  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4333  << " of the multivector is invalid.");
4334  }
4335 
4336  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4337 
4338  auto view = this->getLocalViewHost(Access::ReadWrite);
4339  if (atomic) {
4340  Kokkos::atomic_add (& (view(lclRow, colInd)), value);
4341  }
4342  else {
4343  view(lclRow, colInd) += value;
4344  }
4345  }
4346 
4347 
4348  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4349  void
4351  replaceGlobalValue (const GlobalOrdinal gblRow,
4352  const size_t col,
4353  const impl_scalar_type& ScalarValue)
4354  {
4355  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4356  // touches the RCP's reference count, which isn't thread safe.
4357  const LocalOrdinal lclRow = this->map_->getLocalElement (gblRow);
4358 
4360  const char tfecfFuncName[] = "replaceGlobalValue: ";
4361  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4362  (lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4363  std::runtime_error,
4364  "Global row index " << gblRow << " is not present on this process "
4365  << this->getMap ()->getComm ()->getRank () << ".");
4366  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4367  (this->vectorIndexOutOfRange (col), std::runtime_error,
4368  "Vector index " << col << " of the MultiVector is invalid.");
4369  }
4370 
4371  this->replaceLocalValue (lclRow, col, ScalarValue);
4372  }
4373 
4374  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4375  void
4377  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4378  const size_t col,
4379  const impl_scalar_type& value,
4380  const bool atomic)
4381  {
4382  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4383  // touches the RCP's reference count, which isn't thread safe.
4384  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4385 
4387  TEUCHOS_TEST_FOR_EXCEPTION(
4388  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4389  std::runtime_error,
4390  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4391  << " is not present on this process "
4392  << this->getMap ()->getComm ()->getRank () << ".");
4393  TEUCHOS_TEST_FOR_EXCEPTION(
4394  vectorIndexOutOfRange(col),
4395  std::runtime_error,
4396  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4397  << " of the multivector is invalid.");
4398  }
4399 
4400  this->sumIntoLocalValue (lclRow, col, value, atomic);
4401  }
4402 
4403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4404  template <class T>
4405  Teuchos::ArrayRCP<T>
4407  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4408  size_t j) const
4409  {
4410  typedef Kokkos::DualView<impl_scalar_type*,
4411  typename dual_view_type::array_layout,
4412  execution_space> col_dual_view_type;
4413  const size_t col = isConstantStride () ? j : whichVectors_[j];
4414  col_dual_view_type X_col =
4415  Kokkos::subview (view_, Kokkos::ALL (), col);
4416  return Kokkos::Compat::persistingView (X_col.d_view);
4417  }
4418 
4419 
4420 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4421  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4422  void
4424  clear_sync_state () {
4425  owningView_.clear_sync_state ();
4426  }
4427 
4428  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4429  void
4431  sync_host () {
4432  owningView_.sync_host ();
4433  }
4434 
4435  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4436  void
4438  sync_device () {
4439  owningView_.sync_device ();
4440  }
4441 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4442 
4443  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4444  bool
4446  need_sync_host () const {
4447  return owningView_.need_sync_host ();
4448  }
4449 
4450  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4451  bool
4453  need_sync_device () const {
4454  return owningView_.need_sync_device ();
4455  }
4456 
4457 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4459  void
4461  modify_device () {
4462  owningView_.modify_device ();
4463  }
4464 
4465  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4466  void
4468  modify_host () {
4469  owningView_.modify_host ();
4470  }
4471 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4472 
4473 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
4474  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4477  getLocalViewDevice () const {
4478  return view_.view_device ();
4479  }
4480 
4481  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4484  getLocalViewHost () const {
4485  return view_.view_host ();
4486  }
4487 #endif // TPETRA_ENABLE_DEPRECATED_CODE
4488 
4489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4490  std::string
4492  descriptionImpl (const std::string& className) const
4493  {
4494  using Teuchos::TypeNameTraits;
4495 
4496  std::ostringstream out;
4497  out << "\"" << className << "\": {";
4498  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4499  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4500  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4501  << ", Node" << Node::name ()
4502  << "}, ";
4503  if (this->getObjectLabel () != "") {
4504  out << "Label: \"" << this->getObjectLabel () << "\", ";
4505  }
4506  out << ", numRows: " << this->getGlobalLength ();
4507  if (className != "Tpetra::Vector") {
4508  out << ", numCols: " << this->getNumVectors ()
4509  << ", isConstantStride: " << this->isConstantStride ();
4510  }
4511  if (this->isConstantStride ()) {
4512  out << ", columnStride: " << this->getStride ();
4513  }
4514  out << "}";
4515 
4516  return out.str ();
4517  }
4518 
4519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4520  std::string
4522  description () const
4523  {
4524  return this->descriptionImpl ("Tpetra::MultiVector");
4525  }
4526 
4527  namespace Details
4528  {
4529  template<typename ViewType>
4530  void print_vector(Teuchos::FancyOStream & out, const char* prefix, const ViewType& v)
4531  {
4532  using std::endl;
4533  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace, typename ViewType::memory_space>::accessible,
4534  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a host-accessible view.");
4535  static_assert(ViewType::rank == 2,
4536  "Tpetra::MultiVector::localDescribeToString: Details::print_vector should be given a rank-2 view.");
4537  // The square braces [] and their contents are in Matlab
4538  // format, so users may copy and paste directly into Matlab.
4539  out << "Values("<<prefix<<"): " << std::endl
4540  << "[";
4541  const size_t numRows = v.extent(0);
4542  const size_t numCols = v.extent(1);
4543  if (numCols == 1) {
4544  for (size_t i = 0; i < numRows; ++i) {
4545  out << v(i,0);
4546  if (i + 1 < numRows) {
4547  out << "; ";
4548  }
4549  }
4550  }
4551  else {
4552  for (size_t i = 0; i < numRows; ++i) {
4553  for (size_t j = 0; j < numCols; ++j) {
4554  out << v(i,j);
4555  if (j + 1 < numCols) {
4556  out << ", ";
4557  }
4558  }
4559  if (i + 1 < numRows) {
4560  out << "; ";
4561  }
4562  }
4563  }
4564  out << "]" << endl;
4565  }
4566  }
4567 
4568  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4569  std::string
4571  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4572  {
4573  typedef LocalOrdinal LO;
4574  using std::endl;
4575 
4576  if (vl <= Teuchos::VERB_LOW) {
4577  return std::string ();
4578  }
4579  auto map = this->getMap ();
4580  if (map.is_null ()) {
4581  return std::string ();
4582  }
4583  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4584  auto outp = Teuchos::getFancyOStream (outStringP);
4585  Teuchos::FancyOStream& out = *outp;
4586  auto comm = map->getComm ();
4587  const int myRank = comm->getRank ();
4588  const int numProcs = comm->getSize ();
4589 
4590  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4591  Teuchos::OSTab tab1 (out);
4592 
4593  // VERB_MEDIUM and higher prints getLocalLength()
4594  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4595  out << "Local number of rows: " << lclNumRows << endl;
4596 
4597  if (vl > Teuchos::VERB_MEDIUM) {
4598  // VERB_HIGH and higher prints isConstantStride() and getStride().
4599  // The first is only relevant if the Vector has multiple columns.
4600  if (this->getNumVectors () != static_cast<size_t> (1)) {
4601  out << "isConstantStride: " << this->isConstantStride () << endl;
4602  }
4603  if (this->isConstantStride ()) {
4604  out << "Column stride: " << this->getStride () << endl;
4605  }
4606 
4607  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4608  // VERB_EXTREME prints values. If we're not doing univied memory, we'll
4610  // so we can't use our regular accessor functins
4611 
4612  // NOTE: This is an occasion where we do *not* want the auto-sync stuff
4613  // to trigger (since this function is conceptually const)
4614  auto X_dev = view_.view_device();
4615  auto X_host = view_.view_host();
4616 
4617  if(X_dev.data() == X_host.data()) {
4618  // One single allocation
4619  Details::print_vector(out,"unified",X_host);
4620  }
4621  else {
4622  Details::print_vector(out,"host",X_host);
4623  if(X_dev.span_is_contiguous())
4624  {
4625  auto X_dev_on_host = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), X_dev);
4626  Details::print_vector(out,"dev",X_dev_on_host);
4627  }
4628  else
4629  {
4630  auto X_contig = Tpetra::Details::TempView::toLayout<decltype(X_dev), Kokkos::LayoutLeft>(X_dev);
4631  auto X_dev_on_host = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), X_contig);
4632  Details::print_vector(out,"dev",X_dev_on_host);
4633  }
4634  }
4635  }
4636  }
4637  out.flush (); // make sure the ostringstream got everything
4638  return outStringP->str ();
4639  }
4640 
4641  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4642  void
4644  describeImpl (Teuchos::FancyOStream& out,
4645  const std::string& className,
4646  const Teuchos::EVerbosityLevel verbLevel) const
4647  {
4648  using Teuchos::TypeNameTraits;
4649  using Teuchos::VERB_DEFAULT;
4650  using Teuchos::VERB_NONE;
4651  using Teuchos::VERB_LOW;
4652  using std::endl;
4653  const Teuchos::EVerbosityLevel vl =
4654  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4655 
4656  if (vl == VERB_NONE) {
4657  return; // don't print anything
4658  }
4659  // If this Vector's Comm is null, then the Vector does not
4660  // participate in collective operations with the other processes.
4661  // In that case, it is not even legal to call this method. The
4662  // reasonable thing to do in that case is nothing.
4663  auto map = this->getMap ();
4664  if (map.is_null ()) {
4665  return;
4666  }
4667  auto comm = map->getComm ();
4668  if (comm.is_null ()) {
4669  return;
4670  }
4671 
4672  const int myRank = comm->getRank ();
4673 
4674  // Only Process 0 should touch the output stream, but this method
4675  // in general may need to do communication. Thus, we may need to
4676  // preserve the current tab level across multiple "if (myRank ==
4677  // 0) { ... }" inner scopes. This is why we sometimes create
4678  // OSTab instances by pointer, instead of by value. We only need
4679  // to create them by pointer if the tab level must persist through
4680  // multiple inner scopes.
4681  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4682 
4683  // VERB_LOW and higher prints the equivalent of description().
4684  if (myRank == 0) {
4685  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4686  out << "\"" << className << "\":" << endl;
4687  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4688  {
4689  out << "Template parameters:" << endl;
4690  Teuchos::OSTab tab2 (out);
4691  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4692  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4693  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4694  << "Node: " << Node::name () << endl;
4695  }
4696  if (this->getObjectLabel () != "") {
4697  out << "Label: \"" << this->getObjectLabel () << "\", ";
4698  }
4699  out << "Global number of rows: " << this->getGlobalLength () << endl;
4700  if (className != "Tpetra::Vector") {
4701  out << "Number of columns: " << this->getNumVectors () << endl;
4702  }
4703  // getStride() may differ on different processes, so it (and
4704  // isConstantStride()) properly belong to per-process data.
4705  }
4706 
4707  // This is collective over the Map's communicator.
4708  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4709  const std::string lclStr = this->localDescribeToString (vl);
4710  ::Tpetra::Details::gathervPrint (out, lclStr, *comm);
4711  }
4712  }
4713 
4714  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4715  void
4717  describe (Teuchos::FancyOStream &out,
4718  const Teuchos::EVerbosityLevel verbLevel) const
4719  {
4720  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4721  }
4722 
4723  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4724  void
4726  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4727  {
4728  replaceMap (newMap);
4729  }
4730 
4731  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4732  void
4735  {
4736  using ::Tpetra::Details::localDeepCopy;
4737  const char prefix[] = "Tpetra::MultiVector::assign: ";
4738 
4739  TEUCHOS_TEST_FOR_EXCEPTION
4740  (this->getGlobalLength () != src.getGlobalLength () ||
4741  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4742  prefix << "Global dimensions of the two Tpetra::MultiVector "
4743  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4744  << "," << src.getNumVectors () << "], and *this has dimensions ["
4745  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4746  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4747  TEUCHOS_TEST_FOR_EXCEPTION
4748  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4749  prefix << "The local row counts of the two Tpetra::MultiVector "
4750  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4751  << " and *this has " << this->getLocalLength () << " row(s).");
4752 
4753 
4754  // See #1510. We're writing to *this, so we don't care about its
4755  // contents in either memory space, and we don't want
4756  // DualView::modify to complain about "concurrent modification" of
4757  // host and device Views.
4758 
4762 
4763  // If need sync to device, then host has most recent version.
4764  const bool src_last_updated_on_host = src.need_sync_device ();
4765 
4766  if (src_last_updated_on_host) {
4767  localDeepCopy (this->getLocalViewHost(Access::ReadWrite),
4768  src.getLocalViewHost(Access::ReadOnly),
4769  this->isConstantStride (),
4770  src.isConstantStride (),
4771  this->whichVectors_ (),
4772  src.whichVectors_ ());
4773  }
4774  else {
4775  localDeepCopy (this->getLocalViewDevice(Access::ReadWrite),
4776  src.getLocalViewDevice(Access::ReadOnly),
4777  this->isConstantStride (),
4778  src.isConstantStride (),
4779  this->whichVectors_ (),
4780  src.whichVectors_ ());
4781  }
4782  }
4783 
4784  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4785  template<class T>
4786  Teuchos::RCP<MultiVector<T, LocalOrdinal, GlobalOrdinal, Node> >
4788  convert () const
4789  {
4790  using Teuchos::RCP;
4791 
4792  auto newMV = Teuchos::rcp(new MultiVector<T,LocalOrdinal,GlobalOrdinal,Node>(this->getMap(),
4793  this->getNumVectors(),
4794  /*zeroOut=*/false));
4795  Tpetra::deep_copy(*newMV, *this);
4796  return newMV;
4797  }
4798 
4799  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4800  bool
4803  {
4804  using ::Tpetra::Details::PackTraits;
4805 
4806  const size_t l1 = this->getLocalLength();
4807  const size_t l2 = vec.getLocalLength();
4808  if ((l1!=l2) || (this->getNumVectors() != vec.getNumVectors())) {
4809  return false;
4810  }
4811 
4812  return true;
4813  }
4814 
4815  template <class ST, class LO, class GO, class NT>
4818  std::swap(mv.map_, this->map_);
4819  std::swap(mv.view_, this->view_);
4820  std::swap(mv.origView_, this->origView_);
4821  std::swap(mv.owningView_, this->owningView_);
4822  std::swap(mv.whichVectors_, this->whichVectors_);
4823  }
4824 
4825 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4826  template <class ST, class LO, class GO, class NT>
4827  void
4829  const Teuchos::SerialDenseMatrix<int, ST>& src)
4830  {
4831  using ::Tpetra::Details::localDeepCopy;
4832  using MV = MultiVector<ST, LO, GO, NT>;
4833  using IST = typename MV::impl_scalar_type;
4834  using input_view_type =
4835  Kokkos::View<const IST**, Kokkos::LayoutLeft,
4836  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4837  using pair_type = std::pair<LO, LO>;
4838 
4839  const LO numRows = static_cast<LO> (src.numRows ());
4840  const LO numCols = static_cast<LO> (src.numCols ());
4841 
4842  TEUCHOS_TEST_FOR_EXCEPTION
4843  (numRows != static_cast<LO> (dst.getLocalLength ()) ||
4844  numCols != static_cast<LO> (dst.getNumVectors ()),
4845  std::invalid_argument, "Tpetra::deep_copy: On Process "
4846  << dst.getMap ()->getComm ()->getRank () << ", dst is "
4847  << dst.getLocalLength () << " x " << dst.getNumVectors ()
4848  << ", but src is " << numRows << " x " << numCols << ".");
4849 
4850  const IST* src_raw = reinterpret_cast<const IST*> (src.values ());
4851  input_view_type src_orig (src_raw, src.stride (), numCols);
4852  input_view_type src_view =
4853  Kokkos::subview (src_orig, pair_type (0, numRows), Kokkos::ALL ());
4854 
4855  constexpr bool src_isConstantStride = true;
4856  Teuchos::ArrayView<const size_t> srcWhichVectors (nullptr, 0);
4857  localDeepCopy (dst.getLocalViewHost(Access::ReadWrite),
4858  src_view,
4859  dst.isConstantStride (),
4860  src_isConstantStride,
4861  getMultiVectorWhichVectors (dst),
4862  srcWhichVectors);
4863  }
4864 
4865  template <class ST, class LO, class GO, class NT>
4866  void
4867  deep_copy (Teuchos::SerialDenseMatrix<int, ST>& dst,
4868  const MultiVector<ST, LO, GO, NT>& src)
4869  {
4870  using ::Tpetra::Details::localDeepCopy;
4871  using MV = MultiVector<ST, LO, GO, NT>;
4872  using IST = typename MV::impl_scalar_type;
4873  using output_view_type =
4874  Kokkos::View<IST**, Kokkos::LayoutLeft,
4875  Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
4876  using pair_type = std::pair<LO, LO>;
4877 
4878  const LO numRows = static_cast<LO> (dst.numRows ());
4879  const LO numCols = static_cast<LO> (dst.numCols ());
4880 
4881  TEUCHOS_TEST_FOR_EXCEPTION
4882  (numRows != static_cast<LO> (src.getLocalLength ()) ||
4883  numCols != static_cast<LO> (src.getNumVectors ()),
4884  std::invalid_argument, "Tpetra::deep_copy: On Process "
4885  << src.getMap ()->getComm ()->getRank () << ", src is "
4886  << src.getLocalLength () << " x " << src.getNumVectors ()
4887  << ", but dst is " << numRows << " x " << numCols << ".");
4888 
4889  IST* dst_raw = reinterpret_cast<IST*> (dst.values ());
4890  output_view_type dst_orig (dst_raw, dst.stride (), numCols);
4891  auto dst_view =
4892  Kokkos::subview (dst_orig, pair_type (0, numRows), Kokkos::ALL ());
4893 
4894  constexpr bool dst_isConstantStride = true;
4895  Teuchos::ArrayView<const size_t> dstWhichVectors (nullptr, 0);
4896 
4897  // Prefer the host version of src's data.
4898  localDeepCopy (dst_view,
4899  src.getLocalViewHost(Access::ReadOnly),
4900  dst_isConstantStride,
4901  src.isConstantStride (),
4902  dstWhichVectors,
4903  getMultiVectorWhichVectors (src));
4904  }
4905 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4906 
4907  template <class Scalar, class LO, class GO, class NT>
4908  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT> >
4909  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
4910  size_t numVectors)
4911  {
4913  return Teuchos::rcp (new MV (map, numVectors));
4914  }
4915 
4916  template <class ST, class LO, class GO, class NT>
4919  {
4920  typedef MultiVector<ST, LO, GO, NT> MV;
4921  MV cpy (src.getMap (), src.getNumVectors (), false);
4922  cpy.assign (src);
4923  return cpy;
4924  }
4925 
4926 } // namespace Tpetra
4927 
4928 //
4929 // Explicit instantiation macro
4930 //
4931 // Must be expanded from within the Tpetra namespace!
4932 //
4933 
4934 #ifdef HAVE_TPETRACORE_TEUCHOSNUMERICS
4935 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4936  template class MultiVector< SCALAR , LO , GO , NODE >; \
4937  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
4938  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors); \
4939  template void deep_copy (MultiVector<SCALAR, LO, GO, NODE>& dst, const Teuchos::SerialDenseMatrix<int, SCALAR>& src); \
4940  template void deep_copy (Teuchos::SerialDenseMatrix<int, SCALAR>& dst, const MultiVector<SCALAR, LO, GO, NODE>& src);
4941 
4942 #else
4943 # define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
4944  template class MultiVector< SCALAR , LO , GO , NODE >; \
4945  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src);
4946 
4947 #endif // HAVE_TPETRACORE_TEUCHOSNUMERICS
4948 
4949 
4950 #define TPETRA_MULTIVECTOR_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
4951  \
4952  template Teuchos::RCP< MultiVector< SO , LO , GO , NODE > > \
4953  MultiVector< SI , LO , GO , NODE >::convert< SO > () const;
4954 
4955 
4956 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Functions for initializing and finalizing Tpetra.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
All-reduce a 1-D or 2-D Kokkos::View.
Declaration of functions for checking whether a given pointer is accessible from a given Kokkos execu...
Declaration and definition of Tpetra::Details::Blas::fill, an implementation detail of Tpetra::MultiV...
void fill(const ExecutionSpace &execSpace, const ViewType &X, const ValueType &alpha, const IndexType numRows, const IndexType numCols)
Fill the entries of the given 1-D or 2-D Kokkos::View with the given scalar value alpha.
Declaration of a function that prints strings from each process.
Declaration of Tpetra::Details::isInterComm.
Declaration and definition of Tpetra::Details::lclDot, an implementation detail of Tpetra::MultiVecto...
Declaration and definition of Tpetra::Details::normImpl, which is an implementation detail of Tpetra:...
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
Stand-alone utility functions and macros.
static bool assumeMpiIsCudaAware()
Whether to assume that MPI is CUDA aware.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t multivectorKernelLocationThreshold()
the threshold for transitioning from device to host
Base class for distributed Tpetra objects that support data redistribution.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
Kokkos::DualView< size_t *, buffer_device_type > numImportPacketsPerLID_
Number of packets to receive for each receive operation.
Kokkos::DualView< packet_type *, buffer_device_type > exports_
Buffer from which packed data are exported (sent to other processes).
Kokkos::DualView< packet_type *, buffer_device_type > imports_
Buffer into which packed data are imported (received from other processes).
Kokkos::DualView< size_t *, buffer_device_type > numExportPacketsPerLID_
Number of packets to send for each send operation.
Teuchos::RCP< const map_type > map_
The Map over which this object is distributed.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
A parallel distribution of indices over processes.
One or more distributed dense vectors.
void reciprocal(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise reciprocal values of input Multi-vector in target, this(i,j) = 1/A(i,...
dual_view_type origView_
The "original view" of the MultiVector's data.
void normInf(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the infinity-norm of each vector (column), storing the result in a host View.
void get1dCopy(const Teuchos::ArrayView< Scalar > &A, const size_t LDA) const
Fill the given array with a copy of this multivector's local values.
size_t getStride() const
Stride between columns in the multivector.
virtual size_t constantNumberOfPackets() const
Number of packets to send per LID.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
virtual std::string description() const
A simple one-line description of this object.
void reduce()
Sum values of a locally replicated multivector across all processes.
void randomize()
Set all values in the multivector to pseudorandom numbers.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes owning zero rows from the Map and their communicator.
typename map_type::local_ordinal_type local_ordinal_type
The type of local indices that this class uses.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subView(const Teuchos::Range1D &colRng) const
Return a const MultiVector with const views of selected columns.
void dot(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Teuchos::ArrayView< dot_type > &dots) const
Compute the dot product of each corresponding pair of vectors (columns) in A and B.
void describeImpl(Teuchos::FancyOStream &out, const std::string &className, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Implementation of describe() for this class, and its subclass Vector.
Teuchos::RCP< const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetView(const Teuchos::RCP< const map_type > &subMap, const size_t offset) const
Return a const view of a subset of rows.
Teuchos::ArrayRCP< Scalar > get1dViewNonConst()
Nonconst persisting (1-D) view of this multivector's local values.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &src)
Copy the contents of src into *this (deep copy).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
void meanValue(const Teuchos::ArrayView< impl_scalar_type > &means) const
Compute mean (average) value of each column.
std::string descriptionImpl(const std::string &className) const
Implementation of description() for this class, and its subclass Vector.
void multiply(Teuchos::ETransp transA, Teuchos::ETransp transB, const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Scalar &beta)
Matrix-matrix multiplication: this = beta*this + alpha*op(A)*op(B).
bool need_sync_device() const
Whether this MultiVector needs synchronization to the device.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM)
Perform copies and permutations that are local to the calling (MPI) process.
void replaceLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using local (row) index.
void swap(MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &mv)
Swap contents of mv with contents of *this.
size_t getLocalLength() const
Local number of rows on the calling process.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subCopy(const Teuchos::Range1D &colRng) const
Return a MultiVector with copies of selected columns.
typename Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Teuchos::ArrayRCP< const Scalar > getData(size_t j) const
Const view of the local values in a particular vector of this multivector.
bool need_sync_host() const
Whether this MultiVector needs synchronization to the host.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void norm2(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the two-norm of each vector (column), storing the result in a host View.
typename Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
Teuchos::ArrayRCP< const Scalar > get1dView() const
Const persisting (1-D) view of this multivector's local values.
Teuchos::ArrayRCP< T > getSubArrayRCP(Teuchos::ArrayRCP< T > arr, size_t j) const
Persisting view of j-th column in the given ArrayRCP.
void replaceGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value)
Replace value in host memory, using global row index.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, device_type > dual_view_type
Kokkos::DualView specialization used by this class.
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool areRemoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print the object with the given verbosity level to a FancyOStream.
void elementWiseMultiply(Scalar scalarAB, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarThis)
Multiply a Vector A elementwise by a MultiVector B.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type owningView_
The true original DualView - it owns the memory of this MultiVector, and was not constructed as a sub...
void sumIntoLocalValue(const LocalOrdinal lclRow, const size_t col, const impl_scalar_type &val, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using local row index.
void replaceMap(const Teuchos::RCP< const map_type > &map)
Replace the underlying Map in place.
virtual bool checkSizes(const SrcDistObject &sourceObj)
Whether data redistribution between sourceObj and this object is legal.
Teuchos::RCP< MultiVector< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another MultiVector with the same entries, but converted to a different Scalar type T.
MultiVector()
Default constructor: makes a MultiVector with no rows or columns.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
typename device_type::execution_space execution_space
Type of the (new) Kokkos execution space.
void abs(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A)
Put element-wise absolute values of input Multi-vector in target: A = abs(this)
void norm1(const Kokkos::View< mag_type *, Kokkos::HostSpace > &norms) const
Compute the one-norm of each vector (column), storing the result in a host view.
void get2dCopy(const Teuchos::ArrayView< const Teuchos::ArrayView< Scalar > > &ArrayOfPtrs) const
Fill the given array with a copy of this multivector's local values.
void sumIntoGlobalValue(const GlobalOrdinal gblRow, const size_t col, const impl_scalar_type &value, const bool atomic=useAtomicUpdatesByDefault)
Update (+=) a value in host memory, using global row index.
bool aliases(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &other) const
Whether this multivector's memory might alias other. This is conservative: if either this or other is...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > subViewNonConst(const Teuchos::Range1D &colRng)
Return a MultiVector with views of selected columns.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
Teuchos::ArrayRCP< Scalar > getDataNonConst(size_t j)
View of the local values in a particular vector of this multivector.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getOrigNumLocalCols() const
"Original" number of columns in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< Scalar > > get2dViewNonConst()
Return non-const persisting pointers to values.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > offsetViewNonConst(const Teuchos::RCP< const map_type > &subMap, const size_t offset)
Return a nonconst view of a subset of rows.
std::string localDescribeToString(const Teuchos::EVerbosityLevel vl) const
Print the calling process' verbose describe() information to the returned string.
bool isSameSize(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &vec) const
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
Teuchos::ArrayRCP< Teuchos::ArrayRCP< const Scalar > > get2dView() const
Return const persisting pointers to values.
dual_view_type view_
The Kokkos::DualView containing the MultiVector's data.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
Implementation details of Tpetra.
int local_ordinal_type
Default value of Scalar template parameter.
void normImpl(MagnitudeType norms[], const Kokkos::View< const ValueType **, ArrayLayout, DeviceType > &X, const EWhichNorm whichNorm, const Teuchos::ArrayView< const size_t > &whichVecs, const bool isConstantStride, const bool isDistributed, const Teuchos::Comm< int > *comm)
Implementation of MultiVector norms.
static void allReduceView(const OutputViewType &output, const InputViewType &input, const Teuchos::Comm< int > &comm)
All-reduce from input Kokkos::View to output Kokkos::View.
EWhichNorm
Input argument for normImpl() (which see).
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
MultiVector< ST, LO, GO, NT > createCopy(const MultiVector< ST, LO, GO, NT > &src)
Return a deep copy of the given MultiVector.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.