Tpetra parallel linear algebra  Version of the Day
Tpetra_MultiVector_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MULTIVECTOR_DEF_HPP
43 #define TPETRA_MULTIVECTOR_DEF_HPP
44 
52 
53 #include "Tpetra_Util.hpp"
54 #include "Tpetra_Vector.hpp"
55 #include "Tpetra_Details_gathervPrint.hpp"
58 #include "Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp"
59 
60 #include "KokkosCompat_View.hpp"
61 #include "Kokkos_MV_GEMM.hpp"
62 #include "Kokkos_Blas1_MV.hpp"
63 #include "Kokkos_Blas1.hpp"
64 #include "Kokkos_Random.hpp"
65 
66 #ifdef HAVE_TPETRA_INST_FLOAT128
67 namespace Kokkos {
68  // FIXME (mfh 04 Sep 2015) Just a stub for now!
69  template<class Generator>
70  struct rand<Generator, __float128> {
71  static KOKKOS_INLINE_FUNCTION __float128 max ()
72  {
73  return static_cast<__float128> (1.0);
74  }
75  static KOKKOS_INLINE_FUNCTION __float128
76  draw (Generator& gen)
77  {
78  // Half the smallest normalized double, is the scaling factor of
79  // the lower-order term in the double-double representation.
80  const __float128 scalingFactor =
81  static_cast<__float128> (std::numeric_limits<double>::min ()) /
82  static_cast<__float128> (2.0);
83  const __float128 higherOrderTerm = static_cast<__float128> (gen.drand ());
84  const __float128 lowerOrderTerm =
85  static_cast<__float128> (gen.drand ()) * scalingFactor;
86  return higherOrderTerm + lowerOrderTerm;
87  }
88  static KOKKOS_INLINE_FUNCTION __float128
89  draw (Generator& gen, const __float128& range)
90  {
91  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
92  const __float128 scalingFactor =
93  static_cast<__float128> (std::numeric_limits<double>::min ()) /
94  static_cast<__float128> (2.0);
95  const __float128 higherOrderTerm =
96  static_cast<__float128> (gen.drand (range));
97  const __float128 lowerOrderTerm =
98  static_cast<__float128> (gen.drand (range)) * scalingFactor;
99  return higherOrderTerm + lowerOrderTerm;
100  }
101  static KOKKOS_INLINE_FUNCTION __float128
102  draw (Generator& gen, const __float128& start, const __float128& end)
103  {
104  // FIXME (mfh 05 Sep 2015) Not sure if this is right.
105  const __float128 scalingFactor =
106  static_cast<__float128> (std::numeric_limits<double>::min ()) /
107  static_cast<__float128> (2.0);
108  const __float128 higherOrderTerm =
109  static_cast<__float128> (gen.drand (start, end));
110  const __float128 lowerOrderTerm =
111  static_cast<__float128> (gen.drand (start, end)) * scalingFactor;
112  return higherOrderTerm + lowerOrderTerm;
113  }
114  };
115 } // namespace Kokkos
116 #endif // HAVE_TPETRA_INST_FLOAT128
117 
118 namespace { // (anonymous)
119 
134  template<class ST, class LO, class GO, class NT>
136  allocDualView (const size_t lclNumRows,
137  const size_t numCols,
138  const bool zeroOut = true,
139  const bool allowPadding = false)
140  {
141  using Kokkos::AllowPadding;
142  using Kokkos::view_alloc;
143  using Kokkos::WithoutInitializing;
144  typedef typename Tpetra::MultiVector<ST, LO, GO, NT>::dual_view_type dual_view_type;
145  typedef typename dual_view_type::t_dev dev_view_type;
146  // This needs to be a string and not a char*, if given as an
147  // argument to Kokkos::view_alloc. This is because view_alloc
148  // also allows a raw pointer as its first argument. See
149  // https://github.com/kokkos/kokkos/issues/434.
150  const std::string label ("MV::DualView");
151 
152  // NOTE (mfh 18 Feb 2015, 12 Apr 2015, 22 Sep 2016) Our separate
153  // creation of the DualView's Views works around
154  // Kokkos::DualView's current inability to accept an
155  // AllocationProperties initial argument (as Kokkos::View does).
156  // However, the work-around is harmless, since it does what the
157  // (currently nonexistent) equivalent DualView constructor would
158  // have done anyway.
159 
160  dev_view_type d_view;
161  if (zeroOut) {
162  if (allowPadding) {
163  d_view = dev_view_type (view_alloc (label, AllowPadding),
164  lclNumRows, numCols);
165  }
166  else {
167  d_view = dev_view_type (view_alloc (label),
168  lclNumRows, numCols);
169  }
170  }
171  else {
172  if (allowPadding) {
173  d_view = dev_view_type (view_alloc (label,
174  WithoutInitializing,
175  AllowPadding),
176  lclNumRows, numCols);
177  }
178  else {
179  d_view = dev_view_type (view_alloc (label, WithoutInitializing),
180  lclNumRows, numCols);
181  }
182 #ifdef HAVE_TPETRA_DEBUG
183  // Filling with NaN is a cheap and effective way to tell if
184  // downstream code is trying to use a MultiVector's data without
185  // them having been initialized. ArithTraits lets us call nan()
186  // even if the scalar type doesn't define it; it just returns some
187  // undefined value in the latter case. This won't hurt anything
188  // because by setting zeroOut=false, users already agreed that
189  // they don't care about the contents of the MultiVector.
190  const ST nan = Kokkos::Details::ArithTraits<ST>::nan ();
191  KokkosBlas::fill (d_view, nan);
192 #endif // HAVE_TPETRA_DEBUG
193  }
194 #ifdef HAVE_TPETRA_DEBUG
195  TEUCHOS_TEST_FOR_EXCEPTION
196  (static_cast<size_t> (d_view.dimension_0 ()) != lclNumRows ||
197  static_cast<size_t> (d_view.dimension_1 ()) != numCols, std::logic_error,
198  "allocDualView: d_view's dimensions actual dimensions do not match "
199  "requested dimensions. d_view is " << d_view.dimension_0 () << " x " <<
200  d_view.dimension_1 () << "; requested " << lclNumRows << " x " << numCols
201  << ". Please report this bug to the Tpetra developers.");
202 #endif // HAVE_TPETRA_DEBUG
203 
204  typename dual_view_type::t_host h_view = Kokkos::create_mirror_view (d_view);
205 
206  dual_view_type dv (d_view, h_view);
207  // Whether or not the user cares about the initial contents of the
208  // MultiVector, the device and host views are out of sync. We
209  // prefer to work in device memory. The way to ensure this
210  // happens is to mark the device view as modified.
211  dv.template modify<typename dev_view_type::memory_space> ();
212 
213  return dv;
214  }
215 
216  // Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
217  //
218  // T: The type of the entries of the View.
219  // ExecSpace: The Kokkos execution space.
220  template<class T, class ExecSpace>
221  struct MakeUnmanagedView {
222  // The 'false' part of the branch carefully ensures that this
223  // won't attempt to use a host execution space that hasn't been
224  // initialized. For example, if Kokkos::OpenMP is disabled and
225  // Kokkos::Threads is enabled, the latter is always the default
226  // execution space of Kokkos::HostSpace, even when ExecSpace is
227  // Kokkos::Serial. That's why we go through the trouble of asking
228  // Kokkos::DualView what _its_ space is. That seems to work
229  // around this default execution space issue.
230  //
231  // NOTE (mfh 29 Jan 2016): See kokkos/kokkos#178 for why we use
232  // a memory space, rather than an execution space, as the first
233  // argument of VerifyExecutionCanAccessMemorySpace.
234  typedef typename Kokkos::Impl::if_c<
235  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
236  typename ExecSpace::memory_space,
237  Kokkos::HostSpace>::value,
238  typename ExecSpace::device_type,
239  typename Kokkos::DualView<T*, ExecSpace>::host_mirror_space>::type host_exec_space;
240  typedef Kokkos::LayoutLeft array_layout;
241  typedef Kokkos::View<T*, array_layout, host_exec_space,
242  Kokkos::MemoryUnmanaged> view_type;
243 
244  static view_type getView (const Teuchos::ArrayView<T>& x_in)
245  {
246  const size_t numEnt = static_cast<size_t> (x_in.size ());
247  if (numEnt == 0) {
248  return view_type ();
249  } else {
250  return view_type (x_in.getRawPtr (), numEnt);
251  }
252  }
253  };
254 
255  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
256  // taking a subview of a 0 x N DualView incorrectly always results
257  // in a 0 x 0 DualView.
258  template<class DualViewType>
259  DualViewType
260  takeSubview (const DualViewType& X,
261 //We will move the ALL_t to the Kokkos namespace eventually, this is a workaround for testing the new View implementation
262 #ifdef KOKKOS_USING_EXPERIMENTAL_VIEW
263  const Kokkos::Experimental::Impl::ALL_t&,
264 #else
265  const Kokkos::ALL&,
266 #endif
267  const std::pair<size_t, size_t>& colRng)
268  {
269  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
270  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
271  }
272  else {
273  return subview (X, Kokkos::ALL (), colRng);
274  }
275  }
276 
277  // mfh 14 Apr 2015: Work-around for bug in Kokkos::subview, where
278  // taking a subview of a 0 x N DualView incorrectly always results
279  // in a 0 x 0 DualView.
280  template<class DualViewType>
281  DualViewType
282  takeSubview (const DualViewType& X,
283  const std::pair<size_t, size_t>& rowRng,
284  const std::pair<size_t, size_t>& colRng)
285  {
286  if (X.dimension_0 () == 0 && X.dimension_1 () != 0) {
287  return DualViewType ("MV::DualView", 0, colRng.second - colRng.first);
288  }
289  else {
290  return subview (X, rowRng, colRng);
291  }
292  }
293 } // namespace (anonymous)
294 
295 
296 namespace Tpetra {
297 
298  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
299  bool
300  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
301  vectorIndexOutOfRange (const size_t VectorIndex) const {
302  return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
303  }
304 
305  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
306  MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic>::
307  MultiVector () :
308  base_type (Teuchos::rcp (new map_type ()))
309  {}
310 
311  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
313  MultiVector (const Teuchos::RCP<const map_type>& map,
314  const size_t numVecs,
315  const bool zeroOut) : /* default is true */
316  base_type (map)
317  {
318  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,numVecs,zeroOut)");
319 
320  const size_t lclNumRows = this->getLocalLength ();
321  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (lclNumRows, numVecs, zeroOut);
322  origView_ = view_;
323  }
324 
325  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
328  base_type (source),
329  view_ (source.view_),
330  origView_ (source.origView_),
331  whichVectors_ (source.whichVectors_)
332  {}
333 
334  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
337  const Teuchos::DataAccess copyOrView) :
338  base_type (source),
339  view_ (source.view_),
340  origView_ (source.origView_),
341  whichVectors_ (source.whichVectors_)
342  {
344  const char tfecfFuncName[] = "MultiVector(const MultiVector&, "
345  "const Teuchos::DataAccess): ";
346 
347  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV 2-arg \"copy\" ctor");
348 
349  if (copyOrView == Teuchos::Copy) {
350  // Reuse the conveniently already existing function that creates
351  // a deep copy.
352  MV cpy = createCopy (source);
353  this->view_ = cpy.view_;
354  this->origView_ = cpy.origView_;
355  this->whichVectors_ = cpy.whichVectors_;
356  }
357  else if (copyOrView == Teuchos::View) {
358  }
359  else {
360  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
361  true, std::invalid_argument, "Second argument 'copyOrView' has an "
362  "invalid value " << copyOrView << ". Valid values include "
363  "Teuchos::Copy = " << Teuchos::Copy << " and Teuchos::View = "
364  << Teuchos::View << ".");
365  }
366  }
367 
368  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
370  MultiVector (const Teuchos::RCP<const map_type>& map,
371  const dual_view_type& view) :
372  base_type (map),
373  view_ (view),
374  origView_ (view)
375  {
376  const char tfecfFuncName[] = "MultiVector(map,view): ";
377 
378  // Get stride of view: if second dimension is 0, the
379  // stride might be 0, so take view_dimension instead.
380  size_t stride[8];
381  origView_.stride (stride);
382  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
383  origView_.dimension_0 ();
384  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
385  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
386  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
387  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
388  << ". This may also mean that the input view's first dimension (number "
389  "of rows = " << view.dimension_0 () << ") does not not match the number "
390  "of entries in the Map on the calling process.");
391  }
392 
393 
394  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
396  MultiVector (const Teuchos::RCP<const map_type>& map,
397  const typename dual_view_type::t_dev& d_view) :
398  base_type (map)
399  {
400  using Teuchos::ArrayRCP;
401  using Teuchos::RCP;
402  const char tfecfFuncName[] = "MultiVector(map,d_view): ";
403 
404  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,d_view)");
405 
406  // Get stride of view: if second dimension is 0, the stride might
407  // be 0, so take view_dimension instead.
408  size_t stride[8];
409  d_view.stride (stride);
410  const size_t LDA = (d_view.dimension_1 () > 1) ? stride[1] :
411  d_view.dimension_0 ();
412  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
413  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
414  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::View's "
415  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
416  << ". This may also mean that the input view's first dimension (number "
417  "of rows = " << d_view.dimension_0 () << ") does not not match the "
418  "number of entries in the Map on the calling process.");
419 
420  // The difference between create_mirror and create_mirror_view, is
421  // that the latter copies to host. We don't necessarily want to
422  // do that; we just want to allocate the memory.
423  view_ = dual_view_type (d_view, Kokkos::create_mirror (d_view));
424  // The user gave us a device view. We take it as canonical, which
425  // means we mark it as "modified," so that the next sync will
426  // synchronize it with the host view.
427  this->template modify<device_type> ();
428  origView_ = view_;
429  }
430 
431  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
433  MultiVector (const Teuchos::RCP<const map_type>& map,
434  const dual_view_type& view,
435  const dual_view_type& origView) :
436  base_type (map),
437  view_ (view),
438  origView_ (origView)
439  {
440  const char tfecfFuncName[] = "MultiVector(map,view,origView): ";
441 
442  // Get stride of view: if second dimension is 0, the
443  // stride might be 0, so take view_dimension instead.
444  size_t stride[8];
445  origView_.stride (stride);
446  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
447  origView_.dimension_0 ();
448  const size_t lclNumRows = this->getLocalLength (); // comes from the Map
449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
450  LDA < lclNumRows, std::invalid_argument, "The input Kokkos::DualView's "
451  "column stride LDA = " << LDA << " < getLocalLength() = " << lclNumRows
452  << ". This may also mean that the input origView's first dimension (number "
453  "of rows = " << origView.dimension_0 () << ") does not not match the number "
454  "of entries in the Map on the calling process.");
455  }
456 
457 
458  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
460  MultiVector (const Teuchos::RCP<const map_type>& map,
461  const dual_view_type& view,
462  const Teuchos::ArrayView<const size_t>& whichVectors) :
463  base_type (map),
464  view_ (view),
465  origView_ (view),
466  whichVectors_ (whichVectors.begin (), whichVectors.end ())
467  {
468  using Kokkos::ALL;
469  using Kokkos::subview;
470  const char tfecfFuncName[] = "MultiVector(map,view,whichVectors): ";
471 
472  const size_t lclNumRows = map.is_null () ? size_t (0) :
473  map->getNodeNumElements ();
474  // Check dimensions of the input DualView. We accept that Kokkos
475  // might not allow construction of a 0 x m (Dual)View with m > 0,
476  // so we only require the number of rows to match if the
477  // (Dual)View has more than zero columns. Likewise, we only
478  // require the number of columns to match if the (Dual)View has
479  // more than zero rows.
480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
481  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
482  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
483  << " < map->getNodeNumElements() = " << lclNumRows << ".");
484  if (whichVectors.size () != 0) {
485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
486  view.dimension_1 () != 0 && view.dimension_1 () == 0,
487  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
488  " = " << whichVectors.size () << " > 0.");
489  size_t maxColInd = 0;
490  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
491  for (size_type k = 0; k < whichVectors.size (); ++k) {
492  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
493  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
494  std::invalid_argument, "whichVectors[" << k << "] = "
495  "Teuchos::OrdinalTraits<size_t>::invalid().");
496  maxColInd = std::max (maxColInd, whichVectors[k]);
497  }
498  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
499  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
500  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
501  << " <= max(whichVectors) = " << maxColInd << ".");
502  }
503 
504  // Get stride of view: if second dimension is 0, the
505  // stride might be 0, so take view_dimension instead.
506  size_t stride[8];
507  origView_.stride (stride);
508  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
509  origView_.dimension_0 ();
510  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
511  LDA < lclNumRows, std::invalid_argument,
512  "LDA = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
513 
514  if (whichVectors.size () == 1) {
515  // If whichVectors has only one entry, we don't need to bother
516  // with nonconstant stride. Just shift the view over so it
517  // points to the desired column.
518  //
519  // NOTE (mfh 10 May 2014) This is a special case where we set
520  // origView_ just to view that one column, not all of the
521  // original columns. This ensures that the use of origView_ in
522  // offsetView works correctly.
523  const std::pair<size_t, size_t> colRng (whichVectors[0],
524  whichVectors[0] + 1);
525  view_ = takeSubview (view_, ALL (), colRng);
526  origView_ = takeSubview (origView_, ALL (), colRng);
527  // whichVectors_.size() == 0 means "constant stride."
528  whichVectors_.clear ();
529  }
530  }
531 
532  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
534  MultiVector (const Teuchos::RCP<const map_type>& map,
535  const dual_view_type& view,
536  const dual_view_type& origView,
537  const Teuchos::ArrayView<const size_t>& whichVectors) :
538  base_type (map),
539  view_ (view),
540  origView_ (origView),
541  whichVectors_ (whichVectors.begin (), whichVectors.end ())
542  {
543  using Kokkos::ALL;
544  using Kokkos::subview;
545  const char tfecfFuncName[] = "MultiVector(map,view,origView,whichVectors): ";
546 
547  const size_t lclNumRows = this->getLocalLength ();
548  // Check dimensions of the input DualView. We accept that Kokkos
549  // might not allow construction of a 0 x m (Dual)View with m > 0,
550  // so we only require the number of rows to match if the
551  // (Dual)View has more than zero columns. Likewise, we only
552  // require the number of columns to match if the (Dual)View has
553  // more than zero rows.
554  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
555  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_0 ()) < lclNumRows,
556  std::invalid_argument, "view.dimension_0() = " << view.dimension_0 ()
557  << " < map->getNodeNumElements() = " << lclNumRows << ".");
558  if (whichVectors.size () != 0) {
559  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
560  view.dimension_1 () != 0 && view.dimension_1 () == 0,
561  std::invalid_argument, "view.dimension_1() = 0, but whichVectors.size()"
562  " = " << whichVectors.size () << " > 0.");
563  size_t maxColInd = 0;
564  typedef Teuchos::ArrayView<const size_t>::size_type size_type;
565  for (size_type k = 0; k < whichVectors.size (); ++k) {
566  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
567  whichVectors[k] == Teuchos::OrdinalTraits<size_t>::invalid (),
568  std::invalid_argument, "whichVectors[" << k << "] = "
569  "Teuchos::OrdinalTraits<size_t>::invalid().");
570  maxColInd = std::max (maxColInd, whichVectors[k]);
571  }
572  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
573  view.dimension_1 () != 0 && static_cast<size_t> (view.dimension_1 ()) <= maxColInd,
574  std::invalid_argument, "view.dimension_1() = " << view.dimension_1 ()
575  << " <= max(whichVectors) = " << maxColInd << ".");
576  }
577  // Get stride of view: if second dimension is 0, the
578  // stride might be 0, so take view_dimension instead.
579  size_t stride[8];
580  origView_.stride (stride);
581  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
582  origView_.dimension_0 ();
583  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
584  LDA < lclNumRows, std::invalid_argument, "Input DualView's column stride"
585  " = " << LDA << " < this->getLocalLength() = " << lclNumRows << ".");
586 
587  if (whichVectors.size () == 1) {
588  // If whichVectors has only one entry, we don't need to bother
589  // with nonconstant stride. Just shift the view over so it
590  // points to the desired column.
591  //
592  // NOTE (mfh 10 May 2014) This is a special case where we set
593  // origView_ just to view that one column, not all of the
594  // original columns. This ensures that the use of origView_ in
595  // offsetView works correctly.
596  const std::pair<size_t, size_t> colRng (whichVectors[0],
597  whichVectors[0] + 1);
598  view_ = takeSubview (view_, ALL (), colRng);
599  origView_ = takeSubview (origView_, ALL (), colRng);
600  // whichVectors_.size() == 0 means "constant stride."
601  whichVectors_.clear ();
602  }
603  }
604 
605  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
607  MultiVector (const Teuchos::RCP<const map_type>& map,
608  const Teuchos::ArrayView<const Scalar>& data,
609  const size_t LDA,
610  const size_t numVecs) :
611  base_type (map)
612  {
613  typedef LocalOrdinal LO;
614  typedef GlobalOrdinal GO;
615  const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs): ";
616  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView,LDA,numVecs)");
617 
618  // Deep copy constructor, constant stride (NO whichVectors_).
619  // There is no need for a deep copy constructor with nonconstant stride.
620 
621  const size_t lclNumRows =
622  map.is_null () ? size_t (0) : map->getNodeNumElements ();
623  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
624  (LDA < lclNumRows, std::invalid_argument, "LDA = " << LDA << " < "
625  "map->getNodeNumElements() = " << lclNumRows << ".");
626  if (numVecs != 0) {
627  const size_t minNumEntries = LDA * (numVecs - 1) + lclNumRows;
628  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
629  (static_cast<size_t> (data.size ()) < minNumEntries,
630  std::invalid_argument, "Input Teuchos::ArrayView does not have enough "
631  "entries, given the input Map and number of vectors in the MultiVector."
632  " data.size() = " << data.size () << " < (LDA*(numVecs-1)) + "
633  "map->getNodeNumElements () = " << minNumEntries << ".");
634  }
635 
636  this->view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
637  this->template modify<device_type> ();
638  auto X_out = this->template getLocalView<device_type> ();
639  origView_ = view_;
640 
641  // Make an unmanaged host Kokkos::View of the input data. First
642  // create a View (X_in_orig) with the original stride. Then,
643  // create a subview (X_in) with the right number of columns.
644  const impl_scalar_type* const X_in_raw =
645  reinterpret_cast<const impl_scalar_type*> (data.getRawPtr ());
646  Kokkos::View<const impl_scalar_type**,
647  Kokkos::LayoutLeft,
648  Kokkos::HostSpace,
649  Kokkos::MemoryUnmanaged> X_in_orig (X_in_raw, LDA, numVecs);
650  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
651  auto X_in = Kokkos::subview (X_in_orig, rowRng, Kokkos::ALL ());
652 
653  // If LDA != X_out's column stride, then we need to copy one
654  // column at a time; Kokkos::deep_copy refuses to work in that
655  // case.
656  size_t outStrides[8];
657  X_out.stride (outStrides);
658  const size_t outStride = (X_out.dimension_1 () > 1) ? outStrides[1] :
659  X_out.dimension_0 ();
660  if (LDA == outStride) { // strides are the same; deep_copy once
661  // This only works because MultiVector uses LayoutLeft.
662  // We would need a custom copy functor otherwise.
663  Kokkos::deep_copy (X_out, X_in);
664  }
665  else { // strides differ; copy one column at a time
666  typedef decltype (Kokkos::subview (X_out, Kokkos::ALL (), 0))
667  out_col_view_type;
668  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0))
669  in_col_view_type;
670  for (size_t j = 0; j < numVecs; ++j) {
671  out_col_view_type X_out_j = Kokkos::subview (X_out, Kokkos::ALL (), j);
672  in_col_view_type X_in_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
673  Kokkos::deep_copy (X_out_j, X_in_j);
674  }
675  }
676  }
677 
678  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
680  MultiVector (const Teuchos::RCP<const map_type>& map,
681  const Teuchos::ArrayView<const Teuchos::ArrayView<const Scalar> >& ArrayOfPtrs,
682  const size_t numVecs) :
683  base_type (map)
684  {
685  typedef impl_scalar_type IST;
686  typedef LocalOrdinal LO;
687  typedef GlobalOrdinal GO;
688  const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,numVecs): ";
689  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV ctor (map,Teuchos::ArrayView of ArrayView,numVecs)");
690 
691  const size_t lclNumRows =
692  map.is_null () ? size_t (0) : map->getNodeNumElements ();
693  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
694  (numVecs < 1 || numVecs != static_cast<size_t> (ArrayOfPtrs.size ()),
695  std::runtime_error, "Either numVecs (= " << numVecs << ") < 1, or "
696  "ArrayOfPtrs.size() (= " << ArrayOfPtrs.size () << ") != numVecs.");
697  for (size_t j = 0; j < numVecs; ++j) {
698  Teuchos::ArrayView<const Scalar> X_j_av = ArrayOfPtrs[j];
699  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
700  static_cast<size_t> (X_j_av.size ()) < lclNumRows,
701  std::invalid_argument, "ArrayOfPtrs[" << j << "].size() = "
702  << X_j_av.size () << " < map->getNodeNumElements() = " << lclNumRows
703  << ".");
704  }
705 
706  view_ = allocDualView<Scalar, LO, GO, Node> (lclNumRows, numVecs);
707  this->template modify<device_type> ();
708  auto X_out = this->getLocalView<device_type> ();
709 
710  // Make sure that the type of a single input column has the same
711  // array layout as each output column does, so we can deep_copy.
712  typedef typename decltype (X_out)::array_layout array_layout;
713  typedef typename Kokkos::View<const IST*,
714  array_layout,
715  Kokkos::HostSpace,
716  Kokkos::MemoryUnmanaged> input_col_view_type;
717 
718  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
719  for (size_t j = 0; j < numVecs; ++j) {
720  Teuchos::ArrayView<const IST> X_j_av =
721  Teuchos::av_reinterpret_cast<const IST> (ArrayOfPtrs[j]);
722  input_col_view_type X_j_in (X_j_av.getRawPtr (), lclNumRows);
723  auto X_j_out = Kokkos::subview (X_out, rowRng, j);
724  Kokkos::deep_copy (X_j_out, X_j_in);
725  }
726  origView_ = view_;
727  }
728 
729  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
732 
733  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
736  return whichVectors_.empty ();
737  }
738 
739  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
740  size_t
743  {
744  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
745  return static_cast<size_t> (0);
746  } else {
747  return this->getMap ()->getNodeNumElements ();
748  }
749  }
750 
751  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
755  {
756  if (this->getMap ().is_null ()) { // possible, due to replaceMap().
757  return static_cast<size_t> (0);
758  } else {
759  return this->getMap ()->getGlobalNumElements ();
760  }
761  }
762 
763  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
764  size_t
766  getStride () const
767  {
768  if (isConstantStride ()) {
769  // Get stride of view: if second dimension is 0, the
770  // stride might be 0, so take view_dimension instead.
771  size_t stride[8];
772  origView_.stride (stride);
773  const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] :
774  origView_.dimension_0 ();
775  return LDA;
776  }
777  else {
778  return static_cast<size_t> (0);
779  }
780  }
781 
782  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
783  bool
785  checkSizes (const SrcDistObject& sourceObj)
786  {
787  // Check whether the source object is a MultiVector. If not, then
788  // we can't even compare sizes, so it's definitely not OK to
789  // Import or Export from it.
791  const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
792  if (src == NULL) {
793  return false;
794  } else {
795  // The target of the Import or Export calls checkSizes() in
796  // DistObject::doTransfer(). By that point, we've already
797  // constructed an Import or Export object using the two
798  // multivectors' Maps, which means that (hopefully) we've
799  // already checked other attributes of the multivectors. Thus,
800  // all we need to do here is check the number of columns.
801  return src->getNumVectors () == this->getNumVectors ();
802  }
803  }
804 
805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
806  size_t
809  return this->getNumVectors ();
810  }
811 
812  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
813  void
815  copyAndPermuteNew (const SrcDistObject& sourceObj,
816  const size_t numSameIDs,
817  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteToLIDs,
818  const Kokkos::DualView<const LocalOrdinal*, device_type>& permuteFromLIDs)
819  {
821  using ::Tpetra::Details::ProfilingRegion;
822  using KokkosRefactor::Details::permute_array_multi_column;
823  using KokkosRefactor::Details::permute_array_multi_column_variable_stride;
824  using Kokkos::Compat::create_const_view;
825  typedef typename dual_view_type::t_dev::memory_space DMS;
826  typedef typename dual_view_type::t_host::memory_space HMS;
828  const char tfecfFuncName[] = "copyAndPermuteNew: ";
829  ProfilingRegion regionCAP ("Tpetra::MultiVector::copyAndPermute");
830 
831  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
832  (permuteToLIDs.dimension_0 () != permuteFromLIDs.dimension_0 (),
833  std::runtime_error, "permuteToLIDs.dimension_0() = "
834  << permuteToLIDs.dimension_0 () << " != permuteFromLIDs.dimension_0() = "
835  << permuteFromLIDs.dimension_0 () << ".");
836 
837  // We've already called checkSizes(), so this cast must succeed.
838  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
839  const size_t numCols = this->getNumVectors ();
840 
841  // The input sourceObj controls whether we copy on host or device.
842  // This is because this method is not allowed to modify sourceObj,
843  // so it may not sync sourceObj; it must use the most recently
844  // updated version (host or device) of sourceObj's data.
845  //
846  // If we need sync to device, then host has the most recent version.
847  const bool copyOnHost = sourceMV.template need_sync<device_type> ();
848 
849  if (copyOnHost) {
850  this->template sync<Kokkos::HostSpace> ();
851  this->template modify<Kokkos::HostSpace> ();
852  }
853  else {
854  this->template sync<device_type> ();
855  this->template modify<device_type> ();
856  }
857 
858  // TODO (mfh 15 Sep 2013) When we replace
859  // KokkosClassic::MultiVector with a Kokkos::View, there are two
860  // ways to copy the data:
861  //
862  // 1. Get a (sub)view of each column and call deep_copy on that.
863  // 2. Write a custom kernel to copy the data.
864  //
865  // The first is easier, but the second might be more performant in
866  // case we decide to use layouts other than LayoutLeft. It might
867  // even make sense to hide whichVectors_ in an entirely new layout
868  // for Kokkos Views.
869 
870  // Copy rows [0, numSameIDs-1] of the local multivectors.
871  //
872  // Note (ETP 2 Jul 2014) We need to always copy one column at a
873  // time, even when both multivectors are constant-stride, since
874  // deep_copy between strided subviews with more than one column
875  // doesn't currently work.
876 
877  if (numSameIDs > 0) {
878  const std::pair<size_t, size_t> rows (0, numSameIDs);
879  if (copyOnHost) {
880  auto tgt_h = this->template getLocalView<HMS> ();
881  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
882 
883  for (size_t j = 0; j < numCols; ++j) {
884  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
885  const size_t srcCol =
886  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
887 
888  auto tgt_j = Kokkos::subview (tgt_h, rows, tgtCol);
889  auto src_j = Kokkos::subview (src_h, rows, srcCol);
890  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
891  }
892  }
893  else { // copy on device
894  auto tgt_d = this->template getLocalView<DMS> ();
895  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
896 
897  for (size_t j = 0; j < numCols; ++j) {
898  const size_t tgtCol = isConstantStride () ? j : whichVectors_[j];
899  const size_t srcCol =
900  sourceMV.isConstantStride () ? j : sourceMV.whichVectors_[j];
901 
902  auto tgt_j = Kokkos::subview (tgt_d, rows, tgtCol);
903  auto src_j = Kokkos::subview (src_d, rows, srcCol);
904  Kokkos::deep_copy (tgt_j, src_j); // Copy src_j into tgt_j
905  }
906  }
907  }
908 
909  // For the remaining GIDs, execute the permutations. This may
910  // involve noncontiguous access of both source and destination
911  // vectors, depending on the LID lists.
912  //
913  // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
914  // the same process, this merges their values by replacement of
915  // the last encountered GID, not by the specified merge rule
916  // (such as ADD).
917 
918  // If there are no permutations, we are done
919  if (permuteFromLIDs.dimension_0 () == 0 ||
920  permuteToLIDs.dimension_0 () == 0) {
921  return;
922  }
923 
924  // This gets around const-ness of the DualView input. In
925  // particular, it gives us freedom to sync them where we need
926  // them.
927  Kokkos::DualView<const LocalOrdinal*, device_type> permuteToLIDs_nc =
928  permuteToLIDs;
929  Kokkos::DualView<const LocalOrdinal*, device_type> permuteFromLIDs_nc =
930  permuteFromLIDs;
931 
932  // We could in theory optimize for the case where exactly one of
933  // them is constant stride, but we don't currently do that.
934  const bool nonConstStride =
935  ! this->isConstantStride () || ! sourceMV.isConstantStride ();
936 
937  // We only need the "which vectors" arrays if either the source or
938  // target MV is not constant stride. Since we only have one
939  // kernel that must do double-duty, we have to create a "which
940  // vectors" array for the MV that _is_ constant stride.
941  Kokkos::DualView<const size_t*, device_type> srcWhichVecs;
942  Kokkos::DualView<const size_t*, device_type> tgtWhichVecs;
943  if (nonConstStride) {
944  if (this->whichVectors_.size () == 0) {
945  Kokkos::DualView<size_t*, device_type> tmpTgt ("tgtWhichVecs", numCols);
946  tmpTgt.template modify<HMS> ();
947  for (size_t j = 0; j < numCols; ++j) {
948  tmpTgt.h_view(j) = j;
949  }
950  if (! copyOnHost) {
951  tmpTgt.template sync<DMS> ();
952  }
953  tgtWhichVecs = tmpTgt;
954  }
955  else {
956  Teuchos::ArrayView<const size_t> tgtWhichVecsT = this->whichVectors_ ();
957  tgtWhichVecs =
958  getDualViewCopyFromArrayView<size_t, device_type> (tgtWhichVecsT,
959  "tgtWhichVecs",
960  copyOnHost);
961  }
962  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
963  (static_cast<size_t> (tgtWhichVecs.dimension_0 ()) !=
964  this->getNumVectors (),
965  std::logic_error, "tgtWhichVecs.dimension_0() = " <<
966  tgtWhichVecs.dimension_0 () << " != this->getNumVectors() = " <<
967  this->getNumVectors () << ".");
968 
969  if (sourceMV.whichVectors_.size () == 0) {
970  Kokkos::DualView<size_t*, device_type> tmpSrc ("srcWhichVecs", numCols);
971  tmpSrc.template modify<HMS> ();
972  for (size_t j = 0; j < numCols; ++j) {
973  tmpSrc.h_view(j) = j;
974  }
975  if (! copyOnHost) {
976  tmpSrc.template sync<DMS> ();
977  }
978  srcWhichVecs = tmpSrc;
979  }
980  else {
981  Teuchos::ArrayView<const size_t> srcWhichVecsT =
982  sourceMV.whichVectors_ ();
983  srcWhichVecs =
984  getDualViewCopyFromArrayView<size_t, device_type> (srcWhichVecsT,
985  "srcWhichVecs",
986  copyOnHost);
987  }
988  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
989  (static_cast<size_t> (srcWhichVecs.dimension_0 ()) !=
990  sourceMV.getNumVectors (), std::logic_error,
991  "srcWhichVecs.dimension_0() = " << srcWhichVecs.dimension_0 ()
992  << " != sourceMV.getNumVectors() = " << sourceMV.getNumVectors ()
993  << ".");
994  }
995 
996  if (copyOnHost) {
997  auto tgt_h = this->template getLocalView<HMS> ();
998  auto src_h = create_const_view (sourceMV.template getLocalView<HMS> ());
999  permuteToLIDs_nc.template sync<HMS> ();
1000  auto permuteToLIDs_h =
1001  create_const_view (permuteToLIDs_nc.template view<HMS> ());
1002  permuteFromLIDs_nc.template sync<HMS> ();
1003  auto permuteFromLIDs_h =
1004  create_const_view (permuteFromLIDs_nc.template view<HMS> ());
1005 
1006  if (nonConstStride) {
1007  // No need to sync first, because copyOnHost argument to
1008  // getDualViewCopyFromArrayView puts them in the right place.
1009  auto tgtWhichVecs_h =
1010  create_const_view (tgtWhichVecs.template view<HMS> ());
1011  auto srcWhichVecs_h =
1012  create_const_view (srcWhichVecs.template view<HMS> ());
1013  permute_array_multi_column_variable_stride (tgt_h, src_h,
1014  permuteToLIDs_h,
1015  permuteFromLIDs_h,
1016  tgtWhichVecs_h,
1017  srcWhichVecs_h, numCols);
1018  }
1019  else {
1020  permute_array_multi_column (tgt_h, src_h, permuteToLIDs_h,
1021  permuteFromLIDs_h, numCols);
1022  }
1023  }
1024  else { // copy on device
1025  auto tgt_d = this->template getLocalView<DMS> ();
1026  auto src_d = create_const_view (sourceMV.template getLocalView<DMS> ());
1027  permuteToLIDs_nc.template sync<DMS> ();
1028  auto permuteToLIDs_d =
1029  create_const_view (permuteToLIDs_nc.template view<DMS> ());
1030  permuteFromLIDs_nc.template sync<DMS> ();
1031  auto permuteFromLIDs_d =
1032  create_const_view (permuteFromLIDs_nc.template view<DMS> ());
1033 
1034  if (nonConstStride) {
1035  // No need to sync first, because copyOnHost argument to
1036  // getDualViewCopyFromArrayView puts them in the right place.
1037  auto tgtWhichVecs_d =
1038  create_const_view (tgtWhichVecs.template view<DMS> ());
1039  auto srcWhichVecs_d =
1040  create_const_view (srcWhichVecs.template view<DMS> ());
1041  permute_array_multi_column_variable_stride (tgt_d, src_d,
1042  permuteToLIDs_d,
1043  permuteFromLIDs_d,
1044  tgtWhichVecs_d,
1045  srcWhichVecs_d, numCols);
1046  }
1047  else {
1048  permute_array_multi_column (tgt_d, src_d, permuteToLIDs_d,
1049  permuteFromLIDs_d, numCols);
1050  }
1051  }
1052  }
1053 
1054  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1055  void
1057  packAndPrepareNew (const SrcDistObject& sourceObj,
1058  const Kokkos::DualView<const local_ordinal_type*, device_type> &exportLIDs,
1059  Kokkos::DualView<impl_scalar_type*, device_type>& exports,
1060  const Kokkos::DualView<size_t*, device_type>& /* numExportPacketsPerLID */,
1061  size_t& constantNumPackets,
1062  Distributor & /* distor */ )
1063  {
1064  using ::Tpetra::Details::ProfilingRegion;
1065  using Kokkos::Compat::create_const_view;
1066  using Kokkos::Compat::getKokkosViewDeepCopy;
1068  typedef impl_scalar_type IST;
1069  typedef typename Kokkos::DualView<IST*, device_type>::t_host::memory_space
1070  host_memory_space;
1071  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::memory_space
1072  dev_memory_space;
1073  typedef typename Kokkos::DualView<IST*, device_type>::t_host::execution_space
1074  host_execution_space;
1075  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::execution_space
1076  dev_execution_space;
1077  ProfilingRegion regionPAP ("Tpetra::MultiVector::packAndPrepare");
1078 
1079  // TODO (mfh 09 Sep 2016): The pack and unpack functions now have
1080  // the option to check indices. We do so in a debug build. At
1081  // some point, it would make sense to shift this to a run-time
1082  // option, controlled by environment variable.
1083 #ifdef HAVE_TPETRA_DEBUG
1084  constexpr bool debugCheckIndices = true;
1085 #else
1086  constexpr bool debugCheckIndices = false;
1087 #endif // HAVE_TPETRA_DEBUG
1088 
1089  const bool printDebugOutput = false;
1090  if (printDebugOutput) {
1091  std::cerr << "$$$ MV::packAndPrepareNew" << std::endl;
1092  }
1093  // We've already called checkSizes(), so this cast must succeed.
1094  const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
1095 
1096  // mfh 12 Apr 2016: packAndPrepareNew decides where to pack based
1097  // on the memory space in which exportLIDs was last modified.
1098  // (DistObject::doTransferNew decides this currently.)
1099  //
1100  // We unfortunately can't change the source object sourceMV.
1101  // Thus, we can't sync it to the memory space where we want to
1102  // pack communication buffers. As a result, for example, if
1103  // exportLIDs wants us to pack on host, but sourceMV was last
1104  // modified on device, we have to allocate a temporary host
1105  // version and copy back to host before we can pack. Similarly,
1106  // if exportLIDs wants us to pack on device, but sourceMV was last
1107  // modified on host, we have to allocate a temporary device
1108  // version and copy back to device before we can pack.
1109  const bool packOnHost =
1110  exportLIDs.modified_host () > exportLIDs.modified_device ();
1111  auto src_dev = sourceMV.template getLocalView<dev_memory_space> ();
1112  auto src_host = sourceMV.template getLocalView<host_memory_space> ();
1113  if (packOnHost) {
1114  if (sourceMV.template need_sync<Kokkos::HostSpace> ()) {
1115  // sourceMV was most recently updated on device; copy to host.
1116  // Allocate a new host mirror. We'll use it for packing below.
1117  src_host = decltype (src_host) ("MV::DualView::h_view",
1118  src_dev.dimension_0 (),
1119  src_dev.dimension_1 ());
1120  Kokkos::deep_copy (src_host, src_dev);
1121  }
1122  }
1123  else { // pack on device
1124  if (sourceMV.template need_sync<device_type> ()) {
1125  // sourceMV was most recently updated on host; copy to device.
1126  // Allocate a new "device mirror." We'll use it for packing below.
1127  src_dev = decltype (src_dev) ("MV::DualView::d_view",
1128  src_host.dimension_0 (),
1129  src_host.dimension_1 ());
1130  Kokkos::deep_copy (src_dev, src_host);
1131  }
1132  }
1133 
1134  const size_t numCols = sourceMV.getNumVectors ();
1135 
1136  // This spares us from needing to fill numExportPacketsPerLID.
1137  // Setting constantNumPackets to a nonzero value signals that
1138  // all packets have the same number of entries.
1139  constantNumPackets = numCols;
1140 
1141  // If we have no exports, there is nothing to do. Make sure this
1142  // goes _after_ setting constantNumPackets correctly.
1143  if (exportLIDs.dimension_0 () == 0) {
1144  if (printDebugOutput) {
1145  std::cerr << "$$$ MV::packAndPrepareNew DONE" << std::endl;
1146  }
1147  return;
1148  }
1149 
1150  /* The layout in the export for MultiVectors is as follows:
1151  exports = { all of the data from row exportLIDs.front() ;
1152  ....
1153  all of the data from row exportLIDs.back() }
1154  This doesn't have the best locality, but is necessary because
1155  the data for a Packet (all data associated with an LID) is
1156  required to be contiguous. */
1157 
1158  // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
1159  // packing scheme in the above comment? The data going to a
1160  // particular process must be contiguous, of course, but those
1161  // data could include entries from multiple LIDs. DistObject just
1162  // needs to know how to index into that data. Kokkos is good at
1163  // decoupling storage intent from data layout choice.
1164 
1165  if (printDebugOutput) {
1166  std::cerr << "$$$ MV::packAndPrepareNew realloc" << std::endl;
1167  }
1168 
1169  const size_t numExportLIDs = exportLIDs.dimension_0 ();
1170  const size_t newExportsSize = numCols * numExportLIDs;
1171  if (static_cast<size_t> (exports.dimension_0 ()) != newExportsSize) {
1172  if (printDebugOutput) {
1173  std::ostringstream os;
1174  const int myRank = this->getMap ()->getComm ()->getRank ();
1175  os << "$$$ MV::packAndPrepareNew (Proc " << myRank << ") realloc "
1176  "exports from " << exports.dimension_0 () << " to " << newExportsSize
1177  << std::endl;
1178  std::cerr << os.str ();
1179  }
1180  // Resize 'exports' to fit.
1181  execution_space::fence ();
1182  exports = Kokkos::DualView<impl_scalar_type*, device_type> ("exports", newExportsSize);
1183  execution_space::fence ();
1184  }
1185 
1186  // Mark 'exports' here, since we might resize it above. Resizing
1187  // currently requires calling the constructor, which clears out
1188  // the 'modified' flags.
1189  if (packOnHost) {
1190  exports.template modify<host_memory_space> ();
1191  }
1192  else {
1193  exports.template modify<dev_memory_space> ();
1194  }
1195 
1196  if (numCols == 1) { // special case for one column only
1197  // MultiVector always represents a single column with constant
1198  // stride, but it doesn't hurt to implement both cases anyway.
1199  //
1200  // ETP: I'm not sure I agree with the above statement. Can't a single-
1201  // column multivector be a subview of another multi-vector, in which case
1202  // sourceMV.whichVectors_[0] != 0 ? I think we have to handle that case
1203  // separately.
1204  //
1205  // mfh 18 Jan 2016: In answer to ETP's comment above:
1206  // MultiVector treats single-column MultiVectors created using a
1207  // "nonconstant stride constructor" as a special case, and makes
1208  // them constant stride (by making whichVectors_ have length 0).
1209  if (sourceMV.isConstantStride ()) {
1210  using KokkosRefactor::Details::pack_array_single_column;
1211  if (printDebugOutput) {
1212  std::cerr << "$$$ MV::packAndPrepareNew pack numCols=1 const stride" << std::endl;
1213  }
1214  if (packOnHost) {
1215  pack_array_single_column (exports.template view<host_memory_space> (),
1216  create_const_view (src_host),
1217  exportLIDs.template view<host_memory_space> (),
1218  0,
1219  debugCheckIndices);
1220  }
1221  else { // pack on device
1222  pack_array_single_column (exports.template view<dev_memory_space> (),
1223  create_const_view (src_dev),
1224  exportLIDs.template view<dev_memory_space> (),
1225  0,
1226  debugCheckIndices);
1227  }
1228  }
1229  else {
1230  using KokkosRefactor::Details::pack_array_single_column;
1231  if (printDebugOutput) {
1232  std::cerr << "$$$ MV::packAndPrepareNew pack numCols=1 nonconst stride" << std::endl;
1233  }
1234  if (packOnHost) {
1235  pack_array_single_column (exports.template view<host_memory_space> (),
1236  create_const_view (src_host),
1237  exportLIDs.template view<host_memory_space> (),
1238  sourceMV.whichVectors_[0],
1239  debugCheckIndices);
1240  }
1241  else { // pack on device
1242  pack_array_single_column (exports.template view<dev_memory_space> (),
1243  create_const_view (src_dev),
1244  exportLIDs.template view<dev_memory_space> (),
1245  sourceMV.whichVectors_[0],
1246  debugCheckIndices);
1247  }
1248  }
1249  }
1250  else { // the source MultiVector has multiple columns
1251  if (sourceMV.isConstantStride ()) {
1252  using KokkosRefactor::Details::pack_array_multi_column;
1253  if (printDebugOutput) {
1254  std::cerr << "$$$ MV::packAndPrepareNew pack numCols>1 const stride" << std::endl;
1255  }
1256  if (packOnHost) {
1257  pack_array_multi_column (exports.template view<host_memory_space> (),
1258  create_const_view (src_host),
1259  exportLIDs.template view<host_memory_space> (),
1260  numCols,
1261  debugCheckIndices);
1262  }
1263  else { // pack on device
1264  pack_array_multi_column (exports.template view<dev_memory_space> (),
1265  create_const_view (src_dev),
1266  exportLIDs.template view<dev_memory_space> (),
1267  numCols,
1268  debugCheckIndices);
1269  }
1270  }
1271  else {
1272  using KokkosRefactor::Details::pack_array_multi_column_variable_stride;
1273  if (printDebugOutput) {
1274  std::cerr << "$$$ MV::packAndPrepareNew pack numCols>1 nonconst stride" << std::endl;
1275  }
1276  if (packOnHost) {
1277  pack_array_multi_column_variable_stride (exports.template view<host_memory_space> (),
1278  create_const_view (src_host),
1279  exportLIDs.template view<host_memory_space> (),
1280  getKokkosViewDeepCopy<host_execution_space> (sourceMV.whichVectors_ ()),
1281  numCols,
1282  debugCheckIndices);
1283  }
1284  else { // pack on device
1285  pack_array_multi_column_variable_stride (exports.template view<dev_memory_space> (),
1286  create_const_view (src_dev),
1287  exportLIDs.template view<dev_memory_space> (),
1288  getKokkosViewDeepCopy<dev_execution_space> (sourceMV.whichVectors_ ()),
1289  numCols,
1290  debugCheckIndices);
1291  }
1292  }
1293  }
1294 
1295  if (printDebugOutput) {
1296  std::cerr << "$$$ MV::packAndPrepareNew DONE" << std::endl;
1297  }
1298  }
1299 
1300 
1301  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1302  void
1304  unpackAndCombineNew (const Kokkos::DualView<const local_ordinal_type*, device_type>& importLIDs,
1305  const Kokkos::DualView<const impl_scalar_type*, device_type>& imports,
1306  const Kokkos::DualView<const size_t*, device_type>& /* numPacketsPerLID */,
1307  const size_t constantNumPackets,
1308  Distributor& /* distor */,
1309  const CombineMode CM)
1310  {
1311  using ::Tpetra::Details::ProfilingRegion;
1312  using KokkosRefactor::Details::unpack_array_multi_column;
1313  using KokkosRefactor::Details::unpack_array_multi_column_variable_stride;
1314  using Kokkos::Compat::getKokkosViewDeepCopy;
1315  typedef impl_scalar_type IST;
1316  typedef typename Kokkos::DualView<IST*, device_type>::t_host::memory_space
1317  host_memory_space;
1318  typedef typename Kokkos::DualView<IST*, device_type>::t_dev::memory_space
1319  dev_memory_space;
1320  const char tfecfFuncName[] = "unpackAndCombineNew: ";
1321  const char suffix[] = " Please report this bug to the Tpetra developers.";
1322  ProfilingRegion regionUAC ("Tpetra::MultiVector::unpackAndCombine");
1323 
1324  // TODO (mfh 09 Sep 2016): The pack and unpack functions now have
1325  // the option to check indices. We do so in a debug build. At
1326  // some point, it would make sense to shift this to a run-time
1327  // option, controlled by environment variable.
1328 #ifdef HAVE_TPETRA_DEBUG
1329  constexpr bool debugCheckIndices = true;
1330 #else
1331  constexpr bool debugCheckIndices = false;
1332 #endif // HAVE_TPETRA_DEBUG
1333 
1334  // If we have no imports, there is nothing to do
1335  if (importLIDs.dimension_0 () == 0) {
1336  return;
1337  }
1338 
1339  const size_t numVecs = getNumVectors ();
1340 #ifdef HAVE_TPETRA_DEBUG
1341  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1342  static_cast<size_t> (imports.dimension_0 ()) !=
1343  numVecs * importLIDs.dimension_0 (),
1344  std::runtime_error,
1345  "imports.dimension_0() = " << imports.dimension_0 ()
1346  << " != getNumVectors() * importLIDs.dimension_0() = " << numVecs
1347  << " * " << importLIDs.dimension_0 () << " = "
1348  << numVecs * importLIDs.dimension_0 () << ".");
1349 
1350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1351  (constantNumPackets == static_cast<size_t> (0), std::runtime_error,
1352  "constantNumPackets input argument must be nonzero.");
1353 
1354  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1355  (static_cast<size_t> (numVecs) !=
1356  static_cast<size_t> (constantNumPackets),
1357  std::runtime_error, "constantNumPackets must equal numVecs.");
1358 #endif // HAVE_TPETRA_DEBUG
1359 
1360  // mfh 12 Apr 2016: Decide where to unpack based on the memory
1361  // space in which the imports buffer was last modified.
1362  // DistObject::doTransferNew gets to decide this. We currently
1363  // require importLIDs to match (its most recent version must be in
1364  // the same memory space as imports' most recent version).
1365  const bool unpackOnHost =
1366  imports.modified_host () > imports.modified_device ();
1367  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1368  (unpackOnHost && importLIDs.modified_host () < importLIDs.modified_device (),
1369  std::logic_error, "The 'imports' buffer was last modified on host, "
1370  "but importLIDs was last modified on device." << suffix);
1371  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1372  (! unpackOnHost && importLIDs.modified_host () > importLIDs.modified_device (),
1373  std::logic_error, "The 'imports' buffer was last modified on device, "
1374  "but importLIDs was last modified on host." << suffix);
1375 
1376  // We have to sync before modifying, because this method may read
1377  // as well as write (depending on the CombineMode). This matters
1378  // because copyAndPermute may have modified *this in the other
1379  // memory space.
1380  if (unpackOnHost) {
1381  this->template sync<host_memory_space> ();
1382  this->template modify<host_memory_space> ();
1383  }
1384  else { // unpack on device
1385  this->template sync<dev_memory_space> ();
1386  this->template modify<dev_memory_space> ();
1387  }
1388  auto X_d = this->template getLocalView<dev_memory_space> ();
1389  auto X_h = this->template getLocalView<host_memory_space> ();
1390  auto imports_d = imports.template view<dev_memory_space> ();
1391  auto imports_h = imports.template view<host_memory_space> ();
1392  auto importLIDs_d = importLIDs.template view<dev_memory_space> ();
1393  auto importLIDs_h = importLIDs.template view<host_memory_space> ();
1394 
1395  Kokkos::DualView<size_t*, device_type> whichVecs;
1396  if (! isConstantStride ()) {
1397  Kokkos::View<const size_t*, host_memory_space,
1398  Kokkos::MemoryUnmanaged> whichVecsIn (whichVectors_.getRawPtr (),
1399  numVecs);
1400  whichVecs = Kokkos::DualView<size_t*, device_type> ("whichVecs", numVecs);
1401  if (unpackOnHost) {
1402  whichVecs.template modify<host_memory_space> ();
1403  Kokkos::deep_copy (whichVecs.template view<host_memory_space> (), whichVecsIn);
1404  }
1405  else {
1406  whichVecs.template modify<dev_memory_space> ();
1407  Kokkos::deep_copy (whichVecs.template view<dev_memory_space> (), whichVecsIn);
1408  }
1409  }
1410  auto whichVecs_d = whichVecs.template view<dev_memory_space> ();
1411  auto whichVecs_h = whichVecs.template view<host_memory_space> ();
1412 
1413  /* The layout in the export for MultiVectors is as follows:
1414  imports = { all of the data from row exportLIDs.front() ;
1415  ....
1416  all of the data from row exportLIDs.back() }
1417  This doesn't have the best locality, but is necessary because
1418  the data for a Packet (all data associated with an LID) is
1419  required to be contiguous. */
1420 
1421  if (numVecs > 0 && importLIDs.dimension_0 () > 0) {
1422  // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
1423  // custom combine modes, start editing here. Also, if you trust
1424  // inlining, it would be nice to condense this code by using a
1425  // binary function object f in the pack functors.
1426  if (CM == INSERT || CM == REPLACE) {
1427  KokkosRefactor::Details::InsertOp<execution_space> op;
1428 
1429  if (isConstantStride ()) {
1430  if (unpackOnHost) {
1431  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1432  numVecs, debugCheckIndices);
1433 
1434  }
1435  else { // unpack on device
1436  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1437  numVecs, debugCheckIndices);
1438  }
1439  }
1440  else { // not constant stride
1441  if (unpackOnHost) {
1442  unpack_array_multi_column_variable_stride (X_h, imports_h,
1443  importLIDs_h,
1444  whichVecs_h, op,
1445  numVecs,
1446  debugCheckIndices);
1447  }
1448  else { // unpack on device
1449  unpack_array_multi_column_variable_stride (X_d, imports_d,
1450  importLIDs_d,
1451  whichVecs_d, op,
1452  numVecs,
1453  debugCheckIndices);
1454  }
1455  }
1456  }
1457  else if (CM == ADD) {
1458  KokkosRefactor::Details::AddOp<execution_space> op;
1459 
1460  if (isConstantStride ()) {
1461  if (unpackOnHost) {
1462  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1463  numVecs, debugCheckIndices);
1464  }
1465  else { // unpack on device
1466  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1467  numVecs, debugCheckIndices);
1468  }
1469  }
1470  else { // not constant stride
1471  if (unpackOnHost) {
1472  unpack_array_multi_column_variable_stride (X_h, imports_h,
1473  importLIDs_h,
1474  whichVecs_h, op,
1475  numVecs,
1476  debugCheckIndices);
1477  }
1478  else { // unpack on device
1479  unpack_array_multi_column_variable_stride (X_d, imports_d,
1480  importLIDs_d,
1481  whichVecs_d, op,
1482  numVecs,
1483  debugCheckIndices);
1484  }
1485  }
1486  }
1487  else if (CM == ABSMAX) {
1488  KokkosRefactor::Details::AbsMaxOp<execution_space> op;
1489 
1490  if (isConstantStride ()) {
1491  if (unpackOnHost) {
1492  unpack_array_multi_column (X_h, imports_h, importLIDs_h, op,
1493  numVecs, debugCheckIndices);
1494  }
1495  else { // unpack on device
1496  unpack_array_multi_column (X_d, imports_d, importLIDs_d, op,
1497  numVecs, debugCheckIndices);
1498  }
1499  }
1500  else {
1501  if (unpackOnHost) {
1502  unpack_array_multi_column_variable_stride (X_h, imports_h,
1503  importLIDs_h,
1504  whichVecs_h, op,
1505  numVecs,
1506  debugCheckIndices);
1507  }
1508  else { // unpack on device
1509  unpack_array_multi_column_variable_stride (X_d, imports_d,
1510  importLIDs_d,
1511  whichVecs_d, op,
1512  numVecs,
1513  debugCheckIndices);
1514  }
1515  }
1516  }
1517  else {
1518  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1519  (CM != ADD && CM != REPLACE && CM != INSERT && CM != ABSMAX,
1520  std::invalid_argument, "Invalid CombineMode: " << CM << ". Valid "
1521  "CombineMode values are ADD, REPLACE, INSERT, and ABSMAX.");
1522  }
1523  }
1524  }
1525 
1526  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1527  size_t
1530  {
1531  if (isConstantStride ()) {
1532  return static_cast<size_t> (view_.dimension_1 ());
1533  } else {
1534  return static_cast<size_t> (whichVectors_.size ());
1535  }
1536  }
1537 
1538  namespace { // (anonymous)
1539 
1540  template<class RV, class XMV>
1541  void
1542  lclDotImpl (const RV& dotsOut,
1543  const XMV& X_lcl,
1544  const XMV& Y_lcl,
1545  const size_t lclNumRows,
1546  const size_t numVecs,
1547  const Teuchos::ArrayView<const size_t>& whichVecsX,
1548  const Teuchos::ArrayView<const size_t>& whichVecsY,
1549  const bool constantStrideX,
1550  const bool constantStrideY)
1551  {
1552  using Kokkos::ALL;
1553  using Kokkos::subview;
1554  typedef typename RV::non_const_value_type dot_type;
1555 #ifdef HAVE_TPETRA_DEBUG
1556  const char prefix[] = "Tpetra::MultiVector::lclDotImpl: ";
1557 #endif // HAVE_TPETRA_DEBUG
1558 
1559  static_assert (Kokkos::Impl::is_view<RV>::value,
1560  "Tpetra::MultiVector::lclDotImpl: "
1561  "The first argument dotsOut is not a Kokkos::View.");
1562  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclDotImpl: "
1563  "The first argument dotsOut must have rank 1.");
1564  static_assert (Kokkos::Impl::is_view<XMV>::value,
1565  "Tpetra::MultiVector::lclDotImpl: The type of the 2nd and "
1566  "3rd arguments (X_lcl and Y_lcl) is not a Kokkos::View.");
1567  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclDotImpl: "
1568  "X_lcl and Y_lcl must have rank 2.");
1569 
1570  // In case the input dimensions don't match, make sure that we
1571  // don't overwrite memory that doesn't belong to us, by using
1572  // subset views with the minimum dimensions over all input.
1573  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
1574  const std::pair<size_t, size_t> colRng (0, numVecs);
1575  RV theDots = subview (dotsOut, colRng);
1576  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
1577  XMV Y = subview (Y_lcl, rowRng, Kokkos::ALL());
1578 
1579 #ifdef HAVE_TPETRA_DEBUG
1580  if (lclNumRows != 0) {
1581  TEUCHOS_TEST_FOR_EXCEPTION
1582  (X.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1583  "X.dimension_0() = " << X.dimension_0 () << " != lclNumRows "
1584  "= " << lclNumRows << ". "
1585  "Please report this bug to the Tpetra developers.");
1586  TEUCHOS_TEST_FOR_EXCEPTION
1587  (Y.dimension_0 () != lclNumRows, std::logic_error, prefix <<
1588  "Y.dimension_0() = " << Y.dimension_0 () << " != lclNumRows "
1589  "= " << lclNumRows << ". "
1590  "Please report this bug to the Tpetra developers.");
1591  // If a MultiVector is constant stride, then numVecs should
1592  // equal its View's number of columns. Otherwise, numVecs
1593  // should be less than its View's number of columns.
1594  TEUCHOS_TEST_FOR_EXCEPTION
1595  (constantStrideX &&
1596  (X.dimension_0 () != lclNumRows || X.dimension_1 () != numVecs),
1597  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1598  X.dimension_1 () << " (constant stride), which differs from the "
1599  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1600  "Please report this bug to the Tpetra developers.");
1601  TEUCHOS_TEST_FOR_EXCEPTION
1602  (! constantStrideX &&
1603  (X.dimension_0 () != lclNumRows || X.dimension_1 () < numVecs),
1604  std::logic_error, prefix << "X is " << X.dimension_0 () << " x " <<
1605  X.dimension_1 () << " (NOT constant stride), but the local "
1606  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1607  "Please report this bug to the Tpetra developers.");
1608  TEUCHOS_TEST_FOR_EXCEPTION
1609  (constantStrideY &&
1610  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () != numVecs),
1611  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1612  Y.dimension_1 () << " (constant stride), which differs from the "
1613  "local dimensions " << lclNumRows << " x " << numVecs << ". "
1614  "Please report this bug to the Tpetra developers.");
1615  TEUCHOS_TEST_FOR_EXCEPTION
1616  (! constantStrideY &&
1617  (Y.dimension_0 () != lclNumRows || Y.dimension_1 () < numVecs),
1618  std::logic_error, prefix << "Y is " << Y.dimension_0 () << " x " <<
1619  Y.dimension_1 () << " (NOT constant stride), but the local "
1620  "dimensions are " << lclNumRows << " x " << numVecs << ". "
1621  "Please report this bug to the Tpetra developers.");
1622  }
1623 #endif // HAVE_TPETRA_DEBUG
1624 
1625  if (lclNumRows == 0) {
1626  const dot_type zero = Kokkos::Details::ArithTraits<dot_type>::zero ();
1627  Kokkos::deep_copy(theDots, zero);
1628  }
1629  else { // lclNumRows != 0
1630  if (constantStrideX && constantStrideY) {
1631  if(X.dimension_1() == 1) {
1632  typename RV::non_const_value_type result =
1633  KokkosBlas::dot (Kokkos::subview(X,Kokkos::ALL(),0),
1634  Kokkos::subview(Y,Kokkos::ALL(),0));
1635  Kokkos::deep_copy(theDots,result);
1636  }
1637  else {
1638  KokkosBlas::dot (theDots, X, Y);
1639  }
1640  }
1641  else { // not constant stride
1642  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
1643  // every column. It might be better to have a kernel that
1644  // does the work all at once. On the other hand, we don't
1645  // prioritize performance of MultiVector views of
1646  // noncontiguous columns.
1647  for (size_t k = 0; k < numVecs; ++k) {
1648  const size_t X_col = constantStrideX ? k : whichVecsX[k];
1649  const size_t Y_col = constantStrideY ? k : whichVecsY[k];
1650  KokkosBlas::dot (subview (theDots, k), subview (X, ALL (), X_col),
1651  subview (Y, ALL (), Y_col));
1652  } // for each column
1653  } // constantStride
1654  } // lclNumRows != 0
1655  }
1656 
1657  template<class RV>
1658  void
1659  gblDotImpl (const RV& dotsOut,
1660  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
1661  const bool distributed)
1662  {
1663  using Teuchos::REDUCE_MAX;
1664  using Teuchos::REDUCE_SUM;
1665  using Teuchos::reduceAll;
1666  typedef typename RV::non_const_value_type dot_type;
1667 
1668  const size_t numVecs = dotsOut.dimension_0 ();
1669 
1670  // If the MultiVector is distributed over multiple processes, do
1671  // the distributed (interprocess) part of the dot product. We
1672  // assume that the MPI implementation can read from and write to
1673  // device memory.
1674  //
1675  // replaceMap() may have removed some processes. Those
1676  // processes have a null Map. They must not participate in any
1677  // collective operations. We ask first whether the Map is null,
1678  // because isDistributed() defers that question to the Map. We
1679  // still compute and return local dots for processes not
1680  // participating in collective operations; those probably don't
1681  // make any sense, but it doesn't hurt to do them, since it's
1682  // illegal to call dot() on those processes anyway.
1683  if (distributed && ! comm.is_null ()) {
1684  // The calling process only participates in the collective if
1685  // both the Map and its Comm on that process are nonnull.
1686  const int nv = static_cast<int> (numVecs);
1687  const bool commIsInterComm = Details::isInterComm (*comm);
1688 
1689  if (commIsInterComm) {
1690  // If comm is an intercomm, then we may not alias input and
1691  // output buffers, so we have to make a copy of the local
1692  // sum.
1693  typename RV::non_const_type lclDots (Kokkos::ViewAllocateWithoutInitializing ("tmp"), numVecs);
1694  Kokkos::deep_copy (lclDots, dotsOut);
1695  const dot_type* const lclSum = lclDots.ptr_on_device ();
1696  dot_type* const gblSum = dotsOut.ptr_on_device ();
1697  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
1698  }
1699  else {
1700  dot_type* const inout = dotsOut.ptr_on_device ();
1701  reduceAll<int, dot_type> (*comm, REDUCE_SUM, nv, inout, inout);
1702  }
1703  }
1704  }
1705  } // namespace (anonymous)
1706 
1707  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1708  void
1711  const Kokkos::View<dot_type*, device_type>& dots) const
1712  {
1713  using Kokkos::create_mirror_view;
1714  using Kokkos::subview;
1715  using Teuchos::Comm;
1716  using Teuchos::null;
1717  using Teuchos::RCP;
1718  // View of all the dot product results.
1719  typedef Kokkos::View<dot_type*, device_type> RV;
1721  const char tfecfFuncName[] = "Tpetra::MultiVector::dot: ";
1722 
1723  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Kokkos::View)");
1724 
1725  const size_t numVecs = this->getNumVectors ();
1726  if (numVecs == 0) {
1727  return; // nothing to do
1728  }
1729  const size_t lclNumRows = this->getLocalLength ();
1730  const size_t numDots = static_cast<size_t> (dots.dimension_0 ());
1731 
1732 #ifdef HAVE_TPETRA_DEBUG
1733  {
1734  const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
1735  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1736  ! compat, std::invalid_argument, "Tpetra::MultiVector::dot: *this is "
1737  "not compatible with the input MultiVector A. We only test for this "
1738  "in a debug build.");
1739  }
1740 #endif // HAVE_TPETRA_DEBUG
1741 
1742  // FIXME (mfh 11 Jul 2014) These exception tests may not
1743  // necessarily be thrown on all processes consistently. We should
1744  // instead pass along error state with the inner product. We
1745  // could do this by setting an extra slot to
1746  // Kokkos::Details::ArithTraits<dot_type>::one() on error. The
1747  // final sum should be
1748  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1749  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1750  lclNumRows != A.getLocalLength (), std::runtime_error,
1751  "MultiVectors do not have the same local length. "
1752  "this->getLocalLength() = " << lclNumRows << " != "
1753  "A.getLocalLength() = " << A.getLocalLength () << ".");
1754  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1755  numVecs != A.getNumVectors (), std::runtime_error,
1756  "MultiVectors must have the same number of columns (vectors). "
1757  "this->getNumVectors() = " << numVecs << " != "
1758  "A.getNumVectors() = " << A.getNumVectors () << ".");
1759  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1760  numDots != numVecs, std::runtime_error,
1761  "The output array 'dots' must have the same number of entries as the "
1762  "number of columns (vectors) in *this and A. dots.dimension_0() = " <<
1763  numDots << " != this->getNumVectors() = " << numVecs << ".");
1764 
1765  const std::pair<size_t, size_t> colRng (0, numVecs);
1766  RV dotsOut = subview (dots, colRng);
1767  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
1768  this->getMap ()->getComm ();
1769 
1770  // If we need sync to device, then host has the most recent
1771  // version. A is a guest of this method, so we should sync it.
1772  // Thus, let A control where execution happens.
1773  const bool useHostVersion = A.template need_sync<device_type> ();
1774  if (useHostVersion) {
1775  // A was last modified on host, so run the local kernel there.
1776  // This means we need a host mirror of the array of norms too.
1777  typedef typename dual_view_type::t_host XMV;
1778  typedef typename XMV::memory_space cur_memory_space;
1779 
1780  // I consider it more polite to sync *this, then to sync A.
1781  // A is a "guest" of this method, and is passed in const.
1782  const_cast<MV*> (this)->template sync<cur_memory_space> ();
1783  auto thisView = this->template getLocalView<cur_memory_space> ();
1784  auto A_view = A.template getLocalView<cur_memory_space> ();
1785 
1786  lclDotImpl<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1787  this->whichVectors_, A.whichVectors_,
1788  this->isConstantStride (), A.isConstantStride ());
1789  auto dotsOutHost = Kokkos::create_mirror_view (dotsOut);
1790  Kokkos::deep_copy (dotsOutHost, dotsOut);
1791  gblDotImpl (dotsOutHost, comm, this->isDistributed ());
1792  Kokkos::deep_copy (dotsOut, dotsOutHost);
1793  }
1794  else {
1795  // A was last modified on device, so run the local kernel there.
1796  typedef typename dual_view_type::t_dev XMV;
1797  typedef typename XMV::memory_space cur_memory_space;
1798 
1799  // I consider it more polite to sync *this, then to sync A.
1800  // A is a "guest" of this method, and is passed in const.
1801  //
1802  // Yes, "const" is a lie.
1803  const_cast<MV*> (this)->template sync<cur_memory_space> ();
1804  auto thisView = this->template getLocalView<cur_memory_space> ();
1805  auto A_view = A.template getLocalView<cur_memory_space> ();
1806 
1807  lclDotImpl<RV, XMV> (dotsOut, thisView, A_view, lclNumRows, numVecs,
1808  this->whichVectors_, A.whichVectors_,
1809  this->isConstantStride (), A.isConstantStride ());
1810  gblDotImpl (dotsOut, comm, this->isDistributed ());
1811  }
1812  }
1813 
1814  namespace { // (anonymous)
1815  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1819  {
1820  using Teuchos::Comm;
1821  using Teuchos::RCP;
1823  typedef typename MV::dot_type dot_type;
1824  typedef typename MV::dual_view_type::t_dev::memory_space dev_memory_space;
1825  typedef typename MV::dual_view_type::t_host::memory_space host_memory_space;
1826  ::Tpetra::Details::ProfilingRegion region ("Tpetra::multiVectorSingleColumnDot");
1827 
1828  RCP<const typename MV::map_type> map = x.getMap ();
1829  RCP<const Comm<int> > comm = map.is_null () ? Teuchos::null : map->getComm ();
1830  if (comm.is_null ()) {
1831  return Kokkos::ArithTraits<dot_type>::zero ();
1832  }
1833  else {
1834  typedef LocalOrdinal LO;
1835  // The min just ensures that we don't overwrite memory that
1836  // doesn't belong to us, in the erroneous input case where x
1837  // and y have different numbers of rows.
1838  const LO lclNumRows = static_cast<LO> (std::min (x.getLocalLength (),
1839  y.getLocalLength ()));
1840  const Kokkos::pair<LO, LO> rowRng (0, lclNumRows);
1841  dot_type lclDot = Kokkos::ArithTraits<dot_type>::zero ();
1842  dot_type gblDot = Kokkos::ArithTraits<dot_type>::zero ();
1843 
1844  // This function is meant to be called by x. Therefore, we
1845  // can sync x, but we can't sync y. y is a "guest" of this
1846  // method. Thus, we let y control where execution happens.
1847  // If we need sync to device, then data were last modified on
1848  // host, so we should run on host.
1849 
1850  auto x_vec = x.getVectorNonConst (0);
1851  auto y_vec = y.getVector (0);
1852 
1853  //const bool runOnHost = false;
1854  const bool runOnHost = y_vec->template need_sync<dev_memory_space> ();
1855  // const_cast<MV&> (y).template sync<dev_memory_space> ();
1856  // const_cast<MV&> (x).template sync<dev_memory_space> ();
1857 
1858  if (runOnHost) {
1859  typedef host_memory_space cur_memory_space;
1860  // x is nonconst, so we may sync x where we need to sync it.
1861  x_vec->template sync<cur_memory_space> ();
1862  auto x_2d = x_vec->template getLocalView<cur_memory_space> ();
1863  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1864  auto y_2d = y_vec->template getLocalView<cur_memory_space> ();
1865  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1866  lclDot = KokkosBlas::dot (x_1d, y_1d);
1867  }
1868  else { // run on device
1869  typedef dev_memory_space cur_memory_space;
1870  // x is nonconst, so we may sync x where we need to sync it.
1871  x_vec->template sync<cur_memory_space> ();
1872  auto x_2d = x_vec->template getLocalView<cur_memory_space> ();
1873  auto x_1d = Kokkos::subview (x_2d, rowRng, 0);
1874  auto y_2d = y_vec->template getLocalView<cur_memory_space> ();
1875  auto y_1d = Kokkos::subview (y_2d, rowRng, 0);
1876  lclDot = KokkosBlas::dot (x_1d, y_1d);
1877  }
1878 
1879  if (x.isDistributed ()) {
1880  using Teuchos::outArg;
1881  using Teuchos::REDUCE_SUM;
1882  using Teuchos::reduceAll;
1883  reduceAll<int, dot_type> (*comm, REDUCE_SUM, lclDot, outArg (gblDot));
1884  }
1885  else {
1886  gblDot = lclDot;
1887  }
1888  return gblDot;
1889  }
1890  }
1891  } // namespace (anonymous)
1892 
1893 
1894 
1895  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1896  void
1899  const Teuchos::ArrayView<dot_type>& dots) const
1900  {
1902  const char tfecfFuncName[] = "dot: ";
1903  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::dot (Teuchos::ArrayView)");
1904 
1905  const size_t numVecs = this->getNumVectors ();
1906  const size_t lclNumRows = this->getLocalLength ();
1907  const size_t numDots = static_cast<size_t> (dots.size ());
1908 
1909  // FIXME (mfh 11 Jul 2014, 31 May 2017) These exception tests may
1910  // not necessarily be thrown on all processes consistently. We
1911  // keep them for now, because MultiVector's unit tests insist on
1912  // them. In the future, we should instead pass along error state
1913  // with the inner product. We could do this by setting an extra
1914  // slot to Kokkos::Details::ArithTraits<dot_type>::one() on error.
1915  // The final sum should be
1916  // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
1917  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1918  (lclNumRows != A.getLocalLength (), std::runtime_error,
1919  "MultiVectors do not have the same local length. "
1920  "this->getLocalLength() = " << lclNumRows << " != "
1921  "A.getLocalLength() = " << A.getLocalLength () << ".");
1922  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1923  (numVecs != A.getNumVectors (), std::runtime_error,
1924  "MultiVectors must have the same number of columns (vectors). "
1925  "this->getNumVectors() = " << numVecs << " != "
1926  "A.getNumVectors() = " << A.getNumVectors () << ".");
1927  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1928  (numDots != numVecs, std::runtime_error,
1929  "The output array 'dots' must have the same number of entries as the "
1930  "number of columns (vectors) in *this and A. dots.dimension_0() = " <<
1931  numDots << " != this->getNumVectors() = " << numVecs << ".");
1932 
1933  if (numVecs == 1 && this->isConstantStride () && A.isConstantStride ()) {
1934  const dot_type gblDot = multiVectorSingleColumnDot (const_cast<MV&> (*this), A);
1935  dots[0] = gblDot;
1936  }
1937  else {
1938  // FIXME (mfh 02 Jun 2017) Use the version of KokkosBlas::dot
1939  // that takes a host View as output (for the dot product
1940  // results). This will avoid the temporary buffer allocation
1941  // and the copy from device to host.
1942  typedef Kokkos::View<dot_type*, device_type> dev_dots_view_type;
1943  typedef MakeUnmanagedView<dot_type, device_type> view_getter_type;
1944  typedef typename view_getter_type::view_type host_dots_view_type;
1945 
1946  host_dots_view_type dotsHostView (dots.getRawPtr (), numDots);
1947  dev_dots_view_type dotsDevView ("MV::dot tmp", numDots);
1948  this->dot (A, dotsDevView); // Do the computation on the device.
1949  Kokkos::deep_copy (dotsHostView, dotsDevView); // Bring back result to host
1950  }
1951  }
1952 
1953 
1954  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1955  void
1957  norm2 (const Teuchos::ArrayView<mag_type>& norms) const
1958  {
1959  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
1960  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
1961  typedef typename view_getter_type::view_type host_norms_view_type;
1962 
1963  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::norm2 (Teuchos::ArrayView)");
1964 
1965  const size_t numNorms = static_cast<size_t> (norms.size ());
1966  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
1967  dev_norms_view_type normsDevView ("MV::norm2 tmp", numNorms);
1968  this->norm2 (normsDevView); // Do the computation on the device.
1969  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
1970  }
1971 
1972 
1973  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1974  void
1976  norm2 (const Kokkos::View<mag_type*, device_type>& norms) const
1977  {
1978  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::norm2 (Kokkos::View)");
1979  this->normImpl (norms, NORM_TWO);
1980  }
1981 
1982 
1983  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
1984  void TPETRA_DEPRECATED
1987  const Teuchos::ArrayView<mag_type> &norms) const
1988  {
1989  using Kokkos::ALL;
1990  using Kokkos::subview;
1991  using Teuchos::Comm;
1992  using Teuchos::null;
1993  using Teuchos::RCP;
1994  using Teuchos::reduceAll;
1995  using Teuchos::REDUCE_SUM;
1996  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
1997  typedef Kokkos::Details::ArithTraits<mag_type> ATM;
1998  typedef Kokkos::View<mag_type*, device_type> norms_view_type;
2000  const char tfecfFuncName[] = "normWeighted: ";
2001 
2002  const size_t numVecs = this->getNumVectors ();
2003  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2004  static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
2005  "norms.size() = " << norms.size () << " != this->getNumVectors() = "
2006  << numVecs << ".");
2007 
2008  const bool OneW = (weights.getNumVectors () == 1);
2009  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2010  ! OneW && weights.getNumVectors () != numVecs, std::runtime_error,
2011  "The input MultiVector of weights must contain either one column, "
2012  "or must have the same number of columns as *this. "
2013  "weights.getNumVectors() = " << weights.getNumVectors ()
2014  << " and this->getNumVectors() = " << numVecs << ".");
2015 
2016 #ifdef HAVE_TPETRA_DEBUG
2017  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2018  ! this->getMap ()->isCompatible (*weights.getMap ()), std::runtime_error,
2019  "MultiVectors do not have compatible Maps:" << std::endl
2020  << "this->getMap(): " << std::endl << *this->getMap()
2021  << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
2022 #else
2023  const size_t lclNumRows = this->getLocalLength ();
2024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2025  lclNumRows != weights.getLocalLength (), std::runtime_error,
2026  "MultiVectors do not have the same local length.");
2027 #endif // HAVE_TPETRA_DEBUG
2028 
2029  norms_view_type lclNrms ("lclNrms", numVecs);
2030 
2031  // FIXME (mfh 18 May 2016) Yes, I know "const" is a lie.
2032  const_cast<MV*> (this)->template sync<device_type> ();
2033  const_cast<MV&> (weights).template sync<device_type> ();
2034 
2035  auto X_lcl = this->template getLocalView<device_type> ();
2036  auto W_lcl = weights.template getLocalView<device_type> ();
2037 
2038  if (isConstantStride () && ! OneW) {
2039  KokkosBlas::nrm2w_squared (lclNrms, X_lcl, W_lcl);
2040  }
2041  else {
2042  for (size_t j = 0; j < numVecs; ++j) {
2043  const size_t X_col = this->isConstantStride () ? j :
2044  this->whichVectors_[j];
2045  const size_t W_col = OneW ? static_cast<size_t> (0) :
2046  (weights.isConstantStride () ? j : weights.whichVectors_[j]);
2047  KokkosBlas::nrm2w_squared (subview (lclNrms, j),
2048  subview (X_lcl, ALL (), X_col),
2049  subview (W_lcl, ALL (), W_col));
2050  }
2051  }
2052 
2053  const mag_type OneOverN =
2054  ATM::one () / static_cast<mag_type> (this->getGlobalLength ());
2055  RCP<const Comm<int> > comm = this->getMap ().is_null () ?
2056  Teuchos::null : this->getMap ()->getComm ();
2057 
2058  if (! comm.is_null () && this->isDistributed ()) {
2059  // Assume that MPI can access device memory.
2060  reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2061  lclNrms.ptr_on_device (), norms.getRawPtr ());
2062  for (size_t k = 0; k < numVecs; ++k) {
2063  norms[k] = ATM::sqrt (norms[k] * OneOverN);
2064  }
2065  }
2066  else {
2067  typename norms_view_type::HostMirror lclNrms_h =
2068  Kokkos::create_mirror_view (lclNrms);
2069  Kokkos::deep_copy (lclNrms_h, lclNrms);
2070  for (size_t k = 0; k < numVecs; ++k) {
2071  norms[k] = ATM::sqrt (ATS::magnitude (lclNrms_h(k)) * OneOverN);
2072  }
2073  }
2074  }
2075 
2076 
2077  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2078  void
2080  norm1 (const Teuchos::ArrayView<mag_type>& norms) const
2081  {
2082  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
2083  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
2084  typedef typename view_getter_type::view_type host_norms_view_type;
2085 
2086  const size_t numNorms = static_cast<size_t> (norms.size ());
2087  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
2088  dev_norms_view_type normsDevView ("MV::norm1 tmp", numNorms);
2089  this->norm1 (normsDevView); // Do the computation on the device.
2090  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
2091  }
2092 
2093 
2094  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2095  void
2097  norm1 (const Kokkos::View<mag_type*, device_type>& norms) const
2098  {
2099  this->normImpl (norms, NORM_ONE);
2100  }
2101 
2102 
2103  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2104  void
2106  normInf (const Teuchos::ArrayView<mag_type>& norms) const
2107  {
2108  typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
2109  typedef MakeUnmanagedView<mag_type, device_type> view_getter_type;
2110  typedef typename view_getter_type::view_type host_norms_view_type;
2111 
2112  const size_t numNorms = static_cast<size_t> (norms.size ());
2113  host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
2114  dev_norms_view_type normsDevView ("MV::normInf tmp", numNorms);
2115  this->normInf (normsDevView); // Do the computation on the device.
2116  Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
2117  }
2118 
2119 
2120  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2121  void
2123  normInf (const Kokkos::View<mag_type*, device_type>& norms) const
2124  {
2125  this->normImpl (norms, NORM_INF);
2126  }
2127 
2128  namespace { // (anonymous)
2129 
2132  IMPL_NORM_ONE, //<! Use the one-norm
2133  IMPL_NORM_TWO, //<! Use the two-norm
2134  IMPL_NORM_INF //<! Use the infinity-norm
2135  };
2136 
2137  template<class RV, class XMV>
2138  void
2139  lclNormImpl (const RV& normsOut,
2140  const XMV& X_lcl,
2141  const size_t lclNumRows,
2142  const size_t numVecs,
2143  const Teuchos::ArrayView<const size_t>& whichVecs,
2144  const bool constantStride,
2145  const EWhichNormImpl whichNorm)
2146  {
2147  using Kokkos::ALL;
2148  using Kokkos::subview;
2149  typedef typename RV::non_const_value_type mag_type;
2150 
2151  static_assert (Kokkos::Impl::is_view<RV>::value,
2152  "Tpetra::MultiVector::lclNormImpl: "
2153  "The first argument RV is not a Kokkos::View.");
2154  static_assert (RV::rank == 1, "Tpetra::MultiVector::lclNormImpl: "
2155  "The first argument normsOut must have rank 1.");
2156  static_assert (Kokkos::Impl::is_view<XMV>::value,
2157  "Tpetra::MultiVector::lclNormImpl: "
2158  "The second argument X_lcl is not a Kokkos::View.");
2159  static_assert (XMV::rank == 2, "Tpetra::MultiVector::lclNormImpl: "
2160  "The second argument X_lcl must have rank 2.");
2161 
2162  // In case the input dimensions don't match, make sure that we
2163  // don't overwrite memory that doesn't belong to us, by using
2164  // subset views with the minimum dimensions over all input.
2165  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2166  const std::pair<size_t, size_t> colRng (0, numVecs);
2167  RV theNorms = subview (normsOut, colRng);
2168  XMV X = subview (X_lcl, rowRng, Kokkos::ALL());
2169 
2170  // mfh 10 Mar 2015: Kokkos::(Dual)View subviews don't quite
2171  // behave how you think when they have zero rows. In that case,
2172  // it returns a 0 x 0 (Dual)View.
2173  TEUCHOS_TEST_FOR_EXCEPTION(
2174  lclNumRows != 0 && constantStride && ( \
2175  ( X.dimension_0 () != lclNumRows ) ||
2176  ( X.dimension_1 () != numVecs ) ),
2177  std::logic_error, "Constant Stride X's dimensions are " << X.dimension_0 () << " x "
2178  << X.dimension_1 () << ", which differ from the local dimensions "
2179  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2180  "the Tpetra developers.");
2181 
2182  TEUCHOS_TEST_FOR_EXCEPTION(
2183  lclNumRows != 0 && !constantStride && ( \
2184  ( X.dimension_0 () != lclNumRows ) ||
2185  ( X.dimension_1 () < numVecs ) ),
2186  std::logic_error, "Strided X's dimensions are " << X.dimension_0 () << " x "
2187  << X.dimension_1 () << ", which are incompatible with the local dimensions "
2188  << lclNumRows << " x " << numVecs << ". Please report this bug to "
2189  "the Tpetra developers.");
2190 
2191  if (lclNumRows == 0) {
2192  const mag_type zeroMag = Kokkos::Details::ArithTraits<mag_type>::zero ();
2193  Kokkos::deep_copy(theNorms, zeroMag);
2194  }
2195  else { // lclNumRows != 0
2196  if (constantStride) {
2197  if (whichNorm == IMPL_NORM_INF) {
2198  KokkosBlas::nrmInf (theNorms, X);
2199  }
2200  else if (whichNorm == IMPL_NORM_ONE) {
2201  KokkosBlas::nrm1 (theNorms, X);
2202  }
2203  else if (whichNorm == IMPL_NORM_TWO) {
2204  KokkosBlas::nrm2_squared (theNorms, X);
2205  }
2206  else {
2207  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2208  }
2209  }
2210  else { // not constant stride
2211  // NOTE (mfh 15 Jul 2014) This does a kernel launch for
2212  // every column. It might be better to have a kernel that
2213  // does the work all at once. On the other hand, we don't
2214  // prioritize performance of MultiVector views of
2215  // noncontiguous columns.
2216  for (size_t k = 0; k < numVecs; ++k) {
2217  const size_t X_col = constantStride ? k : whichVecs[k];
2218  if (whichNorm == IMPL_NORM_INF) {
2219  KokkosBlas::nrmInf (subview (theNorms, k),
2220  subview (X, ALL (), X_col));
2221  }
2222  else if (whichNorm == IMPL_NORM_ONE) {
2223  KokkosBlas::nrm1 (subview (theNorms, k),
2224  subview (X, ALL (), X_col));
2225  }
2226  else if (whichNorm == IMPL_NORM_TWO) {
2227  KokkosBlas::nrm2_squared (subview (theNorms, k),
2228  subview (X, ALL (), X_col));
2229  }
2230  else {
2231  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Should never get here!");
2232  }
2233  } // for each column
2234  } // constantStride
2235  } // lclNumRows != 0
2236  }
2237 
2238  template<class RV>
2239  void
2240  gblNormImpl (const RV& normsOut,
2241  const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
2242  const bool distributed,
2243  const EWhichNormImpl whichNorm)
2244  {
2245  using Teuchos::REDUCE_MAX;
2246  using Teuchos::REDUCE_SUM;
2247  using Teuchos::reduceAll;
2248  typedef typename RV::non_const_value_type mag_type;
2249 
2250  const size_t numVecs = normsOut.dimension_0 ();
2251 
2252  // If the MultiVector is distributed over multiple processes, do
2253  // the distributed (interprocess) part of the norm. We assume
2254  // that the MPI implementation can read from and write to device
2255  // memory.
2256  //
2257  // replaceMap() may have removed some processes. Those processes
2258  // have a null Map. They must not participate in any collective
2259  // operations. We ask first whether the Map is null, because
2260  // isDistributed() defers that question to the Map. We still
2261  // compute and return local norms for processes not participating
2262  // in collective operations; those probably don't make any sense,
2263  // but it doesn't hurt to do them, since it's illegal to call
2264  // norm*() on those processes anyway.
2265  if (distributed && ! comm.is_null ()) {
2266  // The calling process only participates in the collective if
2267  // both the Map and its Comm on that process are nonnull.
2268  //
2269  // MPI doesn't allow aliasing of arguments, so we have to make
2270  // a copy of the local sum.
2271  RV lclNorms ("MV::normImpl lcl", numVecs);
2272  Kokkos::deep_copy (lclNorms, normsOut);
2273  const mag_type* const lclSum = lclNorms.ptr_on_device ();
2274  mag_type* const gblSum = normsOut.ptr_on_device ();
2275  const int nv = static_cast<int> (numVecs);
2276  if (whichNorm == IMPL_NORM_INF) {
2277  reduceAll<int, mag_type> (*comm, REDUCE_MAX, nv, lclSum, gblSum);
2278  } else {
2279  reduceAll<int, mag_type> (*comm, REDUCE_SUM, nv, lclSum, gblSum);
2280  }
2281  }
2282 
2283  if (whichNorm == IMPL_NORM_TWO) {
2284  // Replace the norm-squared results with their square roots in
2285  // place, to get the final output. If the device memory and
2286  // the host memory are the same, it probably doesn't pay to
2287  // launch a parallel kernel for that, since there isn't enough
2288  // parallelism for the typical MultiVector case.
2289  const bool inHostMemory =
2290  Kokkos::Impl::is_same<typename RV::memory_space,
2291  typename RV::host_mirror_space::memory_space>::value;
2292  if (inHostMemory) {
2293  for (size_t j = 0; j < numVecs; ++j) {
2294  normsOut(j) = Kokkos::Details::ArithTraits<mag_type>::sqrt (normsOut(j));
2295  }
2296  }
2297  else {
2298  // There's not as much parallelism now, but that's OK. The
2299  // point of doing parallel dispatch here is to keep the norm
2300  // results on the device, thus avoiding a copy to the host and
2301  // back again.
2302  KokkosBlas::Impl::SquareRootFunctor<RV> f (normsOut);
2303  typedef typename RV::execution_space execution_space;
2304  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2305  Kokkos::parallel_for (range_type (0, numVecs), f);
2306  }
2307  }
2308  }
2309 
2310  } // namespace (anonymous)
2311 
2312  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2313  void
2315  normImpl (const Kokkos::View<mag_type*, device_type>& norms,
2316  const EWhichNorm whichNorm) const
2317  {
2318  using Kokkos::create_mirror_view;
2319  using Kokkos::subview;
2320  using Teuchos::Comm;
2321  using Teuchos::null;
2322  using Teuchos::RCP;
2323  // View of all the norm results.
2324  typedef Kokkos::View<mag_type*, device_type> RV;
2325 
2326  const size_t numVecs = this->getNumVectors ();
2327  if (numVecs == 0) {
2328  return; // nothing to do
2329  }
2330  const size_t lclNumRows = this->getLocalLength ();
2331  const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
2332  TEUCHOS_TEST_FOR_EXCEPTION(
2333  numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::normImpl: "
2334  "'norms' must have at least as many entries as the number of vectors in "
2335  "*this. norms.dimension_0() = " << numNorms << " < this->getNumVectors()"
2336  " = " << numVecs << ".");
2337 
2338  const std::pair<size_t, size_t> colRng (0, numVecs);
2339  RV normsOut = subview (norms, colRng);
2340 
2341  EWhichNormImpl lclNormType;
2342  if (whichNorm == NORM_ONE) {
2343  lclNormType = IMPL_NORM_ONE;
2344  } else if (whichNorm == NORM_TWO) {
2345  lclNormType = IMPL_NORM_TWO;
2346  } else {
2347  lclNormType = IMPL_NORM_INF;
2348  }
2349 
2350  RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
2351  this->getMap ()->getComm ();
2352 
2353  // If we need sync to device, then host has the most recent version.
2354  const bool useHostVersion = this->template need_sync<device_type> ();
2355  if (useHostVersion) {
2356  // DualView was last modified on host, so run the local kernel there.
2357  // This means we need a host mirror of the array of norms too.
2358  typedef typename dual_view_type::t_host XMV;
2359  typedef typename XMV::memory_space cur_memory_space;
2360 
2361  auto thisView = this->template getLocalView<cur_memory_space> ();
2362  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2363  this->whichVectors_, this->isConstantStride (),
2364  lclNormType);
2365  auto normsOutHost = Kokkos::create_mirror_view (normsOut);
2366  Kokkos::deep_copy (normsOutHost, normsOut);
2367  gblNormImpl (normsOutHost, comm, this->isDistributed (), lclNormType);
2368  Kokkos::deep_copy (normsOut, normsOutHost);
2369  }
2370  else {
2371  // DualView was last modified on device, so run the local kernel there.
2372  typedef typename dual_view_type::t_dev XMV;
2373  typedef typename XMV::memory_space cur_memory_space;
2374 
2375  auto thisView = this->template getLocalView<cur_memory_space> ();
2376  lclNormImpl<RV, XMV> (normsOut, thisView, lclNumRows, numVecs,
2377  this->whichVectors_, this->isConstantStride (),
2378  lclNormType);
2379  gblNormImpl (normsOut, comm, this->isDistributed (), lclNormType);
2380  }
2381  }
2382 
2383  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2384  void
2386  meanValue (const Teuchos::ArrayView<impl_scalar_type>& means) const
2387  {
2388  // KR FIXME Overload this method to take a View.
2389  using Kokkos::ALL;
2390  using Kokkos::subview;
2391  using Teuchos::Comm;
2392  using Teuchos::RCP;
2393  using Teuchos::reduceAll;
2394  using Teuchos::REDUCE_SUM;
2395  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
2396 
2397  const size_t lclNumRows = this->getLocalLength ();
2398  const size_t numVecs = this->getNumVectors ();
2399  const size_t numMeans = static_cast<size_t> (means.size ());
2400 
2401  TEUCHOS_TEST_FOR_EXCEPTION(
2402  numMeans != numVecs, std::runtime_error,
2403  "Tpetra::MultiVector::meanValue: means.size() = " << numMeans
2404  << " != this->getNumVectors() = " << numVecs << ".");
2405 
2406  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2407  const std::pair<size_t, size_t> colRng (0, numVecs);
2408 
2409  // Make sure that the final output view has the same layout as the
2410  // temporary view's HostMirror. Left or Right doesn't matter for
2411  // a 1-D array anyway; this is just to placate the compiler.
2412  typedef Kokkos::View<impl_scalar_type*, device_type> local_view_type;
2413  typedef Kokkos::View<impl_scalar_type*,
2414  typename local_view_type::HostMirror::array_layout,
2415  Kokkos::HostSpace,
2416  Kokkos::MemoryTraits<Kokkos::Unmanaged> > host_local_view_type;
2417  host_local_view_type meansOut (means.getRawPtr (), numMeans);
2418 
2419  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
2420  this->getMap ()->getComm ();
2421 
2422  // If we need sync to device, then host has the most recent version.
2423  const bool useHostVersion = this->template need_sync<device_type> ();
2424  if (useHostVersion) {
2425  // DualView was last modified on host, so run the local kernel there.
2426  auto X_lcl = subview (this->template getLocalView<Kokkos::HostSpace> (),
2427  rowRng, Kokkos::ALL ());
2428  // Compute the local sum of each column.
2429  typename local_view_type::HostMirror lclSums ("MV::meanValue tmp", numVecs);
2430  if (isConstantStride ()) {
2431  KokkosBlas::sum (lclSums, X_lcl);
2432  }
2433  else {
2434  for (size_t j = 0; j < numVecs; ++j) {
2435  const size_t col = whichVectors_[j];
2436  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2437  }
2438  }
2439 
2440  // If there are multiple MPI processes, the all-reduce reads
2441  // from lclSums, and writes to meansOut. Otherwise, we just
2442  // copy lclSums into meansOut.
2443  if (! comm.is_null () && this->isDistributed ()) {
2444  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2445  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
2446  }
2447  else {
2448  Kokkos::deep_copy (meansOut, lclSums);
2449  }
2450  }
2451  else {
2452  // DualView was last modified on device, so run the local kernel there.
2453  auto X_lcl = subview (this->template getLocalView<device_type> (),
2454  rowRng, Kokkos::ALL ());
2455 
2456  // Compute the local sum of each column.
2457  local_view_type lclSums ("MV::meanValue tmp", numVecs);
2458  if (isConstantStride ()) {
2459  KokkosBlas::sum (lclSums, X_lcl);
2460  }
2461  else {
2462  for (size_t j = 0; j < numVecs; ++j) {
2463  const size_t col = whichVectors_[j];
2464  KokkosBlas::sum (subview (lclSums, j), subview (X_lcl, ALL (), col));
2465  }
2466  }
2467 
2468  // If there are multiple MPI processes, the all-reduce reads
2469  // from lclSums, and writes to meansOut. (We assume that MPI
2470  // can read device memory.) Otherwise, we just copy lclSums
2471  // into meansOut.
2472  if (! comm.is_null () && this->isDistributed ()) {
2473  reduceAll (*comm, REDUCE_SUM, static_cast<int> (numVecs),
2474  lclSums.ptr_on_device (), meansOut.ptr_on_device ());
2475  }
2476  else {
2477  Kokkos::deep_copy (meansOut, lclSums);
2478  }
2479  }
2480 
2481  // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
2482  // to the magnitude type, since operator/ (std::complex<T>, int)
2483  // isn't necessarily defined.
2484  const impl_scalar_type OneOverN =
2485  ATS::one () / static_cast<mag_type> (this->getGlobalLength ());
2486  for (size_t k = 0; k < numMeans; ++k) {
2487  meansOut(k) = meansOut(k) * OneOverN;
2488  }
2489  }
2490 
2491 
2492  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2493  void
2496  {
2497  typedef impl_scalar_type IST;
2498  typedef Kokkos::Details::ArithTraits<IST> ATS;
2499  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2500  typedef typename pool_type::generator_type generator_type;
2501 
2502  const IST max = Kokkos::rand<generator_type, IST>::max ();
2503  const IST min = ATS::is_signed ? IST (-max) : ATS::zero ();
2504 
2505  this->randomize (static_cast<Scalar> (min), static_cast<Scalar> (max));
2506  }
2507 
2508 
2509  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2510  void
2512  randomize (const Scalar& minVal, const Scalar& maxVal)
2513  {
2514  typedef impl_scalar_type IST;
2515  typedef Kokkos::Random_XorShift64_Pool<typename device_type::execution_space> pool_type;
2516 
2517  // Seed the pseudorandom number generator using the calling
2518  // process' rank. This helps decorrelate different process'
2519  // pseudorandom streams. It's not perfect but it's effective and
2520  // doesn't require MPI communication. The seed also includes bits
2521  // from the standard library's rand().
2522  //
2523  // FIXME (mfh 07 Jan 2015) Should we save the seed for later use?
2524  // The code below just makes a new seed each time.
2525 
2526  const uint64_t myRank =
2527  static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
2528  uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
2529  unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
2530 
2531  pool_type rand_pool (seed);
2532  const IST max = static_cast<IST> (maxVal);
2533  const IST min = static_cast<IST> (minVal);
2534 
2535  // See #1510. In case diag has already been marked modified on
2536  // host or device, we need to clear those flags, since the code
2537  // below works on device.
2538  this->view_.modified_device () = 0;
2539  this->view_.modified_host () = 0;
2540 
2541  this->template modify<device_type> ();
2542  auto thisView = this->getLocalView<device_type> ();
2543 
2544  if (isConstantStride ()) {
2545  Kokkos::fill_random (thisView, rand_pool, min, max);
2546  }
2547  else {
2548  const size_t numVecs = getNumVectors ();
2549  for (size_t k = 0; k < numVecs; ++k) {
2550  const size_t col = whichVectors_[k];
2551  auto X_k = Kokkos::subview (thisView, Kokkos::ALL (), col);
2552  Kokkos::fill_random (X_k, rand_pool, min, max);
2553  }
2554  }
2555  }
2556 
2557 
2558  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2559  void
2561  putScalar (const Scalar& alpha)
2562  {
2563  using Kokkos::ALL;
2564  using Kokkos::deep_copy;
2565  using Kokkos::subview;
2566  typedef typename device_type::memory_space DMS;
2567  typedef Kokkos::HostSpace HMS;
2568 
2569  // We need this cast for cases like Scalar = std::complex<T> but
2570  // impl_scalar_type = Kokkos::complex<T>.
2571  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2572  const size_t lclNumRows = this->getLocalLength ();
2573  const size_t numVecs = this->getNumVectors ();
2574  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2575  const std::pair<size_t, size_t> colRng (0, numVecs);
2576 
2577  // Modify the most recently updated version of the data. This
2578  // avoids sync'ing, which could violate users' expectations.
2579  //
2580  // If we need sync to device, then host has the most recent version.
2581  const bool useHostVersion = this->template need_sync<device_type> ();
2582 
2583  if (! useHostVersion) { // last modified in device memory
2584  this->template modify<DMS> (); // we are about to modify on the device
2585  auto X = subview (this->template getLocalView<DMS> (), rowRng, ALL ());
2586  if (numVecs == 1) {
2587  auto X_0 = subview (X, ALL (), static_cast<size_t> (0));
2588  deep_copy (X_0, theAlpha);
2589  }
2590  else if (isConstantStride ()) {
2591  deep_copy (X, theAlpha);
2592  }
2593  else {
2594  for (size_t k = 0; k < numVecs; ++k) {
2595  const size_t col = whichVectors_[k];
2596  auto X_k = subview (X, ALL (), col);
2597  deep_copy (X_k, theAlpha);
2598  }
2599  }
2600  }
2601  else { // last modified in host memory, so modify data there.
2602  this->template modify<HMS> (); // we are about to modify on the host
2603  auto X = subview (this->template getLocalView<HMS> (), rowRng, ALL ());
2604  if (numVecs == 1) {
2605  auto X_0 = subview (X, ALL (), static_cast<size_t> (0));
2606  deep_copy (X_0, theAlpha);
2607  }
2608  else if (isConstantStride ()) {
2609  deep_copy (X, theAlpha);
2610  }
2611  else {
2612  for (size_t k = 0; k < numVecs; ++k) {
2613  const size_t col = whichVectors_[k];
2614  auto X_k = subview (X, ALL (), col);
2615  deep_copy (X_k, theAlpha);
2616  }
2617  }
2618  }
2619  }
2620 
2621 
2622  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2623  void
2625  replaceMap (const Teuchos::RCP<const map_type>& newMap)
2626  {
2627  using Teuchos::ArrayRCP;
2628  using Teuchos::Comm;
2629  using Teuchos::RCP;
2630 
2631  // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
2632  // it might work if the MV is a column view of another MV.
2633  // However, things might go wrong when restoring the original
2634  // Map, so we don't allow this case for now.
2635  TEUCHOS_TEST_FOR_EXCEPTION(
2636  ! this->isConstantStride (), std::logic_error,
2637  "Tpetra::MultiVector::replaceMap: This method does not currently work "
2638  "if the MultiVector is a column view of another MultiVector (that is, if "
2639  "isConstantStride() == false).");
2640 
2641  // Case 1: current Map and new Map are both nonnull on this process.
2642  // Case 2: current Map is nonnull, new Map is null.
2643  // Case 3: current Map is null, new Map is nonnull.
2644  // Case 4: both Maps are null: forbidden.
2645  //
2646  // Case 1 means that we don't have to do anything on this process,
2647  // other than assign the new Map. (We always have to do that.)
2648  // It's an error for the user to supply a Map that requires
2649  // resizing in this case.
2650  //
2651  // Case 2 means that the calling process is in the current Map's
2652  // communicator, but will be excluded from the new Map's
2653  // communicator. We don't have to do anything on the calling
2654  // process; just leave whatever data it may have alone.
2655  //
2656  // Case 3 means that the calling process is excluded from the
2657  // current Map's communicator, but will be included in the new
2658  // Map's communicator. This means we need to (re)allocate the
2659  // local DualView if it does not have the right number of rows.
2660  // If the new number of rows is nonzero, we'll fill the newly
2661  // allocated local data with zeros, as befits a projection
2662  // operation.
2663  //
2664  // The typical use case for Case 3 is that the MultiVector was
2665  // first created with the Map with more processes, then that Map
2666  // was replaced with a Map with fewer processes, and finally the
2667  // original Map was restored on this call to replaceMap.
2668 
2669 #ifdef HAVE_TEUCHOS_DEBUG
2670  // mfh 28 Mar 2013: We can't check for compatibility across the
2671  // whole communicator, unless we know that the current and new
2672  // Maps are nonnull on _all_ participating processes.
2673  // TEUCHOS_TEST_FOR_EXCEPTION(
2674  // origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
2675  // std::invalid_argument, "Tpetra::MultiVector::project: "
2676  // "If the input Map's communicator is compatible (has the same number of "
2677  // "processes as) the current Map's communicator, then the two Maps must be "
2678  // "compatible. The replaceMap() method is not for data redistribution; "
2679  // "use Import or Export for that purpose.");
2680 
2681  // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
2682  // of the Map, in case the process counts don't match.
2683 #endif // HAVE_TEUCHOS_DEBUG
2684 
2685  if (this->getMap ().is_null ()) { // current Map is null
2686  // If this->getMap() is null, that means that this MultiVector
2687  // has already had replaceMap happen to it. In that case, just
2688  // reallocate the DualView with the right size.
2689 
2690  TEUCHOS_TEST_FOR_EXCEPTION(
2691  newMap.is_null (), std::invalid_argument,
2692  "Tpetra::MultiVector::replaceMap: both current and new Maps are null. "
2693  "This probably means that the input Map is incorrect.");
2694 
2695  // Case 3: current Map is null, new Map is nonnull.
2696  // Reallocate the DualView with the right dimensions.
2697  const size_t newNumRows = newMap->getNodeNumElements ();
2698  const size_t origNumRows = view_.dimension_0 ();
2699  const size_t numCols = this->getNumVectors ();
2700 
2701  if (origNumRows != newNumRows || view_.dimension_1 () != numCols) {
2702  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2703  }
2704  }
2705  else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
2706  // I am an excluded process. Reinitialize my data so that I
2707  // have 0 rows. Keep the number of columns as before.
2708  const size_t newNumRows = static_cast<size_t> (0);
2709  const size_t numCols = this->getNumVectors ();
2710  view_ = allocDualView<Scalar, LocalOrdinal, GlobalOrdinal, Node> (newNumRows, numCols);
2711  }
2712 
2713  this->map_ = newMap;
2714  }
2715 
2716  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2717  void
2719  scale (const Scalar& alpha)
2720  {
2721  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2722  if (theAlpha == Kokkos::Details::ArithTraits<impl_scalar_type>::one ()) {
2723  return; // do nothing
2724  }
2725  const size_t lclNumRows = this->getLocalLength ();
2726  const size_t numVecs = this->getNumVectors ();
2727  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2728  const std::pair<size_t, size_t> colRng (0, numVecs);
2729 
2730  // We can't substitute putScalar(0.0) for scale(0.0), because the
2731  // former will overwrite NaNs present in the MultiVector. The
2732  // semantics of this call require multiplying them by 0, which
2733  // IEEE 754 requires to be NaN.
2734 
2735  // If we need sync to device, then host has the most recent version.
2736  const bool useHostVersion = this->template need_sync<device_type> ();
2737  if (useHostVersion) {
2738  auto Y_lcl =
2739  Kokkos::subview (this->template getLocalView<Kokkos::HostSpace> (),
2740  rowRng, Kokkos::ALL ());
2741  if (isConstantStride ()) {
2742  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2743  }
2744  else {
2745  for (size_t k = 0; k < numVecs; ++k) {
2746  const size_t Y_col = this->isConstantStride () ? k :
2747  this->whichVectors_[k];
2748  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2749  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2750  }
2751  }
2752  }
2753  else { // work on device
2754  auto Y_lcl =
2755  Kokkos::subview (this->template getLocalView<device_type> (),
2756  rowRng, Kokkos::ALL ());
2757  if (isConstantStride ()) {
2758  KokkosBlas::scal (Y_lcl, theAlpha, Y_lcl);
2759  }
2760  else {
2761  for (size_t k = 0; k < numVecs; ++k) {
2762  const size_t Y_col = this->isConstantStride () ? k :
2763  this->whichVectors_[k];
2764  auto Y_k = Kokkos::subview (Y_lcl, Kokkos::ALL (), Y_col);
2765  KokkosBlas::scal (Y_k, theAlpha, Y_k);
2766  }
2767  }
2768  }
2769  }
2770 
2771 
2772  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2773  void
2775  scale (const Teuchos::ArrayView<const Scalar>& alphas)
2776  {
2777  const size_t numVecs = this->getNumVectors ();
2778  const size_t numAlphas = static_cast<size_t> (alphas.size ());
2779  TEUCHOS_TEST_FOR_EXCEPTION(
2780  numAlphas != numVecs, std::invalid_argument, "Tpetra::MultiVector::"
2781  "scale: alphas.size() = " << numAlphas << " != this->getNumVectors() = "
2782  << numVecs << ".");
2783 
2784  // Use a DualView to copy the scaling constants onto the device.
2785  typedef Kokkos::DualView<impl_scalar_type*, device_type> k_alphas_type ;
2786  k_alphas_type k_alphas ("alphas::tmp", numAlphas);
2787  k_alphas.template modify<typename k_alphas_type::host_mirror_space> ();
2788  for (size_t i = 0; i < numAlphas; ++i) {
2789  k_alphas.h_view(i) = static_cast<impl_scalar_type> (alphas[i]);
2790  }
2791  k_alphas.template sync<typename k_alphas_type::memory_space> ();
2792  // Invoke the scale() overload that takes a device View of coefficients.
2793  this->scale (k_alphas.d_view);
2794  }
2795 
2796  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2797  void
2799  scale (const Kokkos::View<const impl_scalar_type*, device_type>& alphas)
2800  {
2801  using Kokkos::ALL;
2802  using Kokkos::subview;
2803 
2804  const size_t lclNumRows = this->getLocalLength ();
2805  const size_t numVecs = this->getNumVectors ();
2806  TEUCHOS_TEST_FOR_EXCEPTION(
2807  static_cast<size_t> (alphas.dimension_0 ()) != numVecs,
2808  std::invalid_argument, "Tpetra::MultiVector::scale(alphas): "
2809  "alphas.dimension_0() = " << alphas.dimension_0 ()
2810  << " != this->getNumVectors () = " << numVecs << ".");
2811  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2812  const std::pair<size_t, size_t> colRng (0, numVecs);
2813 
2814  // NOTE (mfh 08 Apr 2015) We prefer to let the compiler deduce the
2815  // type of the return value of subview. This is because if we
2816  // switch the array layout from LayoutLeft to LayoutRight
2817  // (preferred for performance of block operations), the types
2818  // below won't be valid. (A view of a column of a LayoutRight
2819  // multivector has LayoutStride, not LayoutLeft.)
2820 
2821  // If we need sync to device, then host has the most recent version.
2822  const bool useHostVersion = this->template need_sync<device_type> ();
2823  if (useHostVersion) {
2824  // Work in host memory. This means we need to create a host
2825  // mirror of the input View of coefficients.
2826  auto alphas_h = Kokkos::create_mirror_view (alphas);
2827  typedef typename decltype (alphas_h)::memory_space cur_memory_space;
2828  Kokkos::deep_copy (alphas_h, alphas);
2829 
2830  auto Y_lcl =
2831  Kokkos::subview (this->template getLocalView<cur_memory_space> (),
2832  rowRng, Kokkos::ALL ());
2833  if (isConstantStride ()) {
2834  KokkosBlas::scal (Y_lcl, alphas_h, Y_lcl);
2835  }
2836  else {
2837  for (size_t k = 0; k < numVecs; ++k) {
2838  const size_t Y_col = this->isConstantStride () ? k :
2839  this->whichVectors_[k];
2840  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2841  // We don't have to use the entire 1-D View here; we can use
2842  // the version that takes a scalar coefficient.
2843  KokkosBlas::scal (Y_k, alphas_h(k), Y_k);
2844  }
2845  }
2846  }
2847  else { // Work in device memory, using the input View 'alphas' directly.
2848  auto Y_lcl =
2849  Kokkos::subview (this->template getLocalView<device_type> (),
2850  rowRng, Kokkos::ALL());
2851  if (isConstantStride ()) {
2852  KokkosBlas::scal (Y_lcl, alphas, Y_lcl);
2853  }
2854  else {
2855  for (size_t k = 0; k < numVecs; ++k) {
2856  const size_t Y_col = this->isConstantStride () ? k :
2857  this->whichVectors_[k];
2858  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2859  //
2860  // FIXME (mfh 08 Apr 2015) This assumes UVM. It would be
2861  // better to fix scal() so that it takes a 0-D View as the
2862  // second argument.
2863  //
2864  KokkosBlas::scal (Y_k, alphas(k), Y_k);
2865  }
2866  }
2867  }
2868  }
2869 
2870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2871  void
2873  scale (const Scalar& alpha,
2875  {
2876  using Kokkos::ALL;
2877  using Kokkos::subview;
2878  const char tfecfFuncName[] = "scale: ";
2879 
2880  const size_t lclNumRows = getLocalLength ();
2881  const size_t numVecs = getNumVectors ();
2882 
2883  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2884  lclNumRows != A.getLocalLength (), std::invalid_argument,
2885  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
2886  << A.getLocalLength () << ".");
2887  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2888  numVecs != A.getNumVectors (), std::invalid_argument,
2889  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
2890  << A.getNumVectors () << ".");
2891 
2892  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
2893  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
2894  const std::pair<size_t, size_t> colRng (0, numVecs);
2895 
2896  typedef typename dual_view_type::t_dev dev_view_type;
2897  typedef typename dual_view_type::t_host host_view_type;
2898 
2899  // If we need sync to device, then host has the most recent version.
2900  const bool useHostVersion = this->template need_sync<device_type> ();
2901  if (useHostVersion) {
2902  // Work on host, where A's data were most recently modified. A
2903  // is a "guest" of this method, so it's more polite to sync
2904  // *this, than to sync A.
2905  typedef typename host_view_type::memory_space cur_memory_space;
2906 
2907  this->template sync<cur_memory_space> ();
2908  this->template modify<cur_memory_space> ();
2909  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2910  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2911  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2912  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2913 
2914  if (isConstantStride () && A.isConstantStride ()) {
2915  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2916  }
2917  else {
2918  // Make sure that Kokkos only uses the local length for add.
2919  for (size_t k = 0; k < numVecs; ++k) {
2920  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2921  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2922  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2923  auto X_k = subview (X_lcl, ALL (), X_col);
2924 
2925  KokkosBlas::scal (Y_k, theAlpha, X_k);
2926  }
2927  }
2928  }
2929  else { // work on device
2930  typedef typename dev_view_type::memory_space cur_memory_space;
2931 
2932  this->template sync<cur_memory_space> ();
2933  this->template modify<cur_memory_space> ();
2934  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
2935  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
2936  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
2937  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
2938 
2939  if (isConstantStride () && A.isConstantStride ()) {
2940  KokkosBlas::scal (Y_lcl, theAlpha, X_lcl);
2941  }
2942  else {
2943  // Make sure that Kokkos only uses the local length for add.
2944  for (size_t k = 0; k < numVecs; ++k) {
2945  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
2946  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
2947  auto Y_k = subview (Y_lcl, ALL (), Y_col);
2948  auto X_k = subview (X_lcl, ALL (), X_col);
2949 
2950  KokkosBlas::scal (Y_k, theAlpha, X_k);
2951  }
2952  }
2953  }
2954  }
2955 
2956 
2957  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
2958  void
2961  {
2963  const char tfecfFuncName[] = "reciprocal: ";
2964 
2965  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2966  getLocalLength () != A.getLocalLength (), std::runtime_error,
2967  "MultiVectors do not have the same local length. "
2968  "this->getLocalLength() = " << getLocalLength ()
2969  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
2970  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
2971  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
2972  ": MultiVectors do not have the same number of columns (vectors). "
2973  "this->getNumVectors() = " << getNumVectors ()
2974  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
2975 
2976  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
2977 
2978  const size_t numVecs = getNumVectors ();
2979  try {
2980  this->template sync<device_type> ();
2981  this->template modify<device_type> ();
2982  // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
2983  // responsibility to sync A.
2984  //
2985  // FIXME (mfh 18 May 2016) It's rude to sync the input
2986  // argument, since it is marked const.
2987  const_cast<MV&> (A).template sync<device_type> ();
2988 
2989  auto this_view_dev = this->template getLocalView<device_type> ();
2990  auto A_view_dev = A.template getLocalView<device_type> ();
2991 
2992  if (isConstantStride () && A.isConstantStride ()) {
2993  KokkosBlas::reciprocal (this_view_dev, A_view_dev);
2994  }
2995  else {
2996  using Kokkos::ALL;
2997  using Kokkos::subview;
2998  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
2999 
3000  for (size_t k = 0; k < numVecs; ++k) {
3001  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3002  view_type vector_k = subview (this_view_dev, ALL (), this_col);
3003  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3004  view_type vector_Ak = subview (A_view_dev, ALL (), A_col);
3005  KokkosBlas::reciprocal (vector_k, vector_Ak);
3006  }
3007  }
3008  }
3009  catch (std::runtime_error &e) {
3010  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
3011  ": Caught exception from Kokkos: " << e.what () << std::endl);
3012  }
3013  }
3014 
3015 
3016  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3017  void
3020  {
3022  const char tfecfFuncName[] = "abs";
3023  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3024  getLocalLength () != A.getLocalLength (), std::runtime_error,
3025  ": MultiVectors do not have the same local length. "
3026  "this->getLocalLength() = " << getLocalLength ()
3027  << " != A.getLocalLength() = " << A.getLocalLength () << ".");
3028  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3029  A.getNumVectors () != this->getNumVectors (), std::runtime_error,
3030  ": MultiVectors do not have the same number of columns (vectors). "
3031  "this->getNumVectors() = " << getNumVectors ()
3032  << " != A.getNumVectors() = " << A.getNumVectors () << ".");
3033 
3034  // FIXME (mfh 07 Jan 2015) See note on two-argument scale() above.
3035 
3036  const size_t numVecs = getNumVectors ();
3037 
3038  this->template sync<device_type> ();
3039  this->template modify<device_type> ();
3040  // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
3041  // responsibility to sync A.
3042  //
3043  // FIXME (mfh 18 May 2016) It's rude to sync the input
3044  // argument, since it is marked const.
3045  const_cast<MV&> (A).template sync<device_type> ();
3046 
3047  auto this_view_dev = this->template getLocalView<device_type> ();
3048  auto A_view_dev = A.template getLocalView<device_type> ();
3049 
3050  if (isConstantStride () && A.isConstantStride ()) {
3051  KokkosBlas::abs (this_view_dev, A_view_dev);
3052  }
3053  else {
3054  using Kokkos::ALL;
3055  using Kokkos::subview;
3056  typedef Kokkos::View<impl_scalar_type*, device_type> view_type;
3057 
3058  for (size_t k=0; k < numVecs; ++k) {
3059  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3060  view_type vector_k = subview (this_view_dev, ALL (), this_col);
3061  const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
3062  view_type vector_Ak = subview (A_view_dev, ALL (), A_col);
3063  KokkosBlas::abs (vector_k, vector_Ak);
3064  }
3065  }
3066  }
3067 
3068 
3069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3070  void
3072  update (const Scalar& alpha,
3074  const Scalar& beta)
3075  {
3076  using Kokkos::ALL;
3077  using Kokkos::subview;
3078  const char tfecfFuncName[] = "update: ";
3079 
3080  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta)");
3081 
3082  const size_t lclNumRows = getLocalLength ();
3083  const size_t numVecs = getNumVectors ();
3084 
3085  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3086  lclNumRows != A.getLocalLength (), std::invalid_argument,
3087  "this->getLocalLength() = " << lclNumRows << " != A.getLocalLength() = "
3088  << A.getLocalLength () << ".");
3089  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3090  numVecs != A.getNumVectors (), std::invalid_argument,
3091  "this->getNumVectors() = " << numVecs << " != A.getNumVectors() = "
3092  << A.getNumVectors () << ".");
3093 
3094  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3095  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3096  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3097  const std::pair<size_t, size_t> colRng (0, numVecs);
3098 
3099  typedef typename dual_view_type::t_dev dev_view_type;
3100  typedef typename dual_view_type::t_host host_view_type;
3101 
3102  // If we need sync to device, then host has the most recent version.
3103  const bool useHostVersion = this->template need_sync<device_type> ();
3104  if (useHostVersion) {
3105  // Work on host, where A's data were most recently modified. A
3106  // is a "guest" of this method, so it's more polite to sync
3107  // *this, than to sync A.
3108  typedef typename host_view_type::memory_space cur_memory_space;
3109 
3110  this->template sync<cur_memory_space> ();
3111  this->template modify<cur_memory_space> ();
3112  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
3113  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3114  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
3115  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3116 
3117  if (isConstantStride () && A.isConstantStride ()) {
3118  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3119  }
3120  else {
3121  // Make sure that Kokkos only uses the local length for add.
3122  for (size_t k = 0; k < numVecs; ++k) {
3123  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3124  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3125  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3126  auto X_k = subview (X_lcl, ALL (), X_col);
3127 
3128  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3129  }
3130  }
3131  }
3132  else { // work on device
3133  // A is a "guest" of this method, so it's more polite to sync
3134  // *this, than to sync A.
3135  typedef typename dev_view_type::memory_space cur_memory_space;
3136  this->template sync<cur_memory_space> ();
3137  this->template modify<cur_memory_space> ();
3138  auto Y_lcl_orig = this->template getLocalView<cur_memory_space> ();
3139  auto Y_lcl = subview (Y_lcl_orig, rowRng, Kokkos::ALL ());
3140  auto X_lcl_orig = A.template getLocalView<cur_memory_space> ();
3141  auto X_lcl = subview (X_lcl_orig, rowRng, Kokkos::ALL ());
3142 
3143  if (isConstantStride () && A.isConstantStride ()) {
3144  KokkosBlas::axpby (theAlpha, X_lcl, theBeta, Y_lcl);
3145  }
3146  else {
3147  // Make sure that Kokkos only uses the local length for add.
3148  for (size_t k = 0; k < numVecs; ++k) {
3149  const size_t Y_col = this->isConstantStride () ? k : this->whichVectors_[k];
3150  const size_t X_col = A.isConstantStride () ? k : A.whichVectors_[k];
3151  auto Y_k = subview (Y_lcl, ALL (), Y_col);
3152  auto X_k = subview (X_lcl, ALL (), X_col);
3153 
3154  KokkosBlas::axpby (theAlpha, X_k, theBeta, Y_k);
3155  }
3156  }
3157  }
3158  }
3159 
3160  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3161  void
3163  update (const Scalar& alpha,
3165  const Scalar& beta,
3167  const Scalar& gamma)
3168  {
3169  using Kokkos::ALL;
3170  using Kokkos::subview;
3172  const char tfecfFuncName[] = "update(alpha,A,beta,B,gamma): ";
3173 
3174  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::update(alpha,A,beta,B,gamma)");
3175 
3176  const size_t lclNumRows = this->getLocalLength ();
3177  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3178  lclNumRows != A.getLocalLength (), std::invalid_argument,
3179  "The input MultiVector A has " << A.getLocalLength () << " local "
3180  "row(s), but this MultiVector has " << lclNumRows << " local "
3181  "row(s).");
3182  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3183  lclNumRows != B.getLocalLength (), std::invalid_argument,
3184  "The input MultiVector B has " << B.getLocalLength () << " local "
3185  "row(s), but this MultiVector has " << lclNumRows << " local "
3186  "row(s).");
3187  const size_t numVecs = getNumVectors ();
3188  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3189  A.getNumVectors () != numVecs, std::invalid_argument,
3190  "The input MultiVector A has " << A.getNumVectors () << " column(s), "
3191  "but this MultiVector has " << numVecs << " column(s).");
3192  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3193  B.getNumVectors () != numVecs, std::invalid_argument,
3194  "The input MultiVector B has " << B.getNumVectors () << " column(s), "
3195  "but this MultiVector has " << numVecs << " column(s).");
3196 
3197  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3198  const impl_scalar_type theBeta = static_cast<impl_scalar_type> (beta);
3199  const impl_scalar_type theGamma = static_cast<impl_scalar_type> (gamma);
3200 
3201  // We're lucky if *this, A, and B are all sync'd to the same
3202  // memory space. If not, we have to sync _something_. Unlike
3203  // three-argument update() or (say) dot(), we may have to sync one
3204  // of the inputs. For now, we just sync _everything_ to device.
3205  this->template sync<device_type> ();
3206  const_cast<MV&> (A).template sync<device_type> ();
3207  const_cast<MV&> (B).template sync<device_type> ();
3208 
3209  // This method modifies *this.
3210  this->template modify<device_type> ();
3211 
3212  const std::pair<size_t, size_t> rowRng (0, lclNumRows);
3213  const std::pair<size_t, size_t> colRng (0, numVecs);
3214 
3215  // Prefer 'auto' over specifying the type explicitly. This avoids
3216  // issues with a subview possibly having a different type than the
3217  // original view.
3218  auto C_lcl = subview (this->template getLocalView<device_type> (), rowRng, ALL ());
3219  auto A_lcl = subview (A.template getLocalView<device_type> (), rowRng, ALL ());
3220  auto B_lcl = subview (B.template getLocalView<device_type> (), rowRng, ALL ());
3221 
3222  if (isConstantStride () && A.isConstantStride () && B.isConstantStride ()) {
3223  KokkosBlas::update (theAlpha, A_lcl, theBeta, B_lcl, theGamma, C_lcl);
3224  }
3225  else {
3226  // Some input (or *this) is not constant stride,
3227  // so perform the update one column at a time.
3228  for (size_t k = 0; k < numVecs; ++k) {
3229  const size_t this_col = isConstantStride () ? k : whichVectors_[k];
3230  const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
3231  const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
3232  KokkosBlas::update (theAlpha, subview (A_lcl, rowRng, A_col),
3233  theBeta, subview (B_lcl, rowRng, B_col),
3234  theGamma, subview (C_lcl, rowRng, this_col));
3235  }
3236  }
3237  }
3238 
3239  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3240  Teuchos::ArrayRCP<const Scalar>
3242  getData (size_t j) const
3243  {
3244  using Kokkos::ALL;
3245  using Kokkos::subview;
3247 
3248  // Any MultiVector method that called the (classic) Kokkos Node's
3249  // viewBuffer or viewBufferNonConst methods always implied a
3250  // device->host synchronization. Thus, we synchronize here as
3251  // well.
3252  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3253 
3254  // Get a host view of the entire MultiVector's data.
3255  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3256 
3257  // Get a subview of column j.
3258  const size_t col = isConstantStride () ? j : whichVectors_[j];
3259  auto hostView_j = subview (hostView, ALL (), col);
3260 
3261  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3262  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3263  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3264 
3265 #ifdef HAVE_TPETRA_DEBUG
3266  TEUCHOS_TEST_FOR_EXCEPTION
3267  (static_cast<size_t> (hostView_j.dimension_0 ()) <
3268  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3269  "Tpetra::MultiVector::getData: hostView_j.dimension_0() = "
3270  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
3271  << dataAsArcp.size () << ". "
3272  "Please report this bug to the Tpetra developers.");
3273 #endif // HAVE_TPETRA_DEBUG
3274 
3275  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3276  }
3277 
3278  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3279  Teuchos::ArrayRCP<Scalar>
3282  {
3283  using Kokkos::ALL;
3284  using Kokkos::subview;
3286 
3287  // Any MultiVector method that called the (classic) Kokkos Node's
3288  // viewBuffer or viewBufferNonConst methods always implied a
3289  // device->host synchronization. Thus, we synchronize here as
3290  // well.
3291  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3292 
3293  // Calling getDataNonConst() implies that the user plans to modify
3294  // the values in the MultiVector, so we mark the host data as modified.
3295  this->template modify<Kokkos::HostSpace> ();
3296 
3297  // Get a host view of the entire MultiVector's data.
3298  auto hostView = this->template getLocalView<Kokkos::HostSpace> ();
3299 
3300  // Get a subview of column j.
3301  const size_t col = isConstantStride () ? j : whichVectors_[j];
3302  auto hostView_j = subview (hostView, ALL (), col);
3303 
3304  // Wrap up the subview of column j in an ArrayRCP<const impl_scalar_type>.
3305  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3306  Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
3307 
3308 #ifdef HAVE_TPETRA_DEBUG
3309  TEUCHOS_TEST_FOR_EXCEPTION
3310  (static_cast<size_t> (hostView_j.dimension_0 ()) <
3311  static_cast<size_t> (dataAsArcp.size ()), std::logic_error,
3312  "Tpetra::MultiVector::getDataNonConst: hostView_j.dimension_0() = "
3313  << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
3314  << dataAsArcp.size () << ". "
3315  "Please report this bug to the Tpetra developers.");
3316 #endif // HAVE_TPETRA_DEBUG
3317 
3318  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3319  }
3320 
3321  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3325  {
3326  if (this != &source) {
3327  base_type::operator= (source);
3328  //
3329  // operator= implements view semantics (shallow copy).
3330  //
3331 
3332  // Kokkos::View operator= also implements view semantics.
3333  view_ = source.view_;
3334  origView_ = source.origView_;
3335 
3336  // NOTE (mfh 24 Mar 2014) Christian wrote here that assigning
3337  // whichVectors_ is "probably not ok" (probably constitutes deep
3338  // copy). I would say that it's OK, because whichVectors_ is
3339  // immutable (from the user's perspective); it's analogous to
3340  // the dimensions or stride. Once we make whichVectors_ a
3341  // Kokkos::View instead of a Teuchos::Array, all debate will go
3342  // away and we will unquestionably have view semantics.
3343  whichVectors_ = source.whichVectors_;
3344  }
3345  return *this;
3346  }
3347 
3348 
3349  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3350  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3352  subCopy (const Teuchos::ArrayView<const size_t>& cols) const
3353  {
3354  using Teuchos::RCP;
3356 
3357  // Check whether the index set in cols is contiguous. If it is,
3358  // use the more efficient Range1D version of subCopy.
3359  bool contiguous = true;
3360  const size_t numCopyVecs = static_cast<size_t> (cols.size ());
3361  for (size_t j = 1; j < numCopyVecs; ++j) {
3362  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3363  contiguous = false;
3364  break;
3365  }
3366  }
3367  if (contiguous && numCopyVecs > 0) {
3368  return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
3369  }
3370  else {
3371  RCP<const MV> X_sub = this->subView (cols);
3372  RCP<MV> Y (new MV (this->getMap (), numCopyVecs, false));
3373  Y->assign (*X_sub);
3374  return Y;
3375  }
3376  }
3377 
3378  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3379  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3381  subCopy (const Teuchos::Range1D &colRng) const
3382  {
3383  using Teuchos::RCP;
3385 
3386  RCP<const MV> X_sub = this->subView (colRng);
3387  RCP<MV> Y (new MV (this->getMap (), static_cast<size_t> (colRng.size ()), false));
3388  Y->assign (*X_sub);
3389  return Y;
3390  }
3391 
3392  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3393  size_t
3396  return origView_.dimension_0 ();
3397  }
3398 
3399  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3400  size_t
3403  return origView_.dimension_1 ();
3404  }
3405 
3406  template <class Scalar, class LO, class GO, class Node, const bool classic>
3409  const map_type& subMap,
3410  const size_t offset) :
3411  base_type (Teuchos::null) // to be replaced below
3412  {
3413  using Kokkos::ALL;
3414  using Kokkos::subview;
3415  using Teuchos::RCP;
3416  using Teuchos::rcp;
3418  const char prefix[] = "Tpetra::MultiVector constructor (offsetView): ";
3419 
3420  const size_t newNumRows = subMap.getNodeNumElements ();
3421  const bool tooManyElts = newNumRows + offset > X.getOrigNumLocalRows ();
3422  if (tooManyElts) {
3423  const int myRank = X.getMap ()->getComm ()->getRank ();
3424  TEUCHOS_TEST_FOR_EXCEPTION(
3425  newNumRows + offset > X.getLocalLength (), std::runtime_error,
3426  prefix << "Invalid input Map. The input Map owns " << newNumRows <<
3427  " entries on process " << myRank << ". offset = " << offset << ". "
3428  "Yet, the MultiVector contains only " << X.getOrigNumLocalRows () <<
3429  " rows on this process.");
3430  }
3431 
3432 #ifdef HAVE_TPETRA_DEBUG
3433  const size_t strideBefore =
3434  X.isConstantStride () ? X.getStride () : static_cast<size_t> (0);
3435  const size_t lclNumRowsBefore = X.getLocalLength ();
3436  const size_t numColsBefore = X.getNumVectors ();
3437  const impl_scalar_type* hostPtrBefore =
3438  X.template getLocalView<Kokkos::HostSpace> ().ptr_on_device ();
3439 #endif // HAVE_TPETRA_DEBUG
3440 
3441  const std::pair<size_t, size_t> origRowRng (offset, X.origView_.dimension_0 ());
3442  const std::pair<size_t, size_t> rowRng (offset, offset + newNumRows);
3443 
3444  dual_view_type newOrigView = subview (X.origView_, origRowRng, ALL ());
3445  // FIXME (mfh 29 Sep 2016) If we just use X.view_ here, it breaks
3446  // CrsMatrix's Gauss-Seidel implementation (which assumes the
3447  // ability to create domain Map views of column Map MultiVectors,
3448  // and then get the original column Map MultiVector out again).
3449  // If we just use X.origView_ here, it breaks the fix for #46.
3450  // The test for offset == 0 is a hack that makes both tests pass,
3451  // but doesn't actually fix the more general issue. In
3452  // particular, the right way to fix Gauss-Seidel would be to fix
3453  // #385; that would make "getting the original column Map
3454  // MultiVector out again" unnecessary.
3455  dual_view_type newView = subview (offset == 0 ? X.origView_ : X.view_, rowRng, ALL ());
3456 
3457  // NOTE (mfh 06 Jan 2015) Work-around to deal with Kokkos not
3458  // handling subviews of degenerate Views quite so well. For some
3459  // reason, the ([0,0], [0,2]) subview of a 0 x 2 DualView is 0 x
3460  // 0. We work around by creating a new empty DualView of the
3461  // desired (degenerate) dimensions.
3462  if (newOrigView.dimension_0 () == 0 &&
3463  newOrigView.dimension_1 () != X.origView_.dimension_1 ()) {
3464  newOrigView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3465  X.getNumVectors ());
3466  }
3467  if (newView.dimension_0 () == 0 &&
3468  newView.dimension_1 () != X.view_.dimension_1 ()) {
3469  newView = allocDualView<Scalar, LO, GO, Node> (size_t (0),
3470  X.getNumVectors ());
3471  }
3472 
3473  MV subViewMV = X.isConstantStride () ?
3474  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView) :
3475  MV (Teuchos::rcp (new map_type (subMap)), newView, newOrigView, X.whichVectors_ ());
3476 
3477 #ifdef HAVE_TPETRA_DEBUG
3478  const size_t strideAfter = X.isConstantStride () ?
3479  X.getStride () :
3480  static_cast<size_t> (0);
3481  const size_t lclNumRowsAfter = X.getLocalLength ();
3482  const size_t numColsAfter = X.getNumVectors ();
3483  const impl_scalar_type* hostPtrAfter =
3484  X.template getLocalView<Kokkos::HostSpace> ().ptr_on_device ();
3485 
3486  const size_t strideRet = subViewMV.isConstantStride () ?
3487  subViewMV.getStride () :
3488  static_cast<size_t> (0);
3489  const size_t lclNumRowsRet = subViewMV.getLocalLength ();
3490  const size_t numColsRet = subViewMV.getNumVectors ();
3491 
3492  const char suffix[] = ". This should never happen. Please report this "
3493  "bug to the Tpetra developers.";
3494 
3495  TEUCHOS_TEST_FOR_EXCEPTION(
3496  lclNumRowsRet != subMap.getNodeNumElements (),
3497  std::logic_error, prefix << "Returned MultiVector has a number of rows "
3498  "different than the number of local indices in the input Map. "
3499  "lclNumRowsRet: " << lclNumRowsRet << ", subMap.getNodeNumElements(): "
3500  << subMap.getNodeNumElements () << suffix);
3501  TEUCHOS_TEST_FOR_EXCEPTION(
3502  strideBefore != strideAfter || lclNumRowsBefore != lclNumRowsAfter ||
3503  numColsBefore != numColsAfter || hostPtrBefore != hostPtrAfter,
3504  std::logic_error, prefix << "Original MultiVector changed dimensions, "
3505  "stride, or host pointer after taking offset view. strideBefore: " <<
3506  strideBefore << ", strideAfter: " << strideAfter << ", lclNumRowsBefore: "
3507  << lclNumRowsBefore << ", lclNumRowsAfter: " << lclNumRowsAfter <<
3508  ", numColsBefore: " << numColsBefore << ", numColsAfter: " <<
3509  numColsAfter << ", hostPtrBefore: " << hostPtrBefore << ", hostPtrAfter: "
3510  << hostPtrAfter << suffix);
3511  TEUCHOS_TEST_FOR_EXCEPTION(
3512  strideBefore != strideRet, std::logic_error, prefix << "Returned "
3513  "MultiVector has different stride than original MultiVector. "
3514  "strideBefore: " << strideBefore << ", strideRet: " << strideRet <<
3515  ", numColsBefore: " << numColsBefore << ", numColsRet: " << numColsRet
3516  << suffix);
3517  TEUCHOS_TEST_FOR_EXCEPTION(
3518  numColsBefore != numColsRet, std::logic_error,
3519  prefix << "Returned MultiVector has a different number of columns than "
3520  "original MultiVector. numColsBefore: " << numColsBefore << ", "
3521  "numColsRet: " << numColsRet << suffix);
3522 #endif // HAVE_TPETRA_DEBUG
3523 
3524  *this = subViewMV; // shallow copy
3525  }
3526 
3527  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3528  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3530  offsetView (const Teuchos::RCP<const map_type>& subMap,
3531  const size_t offset) const
3532  {
3534  return Teuchos::rcp (new MV (*this, *subMap, offset));
3535  }
3536 
3537  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3538  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3540  offsetViewNonConst (const Teuchos::RCP<const map_type>& subMap,
3541  const size_t offset)
3542  {
3544  return Teuchos::rcp (new MV (*this, *subMap, offset));
3545  }
3546 
3547  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3548  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3550  subView (const Teuchos::ArrayView<const size_t>& cols) const
3551  {
3552  using Teuchos::Array;
3553  using Teuchos::rcp;
3555 
3556  const size_t numViewCols = static_cast<size_t> (cols.size ());
3557  TEUCHOS_TEST_FOR_EXCEPTION(
3558  numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
3559  "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
3560  "contain at least one entry, but cols.size() = " << cols.size ()
3561  << " == 0.");
3562 
3563  // Check whether the index set in cols is contiguous. If it is,
3564  // use the more efficient Range1D version of subView.
3565  bool contiguous = true;
3566  for (size_t j = 1; j < numViewCols; ++j) {
3567  if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
3568  contiguous = false;
3569  break;
3570  }
3571  }
3572  if (contiguous) {
3573  if (numViewCols == 0) {
3574  // The output MV has no columns, so there is nothing to view.
3575  return rcp (new MV (this->getMap (), numViewCols));
3576  } else {
3577  // Use the more efficient contiguous-index-range version.
3578  return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
3579  }
3580  }
3581 
3582  if (isConstantStride ()) {
3583  return rcp (new MV (this->getMap (), view_, origView_, cols));
3584  }
3585  else {
3586  Array<size_t> newcols (cols.size ());
3587  for (size_t j = 0; j < numViewCols; ++j) {
3588  newcols[j] = whichVectors_[cols[j]];
3589  }
3590  return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
3591  }
3592  }
3593 
3594 
3595  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3596  Teuchos::RCP<const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3598  subView (const Teuchos::Range1D& colRng) const
3599  {
3600  using Kokkos::ALL;
3601  using Kokkos::subview;
3602  using Teuchos::Array;
3603  using Teuchos::RCP;
3604  using Teuchos::rcp;
3606  const char tfecfFuncName[] = "subView(Range1D): ";
3607 
3608  const size_t lclNumRows = this->getLocalLength ();
3609  const size_t numVecs = this->getNumVectors ();
3610  // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3611  // colRng.size() == 0, std::runtime_error, prefix << "Range must include "
3612  // "at least one vector.");
3613  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3614  static_cast<size_t> (colRng.size ()) > numVecs, std::runtime_error,
3615  "colRng.size() = " << colRng.size () << " > this->getNumVectors() = "
3616  << numVecs << ".");
3617  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3618  numVecs != 0 && colRng.size () != 0 &&
3619  (colRng.lbound () < static_cast<Teuchos::Ordinal> (0) ||
3620  static_cast<size_t> (colRng.ubound ()) >= numVecs),
3621  std::invalid_argument, "Nonempty input range [" << colRng.lbound () <<
3622  "," << colRng.ubound () << "] exceeds the valid range of column indices "
3623  "[0, " << numVecs << "].");
3624 
3625  RCP<const MV> X_ret; // the MultiVector subview to return
3626 
3627  // FIXME (mfh 14 Apr 2015) Apparently subview on DualView is still
3628  // broken for the case of views with zero rows. I will brutally
3629  // enforce that the subview has the correct dimensions. In
3630  // particular, in the case of zero rows, I will, if necessary,
3631  // create a new dual_view_type with zero rows and the correct
3632  // number of columns. In a debug build, I will use an all-reduce
3633  // to ensure that it has the correct dimensions on all processes.
3634 
3635  const std::pair<size_t, size_t> rows (0, lclNumRows);
3636  if (colRng.size () == 0) {
3637  const std::pair<size_t, size_t> cols (0, 0); // empty range
3638  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3639  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3640  }
3641  else {
3642  // Returned MultiVector is constant stride only if *this is.
3643  if (isConstantStride ()) {
3644  const std::pair<size_t, size_t> cols (colRng.lbound (),
3645  colRng.ubound () + 1);
3646  dual_view_type X_sub = takeSubview (this->view_, ALL (), cols);
3647  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3648  }
3649  else {
3650  if (static_cast<size_t> (colRng.size ()) == static_cast<size_t> (1)) {
3651  // We're only asking for one column, so the result does have
3652  // constant stride, even though this MultiVector does not.
3653  const std::pair<size_t, size_t> col (whichVectors_[0] + colRng.lbound (),
3654  whichVectors_[0] + colRng.ubound () + 1);
3655  dual_view_type X_sub = takeSubview (view_, ALL (), col);
3656  X_ret = rcp (new MV (this->getMap (), X_sub, origView_));
3657  }
3658  else {
3659  Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
3660  whichVectors_.begin () + colRng.ubound () + 1);
3661  X_ret = rcp (new MV (this->getMap (), view_, origView_, which));
3662  }
3663  }
3664  }
3665 
3666 #ifdef HAVE_TPETRA_DEBUG
3667  using Teuchos::Comm;
3668  using Teuchos::outArg;
3669  using Teuchos::REDUCE_MIN;
3670  using Teuchos::reduceAll;
3671 
3672  RCP<const Comm<int> > comm = this->getMap ().is_null () ? Teuchos::null :
3673  this->getMap ()->getComm ();
3674  if (! comm.is_null ()) {
3675  int lclSuccess = 1;
3676  int gblSuccess = 1;
3677 
3678  if (X_ret.is_null ()) {
3679  lclSuccess = 0;
3680  }
3681  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3682  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3683  lclSuccess != 1, std::logic_error, "X_ret (the subview of this "
3684  "MultiVector; the return value of this method) is null on some MPI "
3685  "process in this MultiVector's communicator. This should never "
3686  "happen. Please report this bug to the Tpetra developers.");
3687 
3688  if (! X_ret.is_null () &&
3689  X_ret->getNumVectors () != static_cast<size_t> (colRng.size ())) {
3690  lclSuccess = 0;
3691  }
3692  reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
3693  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3694  lclSuccess != 1, std::logic_error,
3695  "X_ret->getNumVectors() != colRng.size(), on at least one MPI process "
3696  "in this MultiVector's communicator. This should never happen. "
3697  "Please report this bug to the Tpetra developers.");
3698  }
3699 #endif // HAVE_TPETRA_DEBUG
3700 
3701  return X_ret;
3702  }
3703 
3704 
3705  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3706  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3708  subViewNonConst (const Teuchos::ArrayView<const size_t> &cols)
3709  {
3711  return Teuchos::rcp_const_cast<MV> (this->subView (cols));
3712  }
3713 
3714 
3715  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3716  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3718  subViewNonConst (const Teuchos::Range1D &colRng)
3719  {
3721  return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
3722  }
3723 
3724 
3725  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3726  Teuchos::RCP<const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3728  getVector (const size_t j) const
3729  {
3730  using Kokkos::ALL;
3731  using Kokkos::subview;
3732  using Teuchos::rcp;
3734 
3735 #ifdef HAVE_TPETRA_DEBUG
3736  const char tfecfFuncName[] = "getVector(NonConst): ";
3737  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3738  this->vectorIndexOutOfRange (j), std::runtime_error, "Input index j (== "
3739  << j << ") exceeds valid range [0, " << this->getNumVectors ()
3740  << " - 1].");
3741 #endif // HAVE_TPETRA_DEBUG
3742  const size_t jj = this->isConstantStride () ?
3743  static_cast<size_t> (j) :
3744  static_cast<size_t> (this->whichVectors_[j]);
3745  const std::pair<size_t, size_t> rng (jj, jj+1);
3746  return rcp (new V (this->getMap (),
3747  takeSubview (this->view_, ALL (), rng),
3748  origView_));
3749  }
3750 
3751 
3752  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3753  Teuchos::RCP<Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node, classic> >
3755  getVectorNonConst (const size_t j)
3756  {
3758  return Teuchos::rcp_const_cast<V> (this->getVector (j));
3759  }
3760 
3761 
3762  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3763  void
3765  get1dCopy (const Teuchos::ArrayView<Scalar>& A, const size_t LDA) const
3766  {
3767  typedef decltype (this->template getLocalView<device_type> ())
3768  dev_view_type;
3769  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
3770  host_view_type;
3771  typedef impl_scalar_type IST;
3772  typedef Kokkos::View<IST*, typename host_view_type::array_layout,
3773  Kokkos::HostSpace, Kokkos::MemoryUnmanaged> input_col_type;
3774  const char tfecfFuncName[] = "get1dCopy: ";
3775 
3776  const size_t numRows = this->getLocalLength ();
3777  const size_t numCols = this->getNumVectors ();
3778  const std::pair<size_t, size_t> rowRange (0, numRows);
3779 
3780  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3781  LDA < numRows, std::runtime_error,
3782  "LDA = " << LDA << " < numRows = " << numRows << ".");
3783  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3784  numRows > static_cast<size_t> (0) &&
3785  numCols > static_cast<size_t> (0) &&
3786  static_cast<size_t> (A.size ()) < LDA * (numCols - 1) + numRows,
3787  std::runtime_error,
3788  "A.size() = " << A.size () << ", but its size must be at least "
3789  << (LDA * (numCols - 1) + numRows) << " to hold all the entries.");
3790 
3791  // FIXME (mfh 22 Jul 2014, 10 Dec 2014) Currently, it doesn't work
3792  // to do a 2-D copy, even if this MultiVector has constant stride.
3793  // This is because Kokkos can't currently tell the difference
3794  // between padding (which permits a single deep_copy for the whole
3795  // 2-D View) and stride > numRows (which does NOT permit a single
3796  // deep_copy for the whole 2-D View). Carter is working on this,
3797  // but for now, the temporary fix is to copy one column at a time.
3798 
3799  // Use the most recently updated version of this MultiVector's
3800  // data. This avoids sync'ing, which could violate users'
3801  // expectations.
3802  //
3803  // If we need sync to device, then host has the most recent version.
3804  const bool useHostVersion = this->template need_sync<device_type> ();
3805 
3806  dev_view_type srcView_dev;
3807  host_view_type srcView_host;
3808  if (useHostVersion) {
3809  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
3810  }
3811  else {
3812  srcView_dev = this->template getLocalView<device_type> ();
3813  }
3814 
3815  for (size_t j = 0; j < numCols; ++j) {
3816  const size_t srcCol =
3817  this->isConstantStride () ? j : this->whichVectors_[j];
3818  const size_t dstCol = j;
3819  IST* const dstColRaw =
3820  reinterpret_cast<IST*> (A.getRawPtr () + LDA * dstCol);
3821  input_col_type dstColView (dstColRaw, numRows);
3822 
3823  if (useHostVersion) {
3824  auto srcColView_host = Kokkos::subview (srcView_host, rowRange, srcCol);
3825  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3826  dstColView.dimension_0 () != srcColView_host.dimension_0 (),
3827  std::logic_error, ": srcColView and dstColView_host have different "
3828  "dimensions. Please report this bug to the Tpetra developers.");
3829  Kokkos::deep_copy (dstColView, srcColView_host);
3830  }
3831  else {
3832  auto srcColView_dev = Kokkos::subview (srcView_dev, rowRange, srcCol);
3833  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3834  dstColView.dimension_0 () != srcColView_dev.dimension_0 (),
3835  std::logic_error, ": srcColView and dstColView_dev have different "
3836  "dimensions. Please report this bug to the Tpetra developers.");
3837  Kokkos::deep_copy (dstColView, srcColView_dev);
3838  }
3839  }
3840  }
3841 
3842 
3843  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3844  void
3846  get2dCopy (const Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> >& ArrayOfPtrs) const
3847  {
3849  const char tfecfFuncName[] = "get2dCopy: ";
3850  const size_t numRows = this->getLocalLength ();
3851  const size_t numCols = this->getNumVectors ();
3852 
3853  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3854  static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
3855  std::runtime_error, "Input array of pointers must contain as many "
3856  "entries (arrays) as the MultiVector has columns. ArrayOfPtrs.size() = "
3857  << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
3858 
3859  if (numRows != 0 && numCols != 0) {
3860  // No side effects until we've validated the input.
3861  for (size_t j = 0; j < numCols; ++j) {
3862  const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
3863  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3864  dstLen < numRows, std::invalid_argument, "Array j = " << j << " of "
3865  "the input array of arrays is not long enough to fit all entries in "
3866  "that column of the MultiVector. ArrayOfPtrs[j].size() = " << dstLen
3867  << " < getLocalLength() = " << numRows << ".");
3868  }
3869 
3870  // We've validated the input, so it's safe to start copying.
3871  for (size_t j = 0; j < numCols; ++j) {
3872  Teuchos::RCP<const V> X_j = this->getVector (j);
3873  const size_t LDA = static_cast<size_t> (ArrayOfPtrs[j].size ());
3874  X_j->get1dCopy (ArrayOfPtrs[j], LDA);
3875  }
3876  }
3877  }
3878 
3879  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3880  Teuchos::ArrayRCP<const Scalar>
3882  get1dView () const
3883  {
3884  if (getLocalLength () == 0 || getNumVectors () == 0) {
3885  return Teuchos::null;
3886  } else {
3888 
3889  TEUCHOS_TEST_FOR_EXCEPTION(
3890  ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3891  "get1dView: This MultiVector does not have constant stride, so it is "
3892  "not possible to view its data as a single array. You may check "
3893  "whether a MultiVector has constant stride by calling "
3894  "isConstantStride().");
3895  // mfh 18 May 2016: get?dView() and get?dViewNonConst() (replace
3896  // ? with 1 or 2) have always been device->host synchronization
3897  // points, since <= 2012. We retain this behavior for backwards
3898  // compatibility.
3899  //
3900  // Yes, "const" is a lie.
3901  const_cast<MV*> (this)->template sync<Kokkos::HostSpace> ();
3902  // Both get1dView() and get1dViewNonConst() return a host view
3903  // of the data.
3904  auto X_lcl = this->template getLocalView<Kokkos::HostSpace> ();
3905  Teuchos::ArrayRCP<const impl_scalar_type> dataAsArcp =
3906  Kokkos::Compat::persistingView (X_lcl);
3907  return Teuchos::arcp_reinterpret_cast<const Scalar> (dataAsArcp);
3908  }
3909  }
3910 
3911  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3912  Teuchos::ArrayRCP<Scalar>
3915  {
3916  if (getLocalLength () == 0 || getNumVectors () == 0) {
3917  return Teuchos::null;
3918  } else {
3919  TEUCHOS_TEST_FOR_EXCEPTION
3920  (! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
3921  "get1dViewNonConst: This MultiVector does not have constant stride, "
3922  "so it is not possible to view its data as a single array. You may "
3923  "check whether a MultiVector has constant stride by calling "
3924  "isConstantStride().");
3925  // get1dView() and get1dViewNonConst() have always been
3926  // device->host synchronization points, since <= 2012. We
3927  // retain this behavior for backwards compatibility.
3928  this->template sync<Kokkos::HostSpace> ();
3929  // Both get1dView() and get1dViewNonConst() return a host view
3930  // of the data.
3931  auto X_lcl = this->template getLocalView<Kokkos::HostSpace> ();
3932  Teuchos::ArrayRCP<impl_scalar_type> dataAsArcp =
3933  Kokkos::Compat::persistingView (X_lcl);
3934  return Teuchos::arcp_reinterpret_cast<Scalar> (dataAsArcp);
3935  }
3936  }
3937 
3938  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3939  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
3942  {
3943  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3944  // (replace ? with 1 or 2) have always been device->host
3945  // synchronization points, since <= 2012. We retain this behavior
3946  // for backwards compatibility.
3947  this->sync<Kokkos::HostSpace> ();
3948  // When users call the NonConst variants, it implies that they
3949  // want to change the data. Thus, it is appropriate to mark this
3950  // MultiVector as modified.
3951  this->modify<Kokkos::HostSpace> ();
3952 
3953  const size_t myNumRows = this->getLocalLength ();
3954  const size_t numCols = this->getNumVectors ();
3955  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3956  // Don't use the row range here on the outside, in order to avoid
3957  // a strided return type (in case Kokkos::subview is conservative
3958  // about that). Instead, use the row range for the column views
3959  // in the loop.
3960  auto X_lcl = this->getLocalView<Kokkos::HostSpace> ();
3961 
3962  Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
3963  for (size_t j = 0; j < numCols; ++j) {
3964  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
3965  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
3966  Teuchos::ArrayRCP<impl_scalar_type> X_lcl_j_arcp =
3967  Kokkos::Compat::persistingView (X_lcl_j);
3968  views[j] = Teuchos::arcp_reinterpret_cast<Scalar> (X_lcl_j_arcp);
3969  }
3970  return views;
3971  }
3972 
3973  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
3974  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
3976  get2dView () const
3977  {
3978  // NOTE (mfh 16 May 2016) get?dView() and get?dViewNonConst()
3979  // (replace ? with 1 or 2) have always been device->host
3980  // synchronization points, since <= 2012. We retain this behavior
3981  // for backwards compatibility.
3982  //
3983  // Since get2dView() is and was (unfortunately) always marked
3984  // const, I have to cast away const here in order not to break
3985  // backwards compatibility.
3987  const_cast<this_type*> (this)->sync<Kokkos::HostSpace> ();
3988 
3989  const size_t myNumRows = this->getLocalLength ();
3990  const size_t numCols = this->getNumVectors ();
3991  const Kokkos::pair<size_t, size_t> rowRange (0, myNumRows);
3992  // Don't use the row range here on the outside, in order to avoid
3993  // a strided return type (in case Kokkos::subview is conservative
3994  // about that). Instead, use the row range for the column views
3995  // in the loop.
3996  auto X_lcl = this->getLocalView<Kokkos::HostSpace> ();
3997 
3998  Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
3999  for (size_t j = 0; j < numCols; ++j) {
4000  const size_t col = this->isConstantStride () ? j : this->whichVectors_[j];
4001  auto X_lcl_j = Kokkos::subview (X_lcl, rowRange, col);
4002  Teuchos::ArrayRCP<const impl_scalar_type> X_lcl_j_arcp =
4003  Kokkos::Compat::persistingView (X_lcl_j);
4004  views[j] = Teuchos::arcp_reinterpret_cast<const Scalar> (X_lcl_j_arcp);
4005  }
4006  return views;
4007  }
4008 
4009  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4010  void
4012  multiply (Teuchos::ETransp transA,
4013  Teuchos::ETransp transB,
4014  const Scalar& alpha,
4017  const Scalar& beta)
4018  {
4019  using Teuchos::CONJ_TRANS;
4020  using Teuchos::NO_TRANS;
4021  using Teuchos::TRANS;
4022  using Teuchos::RCP;
4023  using Teuchos::rcp;
4024  using Teuchos::rcpFromRef;
4025  typedef Kokkos::Details::ArithTraits<impl_scalar_type> ATS;
4026  typedef Teuchos::ScalarTraits<Scalar> STS;
4028  const char errPrefix[] = "Tpetra::MultiVector::multiply: ";
4029 
4030  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::multiply");
4031 
4032  // This routine performs a variety of matrix-matrix multiply
4033  // operations, interpreting the MultiVector (this-aka C , A and B)
4034  // as 2D matrices. Variations are due to the fact that A, B and C
4035  // can be local replicated or global distributed MultiVectors and
4036  // that we may or may not operate with the transpose of A and B.
4037  // Possible cases are:
4038  //
4039  // Operations # Cases Notes
4040  // 1) C(local) = A^X(local) * B^X(local) 4 X=Trans or Not, no comm needed
4041  // 2) C(local) = A^T(distr) * B (distr) 1 2-D dot product, replicate C
4042  // 3) C(distr) = A (distr) * B^X(local) 2 2-D vector update, no comm needed
4043  //
4044  // The following operations are not meaningful for 1-D
4045  // distributions:
4046  //
4047  // u1) C(local) = A^T(distr) * B^T(distr) 1
4048  // u2) C(local) = A (distr) * B^X(distr) 2
4049  // u3) C(distr) = A^X(local) * B^X(local) 4
4050  // u4) C(distr) = A^X(local) * B^X(distr) 4
4051  // u5) C(distr) = A^T(distr) * B^X(local) 2
4052  // u6) C(local) = A^X(distr) * B^X(local) 4
4053  // u7) C(distr) = A^X(distr) * B^X(local) 4
4054  // u8) C(local) = A^X(local) * B^X(distr) 4
4055  //
4056  // Total number of cases: 32 (= 2^5).
4057 
4058  impl_scalar_type beta_local = beta; // local copy of beta; might be reassigned below
4059 
4060  // In a debug build, check compatibility of local dimensions. We
4061  // only do this in a debug build, since we have to do an
4062  // all-reduce to ensure correctness on all processses. It's
4063  // entirely possible that only some processes may have
4064  // incompatible local dimensions. Throwing an exception only on
4065  // those processes could cause this method to hang.
4066 #ifdef HAVE_TPETRA_DEBUG
4067  if (! this->getMap ().is_null () && ! this->getMap ()->getComm ().is_null ()) {
4068  const size_t A_nrows = (transA != NO_TRANS) ? A.getNumVectors() : A.getLocalLength();
4069  const size_t A_ncols = (transA != NO_TRANS) ? A.getLocalLength() : A.getNumVectors();
4070  const size_t B_nrows = (transB != NO_TRANS) ? B.getNumVectors() : B.getLocalLength();
4071  const size_t B_ncols = (transB != NO_TRANS) ? B.getLocalLength() : B.getNumVectors();
4072 
4073  const bool lclBad = this->getLocalLength () != A_nrows ||
4074  this->getNumVectors () != B_ncols || A_ncols != B_nrows;
4075 
4076  const int lclGood = lclBad ? 0 : 1;
4077  int gblGood = 0;
4078 
4079  using Teuchos::REDUCE_MIN;
4080  using Teuchos::reduceAll;
4081  using Teuchos::outArg;
4082 
4083  auto comm = this->getMap ()->getComm (); // not null; see above
4084  reduceAll<int, int> (*comm, REDUCE_MIN, lclGood, outArg (gblGood));
4085 
4086  TEUCHOS_TEST_FOR_EXCEPTION
4087  (gblGood != 1, std::runtime_error, errPrefix << "Local dimensions of "
4088  "*this, op(A), and op(B) are not consistent on at least one process "
4089  "in this object's communicator.");
4090  }
4091 #endif // HAVE_TPETRA_DEBUG
4092 
4093  const bool A_is_local = ! A.isDistributed ();
4094  const bool B_is_local = ! B.isDistributed ();
4095  const bool C_is_local = ! this->isDistributed ();
4096  // Case 1: C(local) = A^X(local) * B^X(local)
4097  const bool Case1 = C_is_local && A_is_local && B_is_local;
4098  // Case 2: C(local) = A^T(distr) * B (distr)
4099  const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
4100  transA != NO_TRANS &&
4101  transB == NO_TRANS;
4102  // Case 3: C(distr) = A (distr) * B^X(local)
4103  const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
4104  transA == NO_TRANS;
4105 
4106  // Test that we are considering a meaningful case
4107  TEUCHOS_TEST_FOR_EXCEPTION(
4108  ! Case1 && ! Case2 && ! Case3, std::runtime_error, errPrefix
4109  << "Multiplication of op(A) and op(B) into *this is not a "
4110  "supported use case.");
4111 
4112  if (beta != STS::zero () && Case2) {
4113  // If Case2, then C is local and contributions must be summed
4114  // across all processes. However, if beta != 0, then accumulate
4115  // beta*C into the sum. When summing across all processes, we
4116  // only want to accumulate this once, so set beta == 0 on all
4117  // processes except Process 0.
4118  const int myRank = this->getMap ()->getComm ()->getRank ();
4119  if (myRank != 0) {
4120  beta_local = ATS::zero ();
4121  }
4122  }
4123 
4124  // We only know how to do matrix-matrix multiplies if all the
4125  // MultiVectors have constant stride. If not, we have to make
4126  // temporary copies of those MultiVectors (including possibly
4127  // *this) that don't have constant stride.
4128  RCP<MV> C_tmp;
4129  if (! isConstantStride ()) {
4130  C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
4131  } else {
4132  C_tmp = rcp (this, false);
4133  }
4134 
4135  RCP<const MV> A_tmp;
4136  if (! A.isConstantStride ()) {
4137  A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
4138  } else {
4139  A_tmp = rcpFromRef (A);
4140  }
4141 
4142  RCP<const MV> B_tmp;
4143  if (! B.isConstantStride ()) {
4144  B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
4145  } else {
4146  B_tmp = rcpFromRef (B);
4147  }
4148 
4149  TEUCHOS_TEST_FOR_EXCEPTION(
4150  ! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
4151  ! A_tmp->isConstantStride (), std::logic_error, errPrefix
4152  << "Failed to make temporary constant-stride copies of MultiVectors.");
4153 
4154  {
4155  typedef LocalOrdinal LO;
4156 
4157  const LO A_lclNumRows = A_tmp->getLocalLength ();
4158  const LO A_numVecs = A_tmp->getNumVectors ();
4159  auto A_lcl = A_tmp->template getLocalView<device_type> ();
4160  auto A_sub = Kokkos::subview (A_lcl,
4161  std::make_pair (LO (0), A_lclNumRows),
4162  std::make_pair (LO (0), A_numVecs));
4163  const LO B_lclNumRows = B_tmp->getLocalLength ();
4164  const LO B_numVecs = B_tmp->getNumVectors ();
4165  auto B_lcl = B_tmp->template getLocalView<device_type> ();
4166  auto B_sub = Kokkos::subview (B_lcl,
4167  std::make_pair (LO (0), B_lclNumRows),
4168  std::make_pair (LO (0), B_numVecs));
4169  const LO C_lclNumRows = C_tmp->getLocalLength ();
4170  const LO C_numVecs = C_tmp->getNumVectors ();
4171  auto C_lcl = C_tmp->template getLocalView<device_type> ();
4172  auto C_sub = Kokkos::subview (C_lcl,
4173  std::make_pair (LO (0), C_lclNumRows),
4174  std::make_pair (LO (0), C_numVecs));
4175  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' :
4176  (transA == Teuchos::TRANS ? 'T' : 'C'));
4177  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' :
4178  (transB == Teuchos::TRANS ? 'T' : 'C'));
4179  const impl_scalar_type alpha_IST (alpha);
4180 
4181  ::Tpetra::Details::ProfilingRegion regionGemm ("Tpetra::MV::multiply-call-gemm");
4182  ::Tpetra::Details::Blas::gemm (ctransA, ctransB, alpha_IST, A_sub,
4183  B_sub, beta_local, C_sub);
4184  }
4185 
4186  if (! isConstantStride ()) {
4187  deep_copy (*this, *C_tmp); // Copy the result back into *this.
4188  }
4189 
4190  // Dispose of (possibly) extra copies of A and B.
4191  A_tmp = Teuchos::null;
4192  B_tmp = Teuchos::null;
4193 
4194  // If Case 2 then sum up *this and distribute it to all processes.
4195  if (Case2) {
4196  this->reduce ();
4197  }
4198  }
4199 
4200  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4201  void
4203  elementWiseMultiply (Scalar scalarAB,
4206  Scalar scalarThis)
4207  {
4208  using Kokkos::ALL;
4209  using Kokkos::subview;
4212  const char tfecfFuncName[] = "elementWiseMultiply: ";
4213 
4214  const size_t lclNumRows = this->getLocalLength ();
4215  const size_t numVecs = this->getNumVectors ();
4216 
4217  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4218  (lclNumRows != A.getLocalLength () || lclNumRows != B.getLocalLength (),
4219  std::runtime_error, "MultiVectors do not have the same local length.");
4220  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4221  numVecs != B.getNumVectors (), std::runtime_error, "this->getNumVectors"
4222  "() = " << numVecs << " != B.getNumVectors() = " << B.getNumVectors ()
4223  << ".");
4224 
4225  // FIXME (mfh 18 May 2016) It would be rude to sync A and B here,
4226  // because they are guests of this method. Instead, the polite
4227  // thing to do would be to copy them (if necessary) so we get
4228  // their most recently updated version. *this should always
4229  // control where execution happens.
4230 
4231  this->template sync<device_type> ();
4232  this->template modify<device_type> ();
4233  const_cast<V&> (A).template sync<device_type> ();
4234  const_cast<MV&> (B).template sync<device_type> ();
4235  auto this_view = this->template getLocalView<device_type> ();
4236  auto A_view = A.template getLocalView<device_type> ();
4237  auto B_view = B.template getLocalView<device_type> ();
4238 
4239  if (isConstantStride () && B.isConstantStride ()) {
4240  // A is just a Vector; it only has one column, so it always has
4241  // constant stride.
4242  //
4243  // If both *this and B have constant stride, we can do an
4244  // element-wise multiply on all columns at once.
4245  KokkosBlas::mult (scalarThis,
4246  this_view,
4247  scalarAB,
4248  subview (A_view, ALL (), 0),
4249  B_view);
4250  }
4251  else {
4252  for (size_t j = 0; j < numVecs; ++j) {
4253  const size_t C_col = isConstantStride () ? j : whichVectors_[j];
4254  const size_t B_col = B.isConstantStride () ? j : B.whichVectors_[j];
4255  KokkosBlas::mult (scalarThis,
4256  subview (this_view, ALL (), C_col),
4257  scalarAB,
4258  subview (A_view, ALL (), 0),
4259  subview (B_view, ALL (), B_col));
4260  }
4261  }
4262  }
4263 
4264  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4265  void
4268  {
4269  using Teuchos::reduceAll;
4270  using Teuchos::REDUCE_SUM;
4271  typedef decltype (this->template getLocalView<device_type> ())
4272  dev_view_type;
4273  typedef decltype (this->template getLocalView<Kokkos::HostSpace> ())
4274  host_view_type;
4275 
4276  ::Tpetra::Details::ProfilingRegion region ("Tpetra::MV::reduce");
4277 
4278  TEUCHOS_TEST_FOR_EXCEPTION
4279  (this->isDistributed (), std::runtime_error,
4280  "Tpetra::MultiVector::reduce should only be called with locally "
4281  "replicated or otherwise not distributed MultiVector objects.");
4282  const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
4283  if (comm.getSize () == 1) {
4284  return; // MultiVector is already "reduced" to one process
4285  }
4286 
4287  const size_t lclNumRows = getLocalLength ();
4288  const size_t numCols = getNumVectors ();
4289  const size_t totalAllocSize = lclNumRows * numCols;
4290 
4291  // FIXME (mfh 16 June 2014) This exception will cause deadlock if
4292  // it triggers on only some processes. We don't have a good way
4293  // to pack this result into the all-reduce below, but this would
4294  // be a good reason to set a "local error flag" and find other
4295  // opportunities to let it propagate.
4296  TEUCHOS_TEST_FOR_EXCEPTION(
4297  lclNumRows > static_cast<size_t> (std::numeric_limits<int>::max ()),
4298  std::runtime_error, "Tpetra::MultiVector::reduce: On Process " <<
4299  comm.getRank () << ", the number of local rows " << lclNumRows <<
4300  " does not fit in int.");
4301 
4302  //
4303  // Use MPI to sum the entries across all local blocks.
4304  //
4305 
4306  // Get the most recently updated version of the data.
4307  //
4308  // If we need sync to device, then host has the most recent version.
4309  const bool useHostVersion = this->template need_sync<device_type> ();
4310  // Only one of these (the most recently updated one) is active.
4311  dev_view_type srcView_dev;
4312  host_view_type srcView_host;
4313  if (useHostVersion) {
4314  srcView_host = this->template getLocalView<Kokkos::HostSpace> ();
4315  if (lclNumRows != static_cast<size_t> (srcView_host.dimension_0 ())) {
4316  // Make sure the number of rows is correct. If not, take a subview.
4317  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4318  srcView_host = Kokkos::subview (srcView_host, rowRng, Kokkos::ALL ());
4319  }
4320  }
4321  else {
4322  srcView_dev = this->template getLocalView<device_type> ();
4323  if (lclNumRows != static_cast<size_t> (srcView_dev.dimension_0 ())) {
4324  // Make sure the number of rows is correct. If not, take a subview.
4325  const Kokkos::pair<size_t, size_t> rowRng (0, lclNumRows);
4326  srcView_dev = Kokkos::subview (srcView_dev, rowRng, Kokkos::ALL ());
4327  }
4328  }
4329 
4330  // If this MultiVector's local data are stored contiguously, we
4331  // can use the local View as the source buffer in the
4332  // MPI_Allreduce. Otherwise, we have to allocate a temporary
4333  // source buffer and pack.
4334  const bool contig = isConstantStride () && getStride () == lclNumRows;
4335  dev_view_type srcBuf_dev;
4336  host_view_type srcBuf_host;
4337  if (useHostVersion) {
4338  if (contig) {
4339  srcBuf_host = srcView_host; // use the View as-is
4340  }
4341  else { // need to repack into a contiguous buffer
4342  srcBuf_host = decltype (srcBuf_host) ("srcBuf", lclNumRows, numCols);
4343  Kokkos::deep_copy (srcBuf_host, srcView_host);
4344  }
4345  }
4346  else { // use device version
4347  if (contig) {
4348  srcBuf_dev = srcView_dev; // use the View as-is
4349  }
4350  else { // need to repack into a contiguous buffer
4351  srcBuf_dev = decltype (srcBuf_dev) ("srcBuf", lclNumRows, numCols);
4352  Kokkos::deep_copy (srcBuf_dev, srcView_dev);
4353  }
4354  }
4355 
4356  // Check expected invariant of the above block of code. At this
4357  // point, either the srcBuf of choice points to the srcView of
4358  // choice, or it has the right allocation size.
4359  {
4360  // Use >=, not ==, because if srcBuf just points to srcView,
4361  // then srcView may actually be bigger than what we need.
4362  const bool correct =
4363  (useHostVersion && static_cast<size_t> (srcBuf_host.size ()) >= totalAllocSize) ||
4364  (! useHostVersion && static_cast<size_t> (srcBuf_dev.size ()) >= totalAllocSize);
4365  TEUCHOS_TEST_FOR_EXCEPTION
4366  (! correct, std::logic_error, "Tpetra::MultiVector::reduce: Violated "
4367  "invariant of temporary source buffer construction. Please report "
4368  "this bug to the Tpetra developers.");
4369  }
4370 
4371  // MPI requires that the send and receive buffers don't alias one
4372  // another, so we have to copy temporary storage for the result.
4373  //
4374  // We expect that MPI implementations will know how to read device
4375  // pointers.
4376  dev_view_type tgtBuf_dev;
4377  host_view_type tgtBuf_host;
4378  if (useHostVersion) {
4379  tgtBuf_host = host_view_type ("tgtBuf", lclNumRows, numCols);
4380  }
4381  else {
4382  tgtBuf_dev = dev_view_type ("tgtBuf", lclNumRows, numCols);
4383  }
4384 
4385  const int reduceCount = static_cast<int> (totalAllocSize);
4386  if (useHostVersion) {
4387  TEUCHOS_TEST_FOR_EXCEPTION
4388  (static_cast<size_t> (tgtBuf_host.size ()) < totalAllocSize,
4389  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_host.size() = "
4390  << tgtBuf_host.size () << " < lclNumRows*numCols = " << totalAllocSize
4391  << ". Please report this bug to the Tpetra developers.");
4392  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4393  srcBuf_host.ptr_on_device (),
4394  tgtBuf_host.ptr_on_device ());
4395  }
4396  else { // use device version
4397  TEUCHOS_TEST_FOR_EXCEPTION
4398  (static_cast<size_t> (tgtBuf_dev.size ()) < totalAllocSize,
4399  std::logic_error, "Tpetra::MultiVector::reduce: tgtBuf_dev.size() = "
4400  << tgtBuf_dev.size () << " < lclNumRows*numCols = " << totalAllocSize
4401  << ". Please report this bug to the Tpetra developers.");
4402  reduceAll<int, impl_scalar_type> (comm, REDUCE_SUM, reduceCount,
4403  srcBuf_dev.ptr_on_device (),
4404  tgtBuf_dev.ptr_on_device ());
4405  }
4406 
4407  // Write back the results to *this.
4408  if (useHostVersion) {
4409  this->template modify<Kokkos::HostSpace> ();
4410  if (contig || isConstantStride ()) {
4411  Kokkos::deep_copy (srcView_host, tgtBuf_host);
4412  }
4413  else { // need to copy one column at a time
4414  for (size_t j = 0; j < numCols; ++j) {
4415  auto X_j_out = Kokkos::subview (srcView_host, Kokkos::ALL (), j);
4416  auto X_j_in = Kokkos::subview (tgtBuf_host, Kokkos::ALL (), j);
4417  Kokkos::deep_copy (X_j_out, X_j_in);
4418  }
4419  }
4420  }
4421  else { // use device version
4422  this->template modify<device_type> ();
4423  if (contig || isConstantStride ()) {
4424  Kokkos::deep_copy (srcView_dev, tgtBuf_dev);
4425  }
4426  else { // need to copy one column at a time
4427  for (size_t j = 0; j < numCols; ++j) {
4428  auto X_j_out = Kokkos::subview (srcView_dev, Kokkos::ALL (), j);
4429  auto X_j_in = Kokkos::subview (tgtBuf_dev, Kokkos::ALL (), j);
4430  Kokkos::deep_copy (X_j_out, X_j_in);
4431  }
4432  }
4433  }
4434 
4435  // We leave *this unsynchronized.
4436  }
4437 
4438 
4439  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4440  void
4442  replaceLocalValue (const LocalOrdinal lclRow,
4443  const size_t col,
4444  const impl_scalar_type& ScalarValue) const
4445  {
4446 #ifdef HAVE_TPETRA_DEBUG
4447  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4448  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4449  TEUCHOS_TEST_FOR_EXCEPTION(
4450  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4451  std::runtime_error,
4452  "Tpetra::MultiVector::replaceLocalValue: row index " << lclRow
4453  << " is invalid. The range of valid row indices on this process "
4454  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4455  << ", " << maxLocalIndex << "].");
4456  TEUCHOS_TEST_FOR_EXCEPTION(
4457  vectorIndexOutOfRange(col),
4458  std::runtime_error,
4459  "Tpetra::MultiVector::replaceLocalValue: vector index " << col
4460  << " of the multivector is invalid.");
4461 #endif
4462  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4463  view_.h_view (lclRow, colInd) = ScalarValue;
4464  }
4465 
4466 
4467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4468  void
4470  sumIntoLocalValue (const LocalOrdinal lclRow,
4471  const size_t col,
4472  const impl_scalar_type& value,
4473  const bool atomic) const
4474  {
4475 #ifdef HAVE_TPETRA_DEBUG
4476  const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
4477  const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
4478  TEUCHOS_TEST_FOR_EXCEPTION(
4479  lclRow < minLocalIndex || lclRow > maxLocalIndex,
4480  std::runtime_error,
4481  "Tpetra::MultiVector::sumIntoLocalValue: row index " << lclRow
4482  << " is invalid. The range of valid row indices on this process "
4483  << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
4484  << ", " << maxLocalIndex << "].");
4485  TEUCHOS_TEST_FOR_EXCEPTION(
4486  vectorIndexOutOfRange(col),
4487  std::runtime_error,
4488  "Tpetra::MultiVector::sumIntoLocalValue: vector index " << col
4489  << " of the multivector is invalid.");
4490 #endif
4491  const size_t colInd = isConstantStride () ? col : whichVectors_[col];
4492  if (atomic) {
4493  Kokkos::atomic_add (& (view_.h_view(lclRow, colInd)), value);
4494  }
4495  else {
4496  view_.h_view (lclRow, colInd) += value;
4497  }
4498  }
4499 
4500 
4501  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4502  void
4504  replaceGlobalValue (const GlobalOrdinal gblRow,
4505  const size_t col,
4506  const impl_scalar_type& ScalarValue) const
4507  {
4508  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4509  // touches the RCP's reference count, which isn't thread safe.
4510  const LocalOrdinal MyRow = this->map_->getLocalElement (gblRow);
4511 #ifdef HAVE_TPETRA_DEBUG
4512  TEUCHOS_TEST_FOR_EXCEPTION(
4513  MyRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4514  std::runtime_error,
4515  "Tpetra::MultiVector::replaceGlobalValue: Global row index " << gblRow
4516  << "is not present on this process "
4517  << this->getMap ()->getComm ()->getRank () << ".");
4518  TEUCHOS_TEST_FOR_EXCEPTION(
4519  vectorIndexOutOfRange (col), std::runtime_error,
4520  "Tpetra::MultiVector::replaceGlobalValue: Vector index " << col
4521  << " of the multivector is invalid.");
4522 #endif // HAVE_TPETRA_DEBUG
4523  this->replaceLocalValue (MyRow, col, ScalarValue);
4524  }
4525 
4526  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4527  void
4529  sumIntoGlobalValue (const GlobalOrdinal globalRow,
4530  const size_t col,
4531  const impl_scalar_type& value,
4532  const bool atomic) const
4533  {
4534  // mfh 23 Nov 2015: Use map_ and not getMap(), because the latter
4535  // touches the RCP's reference count, which isn't thread safe.
4536  const LocalOrdinal lclRow = this->map_->getLocalElement (globalRow);
4537 #ifdef HAVE_TEUCHOS_DEBUG
4538  TEUCHOS_TEST_FOR_EXCEPTION(
4539  lclRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid (),
4540  std::runtime_error,
4541  "Tpetra::MultiVector::sumIntoGlobalValue: Global row index " << globalRow
4542  << "is not present on this process "
4543  << this->getMap ()->getComm ()->getRank () << ".");
4544  TEUCHOS_TEST_FOR_EXCEPTION(
4545  vectorIndexOutOfRange(col),
4546  std::runtime_error,
4547  "Tpetra::MultiVector::sumIntoGlobalValue: Vector index " << col
4548  << " of the multivector is invalid.");
4549 #endif
4550  this->sumIntoLocalValue (lclRow, col, value, atomic);
4551  }
4552 
4553  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4554  template <class T>
4555  Teuchos::ArrayRCP<T>
4557  getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
4558  size_t j) const
4559  {
4560  typedef Kokkos::DualView<impl_scalar_type*,
4561  typename dual_view_type::array_layout,
4562  execution_space> col_dual_view_type;
4563  const size_t col = isConstantStride () ? j : whichVectors_[j];
4564  col_dual_view_type X_col =
4565  Kokkos::subview (view_, Kokkos::ALL (), col);
4566  return Kokkos::Compat::persistingView (X_col.d_view);
4567  }
4568 
4569  template <class Scalar,
4570  class LocalOrdinal,
4571  class GlobalOrdinal,
4572  class Node,
4573  const bool classic>
4576  getDualView () const {
4577  return view_;
4578  }
4579 
4580  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4581  std::string
4583  descriptionImpl (const std::string& className) const
4584  {
4585  using Teuchos::TypeNameTraits;
4586 
4587  std::ostringstream out;
4588  out << "\"" << className << "\": {";
4589  out << "Template parameters: {Scalar: " << TypeNameTraits<Scalar>::name ()
4590  << ", LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name ()
4591  << ", GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name ()
4592  << ", Node" << Node::name ()
4593  << "}, ";
4594  if (this->getObjectLabel () != "") {
4595  out << "Label: \"" << this->getObjectLabel () << "\", ";
4596  }
4597  out << ", numRows: " << this->getGlobalLength ();
4598  if (className != "Tpetra::Vector") {
4599  out << ", numCols: " << this->getNumVectors ()
4600  << ", isConstantStride: " << this->isConstantStride ();
4601  }
4602  if (this->isConstantStride ()) {
4603  out << ", columnStride: " << this->getStride ();
4604  }
4605  out << "}";
4606 
4607  return out.str ();
4608  }
4609 
4610  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4611  std::string
4614  {
4615  return this->descriptionImpl ("Tpetra::MultiVector");
4616  }
4617 
4618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4619  std::string
4621  localDescribeToString (const Teuchos::EVerbosityLevel vl) const
4622  {
4623  typedef LocalOrdinal LO;
4624  using std::endl;
4625 
4626  if (vl <= Teuchos::VERB_LOW) {
4627  return std::string ();
4628  }
4629  auto map = this->getMap ();
4630  if (map.is_null ()) {
4631  return std::string ();
4632  }
4633  auto outStringP = Teuchos::rcp (new std::ostringstream ());
4634  auto outp = Teuchos::getFancyOStream (outStringP);
4635  Teuchos::FancyOStream& out = *outp;
4636  auto comm = map->getComm ();
4637  const int myRank = comm->getRank ();
4638  const int numProcs = comm->getSize ();
4639 
4640  out << "Process " << myRank << " of " << numProcs << ":" << endl;
4641  Teuchos::OSTab tab1 (out);
4642 
4643  // VERB_MEDIUM and higher prints getLocalLength()
4644  const LO lclNumRows = static_cast<LO> (this->getLocalLength ());
4645  out << "Local number of rows: " << lclNumRows << endl;
4646 
4647  if (vl > Teuchos::VERB_MEDIUM) {
4648  // VERB_HIGH and higher prints isConstantStride() and getStride().
4649  // The first is only relevant if the Vector has multiple columns.
4650  if (this->getNumVectors () != static_cast<size_t> (1)) {
4651  out << "isConstantStride: " << this->isConstantStride () << endl;
4652  }
4653  if (this->isConstantStride ()) {
4654  out << "Column stride: " << this->getStride () << endl;
4655  }
4656 
4657  if (vl > Teuchos::VERB_HIGH && lclNumRows > 0) {
4658  // VERB_EXTREME prints values. Get a host View of the
4659  // Vector's local data, so we can print it. (Don't assume
4660  // that we can access device data directly in host code.)
4661  auto X_dual = this->getDualView ();
4662  typename dual_view_type::t_host X_host;
4663  if (X_dual.template need_sync<Kokkos::HostSpace> ()) {
4664  // Device memory has the latest version. Don't actually
4665  // sync to host; that changes the Vector's state, and may
4666  // change future computations (that use the data's current
4667  // place to decide where to compute). Instead, create a
4668  // temporary host copy and print that.
4669  auto X_dev = this->template getLocalView<device_type> ();
4670  auto X_host_copy = Kokkos::create_mirror_view (X_dev);
4671  Kokkos::deep_copy (X_host_copy, X_dev);
4672  X_host = X_host_copy;
4673  }
4674  else {
4675  // Either host and device are in sync, or host has the
4676  // latest version of the Vector's data. Thus, we can use
4677  // the host version directly.
4678  X_host = this->template getLocalView<Kokkos::HostSpace> ();
4679  }
4680  // The square braces [] and their contents are in Matlab
4681  // format, so users may copy and paste directly into Matlab.
4682  out << "Values: " << endl
4683  << "[";
4684  const LO numCols = static_cast<LO> (this->getNumVectors ());
4685  if (numCols == 1) {
4686  for (LO i = 0; i < lclNumRows; ++i) {
4687  out << X_host(i,0);
4688  if (i + 1 < lclNumRows) {
4689  out << "; ";
4690  }
4691  }
4692  }
4693  else {
4694  for (LO i = 0; i < lclNumRows; ++i) {
4695  for (LO j = 0; j < numCols; ++j) {
4696  out << X_host(i,j);
4697  if (j + 1 < numCols) {
4698  out << ", ";
4699  }
4700  }
4701  if (i + 1 < lclNumRows) {
4702  out << "; ";
4703  }
4704  }
4705  }
4706  out << "]" << endl;
4707  }
4708  }
4709 
4710  out.flush (); // make sure the ostringstream got everything
4711  return outStringP->str ();
4712  }
4713 
4714  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4715  void
4717  describeImpl (Teuchos::FancyOStream& out,
4718  const std::string& className,
4719  const Teuchos::EVerbosityLevel verbLevel) const
4720  {
4721  using Teuchos::TypeNameTraits;
4722  using Teuchos::VERB_DEFAULT;
4723  using Teuchos::VERB_NONE;
4724  using Teuchos::VERB_LOW;
4725  using std::endl;
4726  const Teuchos::EVerbosityLevel vl =
4727  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
4728 
4729  if (vl == VERB_NONE) {
4730  return; // don't print anything
4731  }
4732  // If this Vector's Comm is null, then the Vector does not
4733  // participate in collective operations with the other processes.
4734  // In that case, it is not even legal to call this method. The
4735  // reasonable thing to do in that case is nothing.
4736  auto map = this->getMap ();
4737  if (map.is_null ()) {
4738  return;
4739  }
4740  auto comm = map->getComm ();
4741  if (comm.is_null ()) {
4742  return;
4743  }
4744 
4745  const int myRank = comm->getRank ();
4746 
4747  // Only Process 0 should touch the output stream, but this method
4748  // in general may need to do communication. Thus, we may need to
4749  // preserve the current tab level across multiple "if (myRank ==
4750  // 0) { ... }" inner scopes. This is why we sometimes create
4751  // OSTab instances by pointer, instead of by value. We only need
4752  // to create them by pointer if the tab level must persist through
4753  // multiple inner scopes.
4754  Teuchos::RCP<Teuchos::OSTab> tab0, tab1;
4755 
4756  // VERB_LOW and higher prints the equivalent of description().
4757  if (myRank == 0) {
4758  tab0 = Teuchos::rcp (new Teuchos::OSTab (out));
4759  out << "\"" << className << "\":" << endl;
4760  tab1 = Teuchos::rcp (new Teuchos::OSTab (out));
4761  {
4762  out << "Template parameters:" << endl;
4763  Teuchos::OSTab tab2 (out);
4764  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
4765  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
4766  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
4767  << "Node: " << Node::name () << endl;
4768  }
4769  if (this->getObjectLabel () != "") {
4770  out << "Label: \"" << this->getObjectLabel () << "\", ";
4771  }
4772  out << "Global number of rows: " << this->getGlobalLength () << endl;
4773  if (className != "Tpetra::Vector") {
4774  out << "Number of columns: " << this->getNumVectors () << endl;
4775  }
4776  // getStride() may differ on different processes, so it (and
4777  // isConstantStride()) properly belong to per-process data.
4778  }
4779 
4780  // This is collective over the Map's communicator.
4781  if (vl > VERB_LOW) { // VERB_MEDIUM, VERB_HIGH, or VERB_EXTREME
4782  const std::string lclStr = this->localDescribeToString (vl);
4783  Tpetra::Details::gathervPrint (out, lclStr, *comm);
4784  }
4785  }
4786 
4787  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4788  void
4790  describe (Teuchos::FancyOStream &out,
4791  const Teuchos::EVerbosityLevel verbLevel) const
4792  {
4793  this->describeImpl (out, "Tpetra::MultiVector", verbLevel);
4794  }
4795 
4796  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4797  void
4799  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
4800  {
4801  replaceMap (newMap);
4802  }
4803 
4804  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, const bool classic>
4805  void
4808  {
4809  typedef LocalOrdinal LO;
4810  typedef device_type DT;
4811  typedef typename dual_view_type::host_mirror_space::device_type HMDT;
4812  const char prefix[] = "Tpetra::deep_copy (MultiVector): ";
4813  const bool debug = false;
4814 
4815  TEUCHOS_TEST_FOR_EXCEPTION
4816  (this->getGlobalLength () != src.getGlobalLength () ||
4817  this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
4818  prefix << "Global dimensions of the two Tpetra::MultiVector "
4819  "objects do not match. src has dimensions [" << src.getGlobalLength ()
4820  << "," << src.getNumVectors () << "], and *this has dimensions ["
4821  << this->getGlobalLength () << "," << this->getNumVectors () << "].");
4822  // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
4823  TEUCHOS_TEST_FOR_EXCEPTION
4824  (this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
4825  prefix << "The local row counts of the two Tpetra::MultiVector "
4826  "objects do not match. src has " << src.getLocalLength () << " row(s) "
4827  << " and *this has " << this->getLocalLength () << " row(s).");
4828 
4829  // See #1510. We're writing to *this, so we don't care about its
4830  // contents in either memory space, and we don't want
4831  // DualView::modify to complain about "concurrent modification" of
4832  // host and device Views.
4833  this->view_.modified_device () = 0;
4834  this->view_.modified_host () = 0;
4835 
4836  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4837  std::cout << "*** MultiVector::assign: ";
4838  }
4839 
4840  if (src.isConstantStride () && this->isConstantStride ()) {
4841  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4842  std::cout << "Both *this and src have constant stride" << std::endl;
4843  }
4844 
4845  // If we need sync to device, then host has the most recent version.
4846  const bool useHostVersion = src.template need_sync<device_type> ();
4847 
4848  if (useHostVersion) { // Host memory has the most recent version of src.
4849  this->template modify<HMDT> (); // We are about to modify dst on host.
4850  // Copy from src to dst on host.
4851  Details::localDeepCopyConstStride (this->template getLocalView<HMDT> (),
4852  src.template getLocalView<HMDT> ());
4853  this->template sync<DT> (); // Sync dst from host to device.
4854  }
4855  else { // Device memory has the most recent version of src.
4856  this->template modify<DT> (); // We are about to modify dst on device.
4857  // Copy from src to dst on device.
4858  Details::localDeepCopyConstStride (this->template getLocalView<DT> (),
4859  src.template getLocalView<DT> ());
4860  this->template sync<HMDT> (); // Sync dst from device to host.
4861  }
4862  }
4863  else {
4864  if (this->isConstantStride ()) {
4865  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4866  std::cout << "Only *this has constant stride";
4867  }
4868 
4869  const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
4870  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4871 
4872  // We can't sync src, since it is only an input argument.
4873  // Thus, we have to use the most recently modified version of
4874  // src, device or host.
4875  //
4876  // If we need sync to device, then host has the most recent version.
4877  const bool useHostVersion = src.template need_sync<device_type> ();
4878  if (useHostVersion) { // host version of src most recently modified
4879  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4880  std::cout << "; Copy from host version of src" << std::endl;
4881  }
4882  // Copy from the host version of src.
4883  //
4884  // whichVecs tells the kernel which vectors (columns) of src
4885  // to copy. Fill whichVecs on the host, and use it there.
4886  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4887  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4888  for (LO i = 0; i < numWhichVecs; ++i) {
4889  srcWhichVecs(i) = static_cast<LO> (src.whichVectors_[i]);
4890  }
4891  // Copy from the selected vectors of src to dst, on the
4892  // host. The function ignores its dstWhichVecs argument in
4893  // this case.
4894  Details::localDeepCopy (this->template getLocalView<HMDT> (),
4895  src.template getLocalView<HMDT> (),
4896  true, false, srcWhichVecs, srcWhichVecs);
4897  // Sync dst back to the device, since we only copied on the host.
4898  this->template sync<DT> ();
4899  }
4900  else { // copy from the device version of src
4901  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4902  std::cout << "; Copy from device version of src" << std::endl;
4903  }
4904  // Copy from the device version of src.
4905  //
4906  // whichVecs tells the kernel which vectors (columns) of src
4907  // to copy. Fill whichVecs on the host, and sync to device.
4908  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4909  whichvecs_type srcWhichVecs (whichVecsLabel, numWhichVecs);
4910  srcWhichVecs.template modify<HMDT> ();
4911  for (LO i = 0; i < numWhichVecs; ++i) {
4912  srcWhichVecs.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
4913  }
4914  // Sync the host version of srcWhichVecs to the device.
4915  srcWhichVecs.template sync<DT> ();
4916 
4917  // Mark the device version of dst's DualView as modified.
4918  this->template modify<DT> ();
4919 
4920  // Copy from the selected vectors of src to dst, on the
4921  // device. The function ignores its dstWhichVecs argument
4922  // in this case.
4923  Details::localDeepCopy (this->template getLocalView<DT> (),
4924  src.template getLocalView<DT> (),
4925  true, false, srcWhichVecs.d_view,
4926  srcWhichVecs.d_view);
4927  // Sync *this' DualView to the host. This is cheaper than
4928  // repeating the above copy from src to *this on the host.
4929  this->template sync<HMDT> ();
4930  }
4931 
4932  }
4933  else { // dst is NOT constant stride
4934  if (src.isConstantStride ()) {
4935  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4936  std::cout << "Only src has constant stride" << std::endl;
4937  }
4938 
4939  // If we need sync to device, then host has the most recent version.
4940  const bool useHostVersion = src.template need_sync<device_type> ();
4941  if (useHostVersion) { // src most recently modified on host
4942  // Copy from the host version of src.
4943  //
4944  // whichVecs tells the kernel which vectors (columns) of src
4945  // to copy. Fill whichVecs on the host, and use it there.
4946  typedef Kokkos::View<LO*, HMDT> whichvecs_type;
4947  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4948  whichvecs_type whichVecs ("MV::deep_copy::whichVecs", numWhichVecs);
4949  for (LO i = 0; i < numWhichVecs; ++i) {
4950  whichVecs(i) = static_cast<LO> (this->whichVectors_[i]);
4951  }
4952  // Copy from src to the selected vectors of dst, on the
4953  // host. The functor ignores its 4th arg in this case.
4954  Details::localDeepCopy (this->template getLocalView<HMDT> (),
4955  src.template getLocalView<HMDT> (),
4956  this->isConstantStride (),
4957  src.isConstantStride (),
4958  whichVecs, whichVecs);
4959  // Sync dst back to the device, since we only copied on the host.
4960  //
4961  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4962  // don't actually belong to dst's view.
4963  this->template sync<DT> ();
4964  }
4965  else { // Copy from the device version of src.
4966  // whichVecs tells the kernel which vectors (columns) of dst
4967  // to copy. Fill whichVecs on the host, and sync to device.
4968  typedef Kokkos::DualView<LO*, DT> whichvecs_type;
4969  const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
4970  const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
4971  whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
4972  whichVecs.template modify<HMDT> ();
4973  for (LO i = 0; i < numWhichVecs; ++i) {
4974  whichVecs.h_view(i) = this->whichVectors_[i];
4975  }
4976  // Sync the host version of whichVecs to the device.
4977  whichVecs.template sync<DT> ();
4978 
4979  // Copy src to the selected vectors of dst, on the device.
4980  Details::localDeepCopy (this->template getLocalView<DT> (),
4981  src.template getLocalView<DT> (),
4982  this->isConstantStride (),
4983  src.isConstantStride (),
4984  whichVecs.d_view, whichVecs.d_view);
4985  // We can't sync src and repeat the above copy on the
4986  // host, so sync dst back to the host.
4987  //
4988  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
4989  // don't actually belong to dst's view.
4990  this->template sync<HMDT> ();
4991  }
4992  }
4993  else { // neither src nor dst have constant stride
4994  if (debug && this->getMap ()->getComm ()->getRank () == 0) {
4995  std::cout << "Neither *this nor src has constant stride" << std::endl;
4996  }
4997 
4998  // If we need sync to device, then host has the most recent version.
4999  const bool useHostVersion = src.template need_sync<device_type> ();
5000  if (useHostVersion) { // copy from the host version of src
5001  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
5002  Kokkos::View<LO*, HMDT> whichVectorsDst ("dstWhichVecs", dstNumWhichVecs);
5003  for (LO i = 0; i < dstNumWhichVecs; ++i) {
5004  whichVectorsDst(i) = this->whichVectors_[i];
5005  }
5006 
5007  // Use the destination MultiVector's LocalOrdinal type here.
5008  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
5009  Kokkos::View<LO*, HMDT> whichVectorsSrc ("srcWhichVecs", srcNumWhichVecs);
5010  for (LO i = 0; i < srcNumWhichVecs; ++i) {
5011  whichVectorsSrc(i) = src.whichVectors_[i];
5012  }
5013 
5014  // Copy from the selected vectors of src to the selected
5015  // vectors of dst, on the host.
5016  Details::localDeepCopy (this->template getLocalView<HMDT> (),
5017  src.template getLocalView<HMDT> (),
5018  this->isConstantStride (),
5019  src.isConstantStride (),
5020  whichVectorsDst, whichVectorsSrc);
5021 
5022  // We can't sync src and repeat the above copy on the
5023  // host, so sync dst back to the host.
5024  //
5025  // FIXME (mfh 29 Jul 2014) This may overwrite columns that
5026  // don't actually belong to dst's view.
5027  this->template sync<HMDT> ();
5028  }
5029  else { // copy from the device version of src
5030  // whichVectorsDst tells the kernel which columns of dst
5031  // to copy. Fill it on the host, and sync to device.
5032  const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
5033  Kokkos::DualView<LO*, DT> whichVecsDst ("MV::deep_copy::whichVecsDst",
5034  dstNumWhichVecs);
5035  whichVecsDst.template modify<HMDT> ();
5036  for (LO i = 0; i < dstNumWhichVecs; ++i) {
5037  whichVecsDst.h_view(i) = static_cast<LO> (this->whichVectors_[i]);
5038  }
5039  // Sync the host version of whichVecsDst to the device.
5040  whichVecsDst.template sync<DT> ();
5041 
5042  // whichVectorsSrc tells the kernel which vectors
5043  // (columns) of src to copy. Fill it on the host, and
5044  // sync to device. Use the destination MultiVector's
5045  // LocalOrdinal type here.
5046  const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
5047  Kokkos::DualView<LO*, DT> whichVecsSrc ("MV::deep_copy::whichVecsSrc",
5048  srcNumWhichVecs);
5049  whichVecsSrc.template modify<HMDT> ();
5050  for (LO i = 0; i < srcNumWhichVecs; ++i) {
5051  whichVecsSrc.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
5052  }
5053  // Sync the host version of whichVecsSrc to the device.
5054  whichVecsSrc.template sync<DT> ();
5055 
5056  // Copy from the selected vectors of src to the selected
5057  // vectors of dst, on the device.
5058  Details::localDeepCopy (this->template getLocalView<DT> (),
5059  src.template getLocalView<DT> (),
5060  this->isConstantStride (),
5061  src.isConstantStride (),
5062  whichVecsDst.d_view, whichVecsSrc.d_view);
5063  }
5064  }
5065  }
5066  }
5067  }
5068 
5069  template <class Scalar, class LO, class GO, class NT, const bool classic>
5070  Teuchos::RCP<MultiVector<Scalar, LO, GO, NT, classic> >
5071  createMultiVector (const Teuchos::RCP<const Map<LO, GO, NT> >& map,
5072  size_t numVectors)
5073  {
5075  return Teuchos::rcp (new MV (map, numVectors));
5076  }
5077 
5078  template <class ST, class LO, class GO, class NT, const bool classic>
5081  {
5083  MV cpy (src.getMap (), src.getNumVectors (), false);
5084  cpy.assign (src);
5085  return cpy;
5086  }
5087 
5088 } // namespace Tpetra
5089 
5090 //
5091 // Explicit instantiation macro
5092 //
5093 // Must be expanded from within the Tpetra namespace!
5094 //
5095 
5096 #define TPETRA_MULTIVECTOR_INSTANT(SCALAR,LO,GO,NODE) \
5097  template class MultiVector< SCALAR , LO , GO , NODE >; \
5098  template MultiVector< SCALAR , LO , GO , NODE > createCopy( const MultiVector< SCALAR , LO , GO , NODE >& src); \
5099  template Teuchos::RCP<MultiVector< SCALAR , LO , GO , NODE > > createMultiVector (const Teuchos::RCP<const Map<LO, GO, NODE> >& map, size_t numVectors);
5100 
5101 #endif // TPETRA_MULTIVECTOR_DEF_HPP
Namespace Tpetra contains the class and methods constituting the Tpetra library.
size_t getStride() const
Stride between columns in the multivector.
device_type::execution_space execution_space
The Kokkos execution space.
MultiVector< ST, LO, GO, NT, classic > createCopy(const MultiVector< ST, LO, GO, NT, classic > &src)
Return a deep copy of the given MultiVector.
EWhichNormImpl
Input argument for localNormImpl() (which see).
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Teuchos::Array< size_t > whichVectors_
Indices of columns this multivector is viewing.
One or more distributed dense vectors.
void deep_copy(MultiVector< DS, DL, DG, DN, dstClassic > &dst, const MultiVector< SS, SL, SG, SN, srcClassic > &src)
Copy the contents of the MultiVector src into dst.
Teuchos::RCP< const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > getVector(const size_t j) const
Return a Vector which is a const view of column j.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
dual_view_type view_
The Kokkos::DualView containing the MultiVector&#39;s data.
Kokkos::Details::InnerProductSpaceTraits< impl_scalar_type >::dot_type dot_type
Type of an inner ("dot") product result.
Node::device_type device_type
The Kokkos Device type.
Teuchos::RCP< MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > createMultiVector(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &map, const size_t numVectors)
Nonmember MultiVector "constructor": Create a MultiVector from a given Map.
size_t getNodeNumElements() const
The number of elements belonging to the calling process.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
size_t global_size_t
Global size_t object.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
size_t getLocalLength() const
Local number of rows on the calling process.
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
Declaration of Tpetra::Details::isInterComm.
global_size_t getGlobalLength() const
Global number of rows in the multivector.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Replace old value with maximum of magnitudes of old and new values.
Abstract base class for objects that can be the source of an Import or Export operation.
Replace existing values with new values.
Kokkos::Details::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
Kokkos::Details::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
Kokkos::DualView< impl_scalar_type **, Kokkos::LayoutLeft, typename execution_space::execution_space > dual_view_type
Kokkos::DualView specialization used by this class.
A parallel distribution of indices over processes.
size_t getNumVectors() const
Number of columns in the multivector.
A distributed dense vector.
Stand-alone utility functions and macros.
void assign(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > &src)
Copy the contents of src into *this (deep copy).
size_t getOrigNumLocalRows() const
"Original" number of rows in the (local) data.
EWhichNorm
Input argument for normImpl() (which see).
Teuchos::RCP< Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node, classic > > getVectorNonConst(const size_t j)
Return a Vector which is a nonconst view of column j.
bool isDistributed() const
Whether this is a globally distributed object.
dual_view_type origView_
The "original view" of the MultiVector&#39;s data.