Tpetra parallel linear algebra  Version of the Day
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_CRSMATRIX_DEF_HPP
41 #define TPETRA_CRSMATRIX_DEF_HPP
42 
50 
51 #include "Tpetra_Import_Util.hpp"
52 #include "Tpetra_Import_Util2.hpp"
53 #include "Tpetra_RowMatrix.hpp"
54 #include "Tpetra_LocalCrsMatrixOperator.hpp"
55 
62 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
67 #include "KokkosSparse_getDiagCopy.hpp"
71 #include "Tpetra_Details_packCrsMatrix.hpp"
72 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
74 #include "Teuchos_FancyOStream.hpp"
75 #include "Teuchos_RCP.hpp"
76 #include "Teuchos_DataAccess.hpp"
77 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
78 #include "KokkosBlas.hpp"
79 
80 #include <memory>
81 #include <sstream>
82 #include <typeinfo>
83 #include <utility>
84 #include <vector>
85 
86 using Teuchos::rcpFromRef;
87 
88 namespace Tpetra {
89 
90 namespace { // (anonymous)
91 
92  template<class T, class BinaryFunction>
93  T atomic_binary_function_update (volatile T* const dest,
94  const T& inputVal,
95  BinaryFunction f)
96  {
97  T oldVal = *dest;
98  T assume;
99 
100  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
101  // POWER architectures, because 'newval' depends on 'assume',
102  // which depends on 'oldVal', which depends on '*dest'. This
103  // sets up a chain of read dependencies that should ensure
104  // correct behavior given a sane memory model.
105  do {
106  assume = oldVal;
107  T newVal = f (assume, inputVal);
108  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
109  } while (assume != oldVal);
110 
111  return oldVal;
112  }
113 } // namespace (anonymous)
114 
115 //
116 // Users must never rely on anything in the Details namespace.
117 //
118 namespace Details {
119 
129 template<class Scalar>
130 struct AbsMax {
132  Scalar operator() (const Scalar& x, const Scalar& y) {
133  typedef Teuchos::ScalarTraits<Scalar> STS;
134  return std::max (STS::magnitude (x), STS::magnitude (y));
135  }
136 };
137 
138 } // namespace Details
139 } // namespace Tpetra
140 
141 namespace Tpetra {
142 
143  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
145  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
146  size_t maxNumEntriesPerRow,
147  const ProfileType pftype,
148  const Teuchos::RCP<Teuchos::ParameterList>& params) :
149  dist_object_type (rowMap)
150  {
151  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t, "
152  "ProfileType[, RCP<ParameterList>]): ";
153  Teuchos::RCP<crs_graph_type> graph;
154  try {
155  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
156  pftype, params));
157  }
158  catch (std::exception& e) {
159  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
160  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
161  "size_t, ProfileType[, RCP<ParameterList>]) threw an exception: "
162  << e.what ());
163  }
164  // myGraph_ not null means that the matrix owns the graph. That's
165  // different than the const CrsGraph constructor, where the matrix
166  // does _not_ own the graph.
167  myGraph_ = graph;
168  staticGraph_ = myGraph_;
169  resumeFill (params);
171  }
172 
173  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
175  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
176  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
177  const ProfileType pftype,
178  const Teuchos::RCP<Teuchos::ParameterList>& params) :
179  dist_object_type (rowMap)
180  {
181  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
182  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]): ";
183  Teuchos::RCP<crs_graph_type> graph;
184  try {
185  using Teuchos::rcp;
186  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
187  pftype, params));
188  }
189  catch (std::exception& e) {
190  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
191  (true, std::runtime_error, "CrsGraph constructor "
192  "(RCP<const Map>, ArrayView<const size_t>, "
193  "ProfileType[, RCP<ParameterList>]) threw an exception: "
194  << e.what ());
195  }
196  // myGraph_ not null means that the matrix owns the graph. That's
197  // different than the const CrsGraph constructor, where the matrix
198  // does _not_ own the graph.
199  myGraph_ = graph;
200  staticGraph_ = graph;
201  resumeFill (params);
203  }
204 
205 
206  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
208  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
209  const Teuchos::RCP<const map_type>& colMap,
210  const size_t maxNumEntPerRow,
211  const ProfileType pftype,
212  const Teuchos::RCP<Teuchos::ParameterList>& params) :
213  dist_object_type (rowMap)
214  {
215  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
216  "RCP<const Map>, size_t, ProfileType[, RCP<ParameterList>]): ";
217  const char suffix[] =
218  " Please report this bug to the Tpetra developers.";
219 
220  // An artifact of debugging something a while back.
221  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
222  (! staticGraph_.is_null (), std::logic_error,
223  "staticGraph_ is not null at the beginning of the constructor."
224  << suffix);
225  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
226  (! myGraph_.is_null (), std::logic_error,
227  "myGraph_ is not null at the beginning of the constructor."
228  << suffix);
229  Teuchos::RCP<crs_graph_type> graph;
230  try {
231  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
232  maxNumEntPerRow,
233  pftype, params));
234  }
235  catch (std::exception& e) {
236  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
237  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
238  "RCP<const Map>, size_t, ProfileType[, RCP<ParameterList>]) threw an "
239  "exception: " << e.what ());
240  }
241  // myGraph_ not null means that the matrix owns the graph. That's
242  // different than the const CrsGraph constructor, where the matrix
243  // does _not_ own the graph.
244  myGraph_ = graph;
245  staticGraph_ = myGraph_;
246  resumeFill (params);
248  }
249 
250  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
252  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
253  const Teuchos::RCP<const map_type>& colMap,
254  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
255  const ProfileType pftype,
256  const Teuchos::RCP<Teuchos::ParameterList>& params) :
257  dist_object_type (rowMap)
258  {
259  const char tfecfFuncName[] =
260  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
261  "ArrayView<const size_t>, ProfileType[, RCP<ParameterList>]): ";
262  Teuchos::RCP<crs_graph_type> graph;
263  try {
264  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
265  numEntPerRowToAlloc,
266  pftype, params));
267  }
268  catch (std::exception& e) {
269  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
270  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
271  "RCP<const Map>, ArrayView<const size_t>, ProfileType[, "
272  "RCP<ParameterList>]) threw an exception: " << e.what ());
273  }
274  // myGraph_ not null means that the matrix owns the graph. That's
275  // different than the const CrsGraph constructor, where the matrix
276  // does _not_ own the graph.
277  myGraph_ = graph;
278  staticGraph_ = graph;
279  resumeFill (params);
281  }
282 
283 
284  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
286  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
287  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
288  dist_object_type (graph->getRowMap ()),
289  staticGraph_ (graph),
290  storageStatus_ (Details::STORAGE_1D_PACKED)
291  {
292  using std::endl;
293  typedef typename local_matrix_device_type::values_type values_type;
294  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
295  "RCP<ParameterList>]): ";
296  const bool verbose = Details::Behavior::verbose("CrsMatrix");
297 
298  std::unique_ptr<std::string> prefix;
299  if (verbose) {
300  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
301  std::ostringstream os;
302  os << *prefix << "Start" << endl;
303  std::cerr << os.str ();
304  }
305 
306  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
307  (graph.is_null (), std::runtime_error, "Input graph is null.");
308  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
309  (! graph->isFillComplete (), std::runtime_error, "Input graph "
310  "is not fill complete. You must call fillComplete on the "
311  "graph before using it to construct a CrsMatrix. Note that "
312  "calling resumeFill on the graph makes it not fill complete, "
313  "even if you had previously called fillComplete. In that "
314  "case, you must call fillComplete on the graph again.");
315 
316  // The graph is fill complete, so it is locally indexed and has a
317  // fixed structure. This means we can allocate the (1-D) array of
318  // values and build the local matrix right now. Note that the
319  // local matrix's number of columns comes from the column Map, not
320  // the domain Map.
321 
322  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
323  if (verbose) {
324  std::ostringstream os;
325  os << *prefix << "Allocate values: " << numEnt << endl;
326  std::cerr << os.str ();
327  }
328 
329  values_type val ("Tpetra::CrsMatrix::values", numEnt);
330  valuesPacked_wdv = values_wdv_type(val);
331  valuesUnpacked_wdv = valuesPacked_wdv;
332 
333  // FIXME (22 Jun 2016) I would very much like to get rid of
334  // k_values1D_ at some point. I find it confusing to have all
335  // these extra references lying around.
336 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
337 
339 
340  if (verbose) {
341  std::ostringstream os;
342  os << *prefix << "Done" << endl;
343  std::cerr << os.str ();
344  }
345  }
346 
347  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
350  const Teuchos::RCP<const crs_graph_type>& graph,
351  const Teuchos::RCP<Teuchos::ParameterList>& params) :
352  dist_object_type (graph->getRowMap ()),
353  staticGraph_ (graph),
354  storageStatus_ (matrix.storageStatus_)
355  {
356  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
357  "local_matrix_device_type::values_type, "
358  "[,RCP<ParameterList>]): ";
359  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
360  (graph.is_null (), std::runtime_error, "Input graph is null.");
361  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
362  (! graph->isFillComplete (), std::runtime_error, "Input graph "
363  "is not fill complete. You must call fillComplete on the "
364  "graph before using it to construct a CrsMatrix. Note that "
365  "calling resumeFill on the graph makes it not fill complete, "
366  "even if you had previously called fillComplete. In that "
367  "case, you must call fillComplete on the graph again.");
368 
369  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
370  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
371 
372  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
373  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
374 
376  }
377 
378 
379  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
382  const typename local_matrix_device_type::values_type& values,
383  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
384  dist_object_type (graph->getRowMap ()),
385  staticGraph_ (graph),
386  storageStatus_ (Details::STORAGE_1D_PACKED)
387  {
388  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
389  "local_matrix_device_type::values_type, "
390  "[,RCP<ParameterList>]): ";
391  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
392  (graph.is_null (), std::runtime_error, "Input graph is null.");
393  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
394  (! graph->isFillComplete (), std::runtime_error, "Input graph "
395  "is not fill complete. You must call fillComplete on the "
396  "graph before using it to construct a CrsMatrix. Note that "
397  "calling resumeFill on the graph makes it not fill complete, "
398  "even if you had previously called fillComplete. In that "
399  "case, you must call fillComplete on the graph again.");
400 
401  // The graph is fill complete, so it is locally indexed and has a
402  // fixed structure. This means we can allocate the (1-D) array of
403  // values and build the local matrix right now. Note that the
404  // local matrix's number of columns comes from the column Map, not
405  // the domain Map.
406 
407  valuesPacked_wdv = values_wdv_type(values);
408  valuesUnpacked_wdv = valuesPacked_wdv;
409 
410  // FIXME (22 Jun 2016) I would very much like to get rid of
411  // k_values1D_ at some point. I find it confusing to have all
412  // these extra references lying around.
413  // KDDKDD ALMOST THERE, MARK!
414 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
415 
417  }
418 
419  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
421  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
422  const Teuchos::RCP<const map_type>& colMap,
423  const typename local_graph_device_type::row_map_type& rowPointers,
424  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
425  const typename local_matrix_device_type::values_type& values,
426  const Teuchos::RCP<Teuchos::ParameterList>& params) :
427  dist_object_type (rowMap),
428  storageStatus_ (Details::STORAGE_1D_PACKED)
429  {
430  using Details::getEntryOnHost;
431  using Teuchos::RCP;
432  using std::endl;
433  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
434  "RCP<const Map>, ptr, ind, val[, params]): ";
435  const char suffix[] =
436  ". Please report this bug to the Tpetra developers.";
437  const bool debug = Details::Behavior::debug("CrsMatrix");
438  const bool verbose = Details::Behavior::verbose("CrsMatrix");
439 
440  std::unique_ptr<std::string> prefix;
441  if (verbose) {
442  prefix = this->createPrefix(
443  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
444  std::ostringstream os;
445  os << *prefix << "Start" << endl;
446  std::cerr << os.str ();
447  }
448 
449  // Check the user's input. Note that this might throw only on
450  // some processes but not others, causing deadlock. We prefer
451  // deadlock due to exceptions to segfaults, because users can
452  // catch exceptions.
453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
454  (values.extent(0) != columnIndices.extent(0),
455  std::invalid_argument, "values.extent(0)=" << values.extent(0)
456  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
457  << ".");
458  if (debug && rowPointers.extent(0) != 0) {
459  const size_t numEnt =
460  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
461  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
462  (numEnt != size_t(columnIndices.extent(0)) ||
463  numEnt != size_t(values.extent(0)),
464  std::invalid_argument, "Last entry of rowPointers says that "
465  "the matrix has " << numEnt << " entr"
466  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
467  "columnIndices and values don't match this. "
468  "columnIndices.extent(0)=" << columnIndices.extent (0)
469  << " and values.extent(0)=" << values.extent (0) << ".");
470  }
471 
472  RCP<crs_graph_type> graph;
473  try {
474  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
475  columnIndices, params));
476  }
477  catch (std::exception& e) {
478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
479  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
480  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
481  << e.what ());
482  }
483  // The newly created CrsGraph _must_ have a local graph at this
484  // point. We don't really care whether CrsGraph's constructor
485  // deep-copies or shallow-copies the input, but the dimensions
486  // have to be right. That's how we tell whether the CrsGraph has
487  // a local graph.
488  auto lclGraph = graph->getLocalGraphDevice ();
489  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
490  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
491  lclGraph.entries.extent (0) != columnIndices.extent (0),
492  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
493  "ind[, params]) did not set the local graph correctly." << suffix);
494  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
495  (lclGraph.entries.extent (0) != values.extent (0),
496  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
497  "params]) did not set the local graph correctly. "
498  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
499  << " != values.extent(0) = " << values.extent (0) << suffix);
500 
501  // myGraph_ not null means that the matrix owns the graph. This
502  // is true because the column indices come in as nonconst,
503  // implying shared ownership.
504  myGraph_ = graph;
505  staticGraph_ = graph;
506 
507  // The graph may not be fill complete yet. However, it is locally
508  // indexed (since we have a column Map) and has a fixed structure
509  // (due to the input arrays). This means we can allocate the
510  // (1-D) array of values and build the local matrix right now.
511  // Note that the local matrix's number of columns comes from the
512  // column Map, not the domain Map.
513 
514  valuesPacked_wdv = values_wdv_type(values);
515  valuesUnpacked_wdv = valuesPacked_wdv;
516 
517  // FIXME (22 Jun 2016) I would very much like to get rid of
518  // k_values1D_ at some point. I find it confusing to have all
519  // these extra references lying around.
520 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
521 
523  if (verbose) {
524  std::ostringstream os;
525  os << *prefix << "Done" << endl;
526  std::cerr << os.str();
527  }
528  }
529 
530  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
532  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
533  const Teuchos::RCP<const map_type>& colMap,
534  const Teuchos::ArrayRCP<size_t>& ptr,
535  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
536  const Teuchos::ArrayRCP<Scalar>& val,
537  const Teuchos::RCP<Teuchos::ParameterList>& params) :
538  dist_object_type (rowMap),
539  storageStatus_ (Details::STORAGE_1D_PACKED)
540  {
541  using Kokkos::Compat::getKokkosViewDeepCopy;
542  using Teuchos::av_reinterpret_cast;
543  using Teuchos::RCP;
544  using values_type = typename local_matrix_device_type::values_type;
545  using IST = impl_scalar_type;
546  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
547  "RCP<const Map>, ptr, ind, val[, params]): ";
548 
549  RCP<crs_graph_type> graph;
550  try {
551  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
552  ind, params));
553  }
554  catch (std::exception& e) {
555  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
556  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
557  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
558  "RCP<ParameterList>]) threw an exception: " << e.what ());
559  }
560  // myGraph_ not null means that the matrix owns the graph. This
561  // is true because the column indices come in as nonconst,
562  // implying shared ownership.
563  myGraph_ = graph;
564  staticGraph_ = graph;
565 
566  // The graph may not be fill complete yet. However, it is locally
567  // indexed (since we have a column Map) and has a fixed structure
568  // (due to the input arrays). This means we can allocate the
569  // (1-D) array of values and build the local matrix right now.
570  // Note that the local matrix's number of columns comes from the
571  // column Map, not the domain Map.
572 
573  // The graph _must_ have a local graph at this point. We don't
574  // really care whether CrsGraph's constructor deep-copies or
575  // shallow-copies the input, but the dimensions have to be right.
576  // That's how we tell whether the CrsGraph has a local graph.
577  auto lclGraph = staticGraph_->getLocalGraphDevice ();
578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
579  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
580  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
581  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
582  "ptr, ind[, params]) did not set the local graph correctly. "
583  "Please report this bug to the Tpetra developers.");
584 
585  values_type valIn =
586  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
587  valuesPacked_wdv = values_wdv_type(valIn);
588  valuesUnpacked_wdv = valuesPacked_wdv;
589 
590  // FIXME (22 Jun 2016) I would very much like to get rid of
591  // k_values1D_ at some point. I find it confusing to have all
592  // these extra references lying around.
593 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
594 
596  }
597 
598  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
600  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
601  const Teuchos::RCP<const map_type>& colMap,
602  const local_matrix_device_type& lclMatrix,
603  const Teuchos::RCP<Teuchos::ParameterList>& params) :
604  dist_object_type (rowMap),
605  storageStatus_ (Details::STORAGE_1D_PACKED),
606  fillComplete_ (true)
607  {
608  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
609  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
610  const char suffix[] =
611  " Please report this bug to the Tpetra developers.";
612 
613  Teuchos::RCP<crs_graph_type> graph;
614  try {
615  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
616  lclMatrix.graph, params));
617  }
618  catch (std::exception& e) {
619  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
620  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
621  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
622  "exception: " << e.what ());
623  }
624  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
625  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
626  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
627  "did not produce a fill-complete graph. Please report this bug to the "
628  "Tpetra developers.");
629  // myGraph_ not null means that the matrix owns the graph. This
630  // is true because the column indices come in as nonconst through
631  // the matrix, implying shared ownership.
632  myGraph_ = graph;
633  staticGraph_ = graph;
634 
635  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
636  valuesUnpacked_wdv = valuesPacked_wdv;
637 
638 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
639 
640  const bool callComputeGlobalConstants = params.get () == nullptr ||
641  params->get ("compute global constants", true);
642  if (callComputeGlobalConstants) {
643  this->computeGlobalConstants ();
644  }
645 
646  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
647  (isFillActive (), std::logic_error,
648  "At the end of a CrsMatrix constructor that should produce "
649  "a fillComplete matrix, isFillActive() is true." << suffix);
650  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
651  (! isFillComplete (), std::logic_error, "At the end of a "
652  "CrsMatrix constructor that should produce a fillComplete "
653  "matrix, isFillComplete() is false." << suffix);
655  }
656 
657  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
659  CrsMatrix (const local_matrix_device_type& lclMatrix,
660  const Teuchos::RCP<const map_type>& rowMap,
661  const Teuchos::RCP<const map_type>& colMap,
662  const Teuchos::RCP<const map_type>& domainMap,
663  const Teuchos::RCP<const map_type>& rangeMap,
664  const Teuchos::RCP<Teuchos::ParameterList>& params) :
665  dist_object_type (rowMap),
666  storageStatus_ (Details::STORAGE_1D_PACKED),
667  fillComplete_ (true)
668  {
669  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
670  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
671  "local_matrix_device_type[, RCP<ParameterList>]): ";
672  const char suffix[] =
673  " Please report this bug to the Tpetra developers.";
674 
675  Teuchos::RCP<crs_graph_type> graph;
676  try {
677  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
678  domainMap, rangeMap, params));
679  }
680  catch (std::exception& e) {
681  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
682  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
683  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
684  "RCP<ParameterList>]) threw an exception: " << e.what ());
685  }
686  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
687  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
688  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
689  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
690  "not produce a fillComplete graph." << suffix);
691  // myGraph_ not null means that the matrix owns the graph. This
692  // is true because the column indices come in as nonconst through
693  // the matrix, implying shared ownership.
694  myGraph_ = graph;
695  staticGraph_ = graph;
696 
697  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
698  valuesUnpacked_wdv = valuesPacked_wdv;
699 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
700 
701  const bool callComputeGlobalConstants = params.get () == nullptr ||
702  params->get ("compute global constants", true);
703  if (callComputeGlobalConstants) {
704  this->computeGlobalConstants ();
705  }
706 
707  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
708  (isFillActive (), std::logic_error,
709  "At the end of a CrsMatrix constructor that should produce "
710  "a fillComplete matrix, isFillActive() is true." << suffix);
711  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
712  (! isFillComplete (), std::logic_error, "At the end of a "
713  "CrsMatrix constructor that should produce a fillComplete "
714  "matrix, isFillComplete() is false." << suffix);
716  }
717 
718  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
720  CrsMatrix (const local_matrix_device_type& lclMatrix,
721  const Teuchos::RCP<const map_type>& rowMap,
722  const Teuchos::RCP<const map_type>& colMap,
723  const Teuchos::RCP<const map_type>& domainMap,
724  const Teuchos::RCP<const map_type>& rangeMap,
725  const Teuchos::RCP<const import_type>& importer,
726  const Teuchos::RCP<const export_type>& exporter,
727  const Teuchos::RCP<Teuchos::ParameterList>& params) :
728  dist_object_type (rowMap),
729  storageStatus_ (Details::STORAGE_1D_PACKED),
730  fillComplete_ (true)
731  {
732  using Teuchos::rcp;
733  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
734  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
735  const char suffix[] =
736  " Please report this bug to the Tpetra developers.";
737 
738  Teuchos::RCP<crs_graph_type> graph;
739  try {
740  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
741  domainMap, rangeMap, importer,
742  exporter, params));
743  }
744  catch (std::exception& e) {
745  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
746  (true, std::runtime_error, "CrsGraph constructor "
747  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
748  "params) threw: " << e.what ());
749  }
750  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
751  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
752  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
753  "Export, params) did not produce a fill-complete graph. "
754  "Please report this bug to the Tpetra developers.");
755  // myGraph_ not null means that the matrix owns the graph. This
756  // is true because the column indices come in as nonconst through
757  // the matrix, implying shared ownership.
758  myGraph_ = graph;
759  staticGraph_ = graph;
760 
761  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
762  valuesUnpacked_wdv = valuesPacked_wdv;
763 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
764 
765  const bool callComputeGlobalConstants = params.get () == nullptr ||
766  params->get ("compute global constants", true);
767  if (callComputeGlobalConstants) {
768  this->computeGlobalConstants ();
769  }
770 
771  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
772  (isFillActive (), std::logic_error,
773  "At the end of a CrsMatrix constructor that should produce "
774  "a fillComplete matrix, isFillActive() is true." << suffix);
775  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
776  (! isFillComplete (), std::logic_error, "At the end of a "
777  "CrsMatrix constructor that should produce a fillComplete "
778  "matrix, isFillComplete() is false." << suffix);
780  }
781 
782  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
785  const Teuchos::DataAccess copyOrView)
786  : CrsMatrix (source.getCrsGraph (), source.getLocalValuesView ())
787  {
788  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
789  "const CrsMatrix&, const Teuchos::DataAccess): ";
790  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
791  (! source.isFillComplete (), std::invalid_argument,
792  "Source graph must be fillComplete().");
793 
794  if (copyOrView == Teuchos::Copy) {
795  using values_type = typename local_matrix_device_type::values_type;
796  values_type vals = source.getLocalValuesView ();
797  using Kokkos::view_alloc;
798  using Kokkos::WithoutInitializing;
799  values_type newvals (view_alloc ("val", WithoutInitializing),
800  vals.extent (0));
801  Kokkos::deep_copy (newvals, vals);
802  valuesPacked_wdv = values_wdv_type(newvals);
803  valuesUnpacked_wdv = valuesPacked_wdv;
804 // k_values1D_ = newvals;
805  if (source.isFillComplete ()) {
806  fillComplete (source.getDomainMap (), source.getRangeMap ());
807  }
808  }
809  else if (copyOrView == Teuchos::View) {
810  return;
811  }
812  else {
813  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
814  (true, std::invalid_argument, "Second argument 'copyOrView' "
815  "has an invalid value " << copyOrView << ". Valid values "
816  "include Teuchos::Copy = " << Teuchos::Copy << " and "
817  "Teuchos::View = " << Teuchos::View << ".");
818  }
819  }
820 
821  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
822  void
825  {
826  std::swap(crs_matrix.importMV_, this->importMV_);
827  std::swap(crs_matrix.exportMV_, this->exportMV_);
828  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
829  std::swap(crs_matrix.myGraph_, this->myGraph_);
830  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
831  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
832  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
833  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
834  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
835  std::swap(crs_matrix.frobNorm_, this->frobNorm_);
836  }
837 
838  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
839  Teuchos::RCP<const Teuchos::Comm<int> >
841  getComm () const {
842  return getCrsGraphRef ().getComm ();
843  }
844 
845  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
848  getProfileType () const {
849  return this->getCrsGraphRef ().getProfileType ();
850  }
851 
852  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
853  bool
855  isFillComplete () const {
856  return fillComplete_;
857  }
858 
859  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
860  bool
862  isFillActive () const {
863  return ! fillComplete_;
864  }
865 
866  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
867  bool
869  isStorageOptimized () const {
870  return this->getCrsGraphRef ().isStorageOptimized ();
871  }
872 
873  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
874  bool
876  isLocallyIndexed () const {
877  return getCrsGraphRef ().isLocallyIndexed ();
878  }
879 
880  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
881  bool
883  isGloballyIndexed () const {
884  return getCrsGraphRef ().isGloballyIndexed ();
885  }
886 
887  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
888  bool
890  hasColMap () const {
891  return getCrsGraphRef ().hasColMap ();
892  }
893 
894  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
897  getGlobalNumEntries () const {
898  return getCrsGraphRef ().getGlobalNumEntries ();
899  }
900 
901  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
902  size_t
904  getNodeNumEntries () const {
905  return getCrsGraphRef ().getNodeNumEntries ();
906  }
907 
908  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
911  getGlobalNumRows () const {
912  return getCrsGraphRef ().getGlobalNumRows ();
913  }
914 
915  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
918  getGlobalNumCols () const {
919  return getCrsGraphRef ().getGlobalNumCols ();
920  }
921 
922  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
923  size_t
925  getNodeNumRows () const {
926  return getCrsGraphRef ().getNodeNumRows ();
927  }
928 
929  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
930  size_t
932  getNodeNumCols () const {
933  return getCrsGraphRef ().getNodeNumCols ();
934  }
935 
936 
937  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
938  size_t
940  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
941  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
942  }
943 
944  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
945  size_t
947  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
948  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
949  }
950 
951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
952  size_t
955  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
956  }
957 
958  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
959  size_t
961  getNodeMaxNumRowEntries () const {
962  return getCrsGraphRef ().getNodeMaxNumRowEntries ();
963  }
964 
965  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
966  GlobalOrdinal
968  getIndexBase () const {
969  return getRowMap ()->getIndexBase ();
970  }
971 
972  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
973  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
975  getRowMap () const {
976  return getCrsGraphRef ().getRowMap ();
977  }
978 
979  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
980  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
982  getColMap () const {
983  return getCrsGraphRef ().getColMap ();
984  }
985 
986  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
987  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
989  getDomainMap () const {
990  return getCrsGraphRef ().getDomainMap ();
991  }
992 
993  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
994  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
996  getRangeMap () const {
997  return getCrsGraphRef ().getRangeMap ();
998  }
999 
1000  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1001  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
1003  getGraph () const {
1004  if (staticGraph_ != Teuchos::null) {
1005  return staticGraph_;
1006  }
1007  return myGraph_;
1008  }
1009 
1010  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1011  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
1013  getCrsGraph () const {
1014  if (staticGraph_ != Teuchos::null) {
1015  return staticGraph_;
1016  }
1017  return myGraph_;
1018  }
1019 
1020  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1023  getCrsGraphRef () const
1024  {
1025 #ifdef HAVE_TPETRA_DEBUG
1026  constexpr bool debug = true;
1027 #else
1028  constexpr bool debug = false;
1029 #endif // HAVE_TPETRA_DEBUG
1030 
1031  if (! this->staticGraph_.is_null ()) {
1032  return * (this->staticGraph_);
1033  }
1034  else {
1035  if (debug) {
1036  const char tfecfFuncName[] = "getCrsGraphRef: ";
1037  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1038  (this->myGraph_.is_null (), std::logic_error,
1039  "Both staticGraph_ and myGraph_ are null. "
1040  "Please report this bug to the Tpetra developers.");
1041  }
1042  return * (this->myGraph_);
1043  }
1044  }
1045 
1046 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1047  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1049  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1050  getLocalMatrix () const
1051  {
1052  return getLocalMatrixDevice();
1053  }
1054 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1056  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1059  getLocalMatrixDevice () const
1060  {
1061  auto numCols = staticGraph_->getColMap()->getNodeNumElements();
1062  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
1063  numCols,
1064  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
1065  staticGraph_->getLocalGraphDevice());
1066  }
1067 
1068  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1069  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
1071  getLocalMatrixHost () const
1072  {
1073  auto numCols = staticGraph_->getColMap()->getNodeNumElements();
1074  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
1075  valuesPacked_wdv.getHostView(Access::ReadWrite),
1076  staticGraph_->getLocalGraphHost());
1077  }
1078 
1079 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1080  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1081  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1084  {
1085  auto localMatrix = getLocalMatrixDevice();
1086 #ifdef HAVE_TPETRACORE_CUDA
1087 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
1088  if(this->getNodeNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()) &&
1089  std::is_same<Node, Kokkos::Compat::KokkosCudaWrapperNode>::value)
1090  {
1091  if(this->ordinalRowptrs.data() == nullptr)
1092  {
1093  auto originalRowptrs = localMatrix.graph.row_map;
1094  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1095  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1096  this->ordinalRowptrs = ordinal_rowptrs_type(
1097  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1098  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1099  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1100  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1101  KOKKOS_LAMBDA(LocalOrdinal i)
1102  {
1103  ordinalRowptrs_(i) = originalRowptrs(i);
1104  });
1105  }
1106  //return local operator using ordinalRowptrs
1107  return std::make_shared<local_multiply_op_type>(
1108  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1109  }
1110 #endif
1111 #endif
1112 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1113  return std::make_shared<local_multiply_op_type>(
1114  std::make_shared<local_matrix_device_type>(
1115  getLocalMatrixDevice()));
1116  }
1118  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1119  bool
1121  isStaticGraph () const {
1122  return myGraph_.is_null ();
1123  }
1124 
1125  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1126  bool
1128  hasTransposeApply () const {
1129  return true;
1130  }
1131 
1132  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1133  bool
1135  supportsRowViews () const {
1136  return true;
1137  }
1138 
1139  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1140  void
1142  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1143  const bool verbose)
1144  {
1145  using Details::Behavior;
1147  using std::endl;
1148  const char tfecfFuncName[] = "allocateValues: ";
1149  const char suffix[] =
1150  " Please report this bug to the Tpetra developers.";
1151  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1152 
1153  std::unique_ptr<std::string> prefix;
1154  if (verbose) {
1155  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1156  std::ostringstream os;
1157  os << *prefix << "lg: "
1158  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1159  << ", gas: Graph"
1160  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1161  << "Allocated" << endl;
1162  std::cerr << os.str();
1163  }
1164 
1165  const bool debug = Behavior::debug("CrsMatrix");
1166  if (debug) {
1167  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1168  (this->staticGraph_.is_null (), std::logic_error,
1169  "staticGraph_ is null." << suffix);
1170 
1171  // If the graph indices are already allocated, then gas should be
1172  // GraphAlreadyAllocated. Otherwise, gas should be
1173  // GraphNotYetAllocated.
1174  if ((gas == GraphAlreadyAllocated) !=
1175  staticGraph_->indicesAreAllocated ()) {
1176  const char err1[] = "The caller has asserted that the graph "
1177  "is ";
1178  const char err2[] = "already allocated, but the static graph "
1179  "says that its indices are ";
1180  const char err3[] = "already allocated. ";
1181  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1182  (gas == GraphAlreadyAllocated &&
1183  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1184  err1 << err2 << "not " << err3 << suffix);
1185  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1186  (gas != GraphAlreadyAllocated &&
1187  staticGraph_->indicesAreAllocated (), std::logic_error,
1188  err1 << "not " << err2 << err3 << suffix);
1189  }
1190 
1191  // If the graph is unallocated, then it had better be a
1192  // matrix-owned graph. ("Matrix-owned graph" means that the
1193  // matrix gets to define the graph structure. If the CrsMatrix
1194  // constructor that takes an RCP<const CrsGraph> was used, then
1195  // the matrix does _not_ own the graph.)
1196  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1197  (! this->staticGraph_->indicesAreAllocated () &&
1198  this->myGraph_.is_null (), std::logic_error,
1199  "The static graph says that its indices are not allocated, "
1200  "but the graph is not owned by the matrix." << suffix);
1201  }
1202 
1203  if (gas == GraphNotYetAllocated) {
1204  if (debug) {
1205  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1206  (this->myGraph_.is_null (), std::logic_error,
1207  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1208  }
1209  try {
1210  this->myGraph_->allocateIndices (lg, verbose);
1211  }
1212  catch (std::exception& e) {
1213  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1214  (true, std::runtime_error, "CrsGraph::allocateIndices "
1215  "threw an exception: " << e.what ());
1216  }
1217  catch (...) {
1218  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1219  (true, std::runtime_error, "CrsGraph::allocateIndices "
1220  "threw an exception not a subclass of std::exception.");
1221  }
1222  }
1223 
1224  // Allocate matrix values.
1225  // "Static profile" means that the number of matrix entries in
1226  // each row was fixed at the time the CrsMatrix constructor was
1227  // called. This lets us use 1-D storage for the matrix's
1228  // values. ("1-D storage" means the same as that used by the
1229  // three arrays in the compressed sparse row storage format.)
1230 
1231  if (debug) {
1232  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1233  (this->staticGraph_.is_null (), std::logic_error,
1234  "this->getProfileType() == StaticProfile, but staticGraph_ "
1235  "is null." << suffix);
1236  }
1237 
1238  const size_t lclNumRows = this->staticGraph_->getNodeNumRows ();
1239  typename Graph::local_graph_device_type::row_map_type k_ptrs =
1240  this->staticGraph_->rowPtrsUnpacked_dev_;
1241  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1242  (k_ptrs.extent (0) != lclNumRows+1, std::logic_error,
1243  "With StaticProfile, row offsets array has length "
1244  << k_ptrs.extent (0) << " != (lclNumRows+1) = "
1245  << (lclNumRows+1) << ".");
1246 
1247  const size_t lclTotalNumEntries =
1248  this->staticGraph_->rowPtrsUnpacked_host_(lclNumRows);
1249 
1250  // Allocate array of (packed???) matrix values.
1251  using values_type = typename local_matrix_device_type::values_type;
1252  if (verbose) {
1253  std::ostringstream os;
1254  os << *prefix << "Allocate values_wdv: Pre "
1255  << valuesUnpacked_wdv.extent(0) << ", post "
1256  << lclTotalNumEntries << endl;
1257  std::cerr << os.str();
1258  }
1259 // this->k_values1D_ =
1260  valuesUnpacked_wdv = values_wdv_type(
1261  values_type("Tpetra::CrsMatrix::values",
1262  lclTotalNumEntries));
1263  }
1264 
1265  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1266  void
1267  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1268  getAllValues (Teuchos::ArrayRCP<const size_t>& rowPointers,
1269  Teuchos::ArrayRCP<const LocalOrdinal>& columnIndices,
1270  Teuchos::ArrayRCP<const Scalar>& values) const
1271  {
1272  using Teuchos::RCP;
1273  const char tfecfFuncName[] = "getAllValues: ";
1274  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1275  columnIndices.size () != values.size (), std::runtime_error,
1276  "Requires that columnIndices and values are the same size.");
1277 
1278  RCP<const crs_graph_type> relevantGraph = getCrsGraph ();
1279  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1280  relevantGraph.is_null (), std::runtime_error,
1281  "Requires that getCrsGraph() is not null.");
1282  try {
1283  rowPointers = relevantGraph->getNodeRowPtrs ();
1284  }
1285  catch (std::exception &e) {
1286  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1287  true, std::runtime_error,
1288  "Caught exception while calling graph->getNodeRowPtrs(): "
1289  << e.what ());
1290  }
1291  try {
1292  columnIndices = relevantGraph->getNodePackedIndices ();
1293  }
1294  catch (std::exception &e) {
1295  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1296  true, std::runtime_error,
1297  "Caught exception while calling graph->getNodePackedIndices(): "
1298  << e.what ());
1299  }
1300  Teuchos::ArrayRCP<const impl_scalar_type> vals =
1301 // Kokkos::Compat::persistingView (k_values1D_);
1302  Kokkos::Compat::persistingView (valuesUnpacked_wdv.getHostView(Access::ReadOnly));
1303  values = Teuchos::arcp_reinterpret_cast<const Scalar> (vals);
1304  }
1305 
1306  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1307  void
1309  getAllValues(Teuchos::ArrayRCP<Scalar>& values) {
1310  using Teuchos::RCP;
1311  const char tfecfFuncName[] = "getAllValues: ";
1312  RCP<const crs_graph_type> relevantGraph = getCrsGraph ();
1313  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1314  relevantGraph.is_null (), std::runtime_error,
1315  "Requires that getCrsGraph() is not null.");
1316  Teuchos::ArrayRCP<impl_scalar_type> vals =
1317  Kokkos::Compat::persistingView (k_values1D_);
1318  values = Teuchos::arcp_reinterpret_cast<Scalar> (vals);
1319  }
1320 
1321 
1322  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1323  void
1325  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1326  {
1328  using ::Tpetra::Details::getEntryOnHost;
1329  using Teuchos::arcp_const_cast;
1330  using Teuchos::Array;
1331  using Teuchos::ArrayRCP;
1332  using Teuchos::null;
1333  using Teuchos::RCP;
1334  using Teuchos::rcp;
1335  using std::endl;
1336  using row_map_type = typename local_graph_device_type::row_map_type;
1337  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1338  using values_type = typename local_matrix_device_type::values_type;
1339  Details::ProfilingRegion regionFLGAM
1340  ("Tpetra::CrsGraph::fillLocalGraphAndMatrix");
1341 
1342  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1343  "fillComplete or expertStaticFillComplete): ";
1344  const char suffix[] =
1345  " Please report this bug to the Tpetra developers.";
1346  const bool debug = Details::Behavior::debug("CrsMatrix");
1347  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1348 
1349  std::unique_ptr<std::string> prefix;
1350  if (verbose) {
1351  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1352  std::ostringstream os;
1353  os << *prefix << endl;
1354  std::cerr << os.str ();
1355  }
1356 
1357  if (debug) {
1358  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1359  // matrix owns the graph, which means myGraph_ is not null.
1360  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1361  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1362  "(myGraph_) is null. This means that the matrix has a "
1363  "const (a.k.a. \"static\") graph. fillComplete or "
1364  "expertStaticFillComplete should never call "
1365  "fillLocalGraphAndMatrix in that case." << suffix);
1366  }
1367 
1368  const size_t lclNumRows = this->getNodeNumRows ();
1369 
1370  // This method's goal is to fill in the three arrays (compressed
1371  // sparse row format) that define the sparse graph's and matrix's
1372  // structure, and the sparse matrix's values.
1373  //
1374  // Get references to the data in myGraph_, so we can modify them
1375  // as well. Note that we only call fillLocalGraphAndMatrix() if
1376  // the matrix owns the graph, which means myGraph_ is not null.
1377 
1378  typedef decltype (myGraph_->k_numRowEntries_) row_entries_type;
1379 
1380  // StaticProfile means that the matrix's column indices and
1381  // values are currently stored in a 1-D format, with row offsets
1382  // in rowPtrsUnpacked_ and local column indices in lclIndsUnpacked_wdv.
1383 
1384  // StaticProfile also means that the graph's array of row
1385  // offsets must already be allocated.
1386  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1387  myGraph_->rowPtrsUnpacked_dev_;
1388 
1389  if (debug) {
1390  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1391  (curRowOffsets.extent (0) == 0, std::logic_error,
1392  "(StaticProfile branch) curRowOffsets.extent(0) == 0.");
1393  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1394  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1395  "(StaticProfile branch) curRowOffsets.extent(0) = "
1396  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1397  << (lclNumRows + 1) << ".");
1398  const size_t numOffsets = curRowOffsets.extent (0);
1399  const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1400  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1401  (numOffsets != 0 &&
1402  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1403  std::logic_error, "(StaticProfile branch) numOffsets = " <<
1404  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1405  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1406  << numOffsets << ") = " << valToCheck << ".");
1407  }
1408 
1409  if (myGraph_->getNodeNumEntries() !=
1410  myGraph_->getNodeAllocationSize()) {
1411 
1412  // Use the nonconst version of row_map_type for k_ptrs,
1413  // because row_map_type is const and we need to modify k_ptrs here.
1414  typename row_map_type::non_const_type k_ptrs;
1415  row_map_type k_ptrs_const;
1416  lclinds_1d_type k_inds;
1417  values_type k_vals;
1419  if (verbose) {
1420  std::ostringstream os;
1421  const auto numEnt = myGraph_->getNodeNumEntries();
1422  const auto allocSize = myGraph_->getNodeAllocationSize();
1423  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1424  << ", allocSize=" << allocSize << endl;
1425  std::cerr << os.str ();
1426  }
1427  // The matrix's current 1-D storage is "unpacked." This means
1428  // the row offsets may differ from what the final row offsets
1429  // should be. This could happen, for example, if the user
1430  // specified StaticProfile in the constructor and set an upper
1431  // bound on the number of entries per row, but didn't fill all
1432  // those entries.
1433  if (debug && curRowOffsets.extent (0) != 0) {
1434  const size_t numOffsets =
1435  static_cast<size_t> (curRowOffsets.extent (0));
1436  const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1438  (static_cast<size_t> (valToCheck) !=
1439  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1440  std::logic_error, "(StaticProfile unpacked branch) Before "
1441  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1442  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1443  " = " << valuesUnpacked_wdv.extent (0) << ".");
1444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1445  (static_cast<size_t> (valToCheck) !=
1446  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1447  std::logic_error, "(StaticProfile unpacked branch) Before "
1448  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1449  << ") = " << valToCheck
1450  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1451  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1452  }
1453  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1454  // the array of valid entry counts per row.
1455 
1456  // Total number of entries in the matrix on the calling
1457  // process. We will compute this in the loop below. It's
1458  // cheap to compute and useful as a sanity check.
1459  size_t lclTotalNumEntries = 0;
1460  {
1461  // Allocate the packed row offsets array. We use a nonconst
1462  // temporary (packedRowOffsets) here, because k_ptrs is
1463  // const. We will assign packedRowOffsets to k_ptrs below.
1464  if (verbose) {
1465  std::ostringstream os;
1466  os << *prefix << "Allocate packed row offsets: "
1467  << (lclNumRows+1) << endl;
1468  std::cerr << os.str ();
1469  }
1470  typename row_map_type::non_const_type
1471  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1472  typename row_entries_type::const_type numRowEnt_h =
1473  myGraph_->k_numRowEntries_;
1474  // We're computing offsets on device. This function can
1475  // handle numRowEnt_h being a host View.
1476  lclTotalNumEntries =
1477  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1478  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1479  // to use packedRowOffsets in the loop above and assign here.
1480  k_ptrs = packedRowOffsets;
1481  k_ptrs_const = k_ptrs;
1482  }
1483 
1484  if (debug) {
1485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1487  std::logic_error,
1488  "(StaticProfile unpacked branch) After packing k_ptrs, "
1489  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1490  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1491  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1492  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1493  (valToCheck != lclTotalNumEntries, std::logic_error,
1494  "(StaticProfile unpacked branch) After filling k_ptrs, "
1495  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1496  << " != total number of entries on the calling process = "
1497  << lclTotalNumEntries << ".");
1498  }
1499 
1500  // Allocate the arrays of packed column indices and values.
1501  if (verbose) {
1502  std::ostringstream os;
1503  os << *prefix << "Allocate packed local column indices: "
1504  << lclTotalNumEntries << endl;
1505  std::cerr << os.str ();
1506  }
1507  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1508  if (verbose) {
1509  std::ostringstream os;
1510  os << *prefix << "Allocate packed values: "
1511  << lclTotalNumEntries << endl;
1512  std::cerr << os.str ();
1513  }
1514  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1515 
1516  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1517  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1518  // the packed row offsets array k_ptrs that we created above.
1519  //
1520  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1521  // need to keep around the unpacked row offsets, column
1522  // indices, and values arrays.
1523 
1524  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1525  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1526  using inds_packer_type = pack_functor<
1527  typename Graph::local_graph_device_type::entries_type::non_const_type,
1528  typename Graph::local_inds_dualv_type::t_dev::const_type,
1529  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1530  typename Graph::local_graph_device_type::row_map_type>;
1531  inds_packer_type indsPacker (
1532  k_inds,
1533  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1534  k_ptrs, curRowOffsets);
1535  using exec_space = typename decltype (k_inds)::execution_space;
1536  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1537  Kokkos::parallel_for
1538  ("Tpetra::CrsMatrix pack column indices",
1539  range_type (0, lclNumRows), indsPacker);
1540 
1541  // Pack the values from unpacked valuesUnpacked_wdv into packed
1542  // k_vals. We will replace valuesPacked_wdv below.
1543  using vals_packer_type = pack_functor<
1544  typename values_type::non_const_type,
1545  typename values_type::const_type,
1546  typename row_map_type::non_const_type,
1547  typename row_map_type::const_type>;
1548  vals_packer_type valsPacker (
1549  k_vals,
1550  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1551  k_ptrs, curRowOffsets);
1552  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1553  range_type (0, lclNumRows), valsPacker);
1554 
1555  if (debug) {
1556  const char myPrefix[] = "(StaticProfile \"Optimize Storage\""
1557  "=true branch) After packing, ";
1558  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1559  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1560  << "k_ptrs.extent(0) = 0. This probably means that "
1561  "rowPtrsUnpacked_ was never allocated.");
1562  if (k_ptrs.extent (0) != 0) {
1563  const size_t numOffsets (k_ptrs.extent (0));
1564  const auto valToCheck =
1565  getEntryOnHost (k_ptrs, numOffsets - 1);
1566  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1567  (size_t (valToCheck) != k_vals.extent (0),
1568  std::logic_error, myPrefix <<
1569  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1570  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1571  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1572  (size_t (valToCheck) != k_inds.extent (0),
1573  std::logic_error, myPrefix <<
1574  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1575  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1576  }
1577  }
1578  // Build the local graph.
1579  myGraph_->setRowPtrsPacked(k_ptrs_const);
1580  myGraph_->lclIndsPacked_wdv =
1581  typename crs_graph_type::local_inds_wdv_type(k_inds);
1582  valuesPacked_wdv = values_wdv_type(k_vals);
1583  }
1584  else { // We don't have to pack, so just set the pointers.
1585  myGraph_->setRowPtrsPacked(myGraph_->rowPtrsUnpacked_dev_);
1586  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1587  valuesPacked_wdv = valuesUnpacked_wdv;
1588 
1589  if (verbose) {
1590  std::ostringstream os;
1591  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1592  << myGraph_->rowPtrsUnpacked_host_.extent(0) << ", lclIndsUnpacked_wdv: "
1593  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1594  << valuesUnpacked_wdv.extent(0) << endl;
1595  std::cerr << os.str();
1596  }
1597 
1598  if (debug) {
1599  const char myPrefix[] =
1600  "(StaticProfile \"Optimize Storage\"=false branch) ";
1601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1602  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1603  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1604  "that rowPtrsUnpacked_ was never allocated.");
1605  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1606  const size_t numOffsets (myGraph_->rowPtrsUnpacked_host_.extent (0));
1607  const auto valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets - 1);
1608  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1609  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1610  std::logic_error, myPrefix <<
1611  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1612  << " != valuesPacked_wdv.extent(0) = "
1613  << valuesPacked_wdv.extent (0) << ".");
1614  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1615  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1616  std::logic_error, myPrefix <<
1617  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1618  << " != myGraph_->lclIndsPacked.extent(0) = "
1619  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1620  }
1621  }
1622  }
1623 
1624  if (debug) {
1625  const char myPrefix[] = "After packing, ";
1626  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1627  (size_t (myGraph_->rowPtrsUnpacked_host_.extent (0)) != size_t (lclNumRows + 1),
1628  std::logic_error, myPrefix << "myGraph_->rowPtrsUnpacked_host_.extent(0) = "
1629  << myGraph_->rowPtrsUnpacked_host_.extent (0) << " != lclNumRows+1 = " <<
1630  (lclNumRows+1) << ".");
1631  if (myGraph_->rowPtrsUnpacked_host_.extent (0) != 0) {
1632  const size_t numOffsets (myGraph_->rowPtrsUnpacked_host_.extent (0));
1633  const size_t valToCheck = myGraph_->rowPtrsUnpacked_host_(numOffsets-1);
1634  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1635  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1636  std::logic_error, myPrefix << "k_ptrs_const(" <<
1637  (numOffsets-1) << ") = " << valToCheck
1638  << " != valuesPacked_wdv.extent(0) = "
1639  << valuesPacked_wdv.extent (0) << ".");
1640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1641  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1642  std::logic_error, myPrefix << "k_ptrs_const(" <<
1643  (numOffsets-1) << ") = " << valToCheck
1644  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1645  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1646  }
1647  }
1648 
1649  // May we ditch the old allocations for the packed (and otherwise
1650  // "optimized") allocations, later in this routine? Optimize
1651  // storage if the graph is not static, or if the graph already has
1652  // optimized storage.
1653  const bool defaultOptStorage =
1654  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1655  const bool requestOptimizedStorage =
1656  (! params.is_null () &&
1657  params->get ("Optimize Storage", defaultOptStorage)) ||
1658  (params.is_null () && defaultOptStorage);
1659 
1660  // The graph has optimized storage when indices are allocated,
1661  // myGraph_->k_numRowEntries_ is empty, and there are more than
1662  // zero rows on this process. It's impossible for the graph to
1663  // have dynamic profile (getProfileType() == DynamicProfile) and
1664  // be optimized (isStorageOptimized()).
1665  if (requestOptimizedStorage) {
1666  // Free the old, unpacked, unoptimized allocations.
1667  // Change the graph from dynamic to static allocation profile
1668 
1669  // Free graph data structures that are only needed for
1670  // unpacked 1-D storage.
1671  if (verbose) {
1672  std::ostringstream os;
1673  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1674  << myGraph_->k_numRowEntries_.extent(0) << endl;
1675  std::cerr << os.str();
1676  }
1677 
1678  myGraph_->k_numRowEntries_ = row_entries_type ();
1679 
1680  // Keep the new 1-D packed allocations.
1681  myGraph_->setRowPtrsUnpacked(myGraph_->rowPtrsPacked_dev_);
1682  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1683  valuesUnpacked_wdv = valuesPacked_wdv;
1684 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1685 
1686  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1687  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1688  }
1689  else {
1690  if (verbose) {
1691  std::ostringstream os;
1692  os << *prefix << "User requested NOT to optimize storage"
1693  << endl;
1694  std::cerr << os.str();
1695  }
1696  }
1697  }
1698 
1699  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1700  void
1702  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1703  {
1704  using ::Tpetra::Details::ProfilingRegion;
1705  using Teuchos::ArrayRCP;
1706  using Teuchos::Array;
1707  using Teuchos::null;
1708  using Teuchos::RCP;
1709  using Teuchos::rcp;
1710  using std::endl;
1711  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1712  using non_const_row_map_type = typename row_map_type::non_const_type;
1713  using values_type = typename local_matrix_device_type::values_type;
1714  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1715  const size_t lclNumRows = getNodeNumRows();
1716 
1717  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1718  std::unique_ptr<std::string> prefix;
1719  if (verbose) {
1720  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1721  std::ostringstream os;
1722  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1723  std::cerr << os.str ();
1724  }
1725 
1726  // The goals of this routine are first, to allocate and fill
1727  // packed 1-D storage (see below for an explanation) in the vals
1728  // array, and second, to give vals to the local matrix and
1729  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1730  // row offsets, within the scope of this routine, since we're only
1731  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1732  // fill both the graph and the matrix at the same time).
1733 
1734  // get data from staticGraph_
1735  size_t nodeNumEntries = staticGraph_->getNodeNumEntries ();
1736  size_t nodeNumAllocated = staticGraph_->getNodeAllocationSize ();
1737  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1738 
1739  row_map_type k_ptrs; // "packed" row offsets array
1740  values_type k_vals; // "packed" values array
1741 
1742  // May we ditch the old allocations for the packed (and otherwise
1743  // "optimized") allocations, later in this routine? Request
1744  // optimized storage by default.
1745  bool requestOptimizedStorage = true;
1746  const bool default_OptimizeStorage =
1747  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1748  if (! params.is_null() &&
1749  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1750  requestOptimizedStorage = false;
1751  }
1752  // If we're not allowed to change a static graph, then we can't
1753  // change the storage of the matrix, either. This means that if
1754  // the graph's storage isn't already optimized, we can't optimize
1755  // the matrix's storage either. Check and give warning, as
1756  // appropriate.
1757  if (! staticGraph_->isStorageOptimized () &&
1758  requestOptimizedStorage) {
1760  (true, std::runtime_error, "You requested optimized storage "
1761  "by setting the \"Optimize Storage\" flag to \"true\" in "
1762  "the ParameterList, or by virtue of default behavior. "
1763  "However, the associated CrsGraph was filled separately and "
1764  "requested not to optimize storage. Therefore, the "
1765  "CrsMatrix cannot optimize storage.");
1766  requestOptimizedStorage = false;
1767  }
1768 
1769  using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1770 
1771  // StaticProfile means that the matrix's values are currently
1772  // stored in a 1-D format. However, this format is "unpacked";
1773  // it doesn't necessarily have the same row offsets as indicated
1774  // by the ptrs array returned by allocRowPtrs. This could
1775  // happen, for example, if the user specified StaticProfile in
1776  // the constructor and fixed the number of matrix entries in
1777  // each row, but didn't fill all those entries.
1778  //
1779  // As above, we don't need to keep the "packed" row offsets
1780  // array ptrs here, but we do need it here temporarily, so we
1781  // have to allocate it. We'll free ptrs later in this method.
1782  //
1783  // Note that this routine checks whether storage has already
1784  // been packed. This is a common case for solution of nonlinear
1785  // PDEs using the finite element method, as long as the
1786  // structure of the sparse matrix does not change between linear
1787  // solves.
1788  if (nodeNumEntries != nodeNumAllocated) {
1789  if (verbose) {
1790  std::ostringstream os;
1791  os << *prefix << "Unpacked 1-D storage: numEnt="
1792  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1793  << endl;
1794  std::cerr << os.str();
1795  }
1796  // We have to pack the 1-D storage, since the user didn't fill
1797  // up all requested storage.
1798  if (verbose) {
1799  std::ostringstream os;
1800  os << *prefix << "Allocate packed row offsets: "
1801  << (lclNumRows+1) << endl;
1802  std::cerr << os.str();
1803  }
1804  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1805  lclNumRows+1);
1806  // Total number of entries in the matrix on the calling
1807  // process. We will compute this in the loop below. It's
1808  // cheap to compute and useful as a sanity check.
1809  size_t lclTotalNumEntries = 0;
1810  k_ptrs = tmpk_ptrs;
1811  {
1812  typename row_entries_type::const_type numRowEnt_h =
1813  staticGraph_->k_numRowEntries_;
1814  // This function can handle the counts being a host View.
1815  lclTotalNumEntries =
1816  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1817  }
1818 
1819  // Allocate the "packed" values array.
1820  // It has exactly the right number of entries.
1821  if (verbose) {
1822  std::ostringstream os;
1823  os << *prefix << "Allocate packed values: "
1824  << lclTotalNumEntries << endl;
1825  std::cerr << os.str ();
1826  }
1827  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1829  // Pack values_wdv into k_vals. We will replace values_wdv below.
1830  pack_functor<
1831  typename values_type::non_const_type,
1832  typename values_type::const_type,
1833  typename row_map_type::non_const_type,
1834  typename row_map_type::const_type> valsPacker
1835  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1836  tmpk_ptrs, k_rowPtrs);
1837 
1838  using exec_space = typename decltype (k_vals)::execution_space;
1839  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1840  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1841  range_type (0, lclNumRows), valsPacker);
1842  valuesPacked_wdv = values_wdv_type(k_vals);
1843  }
1844  else { // We don't have to pack, so just set the pointer.
1845  valuesPacked_wdv = valuesUnpacked_wdv;
1846  if (verbose) {
1847  std::ostringstream os;
1848  os << *prefix << "Storage already packed: "
1849  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1850  std::cerr << os.str();
1851  }
1852  }
1853 
1854  // May we ditch the old allocations for the packed one?
1855  if (requestOptimizedStorage) {
1856  // The user requested optimized storage, so we can dump the
1857  // unpacked 1-D storage, and keep the packed storage.
1858  valuesUnpacked_wdv = valuesPacked_wdv;
1859 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1860  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1861  }
1862  }
1863 
1864  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1865  void
1866  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
1867  insertIndicesAndValues (crs_graph_type& graph,
1868  RowInfo& rowInfo,
1869  const typename crs_graph_type::SLocalGlobalViews& newInds,
1870  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1871  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1872  const ELocalGlobal lg,
1873  const ELocalGlobal I)
1874  {
1875  const size_t oldNumEnt = rowInfo.numEntries;
1876  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1877 
1878  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1879  // that probably relates to scalar_type vs. impl_scalar_type
1880  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1881  // details; look for GCC_WORKAROUND macro definition.
1882  if (numInserted > 0) {
1883  const size_t startOffset = oldNumEnt;
1884  memcpy (&oldRowVals[startOffset], &newRowVals[0],
1885  numInserted * sizeof (impl_scalar_type));
1886  }
1887  }
1888 
1889  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1890  void
1892  insertLocalValues (const LocalOrdinal lclRow,
1893  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1894  const Teuchos::ArrayView<const Scalar>& values)
1895  {
1896  using std::endl;
1897  const char tfecfFuncName[] = "insertLocalValues: ";
1898 
1899  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1900  (! this->isFillActive (), std::runtime_error,
1901  "Fill is not active. After calling fillComplete, you must call "
1902  "resumeFill before you may insert entries into the matrix again.");
1903  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1904  (this->isStaticGraph (), std::runtime_error,
1905  "Cannot insert indices with static graph; use replaceLocalValues() "
1906  "instead.");
1907  // At this point, we know that myGraph_ is nonnull.
1908  crs_graph_type& graph = * (this->myGraph_);
1909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1910  (graph.colMap_.is_null (), std::runtime_error,
1911  "Cannot insert local indices without a column map.");
1912  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1913  (graph.isGloballyIndexed (),
1914  std::runtime_error, "Graph indices are global; use "
1915  "insertGlobalValues().");
1916  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1917  (values.size () != indices.size (), std::runtime_error,
1918  "values.size() = " << values.size ()
1919  << " != indices.size() = " << indices.size () << ".");
1920  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1921  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1922  "Local row index " << lclRow << " does not belong to this process.");
1923 
1924  if (! graph.indicesAreAllocated ()) {
1925  // We only allocate values at most once per process, so it's OK
1926  // to check TPETRA_VERBOSE here.
1927  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1928  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1929  }
1930 
1931 #ifdef HAVE_TPETRA_DEBUG
1932  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1933  // In a debug build, test whether any of the given column indices
1934  // are not in the column Map. Keep track of the invalid column
1935  // indices so we can tell the user about them.
1936  {
1937  using Teuchos::toString;
1938 
1939  const map_type& colMap = * (graph.colMap_);
1940  Teuchos::Array<LocalOrdinal> badColInds;
1941  bool allInColMap = true;
1942  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1943  if (! colMap.isNodeLocalElement (indices[k])) {
1944  allInColMap = false;
1945  badColInds.push_back (indices[k]);
1946  }
1947  }
1948  if (! allInColMap) {
1949  std::ostringstream os;
1950  os << "You attempted to insert entries in owned row " << lclRow
1951  << ", at the following column indices: " << toString (indices)
1952  << "." << endl;
1953  os << "Of those, the following indices are not in the column Map on "
1954  "this process: " << toString (badColInds) << "." << endl << "Since "
1955  "the matrix has a column Map already, it is invalid to insert "
1956  "entries at those locations.";
1957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1958  (true, std::invalid_argument, os.str ());
1959  }
1960  }
1961 #endif // HAVE_TPETRA_DEBUG
1962 
1963  RowInfo rowInfo = graph.getRowInfo (lclRow);
1964 
1965  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1966  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1967  valsView[offset] += values[k]; };
1968  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1969  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1970  }
1971 
1972  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1973  void
1975  insertLocalValues (const LocalOrdinal localRow,
1976  const LocalOrdinal numEnt,
1977  const Scalar vals[],
1978  const LocalOrdinal cols[])
1979  {
1980  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1981  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1982  this->insertLocalValues (localRow, colsT, valsT);
1983  }
1984 
1985  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1986  void
1989  RowInfo& rowInfo,
1990  const GlobalOrdinal gblColInds[],
1991  const impl_scalar_type vals[],
1992  const size_t numInputEnt)
1993  {
1994 #ifdef HAVE_TPETRA_DEBUG
1995  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1996  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1997  const size_t curNumEnt = rowInfo.numEntries;
1998 #endif // HAVE_TPETRA_DEBUG
1999 
2000  if (! graph.indicesAreAllocated ()) {
2001  // We only allocate values at most once per process, so it's OK
2002  // to check TPETRA_VERBOSE here.
2003  using ::Tpetra::Details::Behavior;
2004  const bool verbose = Behavior::verbose("CrsMatrix");
2005  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
2006  // mfh 23 Jul 2017: allocateValues invalidates existing
2007  // getRowInfo results. Once we get rid of lazy graph
2008  // allocation, we'll be able to move the getRowInfo call outside
2009  // of this method.
2010  rowInfo = graph.getRowInfo (rowInfo.localRow);
2011  }
2013  auto valsView = this->getValuesViewHostNonConst(rowInfo);
2014  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
2015  valsView[offset] += vals[k];
2016  };
2017  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2018 #ifdef HAVE_TPETRA_DEBUG
2019  //numInserted is only used inside the debug code below.
2020  auto numInserted =
2021 #endif
2022  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
2023 
2024 #ifdef HAVE_TPETRA_DEBUG
2025  size_t newNumEnt = curNumEnt + numInserted;
2026  const size_t chkNewNumEnt =
2027  graph.getNumEntriesInLocalRow (rowInfo.localRow);
2028  if (chkNewNumEnt != newNumEnt) {
2029  std::ostringstream os;
2030  os << std::endl << "newNumEnt = " << newNumEnt
2031  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
2032  << ") = " << chkNewNumEnt << "." << std::endl
2033  << "\torigNumEnt: " << origNumEnt << std::endl
2034  << "\tnumInputEnt: " << numInputEnt << std::endl
2035  << "\tgblColInds: [";
2036  for (size_t k = 0; k < numInputEnt; ++k) {
2037  os << gblColInds[k];
2038  if (k + size_t (1) < numInputEnt) {
2039  os << ",";
2040  }
2041  }
2042  os << "]" << std::endl
2043  << "\tvals: [";
2044  for (size_t k = 0; k < numInputEnt; ++k) {
2045  os << vals[k];
2046  if (k + size_t (1) < numInputEnt) {
2047  os << ",";
2048  }
2049  }
2050  os << "]" << std::endl;
2051 
2052  if (this->supportsRowViews ()) {
2053  values_host_view_type vals2;
2054  if (this->isGloballyIndexed ()) {
2055  global_inds_host_view_type gblColInds2;
2056  const GlobalOrdinal gblRow =
2057  graph.rowMap_->getGlobalElement (rowInfo.localRow);
2058  if (gblRow ==
2059  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
2060  os << "Local row index " << rowInfo.localRow << " is invalid!"
2061  << std::endl;
2062  }
2063  else {
2064  bool getViewThrew = false;
2065  try {
2066  this->getGlobalRowView (gblRow, gblColInds2, vals2);
2067  }
2068  catch (std::exception& e) {
2069  getViewThrew = true;
2070  os << "getGlobalRowView threw exception:" << std::endl
2071  << e.what () << std::endl;
2072  }
2073  if (! getViewThrew) {
2074  os << "\tNew global column indices: ";
2075  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
2076  os << gblColInds2[jjj] << " ";
2077  os << std::endl;
2078  os << "\tNew values: ";
2079  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
2080  os << vals2[jjj] << " ";
2081  os << std::endl;
2082  }
2083  }
2084  }
2085  else if (this->isLocallyIndexed ()) {
2086  local_inds_host_view_type lclColInds2;
2087  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
2088  os << "\tNew local column indices: ";
2089  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
2090  os << lclColInds2[jjj] << " ";
2091  os << std::endl;
2092  os << "\tNew values: ";
2093  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
2094  os << vals2[jjj] << " ";
2095  os << std::endl;
2096  }
2097  }
2098 
2099  os << "Please report this bug to the Tpetra developers.";
2100  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2101  (true, std::logic_error, os.str ());
2102  }
2103 #endif // HAVE_TPETRA_DEBUG
2104  }
2105 
2106  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2107  void
2109  insertGlobalValues (const GlobalOrdinal gblRow,
2110  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2111  const Teuchos::ArrayView<const Scalar>& values)
2112  {
2113  using Teuchos::toString;
2114  using std::endl;
2115  typedef impl_scalar_type IST;
2116  typedef LocalOrdinal LO;
2117  typedef GlobalOrdinal GO;
2118  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2119  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
2120  const char tfecfFuncName[] = "insertGlobalValues: ";
2121 
2122 #ifdef HAVE_TPETRA_DEBUG
2123  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2124  (values.size () != indices.size (), std::runtime_error,
2125  "values.size() = " << values.size () << " != indices.size() = "
2126  << indices.size () << ".");
2127 #endif // HAVE_TPETRA_DEBUG
2128 
2129  // getRowMap() is not thread safe, because it increments RCP's
2130  // reference count. getCrsGraphRef() is thread safe.
2131  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2132  const LO lclRow = rowMap.getLocalElement (gblRow);
2133 
2134  if (lclRow == OTLO::invalid ()) {
2135  // Input row is _not_ owned by the calling process.
2136  //
2137  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2138  // is not in the row Map, it doesn't matter whether or not the
2139  // graph is static; the data just get stashed for later use by
2140  // globalAssemble().
2141  this->insertNonownedGlobalValues (gblRow, indices, values);
2142  }
2143  else { // Input row _is_ owned by the calling process
2144  if (this->isStaticGraph ()) {
2145  // Uh oh! Not allowed to insert into owned rows in that case.
2146  const int myRank = rowMap.getComm ()->getRank ();
2147  const int numProcs = rowMap.getComm ()->getSize ();
2148  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2149  (true, std::runtime_error,
2150  "The matrix was constructed with a constant (\"static\") graph, "
2151  "yet the given global row index " << gblRow << " is in the row "
2152  "Map on the calling process (with rank " << myRank << ", of " <<
2153  numProcs << " process(es)). In this case, you may not insert "
2154  "new entries into rows owned by the calling process.");
2155  }
2156 
2157  crs_graph_type& graph = * (this->myGraph_);
2158  const IST* const inputVals =
2159  reinterpret_cast<const IST*> (values.getRawPtr ());
2160  const GO* const inputGblColInds = indices.getRawPtr ();
2161  const size_t numInputEnt = indices.size ();
2162  RowInfo rowInfo = graph.getRowInfo (lclRow);
2163 
2164  // If the matrix has a column Map, check at this point whether
2165  // the column indices belong to the column Map.
2166  //
2167  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2168  // test to the CrsGraph method, since it may have to do this
2169  // anyway.
2170  if (! graph.colMap_.is_null ()) {
2171  const map_type& colMap = * (graph.colMap_);
2172  // In a debug build, keep track of the nonowned ("bad") column
2173  // indices, so that we can display them in the exception
2174  // message. In a release build, just ditch the loop early if
2175  // we encounter a nonowned column index.
2176 #ifdef HAVE_TPETRA_DEBUG
2177  Teuchos::Array<GO> badColInds;
2178 #endif // HAVE_TPETRA_DEBUG
2179  const size_type numEntriesToInsert = indices.size ();
2180  bool allInColMap = true;
2181  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2182  if (! colMap.isNodeGlobalElement (indices[k])) {
2183  allInColMap = false;
2184 #ifdef HAVE_TPETRA_DEBUG
2185  badColInds.push_back (indices[k]);
2186 #else
2187  break;
2188 #endif // HAVE_TPETRA_DEBUG
2189  }
2190  }
2191  if (! allInColMap) {
2192  std::ostringstream os;
2193  os << "You attempted to insert entries in owned row " << gblRow
2194  << ", at the following column indices: " << toString (indices)
2195  << "." << endl;
2196 #ifdef HAVE_TPETRA_DEBUG
2197  os << "Of those, the following indices are not in the column Map "
2198  "on this process: " << toString (badColInds) << "." << endl
2199  << "Since the matrix has a column Map already, it is invalid "
2200  "to insert entries at those locations.";
2201 #else
2202  os << "At least one of those indices is not in the column Map "
2203  "on this process." << endl << "It is invalid to insert into "
2204  "columns not in the column Map on the process that owns the "
2205  "row.";
2206 #endif // HAVE_TPETRA_DEBUG
2207  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2208  (true, std::invalid_argument, os.str ());
2209  }
2210  }
2211 
2212  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2213  inputVals, numInputEnt);
2214  }
2215  }
2216 
2217 
2218  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2219  void
2221  insertGlobalValues (const GlobalOrdinal globalRow,
2222  const LocalOrdinal numEnt,
2223  const Scalar vals[],
2224  const GlobalOrdinal inds[])
2225  {
2226  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2227  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2228  this->insertGlobalValues (globalRow, indsT, valsT);
2229  }
2230 
2231 
2232  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2233  void
2236  const GlobalOrdinal gblRow,
2237  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2238  const Teuchos::ArrayView<const Scalar>& values,
2239  const bool debug)
2240  {
2241  typedef impl_scalar_type IST;
2242  typedef LocalOrdinal LO;
2243  typedef GlobalOrdinal GO;
2244  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2245  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2246 
2247  if (debug) {
2248  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2249  (values.size () != indices.size (), std::runtime_error,
2250  "values.size() = " << values.size () << " != indices.size() = "
2251  << indices.size () << ".");
2252  }
2253 
2254  // getRowMap() is not thread safe, because it increments RCP's
2255  // reference count. getCrsGraphRef() is thread safe.
2256  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2257  const LO lclRow = rowMap.getLocalElement (gblRow);
2258  if (lclRow == OTLO::invalid ()) {
2259  // Input row is _not_ owned by the calling process.
2260  //
2261  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2262  // is not in the row Map, it doesn't matter whether or not the
2263  // graph is static; the data just get stashed for later use by
2264  // globalAssemble().
2265  this->insertNonownedGlobalValues (gblRow, indices, values);
2266  }
2267  else { // Input row _is_ owned by the calling process
2268  if (this->isStaticGraph ()) {
2269  // Uh oh! Not allowed to insert into owned rows in that case.
2270  const int myRank = rowMap.getComm ()->getRank ();
2271  const int numProcs = rowMap.getComm ()->getSize ();
2272  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2273  (true, std::runtime_error,
2274  "The matrix was constructed with a constant (\"static\") graph, "
2275  "yet the given global row index " << gblRow << " is in the row "
2276  "Map on the calling process (with rank " << myRank << ", of " <<
2277  numProcs << " process(es)). In this case, you may not insert "
2278  "new entries into rows owned by the calling process.");
2279  }
2280 
2281  crs_graph_type& graph = * (this->myGraph_);
2282  const IST* const inputVals =
2283  reinterpret_cast<const IST*> (values.getRawPtr ());
2284  const GO* const inputGblColInds = indices.getRawPtr ();
2285  const size_t numInputEnt = indices.size ();
2286  RowInfo rowInfo = graph.getRowInfo (lclRow);
2287 
2288  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2289  // This branch is similar in function to the following branch, but for
2290  // the special case that the target graph is locally indexed (and the
2291  // profile type is StaticProfile). In this case, we cannot simply filter
2292  // out global indices that don't exist on the receiving process and
2293  // insert the remaining (global) indices, but we must convert them (the
2294  // remaining global indices) to local and call `insertLocalValues`.
2295  const map_type& colMap = * (graph.colMap_);
2296  size_t curOffset = 0;
2297  while (curOffset < numInputEnt) {
2298  // Find a sequence of input indices that are in the column Map on the
2299  // calling process. Doing a sequence at a time, instead of one at a
2300  // time, amortizes some overhead.
2301  Teuchos::Array<LO> lclIndices;
2302  size_t endOffset = curOffset;
2303  for ( ; endOffset < numInputEnt; ++endOffset) {
2304  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2305  if (lclIndex != OTLO::invalid())
2306  lclIndices.push_back(lclIndex);
2307  else
2308  break;
2309  }
2310  // curOffset, endOffset: half-exclusive range of indices in the column
2311  // Map on the calling process. If endOffset == curOffset, the range is
2312  // empty.
2313  const LO numIndInSeq = (endOffset - curOffset);
2314  if (numIndInSeq != 0) {
2315  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2316  }
2317  // Invariant before the increment line: Either endOffset ==
2318  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2319  // on the calling process.
2320  if (debug) {
2321  const bool invariant = endOffset == numInputEnt ||
2322  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2323  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2324  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2325  }
2326  curOffset = endOffset + 1;
2327  }
2328  }
2329  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2330  const map_type& colMap = * (graph.colMap_);
2331  size_t curOffset = 0;
2332  while (curOffset < numInputEnt) {
2333  // Find a sequence of input indices that are in the column
2334  // Map on the calling process. Doing a sequence at a time,
2335  // instead of one at a time, amortizes some overhead.
2336  size_t endOffset = curOffset;
2337  for ( ; endOffset < numInputEnt &&
2338  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2339  ++endOffset)
2340  {}
2341  // curOffset, endOffset: half-exclusive range of indices in
2342  // the column Map on the calling process. If endOffset ==
2343  // curOffset, the range is empty.
2344  const LO numIndInSeq = (endOffset - curOffset);
2345  if (numIndInSeq != 0) {
2346  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2347  this->insertGlobalValuesImpl (graph, rowInfo,
2348  inputGblColInds + curOffset,
2349  inputVals + curOffset,
2350  numIndInSeq);
2351  }
2352  // Invariant before the increment line: Either endOffset ==
2353  // numInputEnt, or inputGblColInds[endOffset] is not in the
2354  // column Map on the calling process.
2355  if (debug) {
2356  const bool invariant = endOffset == numInputEnt ||
2357  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2359  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2360  }
2361  curOffset = endOffset + 1;
2362  }
2363  }
2364  else { // we don't have a column Map.
2365  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2366  inputVals, numInputEnt);
2367  }
2368  }
2369  }
2371  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2372  void
2375  const GlobalOrdinal gblRow,
2376  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2377  const Teuchos::ArrayView<const Scalar>& values,
2378  const char* const prefix,
2379  const bool debug,
2380  const bool verbose)
2381  {
2383  using std::endl;
2384 
2385  try {
2386  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2387  }
2388  catch(std::exception& e) {
2389  std::ostringstream os;
2390  if (verbose) {
2391  const size_t maxNumToPrint =
2393  os << *prefix << ": insertGlobalValuesFiltered threw an "
2394  "exception: " << e.what() << endl
2395  << "Global row index: " << gblRow << endl;
2396  verbosePrintArray(os, indices, "Global column indices",
2397  maxNumToPrint);
2398  os << endl;
2399  verbosePrintArray(os, values, "Values", maxNumToPrint);
2400  os << endl;
2401  }
2402  else {
2403  os << ": insertGlobalValuesFiltered threw an exception: "
2404  << e.what();
2405  }
2406  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2407  }
2408  }
2409 
2410  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2411  LocalOrdinal
2414  const crs_graph_type& graph,
2415  const RowInfo& rowInfo,
2416  const LocalOrdinal inds[],
2417  const impl_scalar_type newVals[],
2418  const LocalOrdinal numElts)
2419  {
2420  typedef LocalOrdinal LO;
2421  typedef GlobalOrdinal GO;
2422  const bool sorted = graph.isSorted ();
2423 
2424  size_t hint = 0; // Guess for the current index k into rowVals
2425  LO numValid = 0; // number of valid local column indices
2426 
2427  if (graph.isLocallyIndexed ()) {
2428  // Get a view of the column indices in the row. This amortizes
2429  // the cost of getting the view over all the entries of inds.
2430  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2431 
2432  for (LO j = 0; j < numElts; ++j) {
2433  const LO lclColInd = inds[j];
2434  const size_t offset =
2435  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2436  lclColInd, hint, sorted);
2437  if (offset != rowInfo.numEntries) {
2438  rowVals[offset] = newVals[j];
2439  hint = offset + 1;
2440  ++numValid;
2441  }
2442  }
2443  }
2444  else if (graph.isGloballyIndexed ()) {
2445  if (graph.colMap_.is_null ()) {
2446  return Teuchos::OrdinalTraits<LO>::invalid ();
2447  }
2448  const map_type colMap = * (graph.colMap_);
2449 
2450  // Get a view of the column indices in the row. This amortizes
2451  // the cost of getting the view over all the entries of inds.
2452  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2453 
2454  for (LO j = 0; j < numElts; ++j) {
2455  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2456  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2457  const size_t offset =
2458  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2459  gblColInd, hint, sorted);
2460  if (offset != rowInfo.numEntries) {
2461  rowVals[offset] = newVals[j];
2462  hint = offset + 1;
2463  ++numValid;
2464  }
2465  }
2466  }
2467  }
2468  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2469  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2470  // to be neither locally nor globally indexed on a process.
2471  // This means that the graph or matrix has no entries on that
2472  // process. Epetra also works like this. It's related to lazy
2473  // allocation (on first insertion, not at graph / matrix
2474  // construction). Lazy allocation will go away because it is
2475  // not thread scalable.
2476 
2477  return numValid;
2478  }
2479 
2480  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2481  LocalOrdinal
2483  replaceLocalValues (const LocalOrdinal localRow,
2484  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2485  const Teuchos::ArrayView<const Scalar>& vals)
2486  {
2487  typedef LocalOrdinal LO;
2488 
2489  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2490  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2491  return Teuchos::OrdinalTraits<LO>::invalid ();
2492  }
2493  const LO* const inputInds = lclCols.getRawPtr ();
2494  const Scalar* const inputVals = vals.getRawPtr ();
2495  return this->replaceLocalValues (localRow, numInputEnt,
2496  inputVals, inputInds);
2497  }
2498 
2499  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2501  local_ordinal_type
2504  const local_ordinal_type localRow,
2505  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2506  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2507  {
2508  using LO = local_ordinal_type;
2509  const LO numInputEnt = inputInds.extent(0);
2510  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2511  return Teuchos::OrdinalTraits<LO>::invalid();
2512  }
2513  const Scalar* const inVals =
2514  reinterpret_cast<const Scalar*>(inputVals.data());
2515  return this->replaceLocalValues(localRow, numInputEnt,
2516  inVals, inputInds.data());
2517  }
2518 
2519  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2520  LocalOrdinal
2522  replaceLocalValues (const LocalOrdinal localRow,
2523  const LocalOrdinal numEnt,
2524  const Scalar inputVals[],
2525  const LocalOrdinal inputCols[])
2526  {
2527  typedef impl_scalar_type IST;
2528  typedef LocalOrdinal LO;
2529 
2530  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2531  // Fill must be active and the "nonconst" graph must exist.
2532  return Teuchos::OrdinalTraits<LO>::invalid ();
2533  }
2534  const crs_graph_type& graph = * (this->staticGraph_);
2535  const RowInfo rowInfo = graph.getRowInfo (localRow);
2536 
2537  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2538  // The calling process does not own this row, so it is not
2539  // allowed to modify its values.
2540  return static_cast<LO> (0);
2541  }
2542  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2543  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2544  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2545  inputCols, inVals, numEnt);
2546  }
2547 
2548  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2549  LocalOrdinal
2552  const crs_graph_type& graph,
2553  const RowInfo& rowInfo,
2554  const GlobalOrdinal inds[],
2555  const impl_scalar_type newVals[],
2556  const LocalOrdinal numElts)
2557  {
2558  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2559  auto fun =
2560  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2561  rowVals[offset] = newVals[k];
2562  };
2563  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2564  return graph.findGlobalIndices(rowInfo, indsT, cb);
2565  }
2566 
2567  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2568  LocalOrdinal
2570  replaceGlobalValues (const GlobalOrdinal globalRow,
2571  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2572  const Teuchos::ArrayView<const Scalar>& inputVals)
2573  {
2574  typedef LocalOrdinal LO;
2575 
2576  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2577  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2578  return Teuchos::OrdinalTraits<LO>::invalid ();
2579  }
2580  return this->replaceGlobalValues (globalRow, numInputEnt,
2581  inputVals.getRawPtr (),
2582  inputGblColInds.getRawPtr ());
2583  }
2584 
2585  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2586  LocalOrdinal
2588  replaceGlobalValues (const GlobalOrdinal globalRow,
2589  const LocalOrdinal numEnt,
2590  const Scalar inputVals[],
2591  const GlobalOrdinal inputGblColInds[])
2592  {
2593  typedef impl_scalar_type IST;
2594  typedef LocalOrdinal LO;
2595 
2596  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2597  // Fill must be active and the "nonconst" graph must exist.
2598  return Teuchos::OrdinalTraits<LO>::invalid ();
2599  }
2600  const crs_graph_type& graph = * (this->staticGraph_);
2601 
2602  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2603  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2604  // The input local row is invalid on the calling process,
2605  // which means that the calling process summed 0 entries.
2606  return static_cast<LO> (0);
2607  }
2608 
2609  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2610  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2611  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2612  inputGblColInds, inVals, numEnt);
2613  }
2615  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2617  local_ordinal_type
2620  const global_ordinal_type globalRow,
2621  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2622  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2623  {
2624  // We use static_assert here to check the template parameters,
2625  // rather than std::enable_if (e.g., on the return value, to
2626  // enable compilation only if the template parameters match the
2627  // desired attributes). This turns obscure link errors into
2628  // clear compilation errors. It also makes the return value a
2629  // lot easier to see.
2630  using LO = local_ordinal_type;
2631  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2632  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2633  return Teuchos::OrdinalTraits<LO>::invalid();
2634  }
2635  const Scalar* const inVals =
2636  reinterpret_cast<const Scalar*>(inputVals.data());
2637  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2638  inputInds.data());
2639  }
2640 
2641  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2642  LocalOrdinal
2645  const crs_graph_type& graph,
2646  const RowInfo& rowInfo,
2647  const GlobalOrdinal inds[],
2648  const impl_scalar_type newVals[],
2649  const LocalOrdinal numElts,
2650  const bool atomic)
2651  {
2652  typedef LocalOrdinal LO;
2653  typedef GlobalOrdinal GO;
2654 
2655  const bool sorted = graph.isSorted ();
2656 
2657  size_t hint = 0; // guess at the index's relative offset in the row
2658  LO numValid = 0; // number of valid input column indices
2659 
2660  if (graph.isLocallyIndexed ()) {
2661  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2662  // pointer does NOT change its reference count. Thus, this
2663  // code is still thread safe.
2664  if (graph.colMap_.is_null ()) {
2665  // NO input column indices are valid in this case, since if
2666  // the column Map is null on the calling process, then the
2667  // calling process owns no graph entries.
2668  return numValid;
2669  }
2670  const map_type& colMap = * (graph.colMap_);
2672  // Get a view of the column indices in the row. This amortizes
2673  // the cost of getting the view over all the entries of inds.
2674  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2675  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2676 
2677  for (LO j = 0; j < numElts; ++j) {
2678  const LO lclColInd = colMap.getLocalElement (inds[j]);
2679  if (lclColInd != LINV) {
2680  const size_t offset =
2681  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2682  lclColInd, hint, sorted);
2683  if (offset != rowInfo.numEntries) {
2684  if (atomic) {
2685  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2686  }
2687  else {
2688  rowVals[offset] += newVals[j];
2689  }
2690  hint = offset + 1;
2691  numValid++;
2692  }
2693  }
2694  }
2695  }
2696  else if (graph.isGloballyIndexed ()) {
2697  // Get a view of the column indices in the row. This amortizes
2698  // the cost of getting the view over all the entries of inds.
2699  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2700 
2701  for (LO j = 0; j < numElts; ++j) {
2702  const GO gblColInd = inds[j];
2703  const size_t offset =
2704  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2705  gblColInd, hint, sorted);
2706  if (offset != rowInfo.numEntries) {
2707  if (atomic) {
2708  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2709  }
2710  else {
2711  rowVals[offset] += newVals[j];
2712  }
2713  hint = offset + 1;
2714  numValid++;
2715  }
2716  }
2717  }
2718  // If the graph is neither locally nor globally indexed on the
2719  // calling process, that means the calling process has no graph
2720  // entries. Thus, none of the input column indices are valid.
2721 
2722  return numValid;
2723  }
2724 
2725  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2726  LocalOrdinal
2728  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2729  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2730  const Teuchos::ArrayView<const Scalar>& inputVals,
2731  const bool atomic)
2732  {
2733  typedef LocalOrdinal LO;
2734 
2735  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2736  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2737  return Teuchos::OrdinalTraits<LO>::invalid ();
2738  }
2739  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2740  inputVals.getRawPtr (),
2741  inputGblColInds.getRawPtr (),
2742  atomic);
2743  }
2744 
2745  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2746  LocalOrdinal
2748  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2749  const LocalOrdinal numInputEnt,
2750  const Scalar inputVals[],
2751  const GlobalOrdinal inputGblColInds[],
2752  const bool atomic)
2753  {
2754  typedef impl_scalar_type IST;
2755  typedef LocalOrdinal LO;
2756  typedef GlobalOrdinal GO;
2757 
2758  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2759  // Fill must be active and the "nonconst" graph must exist.
2760  return Teuchos::OrdinalTraits<LO>::invalid ();
2761  }
2762  const crs_graph_type& graph = * (this->staticGraph_);
2763 
2764  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2765  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2766  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2767  // thread safe in a debug build, in part because it uses
2768  // Teuchos::ArrayView, and in part because of the data structure
2769  // used to stash outgoing entries.
2770  using Teuchos::ArrayView;
2771  ArrayView<const GO> inputGblColInds_av(
2772  numInputEnt == 0 ? nullptr : inputGblColInds,
2773  numInputEnt);
2774  ArrayView<const Scalar> inputVals_av(
2775  numInputEnt == 0 ? nullptr :
2776  inputVals, numInputEnt);
2777  // gblRow is not in the row Map on the calling process, so stash
2778  // the given entries away in a separate data structure.
2779  // globalAssemble() (called during fillComplete()) will exchange
2780  // that data and sum it in using sumIntoGlobalValues().
2781  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2782  inputVals_av);
2783  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2784  // since we won't know whether the given indices were valid
2785  // until globalAssemble (called in fillComplete) is called.
2786  // That's why insertNonownedGlobalValues doesn't return
2787  // anything. Just for consistency, I'll return the number of
2788  // entries that the user gave us.
2789  return numInputEnt;
2790  }
2791  else { // input row is in the row Map on the calling process
2792  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2793  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2794  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2795  inputGblColInds, inVals,
2796  numInputEnt, atomic);
2797  }
2798  }
2799 
2800  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2801  LocalOrdinal
2802  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2803  transformLocalValues (const LocalOrdinal lclRow,
2804  const LocalOrdinal numInputEnt,
2805  const impl_scalar_type inputVals[],
2806  const LocalOrdinal inputCols[],
2807  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2808  const bool atomic)
2809  {
2810  using Tpetra::Details::OrdinalTraits;
2811  typedef LocalOrdinal LO;
2812 
2813  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2814  // Fill must be active and the "nonconst" graph must exist.
2815  return Teuchos::OrdinalTraits<LO>::invalid ();
2816  }
2817  const crs_graph_type& graph = * (this->staticGraph_);
2818  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2819 
2820  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2821  // The calling process does not own this row, so it is not
2822  // allowed to modify its values.
2823  return static_cast<LO> (0);
2824  }
2825  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2826  return this->transformLocalValues (curRowVals.data (), graph,
2827  rowInfo, inputCols, inputVals,
2828  numInputEnt, f, atomic);
2829  }
2830 
2831  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2832  LocalOrdinal
2833  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2834  transformGlobalValues (const GlobalOrdinal gblRow,
2835  const LocalOrdinal numInputEnt,
2836  const impl_scalar_type inputVals[],
2837  const GlobalOrdinal inputCols[],
2838  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2839  const bool atomic)
2840  {
2841  using Tpetra::Details::OrdinalTraits;
2842  typedef LocalOrdinal LO;
2843 
2844  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2845  // Fill must be active and the "nonconst" graph must exist.
2846  return OrdinalTraits<LO>::invalid ();
2847  }
2848  const crs_graph_type& graph = * (this->staticGraph_);
2849  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2850 
2851  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2852  // The calling process does not own this row, so it is not
2853  // allowed to modify its values.
2854  return static_cast<LO> (0);
2855  }
2856  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2857  return this->transformGlobalValues (curRowVals.data (), graph,
2858  rowInfo, inputCols, inputVals,
2859  numInputEnt, f, atomic);
2860  }
2861 
2862  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2863  LocalOrdinal
2866  const crs_graph_type& graph,
2867  const RowInfo& rowInfo,
2868  const LocalOrdinal inds[],
2869  const impl_scalar_type newVals[],
2870  const LocalOrdinal numElts,
2871  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2872  const bool atomic)
2873  {
2874  typedef impl_scalar_type ST;
2875  typedef LocalOrdinal LO;
2876  typedef GlobalOrdinal GO;
2877 
2878  //if (newVals.extent (0) != inds.extent (0)) {
2879  // The sizes of the input arrays must match.
2880  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2881  //}
2882  //const LO numElts = static_cast<LO> (inds.extent (0));
2883  const bool sorted = graph.isSorted ();
2884 
2885  LO numValid = 0; // number of valid input column indices
2886  size_t hint = 0; // Guess for the current index k into rowVals
2887 
2888  if (graph.isLocallyIndexed ()) {
2889  // Get a view of the column indices in the row. This amortizes
2890  // the cost of getting the view over all the entries of inds.
2891  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2892 
2893  for (LO j = 0; j < numElts; ++j) {
2894  const LO lclColInd = inds[j];
2895  const size_t offset =
2896  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2897  lclColInd, hint, sorted);
2898  if (offset != rowInfo.numEntries) {
2899  if (atomic) {
2900  // NOTE (mfh 30 Nov 2015) The commented-out code is
2901  // wrong because another thread may have changed
2902  // rowVals[offset] between those two lines of code.
2903  //
2904  //const ST newVal = f (rowVals[offset], newVals[j]);
2905  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2907  volatile ST* const dest = &rowVals[offset];
2908  (void) atomic_binary_function_update (dest, newVals[j], f);
2909  }
2910  else {
2911  // use binary function f
2912  rowVals[offset] = f (rowVals[offset], newVals[j]);
2913  }
2914  hint = offset + 1;
2915  ++numValid;
2916  }
2917  }
2918  }
2919  else if (graph.isGloballyIndexed ()) {
2920  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2921  // pointer does NOT change its reference count. Thus, this
2922  // code is still thread safe.
2923  if (graph.colMap_.is_null ()) {
2924  // NO input column indices are valid in this case. Either
2925  // the column Map hasn't been set yet (so local indices
2926  // don't exist yet), or the calling process owns no graph
2927  // entries.
2928  return numValid;
2929  }
2930  const map_type& colMap = * (graph.colMap_);
2931  // Get a view of the column indices in the row. This amortizes
2932  // the cost of getting the view over all the entries of inds.
2933  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2934 
2935  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2936  for (LO j = 0; j < numElts; ++j) {
2937  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2938  if (gblColInd != GINV) {
2939  const size_t offset =
2940  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2941  gblColInd, hint, sorted);
2942  if (offset != rowInfo.numEntries) {
2943  if (atomic) {
2944  // NOTE (mfh 30 Nov 2015) The commented-out code is
2945  // wrong because another thread may have changed
2946  // rowVals[offset] between those two lines of code.
2947  //
2948  //const ST newVal = f (rowVals[offset], newVals[j]);
2949  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2950 
2951  volatile ST* const dest = &rowVals[offset];
2952  (void) atomic_binary_function_update (dest, newVals[j], f);
2953  }
2954  else {
2955  // use binary function f
2956  rowVals[offset] = f (rowVals[offset], newVals[j]);
2957  }
2958  hint = offset + 1;
2959  numValid++;
2960  }
2961  }
2962  }
2963  }
2964  // If the graph is neither locally nor globally indexed on the
2965  // calling process, that means the calling process has no graph
2966  // entries. Thus, none of the input column indices are valid.
2967 
2968  return numValid;
2969  }
2970 
2971  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2972  LocalOrdinal
2973  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2974  transformGlobalValues (impl_scalar_type rowVals[],
2975  const crs_graph_type& graph,
2976  const RowInfo& rowInfo,
2977  const GlobalOrdinal inds[],
2978  const impl_scalar_type newVals[],
2979  const LocalOrdinal numElts,
2980  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2981  const bool atomic)
2982  {
2983  typedef impl_scalar_type ST;
2984  typedef LocalOrdinal LO;
2985  typedef GlobalOrdinal GO;
2986 
2987  //if (newVals.extent (0) != inds.extent (0)) {
2988  // The sizes of the input arrays must match.
2989  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2990  //}
2991  //const LO numElts = static_cast<LO> (inds.extent (0));
2992  const bool sorted = graph.isSorted ();
2993 
2994  LO numValid = 0; // number of valid input column indices
2995  size_t hint = 0; // Guess for the current index k into rowVals
2996 
2997  if (graph.isGloballyIndexed ()) {
2998  // Get a view of the column indices in the row. This amortizes
2999  // the cost of getting the view over all the entries of inds.
3000  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3002  for (LO j = 0; j < numElts; ++j) {
3003  const GO gblColInd = inds[j];
3004  const size_t offset =
3005  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3006  gblColInd, hint, sorted);
3007  if (offset != rowInfo.numEntries) {
3008  if (atomic) {
3009  // NOTE (mfh 30 Nov 2015) The commented-out code is
3010  // wrong because another thread may have changed
3011  // rowVals[offset] between those two lines of code.
3012  //
3013  //const ST newVal = f (rowVals[offset], newVals[j]);
3014  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3015 
3016  volatile ST* const dest = &rowVals[offset];
3017  (void) atomic_binary_function_update (dest, newVals[j], f);
3018  }
3019  else {
3020  // use binary function f
3021  rowVals[offset] = f (rowVals[offset], newVals[j]);
3022  }
3023  hint = offset + 1;
3024  ++numValid;
3025  }
3026  }
3027  }
3028  else if (graph.isLocallyIndexed ()) {
3029  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
3030  // pointer does NOT change its reference count. Thus, this
3031  // code is still thread safe.
3032  if (graph.colMap_.is_null ()) {
3033  // NO input column indices are valid in this case. Either the
3034  // column Map hasn't been set yet (so local indices don't
3035  // exist yet), or the calling process owns no graph entries.
3036  return numValid;
3037  }
3038  const map_type& colMap = * (graph.colMap_);
3039  // Get a view of the column indices in the row. This amortizes
3040  // the cost of getting the view over all the entries of inds.
3041  auto colInds = graph.getLocalIndsViewHost (rowInfo);
3043  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
3044  for (LO j = 0; j < numElts; ++j) {
3045  const LO lclColInd = colMap.getLocalElement (inds[j]);
3046  if (lclColInd != LINV) {
3047  const size_t offset =
3048  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3049  lclColInd, hint, sorted);
3050  if (offset != rowInfo.numEntries) {
3051  if (atomic) {
3052  // NOTE (mfh 30 Nov 2015) The commented-out code is
3053  // wrong because another thread may have changed
3054  // rowVals[offset] between those two lines of code.
3055  //
3056  //const ST newVal = f (rowVals[offset], newVals[j]);
3057  //Kokkos::atomic_assign (&rowVals[offset], newVal);
3058 
3059  volatile ST* const dest = &rowVals[offset];
3060  (void) atomic_binary_function_update (dest, newVals[j], f);
3061  }
3062  else {
3063  // use binary function f
3064  rowVals[offset] = f (rowVals[offset], newVals[j]);
3065  }
3066  hint = offset + 1;
3067  numValid++;
3068  }
3069  }
3070  }
3071  }
3072  // If the graph is neither locally nor globally indexed on the
3073  // calling process, that means the calling process has no graph
3074  // entries. Thus, none of the input column indices are valid.
3076  return numValid;
3077  }
3078 
3079  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3080  LocalOrdinal
3083  const crs_graph_type& graph,
3084  const RowInfo& rowInfo,
3085  const LocalOrdinal inds[],
3086  const impl_scalar_type newVals[],
3087  const LocalOrdinal numElts,
3088  const bool atomic)
3089  {
3090  typedef LocalOrdinal LO;
3091  typedef GlobalOrdinal GO;
3092 
3093  const bool sorted = graph.isSorted ();
3094 
3095  size_t hint = 0; // Guess for the current index k into rowVals
3096  LO numValid = 0; // number of valid local column indices
3097 
3098  if (graph.isLocallyIndexed ()) {
3099  // Get a view of the column indices in the row. This amortizes
3100  // the cost of getting the view over all the entries of inds.
3101  auto colInds = graph.getLocalIndsViewHost (rowInfo);
3102 
3103  for (LO j = 0; j < numElts; ++j) {
3104  const LO lclColInd = inds[j];
3105  const size_t offset =
3106  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3107  lclColInd, hint, sorted);
3108  if (offset != rowInfo.numEntries) {
3109  if (atomic) {
3110  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3111  }
3112  else {
3113  rowVals[offset] += newVals[j];
3114  }
3115  hint = offset + 1;
3116  ++numValid;
3117  }
3118  }
3119  }
3120  else if (graph.isGloballyIndexed ()) {
3121  if (graph.colMap_.is_null ()) {
3122  return Teuchos::OrdinalTraits<LO>::invalid ();
3123  }
3124  const map_type colMap = * (graph.colMap_);
3125 
3126  // Get a view of the column indices in the row. This amortizes
3127  // the cost of getting the view over all the entries of inds.
3128  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
3129 
3130  for (LO j = 0; j < numElts; ++j) {
3131  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3132  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3133  const size_t offset =
3134  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3135  gblColInd, hint, sorted);
3136  if (offset != rowInfo.numEntries) {
3137  if (atomic) {
3138  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3139  }
3140  else {
3141  rowVals[offset] += newVals[j];
3142  }
3143  hint = offset + 1;
3144  ++numValid;
3145  }
3146  }
3147  }
3148  }
3149  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3150  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3151  // to be neither locally nor globally indexed on a process.
3152  // This means that the graph or matrix has no entries on that
3153  // process. Epetra also works like this. It's related to lazy
3154  // allocation (on first insertion, not at graph / matrix
3155  // construction). Lazy allocation will go away because it is
3156  // not thread scalable.
3158  return numValid;
3159  }
3160 
3161  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3162  LocalOrdinal
3164  sumIntoLocalValues (const LocalOrdinal localRow,
3165  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3166  const Teuchos::ArrayView<const Scalar>& values,
3167  const bool atomic)
3168  {
3169  using LO = local_ordinal_type;
3170  const LO numInputEnt = static_cast<LO>(indices.size());
3171  if (static_cast<LO>(values.size()) != numInputEnt) {
3172  return Teuchos::OrdinalTraits<LO>::invalid();
3173  }
3174  const LO* const inputInds = indices.getRawPtr();
3175  const scalar_type* const inputVals = values.getRawPtr();
3176  return this->sumIntoLocalValues(localRow, numInputEnt,
3177  inputVals, inputInds, atomic);
3178  }
3179 
3180  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3182  local_ordinal_type
3185  const local_ordinal_type localRow,
3186  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3187  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3188  const bool atomic)
3189  {
3190  using LO = local_ordinal_type;
3191  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3192  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3193  return Teuchos::OrdinalTraits<LO>::invalid();
3194  }
3195  const scalar_type* inVals =
3196  reinterpret_cast<const scalar_type*>(inputVals.data());
3197  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3198  inputInds.data(), atomic);
3199  }
3200 
3201  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3202  LocalOrdinal
3204  sumIntoLocalValues (const LocalOrdinal localRow,
3205  const LocalOrdinal numEnt,
3206  const Scalar vals[],
3207  const LocalOrdinal cols[],
3208  const bool atomic)
3209  {
3210  typedef impl_scalar_type IST;
3211  typedef LocalOrdinal LO;
3212 
3213  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3214  // Fill must be active and the "nonconst" graph must exist.
3215  return Teuchos::OrdinalTraits<LO>::invalid ();
3216  }
3217  const crs_graph_type& graph = * (this->staticGraph_);
3218  const RowInfo rowInfo = graph.getRowInfo (localRow);
3219 
3220  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3221  // The calling process does not own this row, so it is not
3222  // allowed to modify its values.
3223  return static_cast<LO> (0);
3224  }
3225  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3226  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3227  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3228  cols, inputVals, numEnt, atomic);
3229  }
3230 
3231  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3233  values_dualv_type::t_host::const_type
3235  getValuesViewHost (const RowInfo& rowinfo) const
3236  {
3237  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3238  return typename values_dualv_type::t_host::const_type ();
3239  else
3240  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3241  rowinfo.allocSize,
3242  Access::ReadOnly);
3243  }
3244 
3245  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3247  values_dualv_type::t_host
3249  getValuesViewHostNonConst (const RowInfo& rowinfo)
3250  {
3251  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3252  return typename values_dualv_type::t_host ();
3253  else
3254  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3255  rowinfo.allocSize,
3256  Access::ReadWrite);
3257  }
3258 
3259  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3261  values_dualv_type::t_dev::const_type
3263  getValuesViewDevice (const RowInfo& rowinfo) const
3264  {
3265  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3266  return typename values_dualv_type::t_dev::const_type ();
3267  else
3268  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3269  rowinfo.allocSize,
3270  Access::ReadOnly);
3271  }
3272 
3273  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3275  values_dualv_type::t_dev
3277  getValuesViewDeviceNonConst (const RowInfo& rowinfo)
3278  {
3279  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3280  return typename values_dualv_type::t_dev ();
3281  else
3282  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3283  rowinfo.allocSize,
3284  Access::ReadWrite);
3285  }
3286 
3287 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3288  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3289  Teuchos::ArrayView<const typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>
3291  getView (RowInfo rowinfo) const
3292  {
3293  using Kokkos::MemoryUnmanaged;
3294  using Kokkos::View;
3295  using Teuchos::ArrayView;
3296  using ST = impl_scalar_type;
3297  using range_type = std::pair<size_t, size_t>;
3298 
3299  if (valuesUnpacked_wdv.extent (0) != 0 && rowinfo.allocSize > 0) {
3300 
3301 #ifdef HAVE_TPETRA_DEBUG
3302  TEUCHOS_TEST_FOR_EXCEPTION(
3303  rowinfo.offset1D + rowinfo.allocSize > valuesUnpacked_wdv.extent (0),
3304  std::range_error, "Tpetra::CrsMatrix::getView: Invalid access "
3305  "to 1-D storage of values." << std::endl << "rowinfo.offset1D (" <<
3306  rowinfo.offset1D << ") + rowinfo.allocSize (" << rowinfo.allocSize <<
3307  ") > valuesUnpacked_wdv.extent(0) (" << valuesUnpacked_wdv.extent (0)
3308  << ").");
3309 #endif // HAVE_TPETRA_DEBUG
3310 
3311  range_type range (rowinfo.offset1D, rowinfo.offset1D + rowinfo.allocSize);
3312  // mfh 23 Nov 2015: Don't just create a subview of k_values1D_
3313  // directly, because that first creates a _managed_ subview,
3314  // then returns an unmanaged version of that. That touches the
3315  // reference count, which costs performance in a measurable way.
3316  // Instead, we create a temporary unmanaged view, then create
3317  // the subview from that.
3318  // KDDKDD UVM REMOVAL This method is unsafe and deprecated
3319  auto sv = valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3320  rowinfo.allocSize,
3321  Access::ReadOnly);
3322  const ST* const sv_raw = (rowinfo.allocSize == 0) ? nullptr : sv.data ();
3323  return ArrayView<const ST> (sv_raw, rowinfo.allocSize);
3324  }
3325  else {
3326  return ArrayView<impl_scalar_type> ();
3327  }
3328  }
3329 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3330 
3331 
3332  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3333  void
3336  nonconst_local_inds_host_view_type &indices,
3337  nonconst_values_host_view_type &values,
3338  size_t& numEntries) const
3339  {
3340  using Teuchos::ArrayView;
3341  using Teuchos::av_reinterpret_cast;
3342  const char tfecfFuncName[] = "getLocalRowCopy: ";
3343 
3344  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3345  (! this->hasColMap (), std::runtime_error,
3346  "The matrix does not have a column Map yet. This means we don't have "
3347  "local indices for columns yet, so it doesn't make sense to call this "
3348  "method. If the matrix doesn't have a column Map yet, you should call "
3349  "fillComplete on it first.");
3350 
3351  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3352  const size_t theNumEntries = rowinfo.numEntries;
3353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3354  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3355  static_cast<size_t> (values.size ()) < theNumEntries,
3356  std::runtime_error, "Row with local index " << localRow << " has " <<
3357  theNumEntries << " entry/ies, but indices.size() = " <<
3358  indices.size () << " and values.size() = " << values.size () << ".");
3359  numEntries = theNumEntries; // first side effect
3360 
3361  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3362  if (staticGraph_->isLocallyIndexed ()) {
3363  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3364  auto curVals = getValuesViewHost(rowinfo);
3365 
3366  for (size_t j = 0; j < theNumEntries; ++j) {
3367  values[j] = curVals[j];
3368  indices[j] = curLclInds(j);
3369  }
3370  }
3371  else if (staticGraph_->isGloballyIndexed ()) {
3372  // Don't call getColMap(), because it touches RCP's reference count.
3373  const map_type& colMap = * (staticGraph_->colMap_);
3374  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3375  auto curVals = getValuesViewHost(rowinfo);
3376 
3377  for (size_t j = 0; j < theNumEntries; ++j) {
3378  values[j] = curVals[j];
3379  indices[j] = colMap.getLocalElement (curGblInds(j));
3380  }
3381  }
3382  }
3383  }
3384 
3385 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3386  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3387  void
3389  getLocalRowCopy (LocalOrdinal localRow,
3390  const Teuchos::ArrayView<LocalOrdinal>& indices,
3391  const Teuchos::ArrayView<Scalar>& values,
3392  size_t& numEntries) const
3393  {
3394  using Teuchos::ArrayView;
3395  using Teuchos::av_reinterpret_cast;
3396  const char tfecfFuncName[] = "getLocalRowCopy: ";
3397 
3398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3399  (! this->hasColMap (), std::runtime_error,
3400  "The matrix does not have a column Map yet. This means we don't have "
3401  "local indices for columns yet, so it doesn't make sense to call this "
3402  "method. If the matrix doesn't have a column Map yet, you should call "
3403  "fillComplete on it first.");
3404 
3405  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3406  const size_t theNumEntries = rowinfo.numEntries;
3407  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3408  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3409  static_cast<size_t> (values.size ()) < theNumEntries,
3410  std::runtime_error, "Row with local index " << localRow << " has " <<
3411  theNumEntries << " entry/ies, but indices.size() = " <<
3412  indices.size () << " and values.size() = " << values.size () << ".");
3413  numEntries = theNumEntries; // first side effect
3414 
3415  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3416  if (staticGraph_->isLocallyIndexed ()) {
3417  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3418  auto curVals = getValuesViewHost(rowinfo);
3419 
3420  for (size_t j = 0; j < theNumEntries; ++j) {
3421  values[j] = curVals[j];
3422  indices[j] = curLclInds(j);
3423  }
3424  }
3425  else if (staticGraph_->isGloballyIndexed ()) {
3426  // Don't call getColMap(), because it touches RCP's reference count.
3427  const map_type& colMap = * (staticGraph_->colMap_);
3428  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3429  auto curVals = getValuesViewHost(rowinfo);
3430 
3431  for (size_t j = 0; j < theNumEntries; ++j) {
3432  values[j] = curVals[j];
3433  indices[j] = colMap.getLocalElement (curGblInds(j));
3434  }
3435  }
3436  }
3437  }
3438 #endif
3439 
3440 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3441 void
3444  nonconst_global_inds_host_view_type &indices,
3445  nonconst_values_host_view_type &values,
3446  size_t& numEntries) const
3447  {
3448  using Teuchos::ArrayView;
3449  using Teuchos::av_reinterpret_cast;
3450  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3451 
3452  const RowInfo rowinfo =
3453  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3454  const size_t theNumEntries = rowinfo.numEntries;
3455  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3456  static_cast<size_t> (indices.size ()) < theNumEntries ||
3457  static_cast<size_t> (values.size ()) < theNumEntries,
3458  std::runtime_error, "Row with global index " << globalRow << " has "
3459  << theNumEntries << " entry/ies, but indices.size() = " <<
3460  indices.size () << " and values.size() = " << values.size () << ".");
3461  numEntries = theNumEntries; // first side effect
3462 
3463  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3464  if (staticGraph_->isLocallyIndexed ()) {
3465  const map_type& colMap = * (staticGraph_->colMap_);
3466  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3467  auto curVals = getValuesViewHost(rowinfo);
3468 
3469  for (size_t j = 0; j < theNumEntries; ++j) {
3470  values[j] = curVals[j];
3471  indices[j] = colMap.getGlobalElement (curLclInds(j));
3472  }
3473  }
3474  else if (staticGraph_->isGloballyIndexed ()) {
3475  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3476  auto curVals = getValuesViewHost(rowinfo);
3477 
3478  for (size_t j = 0; j < theNumEntries; ++j) {
3479  values[j] = curVals[j];
3480  indices[j] = curGblInds(j);
3481  }
3482  }
3483  }
3484  }
3485 
3486 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3487  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3488  void
3490  getGlobalRowCopy (GlobalOrdinal globalRow,
3491  const Teuchos::ArrayView<GlobalOrdinal>& indices,
3492  const Teuchos::ArrayView<Scalar>& values,
3493  size_t& numEntries) const
3494  {
3495  using Teuchos::ArrayView;
3496  using Teuchos::av_reinterpret_cast;
3497  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3498 
3499  const RowInfo rowinfo =
3500  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3501  const size_t theNumEntries = rowinfo.numEntries;
3502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3503  static_cast<size_t> (indices.size ()) < theNumEntries ||
3504  static_cast<size_t> (values.size ()) < theNumEntries,
3505  std::runtime_error, "Row with global index " << globalRow << " has "
3506  << theNumEntries << " entry/ies, but indices.size() = " <<
3507  indices.size () << " and values.size() = " << values.size () << ".");
3508  numEntries = theNumEntries; // first side effect
3509 
3510  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3511  if (staticGraph_->isLocallyIndexed ()) {
3512  const map_type& colMap = * (staticGraph_->colMap_);
3513  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3514  auto curVals = getValuesViewHost(rowinfo);
3515 
3516  for (size_t j = 0; j < theNumEntries; ++j) {
3517  values[j] = curVals[j];
3518  indices[j] = colMap.getGlobalElement (curLclInds(j));
3519  }
3520  }
3521  else if (staticGraph_->isGloballyIndexed ()) {
3522  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3523  auto curVals = getValuesViewHost(rowinfo);
3524 
3525  for (size_t j = 0; j < theNumEntries; ++j) {
3526  values[j] = curVals[j];
3527  indices[j] = curGblInds(j);
3528  }
3529  }
3530  }
3531  }
3532 #endif
3533 
3534  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3535  void
3537  getLocalRowView(LocalOrdinal localRow,
3538  local_inds_host_view_type &indices,
3539  values_host_view_type &values) const
3540  {
3541  const char tfecfFuncName[] = "getLocalRowView: ";
3543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3544  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3545  "its indices as global indices, so you cannot get a view with local "
3546  "column indices. If the matrix has a column Map, you may call "
3547  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3548  "a view with global column indices by calling getGlobalRowCopy().");
3549 
3550  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3551  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3552  rowInfo.numEntries > 0) {
3553  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3554  rowInfo.offset1D,
3555  rowInfo.numEntries,
3556  Access::ReadOnly);
3557  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3558  rowInfo.numEntries,
3559  Access::ReadOnly);
3560  }
3561  else {
3562  // This does the right thing (reports an empty row) if the input
3563  // row is invalid.
3564  indices = local_inds_host_view_type();
3565  values = values_host_view_type();
3566  }
3567 
3568 #ifdef HAVE_TPETRA_DEBUG
3569  const char suffix[] = ". This should never happen. Please report this "
3570  "bug to the Tpetra developers.";
3571  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3572  (static_cast<size_t> (indices.size ()) !=
3573  static_cast<size_t> (values.size ()), std::logic_error,
3574  "At the end of this method, for local row " << localRow << ", "
3575  "indices.size() = " << indices.size () << " != values.size () = "
3576  << values.size () << suffix);
3577  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3578  (static_cast<size_t> (indices.size ()) !=
3579  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3580  "At the end of this method, for local row " << localRow << ", "
3581  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3582  << rowInfo.numEntries << suffix);
3583  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3584  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3585  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3586  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3587  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3588  expectedNumEntries << suffix);
3589 #endif // HAVE_TPETRA_DEBUG
3590  }
3591 
3592 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3593  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3594  void
3596  getLocalRowView (LocalOrdinal localRow,
3597  Teuchos::ArrayView<const LocalOrdinal>& indices,
3598  Teuchos::ArrayView<const Scalar>& values) const
3599  {
3600  using Teuchos::ArrayView;
3601  using Teuchos::av_reinterpret_cast;
3602  typedef LocalOrdinal LO;
3603  const char tfecfFuncName[] = "getLocalRowView: ";
3604 
3605  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3606  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3607  "its indices as global indices, so you cannot get a view with local "
3608  "column indices. If the matrix has a column Map, you may call "
3609  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3610  "a view with global column indices by calling getGlobalRowCopy().");
3611  indices = Teuchos::null;
3612  values = Teuchos::null;
3613  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3614  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3615  rowinfo.numEntries > 0) {
3616  ArrayView<const LO> indTmp = staticGraph_->getLocalView (rowinfo);
3617  ArrayView<const Scalar> valTmp =
3618  av_reinterpret_cast<const Scalar> (this->getView (rowinfo));
3619  indices = indTmp (0, rowinfo.numEntries);
3620  values = valTmp (0, rowinfo.numEntries);
3621  }
3622 
3623 #ifdef HAVE_TPETRA_DEBUG
3624  const char suffix[] = ". This should never happen. Please report this "
3625  "bug to the Tpetra developers.";
3626  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3627  (static_cast<size_t> (indices.size ()) !=
3628  static_cast<size_t> (values.size ()), std::logic_error,
3629  "At the end of this method, for local row " << localRow << ", "
3630  "indices.size() = " << indices.size () << " != values.size () = "
3631  << values.size () << suffix);
3632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3633  (static_cast<size_t> (indices.size ()) !=
3634  static_cast<size_t> (rowinfo.numEntries), std::logic_error,
3635  "At the end of this method, for local row " << localRow << ", "
3636  "indices.size() = " << indices.size () << " != rowinfo.numEntries = "
3637  << rowinfo.numEntries << suffix);
3638  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3640  (rowinfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3641  "of this method, for local row " << localRow << ", rowinfo.numEntries = "
3642  << rowinfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3643  expectedNumEntries << suffix);
3644 #endif // HAVE_TPETRA_DEBUG
3645  }
3646 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3647 
3648 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3649  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3650  LocalOrdinal
3652  getLocalRowView (const LocalOrdinal lclRow,
3653  LocalOrdinal& numEnt,
3654  const impl_scalar_type*& val,
3655  const LocalOrdinal*& ind) const
3656  {
3657  typedef LocalOrdinal LO;
3658 
3659  // Don't call getCrsGraph(), because that modfies an RCP reference
3660  // count, which is not thread safe. Checking whether an RCP is
3661  // null does NOT modify its reference count, and is therefore
3662  // thread safe. Note that isGloballyIndexed() calls
3663  // getCrsGraph(), so we have to go to the graph directly.
3664  if (staticGraph_.is_null () || staticGraph_->isGloballyIndexed ()) {
3665  return Tpetra::Details::OrdinalTraits<LO>::invalid ();
3666  }
3667  else {
3668  const RowInfo rowInfo = staticGraph_->getRowInfo (lclRow);
3669  if (rowInfo.localRow == Tpetra::Details::OrdinalTraits<size_t>::invalid ()) {
3670  numEnt = 0; // no valid entries in this row on the calling process
3671  val = nullptr;
3672  ind = nullptr;
3673  // First argument (lclRow) invalid, so make 1 the error code.
3674  return static_cast<LO> (1);
3675  }
3676  else {
3677  numEnt = static_cast<LO> (rowInfo.numEntries);
3678  auto lclColInds = staticGraph_->getLocalIndsViewHost (rowInfo);
3679  // KDDKDD UVM Breaks reference counting; unsafe
3680  ind = lclColInds.data ();
3681 
3682  auto values = getValuesViewHost (rowInfo);
3683  // KDDKDD UVM Breaks reference counting; unsafe
3684  val = values.data();
3685  return values.extent(0);
3686  }
3687  }
3688  }
3689 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3690 
3691 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3692  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3693  LocalOrdinal
3694  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
3695  getLocalRowViewRaw (const LocalOrdinal lclRow,
3696  LocalOrdinal& numEnt,
3697  const LocalOrdinal*& lclColInds,
3698  const Scalar*& vals) const
3699  {
3700  const impl_scalar_type* vals_ist = nullptr;
3701  const LocalOrdinal errCode =
3702  this->getLocalRowView (lclRow, numEnt, vals_ist, lclColInds);
3703  vals = reinterpret_cast<const Scalar*> (vals_ist);
3704  return errCode;
3705  }
3706 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3707 
3708  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3709  void
3711  getGlobalRowView (GlobalOrdinal globalRow,
3712  global_inds_host_view_type &indices,
3713  values_host_view_type &values) const
3714  {
3715  const char tfecfFuncName[] = "getGlobalRowView: ";
3716 
3717  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3718  isLocallyIndexed (), std::runtime_error,
3719  "The matrix is locally indexed, so we cannot return a view of the row "
3720  "with global column indices. Use getGlobalRowCopy() instead.");
3721 
3722  // This does the right thing (reports an empty row) if the input
3723  // row is invalid.
3724  const RowInfo rowInfo =
3725  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3726  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3727  rowInfo.numEntries > 0) {
3728  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3729  rowInfo.numEntries,
3730  Access::ReadOnly);
3731  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3732  rowInfo.numEntries,
3733  Access::ReadOnly);
3734  }
3735  else {
3736  indices = global_inds_host_view_type();
3737  values = values_host_view_type();
3738  }
3739 
3740 #ifdef HAVE_TPETRA_DEBUG
3741  const char suffix[] = ". This should never happen. Please report this "
3742  "bug to the Tpetra developers.";
3743  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3744  (static_cast<size_t> (indices.size ()) !=
3745  static_cast<size_t> (values.size ()), std::logic_error,
3746  "At the end of this method, for global row " << globalRow << ", "
3747  "indices.size() = " << indices.size () << " != values.size () = "
3748  << values.size () << suffix);
3749  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3750  (static_cast<size_t> (indices.size ()) !=
3751  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3752  "At the end of this method, for global row " << globalRow << ", "
3753  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3754  << rowInfo.numEntries << suffix);
3755  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3757  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3758  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3759  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3760  " " << expectedNumEntries << suffix);
3761 #endif // HAVE_TPETRA_DEBUG
3762  }
3763 
3764 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3765  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3766  void
3768  getGlobalRowView (GlobalOrdinal globalRow,
3769  Teuchos::ArrayView<const GlobalOrdinal>& indices,
3770  Teuchos::ArrayView<const Scalar>& values) const
3771  {
3772  using Teuchos::ArrayView;
3773  using Teuchos::av_reinterpret_cast;
3774  typedef GlobalOrdinal GO;
3775  const char tfecfFuncName[] = "getGlobalRowView: ";
3776 
3777  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3778  isLocallyIndexed (), std::runtime_error,
3779  "The matrix is locally indexed, so we cannot return a view of the row "
3780  "with global column indices. Use getGlobalRowCopy() instead.");
3781  indices = Teuchos::null;
3782  values = Teuchos::null;
3783  const RowInfo rowinfo =
3784  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3785  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3786  rowinfo.numEntries > 0) {
3787  ArrayView<const GO> indTmp = staticGraph_->getGlobalView (rowinfo);
3788  ArrayView<const Scalar> valTmp =
3789  av_reinterpret_cast<const Scalar> (this->getView (rowinfo));
3790 #ifdef HAVE_TPETRA_DEBUG
3791  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3792  (static_cast<size_t> (indTmp.size ()) < rowinfo.numEntries ||
3793  static_cast<size_t> (valTmp.size ()) < rowinfo.numEntries,
3794  std::logic_error, std::endl << "rowinfo.numEntries not accurate. "
3795  << std::endl << "indTmp.size() = " << indTmp.size ()
3796  << ", valTmp.size() = " << valTmp.size ()
3797  << ", rowinfo.numEntries = " << rowinfo.numEntries << ".");
3798 #endif // HAVE_TPETRA_DEBUG
3799  indices = indTmp (0, rowinfo.numEntries);
3800  values = valTmp (0, rowinfo.numEntries);
3801  }
3802 
3803 #ifdef HAVE_TPETRA_DEBUG
3804  const char suffix[] = ". This should never happen. Please report this "
3805  "bug to the Tpetra developers.";
3806  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3807  (static_cast<size_t> (indices.size ()) !=
3808  static_cast<size_t> (values.size ()), std::logic_error,
3809  "At the end of this method, for global row " << globalRow << ", "
3810  "indices.size() = " << indices.size () << " != values.size () = "
3811  << values.size () << suffix);
3812  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3813  (static_cast<size_t> (indices.size ()) !=
3814  static_cast<size_t> (rowinfo.numEntries), std::logic_error,
3815  "At the end of this method, for global row " << globalRow << ", "
3816  "indices.size() = " << indices.size () << " != rowinfo.numEntries = "
3817  << rowinfo.numEntries << suffix);
3818  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3819  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3820  (rowinfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3821  "of this method, for global row " << globalRow << ", rowinfo.numEntries "
3822  "= " << rowinfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3823  " " << expectedNumEntries << suffix);
3824 #endif // HAVE_TPETRA_DEBUG
3825  }
3826 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3827 
3828  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3829  void
3831  scale (const Scalar& alpha)
3832  {
3833  const char tfecfFuncName[] = "scale: ";
3834  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3835 
3836  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3837  ! isFillActive (), std::runtime_error,
3838  "Fill must be active before you may call this method. "
3839  "Please call resumeFill() to make fill active.");
3840 
3841  const size_t nlrs = staticGraph_->getNodeNumRows ();
3842  const size_t numEntries = staticGraph_->getNodeNumEntries ();
3843  if (! staticGraph_->indicesAreAllocated () ||
3844  nlrs == 0 || numEntries == 0) {
3845  // do nothing
3846  }
3847  else {
3849  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3850  KokkosBlas::scal(vals, theAlpha, vals);
3851 
3852  }
3853  }
3854 
3855  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3856  void
3858  setAllToScalar (const Scalar& alpha)
3859  {
3860  const char tfecfFuncName[] = "setAllToScalar: ";
3861  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3862  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3863  ! isFillActive (), std::runtime_error,
3864  "Fill must be active before you may call this method. "
3865  "Please call resumeFill() to make fill active.");
3866 
3867  // replace all values in the matrix
3868  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3869  // however, if there are no valid entries, we can short-circuit
3870  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3871  const size_t numEntries = staticGraph_->getNodeNumEntries();
3872  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3873  // do nothing
3874  }
3875  else {
3876  Kokkos::deep_copy (valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3877  theAlpha);
3878  }
3879  }
3880 
3881  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3882  void
3884  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3885  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3886  const typename local_matrix_device_type::values_type& values)
3887  {
3888  const char tfecfFuncName[] = "setAllValues: ";
3889  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3890  (columnIndices.size () != values.size (), std::invalid_argument,
3891  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3892  " = " << values.size () << ".");
3893  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3894  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3895 
3896  try {
3897  myGraph_->setAllIndices (rowPointers, columnIndices);
3898  }
3899  catch (std::exception &e) {
3900  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3901  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3902  "exception: " << e.what ());
3903  }
3904  // Make sure that myGraph_ now has a local graph. It may not be
3905  // fillComplete yet, so it's important to check. We don't care
3906  // whether setAllIndices() did a shallow copy or a deep copy, so a
3907  // good way to check is to compare dimensions.
3908  auto lclGraph = myGraph_->getLocalGraphDevice ();
3909  const size_t numEnt = lclGraph.entries.extent (0);
3910  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3911  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3912  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3913  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3914  "local graph. Please report this bug to the Tpetra developers.");
3915 
3916  valuesPacked_wdv = values_wdv_type(values);
3917  valuesUnpacked_wdv = valuesPacked_wdv;
3918 
3919  // FIXME (22 Jun 2016) I would very much like to get rid of
3920  // k_values1D_ at some point. I find it confusing to have all
3921  // these extra references lying around.
3922 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3923 
3924  // Storage MUST be packed, since the interface doesn't give any
3925  // way to indicate any extra space at the end of each row.
3926  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3927 
3928  checkInternalState ();
3929  }
3930 
3931  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3932  void
3934  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3935  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3936  const Teuchos::ArrayRCP<Scalar>& val)
3937  {
3938  using Kokkos::Compat::getKokkosViewDeepCopy;
3939  using Teuchos::ArrayRCP;
3940  using Teuchos::av_reinterpret_cast;
3941  typedef device_type DT;
3942  typedef impl_scalar_type IST;
3943  typedef typename local_graph_device_type::row_map_type row_map_type;
3944  //typedef typename row_map_type::non_const_value_type row_offset_type;
3945  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3946 
3947  // The row offset type may depend on the execution space. It may
3948  // not necessarily be size_t. If it's not, we need to make a deep
3949  // copy. We need to make a deep copy anyway so that Kokkos can
3950  // own the memory. Regardless, ptrIn gets the copy.
3951  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3952  Kokkos::View<const size_t*,
3953  typename row_map_type::array_layout,
3954  Kokkos::HostSpace,
3955  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3956  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3957 
3958  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3959  (ptrNative.extent (0) != ptrSizeT.extent (0),
3960  std::logic_error, "ptrNative.extent(0) = " <<
3961  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3962  << ptrSizeT.extent (0) << ". Please report this bug to the "
3963  "Tpetra developers.");
3964 
3965  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3966  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3967  this->setAllValues (ptrNative, indIn, valIn);
3968  }
3969 
3970  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3971  void
3973  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3974  {
3975  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3976  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3977  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3978 
3979  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3980  // this method in CrsGraph too, so don't call it (otherwise build
3981  // warnings will show up and annoy users). Instead, copy results
3982  // in and out, if the memory space requires it.
3983 
3984  const size_t lclNumRows = staticGraph_->getNodeNumRows ();
3985  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3986  offsets.resize (lclNumRows);
3987  }
3988 
3989  // The input ArrayRCP must always be a host pointer. Thus, if
3990  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3991  // to write to that allocation directly as a Kokkos::View.
3992  typedef typename device_type::memory_space memory_space;
3993  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3994  // It is always syntactically correct to assign a raw host
3995  // pointer to a device View, so this code will compile correctly
3996  // even if this branch never runs.
3997  typedef Kokkos::View<size_t*, device_type,
3998  Kokkos::MemoryUnmanaged> output_type;
3999  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
4000  staticGraph_->getLocalDiagOffsets (offsetsOut);
4001  }
4002  else {
4003  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
4004  staticGraph_->getLocalDiagOffsets (offsetsTmp);
4005  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
4006  Kokkos::MemoryUnmanaged> output_type;
4007  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
4008  Kokkos::deep_copy (offsetsOut, offsetsTmp);
4009  }
4010  }
4011 
4012  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4013  void
4016  {
4017  using Teuchos::ArrayRCP;
4018  using Teuchos::ArrayView;
4019  using Teuchos::av_reinterpret_cast;
4020  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
4021  typedef local_ordinal_type LO;
4023 
4024  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4025  staticGraph_.is_null (), std::runtime_error,
4026  "This method requires that the matrix have a graph.");
4027  auto rowMapPtr = this->getRowMap ();
4028  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
4029  // Processes on which the row Map or its communicator is null
4030  // don't participate. Users shouldn't even call this method on
4031  // those processes.
4032  return;
4033  }
4034  auto colMapPtr = this->getColMap ();
4035  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4036  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
4037  "This method requires that the matrix have a column Map.");
4038  const map_type& rowMap = * rowMapPtr;
4039  const map_type& colMap = * colMapPtr;
4040  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4041 
4042 #ifdef HAVE_TPETRA_DEBUG
4043  // isCompatible() requires an all-reduce, and thus this check
4044  // should only be done in debug mode.
4045  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4046  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4047  "The input Vector's Map must be compatible with the CrsMatrix's row "
4048  "Map. You may check this by using Map's isCompatible method: "
4049  "diag.getMap ()->isCompatible (A.getRowMap ());");
4050 #endif // HAVE_TPETRA_DEBUG
4051 
4052  if (this->isFillComplete ()) {
4053  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
4054  // 1-D subview of the first (and only) column of D_lcl.
4055  const auto D_lcl_1d =
4056  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
4057 
4058  const auto lclRowMap = rowMap.getLocalMap ();
4059  const auto lclColMap = colMap.getLocalMap ();
4061  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
4062  lclColMap,
4063  getLocalMatrixDevice ());
4064  }
4065  else {
4067  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
4068  }
4069  }
4070 
4071  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4072  void
4075  const Kokkos::View<const size_t*, device_type,
4076  Kokkos::MemoryUnmanaged>& offsets) const
4077  {
4078  typedef LocalOrdinal LO;
4079 
4080 #ifdef HAVE_TPETRA_DEBUG
4081  const char tfecfFuncName[] = "getLocalDiagCopy: ";
4082  const map_type& rowMap = * (this->getRowMap ());
4083  // isCompatible() requires an all-reduce, and thus this check
4084  // should only be done in debug mode.
4085  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4086  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4087  "The input Vector's Map must be compatible with (in the sense of Map::"
4088  "isCompatible) the CrsMatrix's row Map.");
4089 #endif // HAVE_TPETRA_DEBUG
4090 
4091  // For now, we fill the Vector on the host and sync to device.
4092  // Later, we may write a parallel kernel that works entirely on
4093  // device.
4094  //
4095  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
4096  // we write a device kernel, it will not need to assume UVM.
4097 
4098  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
4099  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4100  // Get 1-D subview of the first (and only) column of D_lcl.
4101  auto D_lcl_1d =
4102  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
4103 
4104  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
4105  getLocalMatrixDevice ());
4106  }
4107 
4108  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4109  void
4112  const Teuchos::ArrayView<const size_t>& offsets) const
4113  {
4114  using LO = LocalOrdinal;
4115  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
4116  using IST = impl_scalar_type;
4117 
4118 #ifdef HAVE_TPETRA_DEBUG
4119  const char tfecfFuncName[] = "getLocalDiagCopy: ";
4120  const map_type& rowMap = * (this->getRowMap ());
4121  // isCompatible() requires an all-reduce, and thus this check
4122  // should only be done in debug mode.
4123  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4124  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
4125  "The input Vector's Map must be compatible with (in the sense of Map::"
4126  "isCompatible) the CrsMatrix's row Map.");
4127 #endif // HAVE_TPETRA_DEBUG
4128 
4129  // See #1510. In case diag has already been marked modified on
4130  // device, we need to clear that flag, since the code below works
4131  // on host.
4132  //diag.clear_sync_state ();
4133 
4134  // For now, we fill the Vector on the host and sync to device.
4135  // Later, we may write a parallel kernel that works entirely on
4136  // device.
4137  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
4138  // 1-D subview of the first (and only) column of lclVecHost.
4139  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
4140 
4141  using host_offsets_view_type =
4142  Kokkos::View<const size_t*, Kokkos::HostSpace,
4143  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
4144  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
4145  // Find the diagonal entries and put them in lclVecHost1d.
4146  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
4147  const LO myNumRows = static_cast<LO> (this->getNodeNumRows ());
4148  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
4149 
4150  auto rowPtrsPackedHost = staticGraph_->rowPtrsPacked_host_;
4151  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
4152  Kokkos::parallel_for
4153  ("Tpetra::CrsMatrix::getLocalDiagCopy",
4154  range_type (0, myNumRows),
4155  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
4156  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
4157  if (h_offsets[lclRow] != INV) {
4158  auto curRowOffset = rowPtrsPackedHost (lclRow);
4159  lclVecHost1d(lclRow) =
4160  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
4161  }
4162  });
4163  //diag.sync_device ();
4164  }
4165 
4166 
4167  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4168  void
4171  {
4172  using ::Tpetra::Details::ProfilingRegion;
4173  using Teuchos::ArrayRCP;
4174  using Teuchos::ArrayView;
4175  using Teuchos::null;
4176  using Teuchos::RCP;
4177  using Teuchos::rcp;
4178  using Teuchos::rcpFromRef;
4180  const char tfecfFuncName[] = "leftScale: ";
4181 
4182  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
4183 
4184  RCP<const vec_type> xp;
4185  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
4186  // Take from Epetra: If we have a non-trivial exporter, we must
4187  // import elements that are permuted or are on other processors.
4188  auto exporter = this->getCrsGraphRef ().getExporter ();
4189  if (exporter.get () != nullptr) {
4190  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
4191  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
4192  xp = tempVec;
4193  }
4194  else {
4195  xp = rcpFromRef (x);
4196  }
4197  }
4198  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
4199  xp = rcpFromRef (x);
4200  }
4201  else {
4202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4203  (true, std::invalid_argument, "x's Map must be the same as "
4204  "either the row Map or the range Map of the CrsMatrix.");
4205  }
4206 
4207  if (this->isFillComplete()) {
4208  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
4209  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
4211  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
4212  x_lcl_1d, false, false);
4213  }
4214  else {
4215  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
4216  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4217  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
4218  " fillComplete");
4219  }
4220  }
4221 
4222  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4223  void
4226  {
4227  using ::Tpetra::Details::ProfilingRegion;
4228  using Teuchos::ArrayRCP;
4229  using Teuchos::ArrayView;
4230  using Teuchos::null;
4231  using Teuchos::RCP;
4232  using Teuchos::rcp;
4233  using Teuchos::rcpFromRef;
4235  const char tfecfFuncName[] = "rightScale: ";
4236 
4237  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
4238 
4239  RCP<const vec_type> xp;
4240  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
4241  // Take from Epetra: If we have a non-trivial exporter, we must
4242  // import elements that are permuted or are on other processors.
4243  auto importer = this->getCrsGraphRef ().getImporter ();
4244  if (importer.get () != nullptr) {
4245  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
4246  tempVec->doImport (x, *importer, REPLACE);
4247  xp = tempVec;
4248  }
4249  else {
4250  xp = rcpFromRef (x);
4251  }
4252  }
4253  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
4254  xp = rcpFromRef (x);
4255  } else {
4256  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4257  (true, std::runtime_error, "x's Map must be the same as "
4258  "either the domain Map or the column Map of the CrsMatrix.");
4259  }
4260 
4261  if (this->isFillComplete()) {
4262  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
4263  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
4265  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
4266  x_lcl_1d, false, false);
4267  }
4268  else {
4269  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
4270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4271  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
4272  " fillComplete");
4273  }
4274  }
4275 
4276  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4279  getFrobeniusNorm () const
4280  {
4281  using Teuchos::ArrayView;
4282  using Teuchos::outArg;
4283  using Teuchos::REDUCE_SUM;
4284  using Teuchos::reduceAll;
4285 
4286  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
4287  // local part of this computation. It could make sense to put
4288  // this operation in the Kokkos::CrsMatrix.
4289 
4290  // check the cache first
4291  mag_type frobNorm = frobNorm_;
4292  if (frobNorm == -STM::one ()) {
4293  mag_type mySum = STM::zero ();
4294  if (getNodeNumEntries() > 0) {
4295  if (isStorageOptimized ()) {
4296  // "Optimized" storage is packed storage. That means we can
4297  // iterate in one pass through the 1-D values array.
4298  const size_t numEntries = getNodeNumEntries ();
4299  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
4300  for (size_t k = 0; k < numEntries; ++k) {
4301  auto val = values[k];
4302  // Note (etp 06 Jan 2015) We need abs() here for composite types
4303  // (in general, if mag_type is on the left-hand-side, we need
4304  // abs() on the right-hand-side)
4305  const mag_type val_abs = STS::abs (val);
4306  mySum += val_abs * val_abs;
4307  }
4308  }
4309  else {
4310  const LocalOrdinal numRows =
4311  static_cast<LocalOrdinal> (this->getNodeNumRows ());
4312  for (LocalOrdinal r = 0; r < numRows; ++r) {
4313  const RowInfo rowInfo = myGraph_->getRowInfo (r);
4314  const size_t numEntries = rowInfo.numEntries;
4315  auto A_r = this->getValuesViewHost(rowInfo);
4316  for (size_t k = 0; k < numEntries; ++k) {
4317  const impl_scalar_type val = A_r[k];
4318  const mag_type val_abs = STS::abs (val);
4319  mySum += val_abs * val_abs;
4320  }
4321  }
4322  }
4323  }
4324  mag_type totalSum = STM::zero ();
4325  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
4326  mySum, outArg (totalSum));
4327  frobNorm = STM::sqrt (totalSum);
4328  }
4329  if (isFillComplete ()) {
4330  // Only cache the result if the matrix is fill complete.
4331  // Otherwise, the values might still change. resumeFill clears
4332  // the cache.
4333  frobNorm_ = frobNorm;
4334  }
4335  return frobNorm;
4336  }
4337 
4338  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4339  void
4341  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
4342  {
4343  const char tfecfFuncName[] = "replaceColMap: ";
4344  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
4345  // Then replacing the column Map might mean that we need to
4346  // reindex the column indices.
4347  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4348  myGraph_.is_null (), std::runtime_error,
4349  "This method does not work if the matrix has a const graph. The whole "
4350  "idea of a const graph is that you are not allowed to change it, but "
4351  "this method necessarily must modify the graph, since the graph owns "
4352  "the matrix's column Map.");
4353  myGraph_->replaceColMap (newColMap);
4354  }
4355 
4356  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4357  void
4359  reindexColumns (crs_graph_type* const graph,
4360  const Teuchos::RCP<const map_type>& newColMap,
4361  const Teuchos::RCP<const import_type>& newImport,
4362  const bool sortEachRow)
4363  {
4364  const char tfecfFuncName[] = "reindexColumns: ";
4365  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4366  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
4367  "The input graph is null, but the matrix does not own its graph.");
4368 
4369  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
4370  const bool sortGraph = false; // we'll sort graph & matrix together below
4371 
4372  theGraph.reindexColumns (newColMap, newImport, sortGraph);
4373 
4374  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
4375  const LocalOrdinal lclNumRows =
4376  static_cast<LocalOrdinal> (theGraph.getNodeNumRows ());
4377 
4378  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
4379 
4380  const RowInfo rowInfo = theGraph.getRowInfo (row);
4381  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
4382  auto vals = this->getValuesViewHostNonConst (rowInfo);
4383 
4384  sort2 (lclColInds.data (),
4385  lclColInds.data () + rowInfo.numEntries,
4386  vals.data ());
4387  }
4388  theGraph.indicesAreSorted_ = true;
4389  }
4390  }
4391 
4392  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4393  void
4395  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
4396  {
4397  const char tfecfFuncName[] = "replaceDomainMap: ";
4398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4399  myGraph_.is_null (), std::runtime_error,
4400  "This method does not work if the matrix has a const graph. The whole "
4401  "idea of a const graph is that you are not allowed to change it, but this"
4402  " method necessarily must modify the graph, since the graph owns the "
4403  "matrix's domain Map and Import objects.");
4404  myGraph_->replaceDomainMap (newDomainMap);
4405  }
4406 
4407  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4408  void
4410  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
4411  Teuchos::RCP<const import_type>& newImporter)
4412  {
4413  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
4414  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4415  myGraph_.is_null (), std::runtime_error,
4416  "This method does not work if the matrix has a const graph. The whole "
4417  "idea of a const graph is that you are not allowed to change it, but this"
4418  " method necessarily must modify the graph, since the graph owns the "
4419  "matrix's domain Map and Import objects.");
4420  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
4421  }
4422 
4423  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4424  void
4426  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
4427  {
4428  const char tfecfFuncName[] = "replaceRangeMap: ";
4429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4430  myGraph_.is_null (), std::runtime_error,
4431  "This method does not work if the matrix has a const graph. The whole "
4432  "idea of a const graph is that you are not allowed to change it, but this"
4433  " method necessarily must modify the graph, since the graph owns the "
4434  "matrix's domain Map and Import objects.");
4435  myGraph_->replaceRangeMap (newRangeMap);
4436  }
4437 
4438  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4439  void
4441  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
4442  Teuchos::RCP<const export_type>& newExporter)
4443  {
4444  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
4445  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4446  myGraph_.is_null (), std::runtime_error,
4447  "This method does not work if the matrix has a const graph. The whole "
4448  "idea of a const graph is that you are not allowed to change it, but this"
4449  " method necessarily must modify the graph, since the graph owns the "
4450  "matrix's domain Map and Import objects.");
4451  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
4452  }
4453 
4454  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4455  void
4457  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4458  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4459  const Teuchos::ArrayView<const Scalar>& values)
4460  {
4461  using Teuchos::Array;
4462  typedef GlobalOrdinal GO;
4463  typedef typename Array<GO>::size_type size_type;
4464 
4465  const size_type numToInsert = indices.size ();
4466  // Add the new data to the list of nonlocals.
4467  // This creates the arrays if they don't exist yet.
4468  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4469  Array<GO>& curRowInds = curRow.first;
4470  Array<Scalar>& curRowVals = curRow.second;
4471  const size_type newCapacity = curRowInds.size () + numToInsert;
4472  curRowInds.reserve (newCapacity);
4473  curRowVals.reserve (newCapacity);
4474  for (size_type k = 0; k < numToInsert; ++k) {
4475  curRowInds.push_back (indices[k]);
4476  curRowVals.push_back (values[k]);
4477  }
4478  }
4479 
4480  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4481  void
4483  globalAssemble ()
4484  {
4485  using Details::Behavior;
4487  using Teuchos::Comm;
4488  using Teuchos::outArg;
4489  using Teuchos::RCP;
4490  using Teuchos::rcp;
4491  using Teuchos::REDUCE_MAX;
4492  using Teuchos::REDUCE_MIN;
4493  using Teuchos::reduceAll;
4494  using std::endl;
4496  //typedef LocalOrdinal LO;
4497  typedef GlobalOrdinal GO;
4498  typedef typename Teuchos::Array<GO>::size_type size_type;
4499  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4500  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4501 
4502  const bool verbose = Behavior::verbose("CrsMatrix");
4503  std::unique_ptr<std::string> prefix;
4504  if (verbose) {
4505  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4506  std::ostringstream os;
4507  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4508  << endl;
4509  std::cerr << os.str();
4510  }
4511  RCP<const Comm<int> > comm = getComm ();
4512 
4513  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4514  (! isFillActive (), std::runtime_error, "Fill must be active before "
4515  "you may call this method.");
4516 
4517  const size_t myNumNonlocalRows = nonlocals_.size ();
4518 
4519  // If no processes have nonlocal rows, then we don't have to do
4520  // anything. Checking this is probably cheaper than constructing
4521  // the Map of nonlocal rows (see below) and noticing that it has
4522  // zero global entries.
4523  {
4524  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4525  int someoneHasNonlocalRows = 0;
4526  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4527  outArg (someoneHasNonlocalRows));
4528  if (someoneHasNonlocalRows == 0) {
4529  return; // no process has nonlocal rows, so nothing to do
4530  }
4531  }
4532 
4533  // 1. Create a list of the "nonlocal" rows on each process. this
4534  // requires iterating over nonlocals_, so while we do this,
4535  // deduplicate the entries and get a count for each nonlocal
4536  // row on this process.
4537  // 2. Construct a new row Map corresponding to those rows. This
4538  // Map is likely overlapping. We know that the Map is not
4539  // empty on all processes, because the above all-reduce and
4540  // return exclude that case.
4541 
4542  RCP<const map_type> nonlocalRowMap;
4543  // Keep this for CrsGraph's constructor, so we can use StaticProfile.
4544  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4545  {
4546  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4547  size_type curPos = 0;
4548  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4549  ++mapIter, ++curPos) {
4550  myNonlocalGblRows[curPos] = mapIter->first;
4551  // Get the values and column indices by reference, since we
4552  // intend to change them in place (that's what "erase" does).
4553  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4554  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4555 
4556  // Sort both arrays jointly, using the column indices as keys,
4557  // then merge them jointly. "Merge" here adds values
4558  // corresponding to the same column indices. The first 2 args
4559  // of merge2 are output arguments that work just like the
4560  // return value of std::unique.
4561  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4562  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4563  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4564  merge2 (gblCols_newEnd, vals_newEnd,
4565  gblCols.begin (), gblCols.end (),
4566  vals.begin (), vals.end ());
4567  gblCols.erase (gblCols_newEnd, gblCols.end ());
4568  vals.erase (vals_newEnd, vals.end ());
4569  numEntPerNonlocalRow[curPos] = gblCols.size ();
4570  }
4571 
4572  // Currently, Map requires that its indexBase be the global min
4573  // of all its global indices. Map won't compute this for us, so
4574  // we must do it. If our process has no nonlocal rows, set the
4575  // "min" to the max possible GO value. This ensures that if
4576  // some process has at least one nonlocal row, then it will pick
4577  // that up as the min. We know that at least one process has a
4578  // nonlocal row, since the all-reduce and return at the top of
4579  // this method excluded that case.
4580  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4581  {
4582  auto iter = std::min_element (myNonlocalGblRows.begin (),
4583  myNonlocalGblRows.end ());
4584  if (iter != myNonlocalGblRows.end ()) {
4585  myMinNonlocalGblRow = *iter;
4586  }
4587  }
4588  GO gblMinNonlocalGblRow = 0;
4589  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4590  outArg (gblMinNonlocalGblRow));
4591  const GO indexBase = gblMinNonlocalGblRow;
4592  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4593  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4594  }
4595 
4596  // 3. Use the values and column indices for each nonlocal row, as
4597  // stored in nonlocals_, to construct a CrsMatrix corresponding
4598  // to nonlocal rows. We may use StaticProfile, since we have
4599  // exact counts of the number of entries in each nonlocal row.
4600 
4601  if (verbose) {
4602  std::ostringstream os;
4603  os << *prefix << "Create nonlocal matrix" << endl;
4604  std::cerr << os.str();
4605  }
4606  RCP<crs_matrix_type> nonlocalMatrix =
4607  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow (),
4608  StaticProfile));
4609  {
4610  size_type curPos = 0;
4611  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4612  ++mapIter, ++curPos) {
4613  const GO gblRow = mapIter->first;
4614  // Get values & column indices by ref, just to avoid copy.
4615  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4616  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4617  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4618  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4619  }
4620  }
4621  // There's no need to fill-complete the nonlocals matrix.
4622  // We just use it as a temporary container for the Export.
4623 
4624  // 4. If the original row Map is one to one, then we can Export
4625  // directly from nonlocalMatrix into this. Otherwise, we have
4626  // to create a temporary matrix with a one-to-one row Map,
4627  // Export into that, then Import from the temporary matrix into
4628  // *this.
4629 
4630  auto origRowMap = this->getRowMap ();
4631  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4632 
4633  int isLocallyComplete = 1; // true by default
4634 
4635  if (origRowMapIsOneToOne) {
4636  if (verbose) {
4637  std::ostringstream os;
4638  os << *prefix << "Original row Map is 1-to-1" << endl;
4639  std::cerr << os.str();
4640  }
4641  export_type exportToOrig (nonlocalRowMap, origRowMap);
4642  if (! exportToOrig.isLocallyComplete ()) {
4643  isLocallyComplete = 0;
4644  }
4645  if (verbose) {
4646  std::ostringstream os;
4647  os << *prefix << "doExport from nonlocalMatrix" << endl;
4648  std::cerr << os.str();
4649  }
4650  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4651  // We're done at this point!
4652  }
4653  else {
4654  if (verbose) {
4655  std::ostringstream os;
4656  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4657  std::cerr << os.str();
4658  }
4659  // If you ask a Map whether it is one to one, it does some
4660  // communication and stashes intermediate results for later use
4661  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4662  // much more then the original cost of calling isOneToOne.
4663  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4664  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4665  if (! exportToOneToOne.isLocallyComplete ()) {
4666  isLocallyComplete = 0;
4667  }
4668 
4669  // Create a temporary matrix with the one-to-one row Map.
4670  //
4671  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4672  // each row, to avoid reallocation during the Export operation.
4673  if (verbose) {
4674  std::ostringstream os;
4675  os << *prefix << "Create & doExport into 1-to-1 matrix"
4676  << endl;
4677  std::cerr << os.str();
4678  }
4679  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4680  // Export from matrix of nonlocals into the temp one-to-one matrix.
4681  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4682  Tpetra::ADD);
4683 
4684  // We don't need the matrix of nonlocals anymore, so get rid of
4685  // it, to keep the memory high-water mark down.
4686  if (verbose) {
4687  std::ostringstream os;
4688  os << *prefix << "Free nonlocalMatrix" << endl;
4689  std::cerr << os.str();
4690  }
4691  nonlocalMatrix = Teuchos::null;
4692 
4693  // Import from the one-to-one matrix to the original matrix.
4694  if (verbose) {
4695  std::ostringstream os;
4696  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4697  std::cerr << os.str();
4698  }
4699  import_type importToOrig (oneToOneRowMap, origRowMap);
4700  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4701  }
4702 
4703  // It's safe now to clear out nonlocals_, since we've already
4704  // committed side effects to *this. The standard idiom for
4705  // clearing a Container like std::map, is to swap it with an empty
4706  // Container and let the swapped Container fall out of scope.
4707  if (verbose) {
4708  std::ostringstream os;
4709  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4710  std::cerr << os.str();
4711  }
4712  decltype (nonlocals_) newNonlocals;
4713  std::swap (nonlocals_, newNonlocals);
4714 
4715  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4716  // don't like throwing an exception here. A local return value
4717  // would likely be more useful to users. However, if users find
4718  // themselves exercising nonlocal inserts often, then they are
4719  // probably novice users who need the help. See Gibhub Issues
4720  // #603 and #601 (esp. the latter) for discussion.
4721 
4722  int isGloballyComplete = 0; // output argument of reduceAll
4723  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4724  outArg (isGloballyComplete));
4725  TEUCHOS_TEST_FOR_EXCEPTION
4726  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4727  "you called insertGlobalValues with a global row index which is not in "
4728  "the matrix's row Map on any process in its communicator.");
4729  }
4730 
4731  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4732  void
4734  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4735  {
4736  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4737  myGraph_->resumeFill (params);
4738  }
4739  clearGlobalConstants ();
4740  fillComplete_ = false;
4741  }
4742 
4743  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4744  void
4747  {
4748  // This method doesn't do anything. The analogous method in
4749  // CrsGraph does actually compute something.
4750  //
4751  // Oddly enough, clearGlobalConstants() clears frobNorm_ (by
4752  // setting it to -1), but computeGlobalConstants() does _not_
4753  // compute the Frobenius norm; this is done on demand in
4754  // getFrobeniusNorm(), and the result is cached there.
4755  }
4756 
4757  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4758  bool
4760  haveGlobalConstants() const {
4761  return getCrsGraphRef ().haveGlobalConstants ();
4762  }
4763 
4764  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4765  void
4768  // We use -1 to indicate that the Frobenius norm needs to be
4769  // recomputed, since the values might change between now and the
4770  // next fillComplete call.
4771  //
4772  // Oddly enough, clearGlobalConstants() clears frobNorm_, but
4773  // computeGlobalConstants() does _not_ compute the Frobenius norm;
4774  // this is done on demand in getFrobeniusNorm(), and the result is
4775  // cached there.
4776  frobNorm_ = -STM::one ();
4777  }
4778 
4779  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4780  void
4782  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4783  {
4784  const char tfecfFuncName[] = "fillComplete(params): ";
4785 
4786  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4787  (this->getCrsGraph ().is_null (), std::logic_error,
4788  "getCrsGraph() returns null. This should not happen at this point. "
4789  "Please report this bug to the Tpetra developers.");
4790 
4791  const crs_graph_type& graph = this->getCrsGraphRef ();
4792  if (this->isStaticGraph () && graph.isFillComplete ()) {
4793  // If this matrix's graph is fill complete and the user did not
4794  // supply a domain or range Map, use the graph's domain and
4795  // range Maps.
4796  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4797  }
4798  else { // assume that user's row Map is the domain and range Map
4799  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4800  Teuchos::RCP<const map_type> domainMap = rangeMap;
4801  this->fillComplete (domainMap, rangeMap, params);
4802  }
4803  }
4804 
4805  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4806  void
4808  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4809  const Teuchos::RCP<const map_type>& rangeMap,
4810  const Teuchos::RCP<Teuchos::ParameterList>& params)
4811  {
4812  using Details::Behavior;
4814  using Teuchos::ArrayRCP;
4815  using Teuchos::RCP;
4816  using Teuchos::rcp;
4817  using std::endl;
4818  const char tfecfFuncName[] = "fillComplete: ";
4819  ProfilingRegion regionFillComplete
4820  ("Tpetra::CrsMatrix::fillComplete");
4821  const bool verbose = Behavior::verbose("CrsMatrix");
4822  std::unique_ptr<std::string> prefix;
4823  if (verbose) {
4824  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4825  std::ostringstream os;
4826  os << *prefix << endl;
4827  std::cerr << os.str ();
4828  }
4829  Details::ProfilingRegion region(
4830  "Tpetra::CrsMatrix::fillCompete",
4831  "fillCompete");
4832 
4833  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4834  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4835  "Matrix fill state must be active (isFillActive() "
4836  "must be true) before you may call fillComplete().");
4837  const int numProcs = this->getComm ()->getSize ();
4838 
4839  //
4840  // Read parameters from the input ParameterList.
4841  //
4842  {
4843  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4844 
4845  // If true, the caller promises that no process did nonlocal
4846  // changes since the last call to fillComplete.
4847  bool assertNoNonlocalInserts = false;
4848  // If true, makeColMap sorts remote GIDs (within each remote
4849  // process' group).
4850  bool sortGhosts = true;
4851 
4852  if (! params.is_null ()) {
4853  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4854  assertNoNonlocalInserts);
4855  if (params->isParameter ("sort column map ghost gids")) {
4856  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4857  }
4858  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4859  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4860  }
4861  }
4862  // We also don't need to do global assembly if there is only one
4863  // process in the communicator.
4864  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4865  // This parameter only matters if this matrix owns its graph.
4866  if (! this->myGraph_.is_null ()) {
4867  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4868  }
4869 
4870  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4871  if (this->hasColMap ()) { // use local indices
4872  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4873  }
4874  else { // no column Map, so use global indices
4875  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4876  }
4877  }
4878  // Global assemble, if we need to. This call only costs a single
4879  // all-reduce if we didn't need global assembly after all.
4880  if (needGlobalAssemble) {
4881  this->globalAssemble ();
4882  }
4883  else {
4884  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4885  (numProcs == 1 && nonlocals_.size() > 0,
4886  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4887  "An invalid entry (i.e., with row index not in the row Map) must have "
4888  "been submitted to the CrsMatrix.");
4889  }
4890  }
4891  if (this->isStaticGraph ()) {
4892  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4893  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4894  // checks below only in debug mode. It would be nicer to do a
4895  // local check, then propagate the error state in a deferred
4896  // way, whenever communication happens. That would reduce the
4897  // cost of checking, to the point where it may make sense to
4898  // enable it even in release mode.
4899 #ifdef HAVE_TPETRA_DEBUG
4900  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4901  // input Maps incurs a penalty of two all-reduces for the
4902  // otherwise optimal const graph case.
4903  //
4904  // We could turn these (max) 2 all-reduces into (max) 1, by
4905  // fusing them. We could do this by adding a "locallySameAs"
4906  // method to Map, which would return one of four states:
4907  //
4908  // a. Certainly globally the same
4909  // b. Certainly globally not the same
4910  // c. Locally the same
4911  // d. Locally not the same
4912  //
4913  // The first two states don't require further communication.
4914  // The latter two states require an all-reduce to communicate
4915  // globally, but we only need one all-reduce, since we only need
4916  // to check whether at least one of the Maps is wrong.
4917  const bool domainMapsMatch =
4918  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4919  const bool rangeMapsMatch =
4920  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4921 
4922  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4923  (! domainMapsMatch, std::runtime_error,
4924  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4925  "The graph cannot be changed because it was given to the CrsMatrix "
4926  "constructor as const. You can fix this by passing in the graph's "
4927  "domain Map and range Map to the matrix's fillComplete call.");
4928 
4929  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4930  (! rangeMapsMatch, std::runtime_error,
4931  "The CrsMatrix's range Map does not match the graph's range Map. "
4932  "The graph cannot be changed because it was given to the CrsMatrix "
4933  "constructor as const. You can fix this by passing in the graph's "
4934  "domain Map and range Map to the matrix's fillComplete call.");
4935 #endif // HAVE_TPETRA_DEBUG
4936 
4937  // The matrix does _not_ own the graph, and the graph's
4938  // structure is already fixed, so just fill the local matrix.
4939  this->fillLocalMatrix (params);
4940  }
4941  else {
4942  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4943  // Set the graph's domain and range Maps. This will clear the
4944  // Import if the domain Map has changed (is a different
4945  // pointer), and the Export if the range Map has changed (is a
4946  // different pointer).
4947  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4948 
4949  // Make the graph's column Map, if necessary.
4950  Teuchos::Array<int> remotePIDs (0);
4951  const bool mustBuildColMap = ! this->hasColMap ();
4952  if (mustBuildColMap) {
4953  this->myGraph_->makeColMap (remotePIDs);
4954  }
4955 
4956  // Make indices local, if necessary. The method won't do
4957  // anything if the graph is already locally indexed.
4958  const std::pair<size_t, std::string> makeIndicesLocalResult =
4959  this->myGraph_->makeIndicesLocal(verbose);
4960  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4961  // the error state to makeImportExport or
4962  // computeGlobalConstants, which may do all-reduces and thus may
4963  // have the opportunity to communicate that error state.
4964  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4965  (makeIndicesLocalResult.first != 0, std::runtime_error,
4966  makeIndicesLocalResult.second);
4967 
4968  const bool sorted = this->myGraph_->isSorted ();
4969  const bool merged = this->myGraph_->isMerged ();
4970  this->sortAndMergeIndicesAndValues (sorted, merged);
4971 
4972  // Make Import and Export objects, if they haven't been made
4973  // already. If we made a column Map above, reuse information
4974  // from that process to avoid communiation in the Import setup.
4975  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4976 
4977  // The matrix _does_ own the graph, so fill the local graph at
4978  // the same time as the local matrix.
4979  this->fillLocalGraphAndMatrix (params);
4980 
4981  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4982  params->get ("compute global constants", true);
4983  if (callGraphComputeGlobalConstants) {
4984  this->myGraph_->computeGlobalConstants ();
4985  }
4986  else {
4987  this->myGraph_->computeLocalConstants ();
4988  }
4989  this->myGraph_->fillComplete_ = true;
4990  this->myGraph_->checkInternalState ();
4991  }
4992 
4993  {
4994  Details::ProfilingRegion region_ccgc(
4995  "Tpetra::CrsMatrix::fillCompete", "callComputeGlobalConstamnts"
4996  );
4997  const bool callComputeGlobalConstants = params.get () == nullptr ||
4998  params->get ("compute global constants", true);
4999  if (callComputeGlobalConstants) {
5000  this->computeGlobalConstants ();
5001  }
5002  }
5003 
5004  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
5005 
5006  this->fillComplete_ = true; // Now we're fill complete!
5007  {
5008  Details::ProfilingRegion region_cis(
5009  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
5010  );
5011  this->checkInternalState ();
5012  }
5013  }
5014 
5015  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5016  void
5018  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
5019  const Teuchos::RCP<const map_type> & rangeMap,
5020  const Teuchos::RCP<const import_type>& importer,
5021  const Teuchos::RCP<const export_type>& exporter,
5022  const Teuchos::RCP<Teuchos::ParameterList> &params)
5023  {
5024 #ifdef HAVE_TPETRA_MMM_TIMINGS
5025  std::string label;
5026  if(!params.is_null())
5027  label = params->get("Timer Label",label);
5028  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
5029  using Teuchos::TimeMonitor;
5030 
5031  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
5032 #endif
5033 
5034  const char tfecfFuncName[] = "expertStaticFillComplete: ";
5035  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
5036  std::runtime_error, "Matrix fill state must be active (isFillActive() "
5037  "must be true) before calling fillComplete().");
5038  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
5039  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
5040 
5041  {
5042 #ifdef HAVE_TPETRA_MMM_TIMINGS
5043  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
5044 #endif
5045  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
5046  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
5047  }
5048 
5049  const bool callComputeGlobalConstants = params.get () == nullptr ||
5050  params->get ("compute global constants", true);
5051  if (callComputeGlobalConstants) {
5052  this->computeGlobalConstants ();
5053  }
5054 
5055  {
5056 #ifdef HAVE_TPETRA_MMM_TIMINGS
5057  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
5058 #endif
5059  // Fill the local graph and matrix
5060  fillLocalGraphAndMatrix (params);
5061  }
5062  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
5063 
5064  // Now we're fill complete!
5065  fillComplete_ = true;
5066 
5067  // Sanity checks at the end.
5068 #ifdef HAVE_TPETRA_DEBUG
5069  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
5070  ": We're at the end of fillComplete(), but isFillActive() is true. "
5071  "Please report this bug to the Tpetra developers.");
5072  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
5073  ": We're at the end of fillComplete(), but isFillActive() is true. "
5074  "Please report this bug to the Tpetra developers.");
5075 #endif // HAVE_TPETRA_DEBUG
5076  {
5077 #ifdef HAVE_TPETRA_MMM_TIMINGS
5078  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
5079 #endif
5080 
5081  checkInternalState();
5082  }
5083  }
5084 
5085  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5087  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
5088  {
5089  impl_scalar_type* rowValueIter = vals;
5090  // beg,end define a half-exclusive interval over which to iterate.
5091  LocalOrdinal* beg = cols;
5092  LocalOrdinal* end = cols + rowLen;
5093  LocalOrdinal* newend = beg;
5094  if (beg != end) {
5095  LocalOrdinal* cur = beg + 1;
5096  impl_scalar_type* vcur = rowValueIter + 1;
5097  impl_scalar_type* vend = rowValueIter;
5098  cur = beg+1;
5099  while (cur != end) {
5100  if (*cur != *newend) {
5101  // new entry; save it
5102  ++newend;
5103  ++vend;
5104  (*newend) = (*cur);
5105  (*vend) = (*vcur);
5106  }
5107  else {
5108  // old entry; merge it
5109  //(*vend) = f (*vend, *vcur);
5110  (*vend) += *vcur;
5111  }
5112  ++cur;
5113  ++vcur;
5114  }
5115  ++newend; // one past the last entry, per typical [beg,end) semantics
5116  }
5117  return newend - beg;
5118  }
5119 
5120  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5121  void
5123  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
5124  {
5125  using ::Tpetra::Details::ProfilingRegion;
5126  typedef LocalOrdinal LO;
5127  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
5128  host_execution_space;
5129  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
5130  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
5131  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
5132 
5133  if (! sorted || ! merged) {
5134  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5135  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
5136  "\"static\" (const) graph, since the matrix does not own the graph.");
5137  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5138  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
5139  "this matrix claims ! isStaticGraph(). "
5140  "Please report this bug to the Tpetra developers.");
5141  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5142  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
5143  "this method if the graph's storage has already been optimized. "
5144  "Please report this bug to the Tpetra developers.");
5145 
5146  crs_graph_type& graph = * (this->myGraph_);
5147  const LO lclNumRows = static_cast<LO> (this->getNodeNumRows ());
5148  size_t totalNumDups = 0;
5149  {
5150  //Accessing host unpacked (4-array CRS) local matrix.
5151  auto rowBegins_ = graph.rowPtrsUnpacked_host_;
5152  auto rowLengths_ = graph.k_numRowEntries_;
5153  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
5154  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
5155  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
5156  [=] (const LO lclRow, size_t& numDups) {
5157  size_t rowBegin = rowBegins_(lclRow);
5158  size_t rowLen = rowLengths_(lclRow);
5159  LO* cols = cols_.data() + rowBegin;
5160  impl_scalar_type* vals = vals_.data() + rowBegin;
5161  if (! sorted) {
5162  sort2 (cols, cols + rowLen, vals);
5163  }
5164  if (! merged) {
5165  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
5166  rowLengths_(lclRow) = newRowLength;
5167  numDups += rowLen - newRowLength;
5168  }
5169  }, totalNumDups);
5170  }
5171  if (! sorted) {
5172  graph.indicesAreSorted_ = true; // we just sorted every row
5173  }
5174  if (! merged) {
5175  graph.noRedundancies_ = true; // we just merged every row
5176  }
5177  }
5178  }
5179 
5180  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5181  void
5185  Scalar alpha,
5186  Scalar beta) const
5187  {
5189  using Teuchos::RCP;
5190  using Teuchos::rcp;
5191  using Teuchos::rcp_const_cast;
5192  using Teuchos::rcpFromRef;
5193  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5194  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
5195 
5196  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
5197  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
5198  // failing only for the Kokkos refactor version of Tpetra. It's a
5199  // good idea regardless to have the bypass.
5200  if (alpha == ZERO) {
5201  if (beta == ZERO) {
5202  Y_in.putScalar (ZERO);
5203  } else if (beta != ONE) {
5204  Y_in.scale (beta);
5205  }
5206  return;
5207  }
5208 
5209  // It's possible that X is a view of Y or vice versa. We don't
5210  // allow this (apply() requires that X and Y not alias one
5211  // another), but it's helpful to detect and work around this case.
5212  // We don't try to to detect the more subtle cases (e.g., one is a
5213  // subview of the other, but their initial pointers differ). We
5214  // only need to do this if this matrix's Import is trivial;
5215  // otherwise, we don't actually apply the operator from X into Y.
5216 
5217  RCP<const import_type> importer = this->getGraph ()->getImporter ();
5218  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
5219 
5220  // If beta == 0, then the output MV will be overwritten; none of
5221  // its entries should be read. (Sparse BLAS semantics say that we
5222  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
5223  // This matters if we need to do an Export operation; see below.
5224  const bool Y_is_overwritten = (beta == ZERO);
5225 
5226  // We treat the case of a replicated MV output specially.
5227  const bool Y_is_replicated =
5228  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
5229 
5230  // This is part of the special case for replicated MV output.
5231  // We'll let each process do its thing, but do an all-reduce at
5232  // the end to sum up the results. Setting beta=0 on all processes
5233  // but Proc 0 makes the math work out for the all-reduce. (This
5234  // assumes that the replicated data is correctly replicated, so
5235  // that the data are the same on all processes.)
5236  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
5237  beta = ZERO;
5238  }
5239 
5240  // Temporary MV for Import operation. After the block of code
5241  // below, this will be an (Imported if necessary) column Map MV
5242  // ready to give to localApply(...).
5243  RCP<const MV> X_colMap;
5244  if (importer.is_null ()) {
5245  if (! X_in.isConstantStride ()) {
5246  // Not all sparse mat-vec kernels can handle an input MV with
5247  // nonconstant stride correctly, so we have to copy it in that
5248  // case into a constant stride MV. To make a constant stride
5249  // copy of X_in, we force creation of the column (== domain)
5250  // Map MV (if it hasn't already been created, else fetch the
5251  // cached copy). This avoids creating a new MV each time.
5252  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
5253  Tpetra::deep_copy (*X_colMapNonConst, X_in);
5254  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
5255  }
5256  else {
5257  // The domain and column Maps are the same, so do the local
5258  // multiply using the domain Map input MV X_in.
5259  X_colMap = rcpFromRef (X_in);
5260  }
5261  }
5262  else { // need to Import source (multi)vector
5263  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
5264 
5265  // We're doing an Import anyway, which will copy the relevant
5266  // elements of the domain Map MV X_in into a separate column Map
5267  // MV. Thus, we don't have to worry whether X_in is constant
5268  // stride.
5269  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
5270 
5271  // Import from the domain Map MV to the column Map MV.
5272  X_colMapNonConst->doImport (X_in, *importer, INSERT);
5273  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
5274  }
5275 
5276  // Temporary MV for doExport (if needed), or for copying a
5277  // nonconstant stride output MV into a constant stride MV. This
5278  // is null if we don't need the temporary MV, that is, if the
5279  // Export is trivial (null).
5280  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
5281 
5282  // If we have a nontrivial Export object, we must perform an
5283  // Export. In that case, the local multiply result will go into
5284  // the row Map multivector. We don't have to make a
5285  // constant-stride version of Y_in in this case, because we had to
5286  // make a constant stride Y_rowMap MV and do an Export anyway.
5287  if (! exporter.is_null ()) {
5288  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
5289  {
5290  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
5291 
5292  // If we're overwriting the output MV Y_in completely (beta ==
5293  // 0), then make sure that it is filled with zeros before we
5294  // do the Export. Otherwise, the ADD combine mode will use
5295  // data in Y_in, which is supposed to be zero.
5296  if (Y_is_overwritten) {
5297  Y_in.putScalar (ZERO);
5298  }
5299  else {
5300  // Scale output MV by beta, so that doExport sums in the
5301  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
5302  Y_in.scale (beta);
5303  }
5304  // Do the Export operation.
5305  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
5306  }
5307  }
5308  else { // Don't do an Export: row Map and range Map are the same.
5309  //
5310  // If Y_in does not have constant stride, or if the column Map
5311  // MV aliases Y_in, then we can't let the kernel write directly
5312  // to Y_in. Instead, we have to use the cached row (== range)
5313  // Map MV as temporary storage.
5314  //
5315  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
5316  // the user passed in the same MultiVector for both X and Y. It
5317  // won't detect whether one MultiVector views the other. We
5318  // should also check the MultiVectors' raw data pointers.
5319  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
5320  // Force creating the MV if it hasn't been created already.
5321  // This will reuse a previously created cached MV.
5322  Y_rowMap = getRowMapMultiVector (Y_in, true);
5323 
5324  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
5325  // since we're overwriting it anyway.
5326  if (beta != ZERO) {
5327  Tpetra::deep_copy (*Y_rowMap, Y_in);
5328  }
5329  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
5330  Tpetra::deep_copy (Y_in, *Y_rowMap);
5331  }
5332  else {
5333  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
5334  }
5335  }
5336 
5337  // If the range Map is a locally replicated Map, sum up
5338  // contributions from each process. We set beta = 0 on all
5339  // processes but Proc 0 initially, so this will handle the scaling
5340  // factor beta correctly.
5341  if (Y_is_replicated) {
5342  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
5343  Y_in.reduce ();
5344  }
5345  }
5346 
5347  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5348  void
5352  const Teuchos::ETransp mode,
5353  Scalar alpha,
5354  Scalar beta) const
5355  {
5357  using Teuchos::null;
5358  using Teuchos::RCP;
5359  using Teuchos::rcp;
5360  using Teuchos::rcp_const_cast;
5361  using Teuchos::rcpFromRef;
5362  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5363 
5364  // Take shortcuts for alpha == 0.
5365  if (alpha == ZERO) {
5366  // Follow the Sparse BLAS convention by ignoring both the matrix
5367  // and X_in, in this case.
5368  if (beta == ZERO) {
5369  // Follow the Sparse BLAS convention by overwriting any Inf or
5370  // NaN values in Y_in, in this case.
5371  Y_in.putScalar (ZERO);
5372  }
5373  else {
5374  Y_in.scale (beta);
5375  }
5376  return;
5377  }
5378 
5379  const size_t numVectors = X_in.getNumVectors ();
5380 
5381  // We don't allow X_in and Y_in to alias one another. It's hard
5382  // to check this, because advanced users could create views from
5383  // raw pointers. However, if X_in and Y_in reference the same
5384  // object, we will do the user a favor by copying X into new
5385  // storage (with a warning). We only need to do this if we have
5386  // trivial importers; otherwise, we don't actually apply the
5387  // operator from X into Y.
5388  RCP<const import_type> importer = this->getGraph ()->getImporter ();
5389  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
5390  // access X indirectly, in case we need to create temporary storage
5391  RCP<const MV> X;
5392 
5393  // some parameters for below
5394  const bool Y_is_replicated = ! Y_in.isDistributed ();
5395  const bool Y_is_overwritten = (beta == ZERO);
5396  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
5397  beta = ZERO;
5398  }
5399 
5400  // The kernels do not allow input or output with nonconstant stride.
5401  if (! X_in.isConstantStride () && importer.is_null ()) {
5402  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
5403  } else {
5404  X = rcpFromRef (X_in); // Reference to X_in
5405  }
5406 
5407  // Set up temporary multivectors for Import and/or Export.
5408  if (importer != Teuchos::null) {
5409  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
5410  importMV_ = null;
5411  }
5412  if (importMV_ == null) {
5413  importMV_ = rcp (new MV (this->getColMap (), numVectors));
5414  }
5415  }
5416  if (exporter != Teuchos::null) {
5417  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
5418  exportMV_ = null;
5419  }
5420  if (exportMV_ == null) {
5421  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
5422  }
5423  }
5424 
5425  // If we have a non-trivial exporter, we must import elements that
5426  // are permuted or are on other processors.
5427  if (! exporter.is_null ()) {
5428  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
5429  exportMV_->doImport (X_in, *exporter, INSERT);
5430  X = exportMV_; // multiply out of exportMV_
5431  }
5432 
5433  // If we have a non-trivial importer, we must export elements that
5434  // are permuted or belong to other processors. We will compute
5435  // solution into the to-be-exported MV; get a view.
5436  if (importer != Teuchos::null) {
5437  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
5438 
5439  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
5440  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
5441  // back and figure out why this helps. importMV_ SHOULD be
5442  // completely overwritten in the localApply(...) call
5443  // below, because beta == ZERO there.
5444  importMV_->putScalar (ZERO);
5445  // Do the local computation.
5446  this->localApply (*X, *importMV_, mode, alpha, ZERO);
5447 
5448  if (Y_is_overwritten) {
5449  Y_in.putScalar (ZERO);
5450  } else {
5451  Y_in.scale (beta);
5452  }
5453  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
5454  }
5455  // otherwise, multiply into Y
5456  else {
5457  // can't multiply in-situ; can't multiply into non-strided multivector
5458  //
5459  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
5460  // the user passed in the same MultiVector for both X and Y. It
5461  // won't detect whether one MultiVector views the other. We
5462  // should also check the MultiVectors' raw data pointers.
5463  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
5464  // Make a deep copy of Y_in, into which to write the multiply result.
5465  MV Y (Y_in, Teuchos::Copy);
5466  this->localApply (*X, Y, mode, alpha, beta);
5467  Tpetra::deep_copy (Y_in, Y);
5468  } else {
5469  this->localApply (*X, Y_in, mode, alpha, beta);
5470  }
5471  }
5472 
5473  // If the range Map is a locally replicated map, sum the
5474  // contributions from each process. (That's why we set beta=0
5475  // above for all processes but Proc 0.)
5476  if (Y_is_replicated) {
5477  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
5478  Y_in.reduce ();
5479  }
5480  }
5481 
5482  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5483  void
5487  const Teuchos::ETransp mode,
5488  const Scalar& alpha,
5489  const Scalar& beta) const
5490  {
5492  using Teuchos::NO_TRANS;
5493  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5494 
5495  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5496  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5497  auto matrix_lcl = getLocalMultiplyOperator();
5498 
5499  const bool debug = ::Tpetra::Details::Behavior::debug ();
5500  if (debug) {
5501  const char tfecfFuncName[] = "localApply: ";
5502  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5503  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5504  "X.getNumVectors() = " << X.getNumVectors () << " != "
5505  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5506  const bool transpose = (mode != Teuchos::NO_TRANS);
5507  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5508  (! transpose && X.getLocalLength () !=
5509  getColMap ()->getNodeNumElements (), std::runtime_error,
5510  "NO_TRANS case: X has the wrong number of local rows. "
5511  "X.getLocalLength() = " << X.getLocalLength () << " != "
5512  "getColMap()->getNodeNumElements() = " <<
5513  getColMap ()->getNodeNumElements () << ".");
5514  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5515  (! transpose && Y.getLocalLength () !=
5516  getRowMap ()->getNodeNumElements (), std::runtime_error,
5517  "NO_TRANS case: Y has the wrong number of local rows. "
5518  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5519  "getRowMap()->getNodeNumElements() = " <<
5520  getRowMap ()->getNodeNumElements () << ".");
5521  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5522  (transpose && X.getLocalLength () !=
5523  getRowMap ()->getNodeNumElements (), std::runtime_error,
5524  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5525  "rows. X.getLocalLength() = " << X.getLocalLength ()
5526  << " != getRowMap()->getNodeNumElements() = "
5527  << getRowMap ()->getNodeNumElements () << ".");
5528  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5529  (transpose && Y.getLocalLength () !=
5530  getColMap ()->getNodeNumElements (), std::runtime_error,
5531  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5532  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5533  << " != getColMap()->getNodeNumElements() = "
5534  << getColMap ()->getNodeNumElements () << ".");
5535  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5536  (! isFillComplete (), std::runtime_error, "The matrix is not "
5537  "fill complete. You must call fillComplete() (possibly with "
5538  "domain and range Map arguments) without an intervening "
5539  "resumeFill() call before you may call this method.");
5540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5541  (! X.isConstantStride () || ! Y.isConstantStride (),
5542  std::runtime_error, "X and Y must be constant stride.");
5543  // If the two pointers are null, then they don't alias one
5544  // another, even though they are equal.
5545  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5546  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr,
5547  std::runtime_error, "X and Y may not alias one another.");
5548  }
5549 
5550  LocalOrdinal nrows = getNodeNumRows();
5551  LocalOrdinal maxRowImbalance = 0;
5552  if(nrows != 0)
5553  maxRowImbalance = getNodeMaxNumRowEntries() - (getNodeNumEntries() / nrows);
5554 
5555  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5556  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5557  else
5558  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5559  }
5560 
5561  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5562  void
5566  Teuchos::ETransp mode,
5567  Scalar alpha,
5568  Scalar beta) const
5569  {
5571  const char fnName[] = "Tpetra::CrsMatrix::apply";
5572 
5573  TEUCHOS_TEST_FOR_EXCEPTION
5574  (! isFillComplete (), std::runtime_error,
5575  fnName << ": Cannot call apply() until fillComplete() "
5576  "has been called.");
5577 
5578  if (mode == Teuchos::NO_TRANS) {
5579  ProfilingRegion regionNonTranspose (fnName);
5580  this->applyNonTranspose (X, Y, alpha, beta);
5581  }
5582  else {
5583  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5584 
5585  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
5586  //when bets==0. This was not the case with transpose in a multithreaded
5587  //environment where a multiplication with subsequent atomic_adds is used
5588  //since 0 is effectively not special cased. Doing the explicit set to zero here
5589  //This catches cases where Y is nan or inf.
5590  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
5591  if (beta == ZERO) {
5592  Y.putScalar (ZERO);
5593  }
5594  this->applyTranspose (X, Y, mode, alpha, beta);
5595  }
5596  }
5597 
5598 
5599  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5600  template<class T>
5601  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5603  convert () const
5604  {
5605  using Teuchos::RCP;
5606  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5607  const char tfecfFuncName[] = "convert: ";
5608 
5609  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5610  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5611  "of the conversion) is not fill complete. You must first call "
5612  "fillComplete() (possibly with the domain and range Map) without an "
5613  "intervening call to resumeFill(), before you may call this method.");
5614 
5615  RCP<output_matrix_type> newMatrix
5616  (new output_matrix_type (this->getCrsGraph ()));
5617  // Copy old values into new values. impl_scalar_type and T may
5618  // differ, so we can't use Kokkos::deep_copy.
5620  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5621  this->getLocalMatrixDevice ().values);
5622  // Since newmat has a static (const) graph, the graph already has
5623  // a column Map, and Import and Export objects already exist (if
5624  // applicable). Thus, calling fillComplete is cheap.
5625  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5626 
5627  return newMatrix;
5628  }
5629 
5630 
5631  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5632  void
5634  checkInternalState () const
5635  {
5636  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5637  if (debug) {
5638  const char tfecfFuncName[] = "checkInternalState: ";
5639  const char err[] = "Internal state is not consistent. "
5640  "Please report this bug to the Tpetra developers.";
5641 
5642  // This version of the graph (RCP<const crs_graph_type>) must
5643  // always be nonnull.
5644  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5645  (staticGraph_.is_null (), std::logic_error, err);
5646  // myGraph == null means that the matrix has a const ("static")
5647  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5648  // graph).
5649  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5650  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5651  std::logic_error, err);
5652  // if matrix is fill complete, then graph must be fill complete
5653  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5654  (isFillComplete () && ! staticGraph_->isFillComplete (),
5655  std::logic_error, err << " Specifically, the matrix is fill complete, "
5656  "but its graph is NOT fill complete.");
5657  // if values are allocated and they are non-zero in number, then
5658  // one of the allocations should be present
5659  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5660  (staticGraph_->indicesAreAllocated () &&
5661  staticGraph_->getNodeAllocationSize() > 0 &&
5662  staticGraph_->getNodeNumRows() > 0 &&
5663  valuesUnpacked_wdv.extent (0) == 0,
5664  std::logic_error, err);
5665  }
5666  }
5667 
5668  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5669  std::string
5671  description () const
5672  {
5673  std::ostringstream os;
5674 
5675  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5676  if (this->getObjectLabel () != "") {
5677  os << "Label: \"" << this->getObjectLabel () << "\", ";
5678  }
5679  if (isFillComplete ()) {
5680  os << "isFillComplete: true"
5681  << ", global dimensions: [" << getGlobalNumRows () << ", "
5682  << getGlobalNumCols () << "]"
5683  << ", global number of entries: " << getGlobalNumEntries ()
5684  << "}";
5685  }
5686  else {
5687  os << "isFillComplete: false"
5688  << ", global dimensions: [" << getGlobalNumRows () << ", "
5689  << getGlobalNumCols () << "]}";
5690  }
5691  return os.str ();
5692  }
5693 
5694  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5695  void
5697  describe (Teuchos::FancyOStream &out,
5698  const Teuchos::EVerbosityLevel verbLevel) const
5699  {
5700  using std::endl;
5701  using std::setw;
5702  using Teuchos::ArrayView;
5703  using Teuchos::Comm;
5704  using Teuchos::RCP;
5705  using Teuchos::TypeNameTraits;
5706  using Teuchos::VERB_DEFAULT;
5707  using Teuchos::VERB_NONE;
5708  using Teuchos::VERB_LOW;
5709  using Teuchos::VERB_MEDIUM;
5710  using Teuchos::VERB_HIGH;
5711  using Teuchos::VERB_EXTREME;
5712 
5713  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5714 
5715  if (vl == VERB_NONE) {
5716  return; // Don't print anything at all
5717  }
5718 
5719  // By convention, describe() always begins with a tab.
5720  Teuchos::OSTab tab0 (out);
5721 
5722  RCP<const Comm<int> > comm = this->getComm();
5723  const int myRank = comm->getRank();
5724  const int numProcs = comm->getSize();
5725  size_t width = 1;
5726  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5727  ++width;
5728  }
5729  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5730 
5731  // none: print nothing
5732  // low: print O(1) info from node 0
5733  // medium: print O(P) info, num entries per process
5734  // high: print O(N) info, num entries per row
5735  // extreme: print O(NNZ) info: print indices and values
5736  //
5737  // for medium and higher, print constituent objects at specified verbLevel
5738  if (myRank == 0) {
5739  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5740  }
5741  Teuchos::OSTab tab1 (out);
5742 
5743  if (myRank == 0) {
5744  if (this->getObjectLabel () != "") {
5745  out << "Label: \"" << this->getObjectLabel () << "\", ";
5746  }
5747  {
5748  out << "Template parameters:" << endl;
5749  Teuchos::OSTab tab2 (out);
5750  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5751  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5752  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5753  << "Node: " << TypeNameTraits<Node>::name () << endl;
5754  }
5755  if (isFillComplete()) {
5756  out << "isFillComplete: true" << endl
5757  << "Global dimensions: [" << getGlobalNumRows () << ", "
5758  << getGlobalNumCols () << "]" << endl
5759  << "Global number of entries: " << getGlobalNumEntries () << endl
5760  << endl << "Global max number of entries in a row: "
5761  << getGlobalMaxNumRowEntries () << endl;
5762  }
5763  else {
5764  out << "isFillComplete: false" << endl
5765  << "Global dimensions: [" << getGlobalNumRows () << ", "
5766  << getGlobalNumCols () << "]" << endl;
5767  }
5768  }
5769 
5770  if (vl < VERB_MEDIUM) {
5771  return; // all done!
5772  }
5773 
5774  // Describe the row Map.
5775  if (myRank == 0) {
5776  out << endl << "Row Map:" << endl;
5777  }
5778  if (getRowMap ().is_null ()) {
5779  if (myRank == 0) {
5780  out << "null" << endl;
5781  }
5782  }
5783  else {
5784  if (myRank == 0) {
5785  out << endl;
5786  }
5787  getRowMap ()->describe (out, vl);
5788  }
5789 
5790  // Describe the column Map.
5791  if (myRank == 0) {
5792  out << "Column Map: ";
5793  }
5794  if (getColMap ().is_null ()) {
5795  if (myRank == 0) {
5796  out << "null" << endl;
5797  }
5798  } else if (getColMap () == getRowMap ()) {
5799  if (myRank == 0) {
5800  out << "same as row Map" << endl;
5801  }
5802  } else {
5803  if (myRank == 0) {
5804  out << endl;
5805  }
5806  getColMap ()->describe (out, vl);
5807  }
5808 
5809  // Describe the domain Map.
5810  if (myRank == 0) {
5811  out << "Domain Map: ";
5812  }
5813  if (getDomainMap ().is_null ()) {
5814  if (myRank == 0) {
5815  out << "null" << endl;
5816  }
5817  } else if (getDomainMap () == getRowMap ()) {
5818  if (myRank == 0) {
5819  out << "same as row Map" << endl;
5820  }
5821  } else if (getDomainMap () == getColMap ()) {
5822  if (myRank == 0) {
5823  out << "same as column Map" << endl;
5824  }
5825  } else {
5826  if (myRank == 0) {
5827  out << endl;
5828  }
5829  getDomainMap ()->describe (out, vl);
5830  }
5831 
5832  // Describe the range Map.
5833  if (myRank == 0) {
5834  out << "Range Map: ";
5835  }
5836  if (getRangeMap ().is_null ()) {
5837  if (myRank == 0) {
5838  out << "null" << endl;
5839  }
5840  } else if (getRangeMap () == getDomainMap ()) {
5841  if (myRank == 0) {
5842  out << "same as domain Map" << endl;
5843  }
5844  } else if (getRangeMap () == getRowMap ()) {
5845  if (myRank == 0) {
5846  out << "same as row Map" << endl;
5847  }
5848  } else {
5849  if (myRank == 0) {
5850  out << endl;
5851  }
5852  getRangeMap ()->describe (out, vl);
5853  }
5854 
5855  // O(P) data
5856  for (int curRank = 0; curRank < numProcs; ++curRank) {
5857  if (myRank == curRank) {
5858  out << "Process rank: " << curRank << endl;
5859  Teuchos::OSTab tab2 (out);
5860  if (! staticGraph_->indicesAreAllocated ()) {
5861  out << "Graph indices not allocated" << endl;
5862  }
5863  else {
5864  out << "Number of allocated entries: "
5865  << staticGraph_->getNodeAllocationSize () << endl;
5866  }
5867  out << "Number of entries: " << getNodeNumEntries () << endl
5868  << "Max number of entries per row: " << getNodeMaxNumRowEntries ()
5869  << endl;
5870  }
5871  // Give output time to complete by executing some barriers.
5872  comm->barrier ();
5873  comm->barrier ();
5874  comm->barrier ();
5875  }
5876 
5877  if (vl < VERB_HIGH) {
5878  return; // all done!
5879  }
5880 
5881  // O(N) and O(NNZ) data
5882  for (int curRank = 0; curRank < numProcs; ++curRank) {
5883  if (myRank == curRank) {
5884  out << std::setw(width) << "Proc Rank"
5885  << std::setw(width) << "Global Row"
5886  << std::setw(width) << "Num Entries";
5887  if (vl == VERB_EXTREME) {
5888  out << std::setw(width) << "(Index,Value)";
5889  }
5890  out << endl;
5891  for (size_t r = 0; r < getNodeNumRows (); ++r) {
5892  const size_t nE = getNumEntriesInLocalRow(r);
5893  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5894  out << std::setw(width) << myRank
5895  << std::setw(width) << gid
5896  << std::setw(width) << nE;
5897  if (vl == VERB_EXTREME) {
5898  if (isGloballyIndexed()) {
5899  global_inds_host_view_type rowinds;
5900  values_host_view_type rowvals;
5901  getGlobalRowView (gid, rowinds, rowvals);
5902  for (size_t j = 0; j < nE; ++j) {
5903  out << " (" << rowinds[j]
5904  << ", " << rowvals[j]
5905  << ") ";
5906  }
5907  }
5908  else if (isLocallyIndexed()) {
5909  local_inds_host_view_type rowinds;
5910  values_host_view_type rowvals;
5911  getLocalRowView (r, rowinds, rowvals);
5912  for (size_t j=0; j < nE; ++j) {
5913  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5914  << ", " << rowvals[j]
5915  << ") ";
5916  }
5917  } // globally or locally indexed
5918  } // vl == VERB_EXTREME
5919  out << endl;
5920  } // for each row r on this process
5921  } // if (myRank == curRank)
5922 
5923  // Give output time to complete
5924  comm->barrier ();
5925  comm->barrier ();
5926  comm->barrier ();
5927  } // for each process p
5928  }
5929 
5930  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5931  bool
5933  checkSizes (const SrcDistObject& source)
5934  {
5935  // It's not clear what kind of compatibility checks on sizes can
5936  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5937  // compatibility.
5938 
5939  // Currently, the source object must be a RowMatrix with the same
5940  // four template parameters as the target CrsMatrix. We might
5941  // relax this requirement later.
5942  const row_matrix_type* srcRowMat =
5943  dynamic_cast<const row_matrix_type*> (&source);
5944  return (srcRowMat != nullptr);
5945  }
5946 
5947  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5948  void
5951  const typename crs_graph_type::padding_type& padding,
5952  const bool verbose)
5953  {
5955  using Details::padCrsArrays;
5956  using std::endl;
5957  using LO = local_ordinal_type;
5958  using row_ptrs_type =
5959  typename local_graph_device_type::row_map_type::non_const_type;
5960  using range_policy =
5961  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5962  const char tfecfFuncName[] = "applyCrsPadding";
5963  const char suffix[] =
5964  ". Please report this bug to the Tpetra developers.";
5965  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5966 
5967  std::unique_ptr<std::string> prefix;
5968  if (verbose) {
5969  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5970  std::ostringstream os;
5971  os << *prefix << "padding: ";
5972  padding.print(os);
5973  os << endl;
5974  std::cerr << os.str();
5975  }
5976  const int myRank = ! verbose ? -1 : [&] () {
5977  auto map = this->getMap();
5978  if (map.is_null()) {
5979  return -1;
5980  }
5981  auto comm = map->getComm();
5982  if (comm.is_null()) {
5983  return -1;
5984  }
5985  return comm->getRank();
5986  } ();
5987 
5988  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5989  if (! myGraph_->indicesAreAllocated()) {
5990  if (verbose) {
5991  std::ostringstream os;
5992  os << *prefix << "Call allocateIndices" << endl;
5993  std::cerr << os.str();
5994  }
5995  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5996  }
5997 
5998  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5999  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
6000  // size needs to increase. That should be the job of
6001  // padCrsArrays.
6002 
6003  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
6004  // would use it directly.
6005 
6006  if (verbose) {
6007  std::ostringstream os;
6008  os << *prefix << "Allocate row_ptrs_beg: "
6009  << myGraph_->rowPtrsUnpacked_host_.extent(0) << endl;
6010  std::cerr << os.str();
6011  }
6012  using Kokkos::view_alloc;
6013  using Kokkos::WithoutInitializing;
6014  row_ptrs_type row_ptr_beg(
6015  view_alloc("row_ptr_beg", WithoutInitializing),
6016  myGraph_->rowPtrsUnpacked_dev_.extent(0));
6017  Kokkos::deep_copy(row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
6018 
6019  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
6020  size_t(row_ptr_beg.extent(0) - 1);
6021  if (verbose) {
6022  std::ostringstream os;
6023  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
6024  std::cerr << os.str();
6025  }
6026  row_ptrs_type row_ptr_end(
6027  view_alloc("row_ptr_end", WithoutInitializing), N);
6028 
6029  row_ptrs_type num_row_entries_d;
6030 
6031  const bool refill_num_row_entries =
6032  myGraph_->k_numRowEntries_.extent(0) != 0;
6033 
6034  if (refill_num_row_entries) { // unpacked storage
6035  // We can't assume correct *this capture until C++17, and it's
6036  // likely more efficient just to capture what we need anyway.
6037  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
6038  myGraph_->k_numRowEntries_);
6039  Kokkos::parallel_for
6040  ("Fill end row pointers", range_policy(0, N),
6041  KOKKOS_LAMBDA (const size_t i) {
6042  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
6043  });
6044  }
6045  else {
6046  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
6047  // storage, we don't need row_ptr_end to be separate allocation;
6048  // could just have it alias row_ptr_beg+1.
6049  Kokkos::parallel_for
6050  ("Fill end row pointers", range_policy(0, N),
6051  KOKKOS_LAMBDA (const size_t i) {
6052  row_ptr_end(i) = row_ptr_beg(i+1);
6053  });
6054  }
6055 
6056  if (myGraph_->isGloballyIndexed()) {
6057  padCrsArrays(row_ptr_beg, row_ptr_end,
6058  myGraph_->gblInds_wdv,
6059  valuesUnpacked_wdv, padding, myRank, verbose);
6060  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
6061  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
6062  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6063  (newValuesLen != newColIndsLen, std::logic_error,
6064  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
6065  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
6066  << suffix);
6067  }
6068  else {
6069  padCrsArrays(row_ptr_beg, row_ptr_end,
6070  myGraph_->lclIndsUnpacked_wdv,
6071  valuesUnpacked_wdv, padding, myRank, verbose);
6072  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
6073  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
6074  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6075  (newValuesLen != newColIndsLen, std::logic_error,
6076  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
6077  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
6078  << suffix);
6079  }
6080 
6081  if (refill_num_row_entries) {
6082  Kokkos::parallel_for
6083  ("Fill num entries", range_policy(0, N),
6084  KOKKOS_LAMBDA (const size_t i) {
6085  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
6086  });
6087  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
6088  }
6089 
6090  if (verbose) {
6091  std::ostringstream os;
6092  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
6093  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
6094  << ", new size: " << row_ptr_beg.extent(0) << endl;
6095  std::cerr << os.str();
6096  TEUCHOS_ASSERT( myGraph_->rowPtrsUnpacked_host_.extent(0) ==
6097  row_ptr_beg.extent(0) );
6098  }
6099  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
6100  }
6101 
6102  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6103  void
6104  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6105  copyAndPermuteStaticGraph(
6106  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
6107  const size_t numSameIDs,
6108  const LocalOrdinal permuteToLIDs[],
6109  const LocalOrdinal permuteFromLIDs[],
6110  const size_t numPermutes)
6111  {
6112  using Details::ProfilingRegion;
6113  using Teuchos::Array;
6114  using Teuchos::ArrayView;
6115  using std::endl;
6116  using LO = LocalOrdinal;
6117  using GO = GlobalOrdinal;
6118  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
6119  const char suffix[] =
6120  " Please report this bug to the Tpetra developers.";
6121  ProfilingRegion regionCAP
6122  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
6123 
6124  const bool debug = Details::Behavior::debug("CrsGraph");
6125  const bool verbose = Details::Behavior::verbose("CrsGraph");
6126  std::unique_ptr<std::string> prefix;
6127  if (verbose) {
6128  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
6129  std::ostringstream os;
6130  os << *prefix << "Start" << endl;
6131  }
6132  const char* const prefix_raw =
6133  verbose ? prefix.get()->c_str() : nullptr;
6134 
6135  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
6136  //
6137  // Copy the first numSame row from source to target (this matrix).
6138  // This involves copying rows corresponding to LIDs [0, numSame-1].
6139  //
6140  const map_type& srcRowMap = * (srcMat.getRowMap ());
6141  nonconst_global_inds_host_view_type rowInds;
6142  nonconst_values_host_view_type rowVals;
6143  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
6144  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
6145  // Global ID for the current row index in the source matrix.
6146  // The first numSameIDs GIDs in the two input lists are the
6147  // same, so sourceGID == targetGID in this case.
6148  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
6149  const GO targetGID = sourceGID;
6150 
6151  ArrayView<const GO>rowIndsConstView;
6152  ArrayView<const Scalar> rowValsConstView;
6153 
6154  if (sourceIsLocallyIndexed) {
6155  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6156  if (rowLength > static_cast<size_t> (rowInds.size())) {
6157  Kokkos::resize(rowInds,rowLength);
6158  Kokkos::resize(rowVals,rowLength);
6159  }
6160  // Resizing invalidates an Array's views, so we must make new
6161  // ones, even if rowLength hasn't changed.
6162  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6163  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6164 
6165  // The source matrix is locally indexed, so we have to get a
6166  // copy. Really it's the GIDs that have to be copied (because
6167  // they have to be converted from LIDs).
6168  size_t checkRowLength = 0;
6169  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
6170  rowValsView, checkRowLength);
6171  if (debug) {
6172  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6173  (rowLength != checkRowLength, std::logic_error, "For "
6174  "global row index " << sourceGID << ", the source "
6175  "matrix's getNumEntriesInGlobalRow returns a row length "
6176  "of " << rowLength << ", but getGlobalRowCopy reports "
6177  "a row length of " << checkRowLength << "." << suffix);
6178  }
6179 
6180  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6181  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6182  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6183  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6184  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6185  rowIndsView.data(), rowIndsView.extent(0),
6186  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6187  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6188  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6189  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6190  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6191  // KDDKDD UVM TEMPORARY: KokkosView interface
6192  }
6193  else { // source matrix is globally indexed.
6194  global_inds_host_view_type rowIndsView;
6195  values_host_view_type rowValsView;
6196  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6197  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6198  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6199  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6200  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6201  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6202  rowIndsView.data(), rowIndsView.extent(0),
6203  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6204  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6205  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6206  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6207  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6208  // KDDKDD UVM TEMPORARY: KokkosView interface
6209 
6210  }
6211 
6212  // Applying a permutation to a matrix with a static graph
6213  // means REPLACE-ing entries.
6214  combineGlobalValues(targetGID, rowIndsConstView,
6215  rowValsConstView, REPLACE,
6216  prefix_raw, debug, verbose);
6217  }
6218 
6219  if (verbose) {
6220  std::ostringstream os;
6221  os << *prefix << "Do permutes" << endl;
6222  }
6223 
6224  const map_type& tgtRowMap = * (this->getRowMap ());
6225  for (size_t p = 0; p < numPermutes; ++p) {
6226  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
6227  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
6228 
6229  ArrayView<const GO> rowIndsConstView;
6230  ArrayView<const Scalar> rowValsConstView;
6231 
6232  if (sourceIsLocallyIndexed) {
6233  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6234  if (rowLength > static_cast<size_t> (rowInds.size ())) {
6235  Kokkos::resize(rowInds,rowLength);
6236  Kokkos::resize(rowVals,rowLength);
6237  }
6238  // Resizing invalidates an Array's views, so we must make new
6239  // ones, even if rowLength hasn't changed.
6240  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6241  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6242 
6243  // The source matrix is locally indexed, so we have to get a
6244  // copy. Really it's the GIDs that have to be copied (because
6245  // they have to be converted from LIDs).
6246  size_t checkRowLength = 0;
6247  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6248  rowValsView, checkRowLength);
6249  if (debug) {
6250  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6251  (rowLength != checkRowLength, std::logic_error, "For "
6252  "source matrix global row index " << sourceGID << ", "
6253  "getNumEntriesInGlobalRow returns a row length of " <<
6254  rowLength << ", but getGlobalRowCopy a row length of "
6255  << checkRowLength << "." << suffix);
6256  }
6257 
6258  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6259  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6260  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6261  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6262  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6263  rowIndsView.data(), rowIndsView.extent(0),
6264  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6265  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6266  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6267  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6268  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6269  // KDDKDD UVM TEMPORARY: KokkosView interface
6270  }
6271  else {
6272  global_inds_host_view_type rowIndsView;
6273  values_host_view_type rowValsView;
6274  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6275  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6276  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6277  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6278  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6279  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6280  rowIndsView.data(), rowIndsView.extent(0),
6281  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6282  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6283  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6284  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6285  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6286  // KDDKDD UVM TEMPORARY: KokkosView interface
6287  }
6288 
6289  combineGlobalValues(targetGID, rowIndsConstView,
6290  rowValsConstView, REPLACE,
6291  prefix_raw, debug, verbose);
6292  }
6293 
6294  if (verbose) {
6295  std::ostringstream os;
6296  os << *prefix << "Done" << endl;
6297  }
6298  }
6299 
6300  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6301  void
6302  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6303  copyAndPermuteNonStaticGraph(
6304  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
6305  const size_t numSameIDs,
6306  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
6307  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
6308  const size_t numPermutes)
6309  {
6310  using Details::ProfilingRegion;
6311  using Teuchos::Array;
6312  using Teuchos::ArrayView;
6313  using std::endl;
6314  using LO = LocalOrdinal;
6315  using GO = GlobalOrdinal;
6316  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
6317  const char suffix[] =
6318  " Please report this bug to the Tpetra developers.";
6319  ProfilingRegion regionCAP
6320  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
6321 
6322  const bool debug = Details::Behavior::debug("CrsGraph");
6323  const bool verbose = Details::Behavior::verbose("CrsGraph");
6324  std::unique_ptr<std::string> prefix;
6325  if (verbose) {
6326  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
6327  std::ostringstream os;
6328  os << *prefix << "Start" << endl;
6329  }
6330  const char* const prefix_raw =
6331  verbose ? prefix.get()->c_str() : nullptr;
6332 
6333  {
6334  using row_graph_type = RowGraph<LO, GO, Node>;
6335  const row_graph_type& srcGraph = *(srcMat.getGraph());
6336  auto padding =
6337  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
6338  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
6339  applyCrsPadding(*padding, verbose);
6340  }
6341  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
6342  //
6343  // Copy the first numSame row from source to target (this matrix).
6344  // This involves copying rows corresponding to LIDs [0, numSame-1].
6345  //
6346  const map_type& srcRowMap = * (srcMat.getRowMap ());
6347  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
6348  using gids_type = nonconst_global_inds_host_view_type;
6349  using vals_type = nonconst_values_host_view_type;
6350  gids_type rowInds;
6351  vals_type rowVals;
6352  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
6353  // Global ID for the current row index in the source matrix.
6354  // The first numSameIDs GIDs in the two input lists are the
6355  // same, so sourceGID == targetGID in this case.
6356  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
6357  const GO targetGID = sourceGID;
6358 
6359  ArrayView<const GO> rowIndsConstView;
6360  ArrayView<const Scalar> rowValsConstView;
6361 
6362  if (sourceIsLocallyIndexed) {
6363 
6364  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6365  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
6366  Kokkos::resize(rowInds,rowLength);
6367  Kokkos::resize(rowVals,rowLength);
6368  }
6369  // Resizing invalidates an Array's views, so we must make new
6370  // ones, even if rowLength hasn't changed.
6371  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6372  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6373 
6374  // The source matrix is locally indexed, so we have to get a
6375  // copy. Really it's the GIDs that have to be copied (because
6376  // they have to be converted from LIDs).
6377  size_t checkRowLength = 0;
6378  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
6379  checkRowLength);
6380  if (debug) {
6381  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6382  (rowLength != checkRowLength, std::logic_error, ": For "
6383  "global row index " << sourceGID << ", the source "
6384  "matrix's getNumEntriesInGlobalRow returns a row length "
6385  "of " << rowLength << ", but getGlobalRowCopy reports "
6386  "a row length of " << checkRowLength << "." << suffix);
6387  }
6388  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6389  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6390  }
6391  else { // source matrix is globally indexed.
6392  global_inds_host_view_type rowIndsView;
6393  values_host_view_type rowValsView;
6394  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6395 
6396  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6397  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6398  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6399  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6400  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6401  rowIndsView.data(), rowIndsView.extent(0),
6402  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6403  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6404  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6405  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6406  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6407  // KDDKDD UVM TEMPORARY: KokkosView interface
6408  }
6409 
6410  // Combine the data into the target matrix.
6411  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6412  rowValsConstView, prefix_raw, debug, verbose);
6413  }
6414 
6415  if (verbose) {
6416  std::ostringstream os;
6417  os << *prefix << "Do permutes" << endl;
6418  }
6419  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
6420  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
6421 
6422  const map_type& tgtRowMap = * (this->getRowMap ());
6423  for (size_t p = 0; p < numPermutes; ++p) {
6424  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
6425  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
6426 
6427  ArrayView<const GO> rowIndsConstView;
6428  ArrayView<const Scalar> rowValsConstView;
6429 
6430  if (sourceIsLocallyIndexed) {
6431  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
6432  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
6433  Kokkos::resize(rowInds,rowLength);
6434  Kokkos::resize(rowVals,rowLength);
6435  }
6436  // Resizing invalidates an Array's views, so we must make new
6437  // ones, even if rowLength hasn't changed.
6438  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
6439  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
6440 
6441  // The source matrix is locally indexed, so we have to get a
6442  // copy. Really it's the GIDs that have to be copied (because
6443  // they have to be converted from LIDs).
6444  size_t checkRowLength = 0;
6445  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6446  rowValsView, checkRowLength);
6447  if (debug) {
6448  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6449  (rowLength != checkRowLength, std::logic_error, "For "
6450  "source matrix global row index " << sourceGID << ", "
6451  "getNumEntriesInGlobalRow returns a row length of " <<
6452  rowLength << ", but getGlobalRowCopy a row length of "
6453  << checkRowLength << "." << suffix);
6454  }
6455  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6456  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6457  }
6458  else {
6459  global_inds_host_view_type rowIndsView;
6460  values_host_view_type rowValsView;
6461  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6462 
6463  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6464  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6465  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6466  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6467  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6468  rowIndsView.data(), rowIndsView.extent(0),
6469  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6470  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6471  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6472  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6473  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6474  // KDDKDD UVM TEMPORARY: KokkosView interface
6475  }
6476 
6477  // Combine the data into the target matrix.
6478  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6479  rowValsConstView, prefix_raw, debug, verbose);
6480  }
6481 
6482  if (verbose) {
6483  std::ostringstream os;
6484  os << *prefix << "Done" << endl;
6485  }
6486  }
6487 
6488  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6489  void
6490  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6491  copyAndPermute(
6492  const SrcDistObject& srcObj,
6493  const size_t numSameIDs,
6494  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6495  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6496  const CombineMode /*CM*/)
6497  {
6498  using Details::Behavior;
6500  using Details::ProfilingRegion;
6501  using std::endl;
6502 
6503  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6504  const char tfecfFuncName[] = "copyAndPermute: ";
6505  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6506 
6507  const bool verbose = Behavior::verbose("CrsMatrix");
6508  std::unique_ptr<std::string> prefix;
6509  if (verbose) {
6510  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6511  std::ostringstream os;
6512  os << *prefix << endl
6513  << *prefix << " numSameIDs: " << numSameIDs << endl
6514  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6515  << endl
6516  << *prefix << " "
6517  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6518  << endl
6519  << *prefix << " "
6520  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6521  << endl
6522  << *prefix << " "
6523  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6524  << endl;
6525  std::cerr << os.str ();
6526  }
6527 
6528  const auto numPermute = permuteToLIDs.extent (0);
6529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6530  (numPermute != permuteFromLIDs.extent (0),
6531  std::invalid_argument, "permuteToLIDs.extent(0) = "
6532  << numPermute << "!= permuteFromLIDs.extent(0) = "
6533  << permuteFromLIDs.extent (0) << ".");
6534 
6535  // This dynamic cast should succeed, because we've already tested
6536  // it in checkSizes().
6537  using RMT = RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>;
6538  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6539  if (isStaticGraph ()) {
6540  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6541  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6542  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6543  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6544 
6545  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6546  permuteToLIDs_h.data(),
6547  permuteFromLIDs_h.data(),
6548  numPermute);
6549  }
6550  else {
6551  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6552  permuteFromLIDs, numPermute);
6553  }
6554 
6555  if (verbose) {
6556  std::ostringstream os;
6557  os << *prefix << "Done" << endl;
6558  std::cerr << os.str();
6559  }
6560  }
6561 
6562  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6563  void
6564  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6565  packAndPrepare
6566  (const SrcDistObject& source,
6567  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6568  Kokkos::DualView<char*, buffer_device_type>& exports,
6569  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6570  size_t& constantNumPackets)
6571  {
6572  using Details::Behavior;
6574  using Details::ProfilingRegion;
6575  using Teuchos::outArg;
6576  using Teuchos::REDUCE_MAX;
6577  using Teuchos::reduceAll;
6578  using std::endl;
6579  typedef LocalOrdinal LO;
6580  typedef GlobalOrdinal GO;
6581  const char tfecfFuncName[] = "packAndPrepare: ";
6582  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6583 
6584  const bool debug = Behavior::debug("CrsMatrix");
6585  const bool verbose = Behavior::verbose("CrsMatrix");
6586 
6587  // Processes on which the communicator is null should not participate.
6588  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6589  if (pComm.is_null ()) {
6590  return;
6591  }
6592  const Teuchos::Comm<int>& comm = *pComm;
6593  const int myRank = comm.getSize ();
6594 
6595  std::unique_ptr<std::string> prefix;
6596  if (verbose) {
6597  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6598  std::ostringstream os;
6599  os << *prefix << "Start" << endl
6600  << *prefix << " "
6601  << dualViewStatusToString (exportLIDs, "exportLIDs")
6602  << endl
6603  << *prefix << " "
6604  << dualViewStatusToString (exports, "exports")
6605  << endl
6606  << *prefix << " "
6607  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6608  << endl;
6609  std::cerr << os.str ();
6610  }
6611 
6612  // Attempt to cast the source object to CrsMatrix. If successful,
6613  // use the source object's packNew() method to pack its data for
6614  // communication. Otherwise, attempt to cast to RowMatrix; if
6615  // successful, use the source object's pack() method. Otherwise,
6616  // the source object doesn't have the right type.
6617  //
6618  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6619  // RowMatrix to have the same Node type. Unfortunately, we don't
6620  // have a way to ask if the RowMatrix is "a RowMatrix with any
6621  // Node type," since RowMatrix doesn't have a base class. A
6622  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6623  // not currently exist, would satisfy this requirement.
6624  //
6625  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6626  // type doesn't technically need to match the target object's
6627  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6628  // and GO need not be the same, as long as there is no overflow of
6629  // the indices. However, checking for index overflow is global
6630  // and therefore undesirable.
6631 
6632  std::ostringstream msg; // for collecting error messages
6633  int lclBad = 0; // to be set below
6634 
6635  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6636  const crs_matrix_type* srcCrsMat =
6637  dynamic_cast<const crs_matrix_type*> (&source);
6638  if (srcCrsMat != nullptr) {
6639  if (verbose) {
6640  std::ostringstream os;
6641  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6642  "calling packNew" << endl;
6643  std::cerr << os.str ();
6644  }
6645  try {
6646  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6647  constantNumPackets);
6648  }
6649  catch (std::exception& e) {
6650  lclBad = 1;
6651  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6652  }
6653  }
6654  else {
6655  using Kokkos::HostSpace;
6656  using Kokkos::subview;
6657  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6658  using range_type = Kokkos::pair<size_t, size_t>;
6659 
6660  if (verbose) {
6661  std::ostringstream os;
6662  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6663  << endl;
6664  std::cerr << os.str ();
6665  }
6666 
6667  const row_matrix_type* srcRowMat =
6668  dynamic_cast<const row_matrix_type*> (&source);
6669  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6670  (srcRowMat == nullptr, std::invalid_argument,
6671  "The source object of the Import or Export operation is neither a "
6672  "CrsMatrix (with the same template parameters as the target object), "
6673  "nor a RowMatrix (with the same first four template parameters as the "
6674  "target object).");
6675 
6676  // For the RowMatrix case, we need to convert from
6677  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6678  // so terribly efficient, since packing a non-CrsMatrix
6679  // RowMatrix for Import/Export into a CrsMatrix is not a
6680  // critical case. Thus, we may allocate Teuchos::Array objects
6681  // here and copy to and from Kokkos::*View.
6682 
6683  // View exportLIDs's host data as a Teuchos::ArrayView.
6684  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6685  auto exportLIDs_h = exportLIDs.view_host ();
6686  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6687  exportLIDs_h.size ());
6688 
6689  // pack() will allocate exports_a as needed. We'll copy back
6690  // into exports (after (re)allocating exports if needed) below.
6691  Teuchos::Array<char> exports_a;
6692 
6693  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6694  // need to sync, since we're doing write-only access, but we do
6695  // need to mark the DualView as modified on host.
6696 
6697  numPacketsPerLID.clear_sync_state (); // write-only access
6698  numPacketsPerLID.modify_host ();
6699  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6700  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6701  numPacketsPerLID_h.size ());
6702 
6703  // Invoke RowMatrix's legacy pack() interface, using above
6704  // Teuchos::Array* objects.
6705  try {
6706  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6707  constantNumPackets);
6708  }
6709  catch (std::exception& e) {
6710  lclBad = 1;
6711  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6712  }
6713 
6714  // Allocate 'exports', and copy exports_a back into it.
6715  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6716  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6717  const std::string oldLabel = exports.d_view.label ();
6718  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6719  exports = exports_type (newLabel, newAllocSize);
6720  }
6721  // It's safe to assume that we're working on host anyway, so
6722  // just keep exports sync'd to host.
6723  // ignore current device contents
6724  exports.modify_host();
6725 
6726  auto exports_h = exports.view_host ();
6727  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6728 
6729  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6730  // exports_a into a nonowning Kokkos::View first before copying.
6731  typedef typename exports_type::t_host::execution_space HES;
6732  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6733  Kokkos::View<const char*, host_device_type>
6734  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6735  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6736  }
6737 
6738  if (debug) {
6739  int gblBad = 0; // output argument; to be set below
6740  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6741  if (gblBad != 0) {
6742  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6743  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6744  (true, std::logic_error, "packNew() or pack() threw an exception on "
6745  "one or more participating processes.");
6746  }
6747  }
6748  else {
6749  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6750  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6751  "or more participating processes. Here is this process' error "
6752  "message: " << msg.str ());
6753  }
6754 
6755  if (verbose) {
6756  std::ostringstream os;
6757  os << *prefix << "packAndPrepare: Done!" << endl
6758  << *prefix << " "
6759  << dualViewStatusToString (exportLIDs, "exportLIDs")
6760  << endl
6761  << *prefix << " "
6762  << dualViewStatusToString (exports, "exports")
6763  << endl
6764  << *prefix << " "
6765  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6766  << endl;
6767  std::cerr << os.str ();
6768  }
6769  }
6770 
6771  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6772  size_t
6773  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6774  packRow (char exports[],
6775  const size_t offset,
6776  const size_t numEnt,
6777  const GlobalOrdinal gidsIn[],
6778  const impl_scalar_type valsIn[],
6779  const size_t numBytesPerValue) const
6780  {
6781  using Kokkos::View;
6782  using Kokkos::subview;
6784  typedef LocalOrdinal LO;
6785  typedef GlobalOrdinal GO;
6786  typedef impl_scalar_type ST;
6787 
6788  if (numEnt == 0) {
6789  // Empty rows always take zero bytes, to ensure sparsity.
6790  return 0;
6791  }
6792 
6793  const GO gid = 0; // packValueCount wants this
6794  const LO numEntLO = static_cast<size_t> (numEnt);
6795 
6796  const size_t numEntBeg = offset;
6797  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6798  const size_t gidsBeg = numEntBeg + numEntLen;
6799  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6800  const size_t valsBeg = gidsBeg + gidsLen;
6801  const size_t valsLen = numEnt * numBytesPerValue;
6802 
6803  char* const numEntOut = exports + numEntBeg;
6804  char* const gidsOut = exports + gidsBeg;
6805  char* const valsOut = exports + valsBeg;
6806 
6807  size_t numBytesOut = 0;
6808  int errorCode = 0;
6809  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6810 
6811  {
6812  Kokkos::pair<int, size_t> p;
6813  p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6814  errorCode += p.first;
6815  numBytesOut += p.second;
6816 
6817  p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6818  errorCode += p.first;
6819  numBytesOut += p.second;
6820  }
6821 
6822  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6823  TEUCHOS_TEST_FOR_EXCEPTION
6824  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6825  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6826  << expectedNumBytes << ".");
6827  TEUCHOS_TEST_FOR_EXCEPTION
6828  (errorCode != 0, std::runtime_error, "packRow: "
6829  "PackTraits::packArray returned a nonzero error code");
6830 
6831  return numBytesOut;
6832  }
6833 
6834  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6835  size_t
6836  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6837  unpackRow (GlobalOrdinal gidsOut[],
6838  impl_scalar_type valsOut[],
6839  const char imports[],
6840  const size_t offset,
6841  const size_t numBytes,
6842  const size_t numEnt,
6843  const size_t numBytesPerValue)
6844  {
6845  using Kokkos::View;
6846  using Kokkos::subview;
6848  typedef LocalOrdinal LO;
6849  typedef GlobalOrdinal GO;
6850  typedef impl_scalar_type ST;
6851 
6852  Details::ProfilingRegion region_upack_row(
6853  "Tpetra::CrsMatrix::unpackRow",
6854  "Import/Export"
6855  );
6856 
6857  if (numBytes == 0) {
6858  // Rows with zero bytes should always have zero entries.
6859  if (numEnt != 0) {
6860  const int myRank = this->getMap ()->getComm ()->getRank ();
6861  TEUCHOS_TEST_FOR_EXCEPTION
6862  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6863  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6864  "number of entries to unpack (as reported by numPacketsPerLID) "
6865  "for this row numEnt=" << numEnt << " != 0.");
6866  }
6867  return 0;
6868  }
6869 
6870  if (numEnt == 0 && numBytes != 0) {
6871  const int myRank = this->getMap ()->getComm ()->getRank ();
6872  TEUCHOS_TEST_FOR_EXCEPTION
6873  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6874  "unpackRow: The number of entries to unpack (as reported by "
6875  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6876  "numBytes=" << numBytes << " != 0.");
6877  }
6878 
6879  const GO gid = 0; // packValueCount wants this
6880  const LO lid = 0; // packValueCount wants this
6881 
6882  const size_t numEntBeg = offset;
6883  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6884  const size_t gidsBeg = numEntBeg + numEntLen;
6885  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6886  const size_t valsBeg = gidsBeg + gidsLen;
6887  const size_t valsLen = numEnt * numBytesPerValue;
6888 
6889  const char* const numEntIn = imports + numEntBeg;
6890  const char* const gidsIn = imports + gidsBeg;
6891  const char* const valsIn = imports + valsBeg;
6892 
6893  size_t numBytesOut = 0;
6894  int errorCode = 0;
6895  LO numEntOut;
6896  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6897  if (static_cast<size_t> (numEntOut) != numEnt ||
6898  numEntOut == static_cast<LO> (0)) {
6899  const int myRank = this->getMap ()->getComm ()->getRank ();
6900  std::ostringstream os;
6901  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6902  bool firstErrorCondition = false;
6903  if (static_cast<size_t> (numEntOut) != numEnt) {
6904  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6905  << " does not equal number of entries unpacked from imports "
6906  "buffer numEntOut=" << numEntOut << ".";
6907  firstErrorCondition = true;
6908  }
6909  if (numEntOut == static_cast<LO> (0)) {
6910  if (firstErrorCondition) {
6911  os << " Also, ";
6912  }
6913  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6914  "but number of bytes to unpack for this row numBytes=" << numBytes
6915  << " != 0. This should never happen, since packRow should only "
6916  "ever pack rows with a nonzero number of entries. In this case, "
6917  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6918  << ".";
6919  }
6920  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6921  }
6922 
6923  {
6924  Kokkos::pair<int, size_t> p;
6925  p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6926  errorCode += p.first;
6927  numBytesOut += p.second;
6928 
6929  p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6930  errorCode += p.first;
6931  numBytesOut += p.second;
6932  }
6933 
6934  TEUCHOS_TEST_FOR_EXCEPTION
6935  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6936  << numBytesOut << " != numBytes = " << numBytes << ".");
6937 
6938  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6939  TEUCHOS_TEST_FOR_EXCEPTION
6940  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6941  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6942  << expectedNumBytes << ".");
6943 
6944  TEUCHOS_TEST_FOR_EXCEPTION
6945  (errorCode != 0, std::runtime_error, "unpackRow: "
6946  "PackTraits::unpackArray returned a nonzero error code");
6947 
6948  return numBytesOut;
6949  }
6950 
6951  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6952  void
6953  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6954  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6955  size_t& totalNumEntries,
6956  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6957  {
6958  using Details::Behavior;
6960  using std::endl;
6961  typedef impl_scalar_type IST;
6962  typedef LocalOrdinal LO;
6963  typedef GlobalOrdinal GO;
6964  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6965 
6966  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6967  // output to std::cerr on every MPI process. This is unwise for
6968  // runs with large numbers of MPI processes.
6969  const bool verbose = Behavior::verbose("CrsMatrix");
6970  std::unique_ptr<std::string> prefix;
6971  if (verbose) {
6972  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6973  std::ostringstream os;
6974  os << *prefix << "Before:"
6975  << endl
6976  << *prefix << " "
6977  << dualViewStatusToString (exports, "exports")
6978  << endl
6979  << *prefix << " "
6980  << dualViewStatusToString (exportLIDs, "exportLIDs")
6981  << endl;
6982  std::cerr << os.str ();
6983  }
6984 
6985  // The number of export LIDs must fit in LocalOrdinal, assuming
6986  // that the LIDs are distinct and valid on the calling process.
6987  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6988 
6989  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6990  auto exportLIDs_h = exportLIDs.view_host ();
6991 
6992  // Count the total number of matrix entries to send.
6993  totalNumEntries = 0;
6994  for (LO i = 0; i < numExportLIDs; ++i) {
6995  const LO lclRow = exportLIDs_h[i];
6996  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6997  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6998  // indices as an error. Just consider them nonowned for now.
6999  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
7000  curNumEntries = 0;
7001  }
7002  totalNumEntries += curNumEntries;
7003  }
7004 
7005  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
7006  // if sizeof(IST) is a meaningful representation of the amount of
7007  // data in a Scalar instance. (LO and GO are always built-in
7008  // integer types.)
7009  //
7010  // Allocate the exports array. It does NOT need padding for
7011  // alignment, since we use memcpy to write to / read from send /
7012  // receive buffers.
7013  const size_t allocSize =
7014  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
7015  totalNumEntries * (sizeof (IST) + sizeof (GO));
7016  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
7017  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
7018 
7019  const std::string oldLabel = exports.d_view.label ();
7020  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
7021  exports = exports_type (newLabel, allocSize);
7022  }
7023 
7024  if (verbose) {
7025  std::ostringstream os;
7026  os << *prefix << "After:"
7027  << endl
7028  << *prefix << " "
7029  << dualViewStatusToString (exports, "exports")
7030  << endl
7031  << *prefix << " "
7032  << dualViewStatusToString (exportLIDs, "exportLIDs")
7033  << endl;
7034  std::cerr << os.str ();
7035  }
7036  }
7037 
7038  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7039  void
7041  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
7042  Kokkos::DualView<char*, buffer_device_type>& exports,
7043  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
7044  size_t& constantNumPackets) const
7045  {
7046  // The call to packNew in packAndPrepare catches and handles any exceptions.
7047  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
7048  if (this->isStaticGraph ()) {
7050  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
7051  constantNumPackets);
7052  }
7053  else {
7054  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
7055  constantNumPackets);
7056  }
7057  }
7058 
7059  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7060  void
7062  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
7063  Kokkos::DualView<char*, buffer_device_type>& exports,
7064  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
7065  size_t& constantNumPackets) const
7066  {
7067  using Details::Behavior;
7069  using Details::PackTraits;
7071  using Kokkos::View;
7072  using std::endl;
7073  using LO = LocalOrdinal;
7074  using GO = GlobalOrdinal;
7075  using ST = impl_scalar_type;
7076  const char tfecfFuncName[] = "packNonStaticNew: ";
7077 
7078  const bool verbose = Behavior::verbose("CrsMatrix");
7079  std::unique_ptr<std::string> prefix;
7080  if (verbose) {
7081  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
7082  std::ostringstream os;
7083  os << *prefix << "Start" << endl;
7084  std::cerr << os.str ();
7085  }
7086 
7087  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
7088  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7089  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
7090  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
7091  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
7092  << ".");
7093 
7094  // Setting this to zero tells the caller to expect a possibly
7095  // different ("nonconstant") number of packets per local index
7096  // (i.e., a possibly different number of entries per row).
7097  constantNumPackets = 0;
7098 
7099  // The pack buffer 'exports' enters this method possibly
7100  // unallocated. Do the first two parts of "Count, allocate, fill,
7101  // compute."
7102  size_t totalNumEntries = 0;
7103  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
7104  const size_t bufSize = static_cast<size_t> (exports.extent (0));
7105 
7106  // Write-only host access
7107  exports.clear_sync_state();
7108  exports.modify_host();
7109  auto exports_h = exports.view_host ();
7110  if (verbose) {
7111  std::ostringstream os;
7112  os << *prefix << "After marking exports as modified on host, "
7113  << dualViewStatusToString (exports, "exports") << endl;
7114  std::cerr << os.str ();
7115  }
7116 
7117  // Read-only host access
7118  auto exportLIDs_h = exportLIDs.view_host ();
7119 
7120  // Write-only host access
7121  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
7122  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
7123  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
7124 
7125  // Compute the number of "packets" (in this case, bytes) per
7126  // export LID (in this case, local index of the row to send), and
7127  // actually pack the data.
7128  auto maxRowNumEnt = this->getNodeMaxNumRowEntries();
7129 
7130 
7131  // Temporary buffer for global column indices.
7132  typename global_inds_host_view_type::non_const_type gidsIn_k;
7133  if (this->isLocallyIndexed()) { // Need storage for Global IDs
7134  gidsIn_k =
7135  typename global_inds_host_view_type::non_const_type("packGids",
7136  maxRowNumEnt);
7137  }
7138 
7139  size_t offset = 0; // current index into 'exports' array.
7140  for (size_t i = 0; i < numExportLIDs; ++i) {
7141  const LO lclRow = exportLIDs_h[i];
7142 
7143  size_t numBytes;
7144  size_t numEnt;
7145  numEnt = this->getNumEntriesInLocalRow (lclRow);
7146 
7147  // Only pack this row's data if it has a nonzero number of
7148  // entries. We can do this because receiving processes get the
7149  // number of packets, and will know that zero packets means zero
7150  // entries.
7151  if (numEnt == 0) {
7152  numPacketsPerLID_h[i] = 0;
7153  continue;
7154  }
7155 
7156  if (this->isLocallyIndexed ()) {
7157  typename global_inds_host_view_type::non_const_type gidsIn;
7158  values_host_view_type valsIn;
7159  // If the matrix is locally indexed on the calling process, we
7160  // have to use its column Map (which it _must_ have in this
7161  // case) to convert to global indices.
7162  local_inds_host_view_type lidsIn;
7163  this->getLocalRowView (lclRow, lidsIn, valsIn);
7164  const map_type& colMap = * (this->getColMap ());
7165  for (size_t k = 0; k < numEnt; ++k) {
7166  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
7167  }
7168  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
7169 
7170  const size_t numBytesPerValue =
7171  PackTraits<ST>::packValueCount (valsIn[0]);
7172  numBytes = this->packRow (exports_h.data (), offset, numEnt,
7173  gidsIn.data (), valsIn.data (),
7174  numBytesPerValue);
7175  }
7176  else if (this->isGloballyIndexed ()) {
7177  global_inds_host_view_type gidsIn;
7178  values_host_view_type valsIn;
7179  // If the matrix is globally indexed on the calling process,
7180  // then we can use the column indices directly. However, we
7181  // have to get the global row index. The calling process must
7182  // have a row Map, since otherwise it shouldn't be participating
7183  // in packing operations.
7184  const map_type& rowMap = * (this->getRowMap ());
7185  const GO gblRow = rowMap.getGlobalElement (lclRow);
7186  this->getGlobalRowView (gblRow, gidsIn, valsIn);
7187 
7188  const size_t numBytesPerValue =
7189  PackTraits<ST>::packValueCount (valsIn[0]);
7190  numBytes = this->packRow (exports_h.data (), offset, numEnt,
7191  gidsIn.data (), valsIn.data (),
7192  numBytesPerValue);
7193  }
7194  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
7195  // nor locally indexed, then it has no entries. Therefore,
7196  // there is nothing to pack. No worries!
7197 
7198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7199  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
7200  "First invalid offset into 'exports' pack buffer at index i = " << i
7201  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
7202  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
7203  ".");
7204  // numPacketsPerLID_h[i] is the number of "packets" in the
7205  // current local row i. Packet=char (really "byte") so use the
7206  // number of bytes of the packed data for that row.
7207  numPacketsPerLID_h[i] = numBytes;
7208  offset += numBytes;
7209  }
7210 
7211  if (verbose) {
7212  std::ostringstream os;
7213  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
7214  << *prefix << " "
7215  << dualViewStatusToString (exports, "exports")
7216  << endl
7217  << *prefix << " "
7218  << dualViewStatusToString (exportLIDs, "exportLIDs")
7219  << endl;
7220  std::cerr << os.str ();
7221  }
7222  }
7223 
7224  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7225  LocalOrdinal
7226  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7227  combineGlobalValuesRaw(const LocalOrdinal lclRow,
7228  const LocalOrdinal numEnt,
7229  const impl_scalar_type vals[],
7230  const GlobalOrdinal cols[],
7231  const Tpetra::CombineMode combMode,
7232  const char* const prefix,
7233  const bool debug,
7234  const bool verbose)
7235  {
7236  using GO = GlobalOrdinal;
7237 
7238  // mfh 23 Mar 2017: This branch is not thread safe in a debug
7239  // build, due to use of Teuchos::ArrayView; see #229.
7240  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
7241  Teuchos::ArrayView<const GO> cols_av
7242  (numEnt == 0 ? nullptr : cols, numEnt);
7243  Teuchos::ArrayView<const Scalar> vals_av
7244  (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
7245 
7246  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
7247  // combine modes. combineGlobalValues throws on error; it does
7248  // not return an error code. Thus, if it returns, it succeeded.
7249  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
7250  prefix, debug, verbose);
7251  return numEnt;
7252  }
7253 
7254  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7255  void
7256  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7257  combineGlobalValues(
7258  const GlobalOrdinal globalRowIndex,
7259  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
7260  const Teuchos::ArrayView<const Scalar>& values,
7261  const Tpetra::CombineMode combineMode,
7262  const char* const prefix,
7263  const bool debug,
7264  const bool verbose)
7265  {
7266  const char tfecfFuncName[] = "combineGlobalValues: ";
7267 
7268  if (isStaticGraph ()) {
7269  // INSERT doesn't make sense for a static graph, since you
7270  // aren't allowed to change the structure of the graph.
7271  // However, all the other combine modes work.
7272  if (combineMode == ADD) {
7273  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
7274  }
7275  else if (combineMode == REPLACE) {
7276  replaceGlobalValues (globalRowIndex, columnIndices, values);
7277  }
7278  else if (combineMode == ABSMAX) {
7279  using ::Tpetra::Details::AbsMax;
7280  AbsMax<Scalar> f;
7281  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
7282  columnIndices,
7283  values, f);
7284  }
7285  else if (combineMode == INSERT) {
7286  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7287  (isStaticGraph() && combineMode == INSERT,
7288  std::invalid_argument, "INSERT combine mode is forbidden "
7289  "if the matrix has a static (const) graph (i.e., was "
7290  "constructed with the CrsMatrix constructor that takes a "
7291  "const CrsGraph pointer).");
7292  }
7293  else {
7294  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7295  (true, std::logic_error, "Invalid combine mode; should "
7296  "never get here! "
7297  "Please report this bug to the Tpetra developers.");
7298  }
7299  }
7300  else { // The matrix has a dynamic graph.
7301  if (combineMode == ADD || combineMode == INSERT) {
7302  // For a dynamic graph, all incoming column indices are
7303  // inserted into the target graph. Duplicate indices will
7304  // have their values summed. In this context, ADD and INSERT
7305  // are equivalent. We need to call insertGlobalValues()
7306  // anyway if the column indices don't yet exist in this row,
7307  // so we just call insertGlobalValues() for both cases.
7308  insertGlobalValuesFilteredChecked(globalRowIndex,
7309  columnIndices, values, prefix, debug, verbose);
7310  }
7311  // FIXME (mfh 14 Mar 2012):
7312  //
7313  // Implementing ABSMAX or REPLACE for a dynamic graph would
7314  // require modifying assembly to attach a possibly different
7315  // combine mode to each inserted (i, j, A_ij) entry. For
7316  // example, consider two different Export operations to the same
7317  // target CrsMatrix, the first with ABSMAX combine mode and the
7318  // second with REPLACE. This isn't a common use case, so we
7319  // won't mess with it for now.
7320  else if (combineMode == ABSMAX) {
7321  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7322  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
7323  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
7324  "implemented.");
7325  }
7326  else if (combineMode == REPLACE) {
7327  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7328  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
7329  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
7330  "implemented.");
7331  }
7332  else {
7333  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
7334  true, std::logic_error, "Should never get here! Please report this "
7335  "bug to the Tpetra developers.");
7336  }
7337  }
7338  }
7339 
7340  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7341  void
7344  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
7345  Kokkos::DualView<char*, buffer_device_type> imports,
7346  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7347  const size_t constantNumPackets,
7348  const CombineMode combineMode)
7349  {
7350  using Details::Behavior;
7353  using std::endl;
7354  const char tfecfFuncName[] = "unpackAndCombine: ";
7355  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
7356 
7357  const bool debug = Behavior::debug("CrsMatrix");
7358  const bool verbose = Behavior::verbose("CrsMatrix");
7359  constexpr int numValidModes = 5;
7360  const CombineMode validModes[numValidModes] =
7361  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
7362  const char* validModeNames[numValidModes] =
7363  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
7364 
7365  std::unique_ptr<std::string> prefix;
7366  if (verbose) {
7367  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
7368  std::ostringstream os;
7369  os << *prefix << "Start:" << endl
7370  << *prefix << " "
7371  << dualViewStatusToString (importLIDs, "importLIDs")
7372  << endl
7373  << *prefix << " "
7374  << dualViewStatusToString (imports, "imports")
7375  << endl
7376  << *prefix << " "
7377  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7378  << endl
7379  << *prefix << " constantNumPackets: " << constantNumPackets
7380  << endl
7381  << *prefix << " combineMode: " << combineModeToString (combineMode)
7382  << endl;
7383  std::cerr << os.str ();
7384  }
7385 
7386  if (debug) {
7387  if (std::find (validModes, validModes+numValidModes, combineMode) ==
7388  validModes+numValidModes) {
7389  std::ostringstream os;
7390  os << "Invalid combine mode. Valid modes are {";
7391  for (int k = 0; k < numValidModes; ++k) {
7392  os << validModeNames[k];
7393  if (k < numValidModes - 1) {
7394  os << ", ";
7395  }
7396  }
7397  os << "}.";
7398  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7399  (true, std::invalid_argument, os.str ());
7400  }
7401  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7402  (importLIDs.extent(0) != numPacketsPerLID.extent(0),
7403  std::invalid_argument, "importLIDs.extent(0)="
7404  << importLIDs.extent(0)
7405  << " != numPacketsPerLID.extent(0)="
7406  << numPacketsPerLID.extent(0) << ".");
7407  }
7408 
7409  if (combineMode == ZERO) {
7410  return; // nothing to do
7411  }
7412 
7413  if (debug) {
7414  using Teuchos::reduceAll;
7415  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
7416  int lclBad = 0;
7417  try {
7418  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7419  constantNumPackets, combineMode,
7420  verbose);
7421  } catch (std::exception& e) {
7422  lclBad = 1;
7423  *msg << e.what ();
7424  }
7425  int gblBad = 0;
7426  const Teuchos::Comm<int>& comm = * (this->getComm ());
7427  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
7428  lclBad, Teuchos::outArg (gblBad));
7429  if (gblBad != 0) {
7430  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
7431  // initialized in a debug build. Thus, we get the process
7432  // rank again here. This is an error message, so the small
7433  // run-time cost doesn't matter. See #1887.
7434  std::ostringstream os;
7435  os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
7436  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
7437  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
7438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7439  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
7440  "threw an exception on one or more participating processes: "
7441  << endl << msg->str ());
7442  }
7443  }
7444  else {
7445  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7446  constantNumPackets, combineMode,
7447  verbose);
7448  }
7449 
7450  if (verbose) {
7451  std::ostringstream os;
7452  os << *prefix << "Done!" << endl
7453  << *prefix << " "
7454  << dualViewStatusToString (importLIDs, "importLIDs")
7455  << endl
7456  << *prefix << " "
7457  << dualViewStatusToString (imports, "imports")
7458  << endl
7459  << *prefix << " "
7460  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7461  << endl;
7462  std::cerr << os.str ();
7463  }
7464  }
7465 
7466  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7467  void
7470  const Kokkos::DualView<const local_ordinal_type*,
7471  buffer_device_type>& importLIDs,
7472  Kokkos::DualView<char*, buffer_device_type> imports,
7473  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7474  const size_t constantNumPackets,
7475  const CombineMode combineMode,
7476  const bool verbose)
7477  {
7478  Details::ProfilingRegion region_unpack_and_combine_impl(
7479  "Tpetra::CrsMatrix::unpackAndCombineImpl",
7480  "Import/Export"
7481  );
7482  using std::endl;
7483  const char tfecfFuncName[] = "unpackAndCombineImpl";
7484  std::unique_ptr<std::string> prefix;
7485  if (verbose) {
7486  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7487  std::ostringstream os;
7488  os << *prefix << "isStaticGraph(): "
7489  << (isStaticGraph() ? "true" : "false")
7490  << ", importLIDs.extent(0): "
7491  << importLIDs.extent(0)
7492  << ", imports.extent(0): "
7493  << imports.extent(0)
7494  << ", numPacketsPerLID.extent(0): "
7495  << numPacketsPerLID.extent(0)
7496  << endl;
7497  std::cerr << os.str();
7498  }
7499 
7500  if (isStaticGraph ()) {
7501  using Details::unpackCrsMatrixAndCombineNew;
7502  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7503  importLIDs, constantNumPackets,
7504  combineMode);
7505  }
7506  else {
7507  {
7508  using padding_type = typename crs_graph_type::padding_type;
7509  std::unique_ptr<padding_type> padding;
7510  try {
7511  padding = myGraph_->computePaddingForCrsMatrixUnpack(
7512  importLIDs, imports, numPacketsPerLID, verbose);
7513  }
7514  catch (std::exception& e) {
7515  const auto rowMap = getRowMap();
7516  const auto comm = rowMap.is_null() ? Teuchos::null :
7517  rowMap->getComm();
7518  const int myRank = comm.is_null() ? -1 : comm->getRank();
7519  TEUCHOS_TEST_FOR_EXCEPTION
7520  (true, std::runtime_error, "Proc " << myRank << ": "
7521  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7522  "threw an exception: " << e.what());
7523  }
7524  if (verbose) {
7525  std::ostringstream os;
7526  os << *prefix << "Call applyCrsPadding" << endl;
7527  std::cerr << os.str();
7528  }
7529  applyCrsPadding(*padding, verbose);
7530  }
7531  if (verbose) {
7532  std::ostringstream os;
7533  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7534  std::cerr << os.str();
7535  }
7536  unpackAndCombineImplNonStatic(importLIDs, imports,
7537  numPacketsPerLID,
7538  constantNumPackets,
7539  combineMode);
7540  }
7541 
7542  if (verbose) {
7543  std::ostringstream os;
7544  os << *prefix << "Done" << endl;
7545  std::cerr << os.str();
7546  }
7547  }
7548 
7549  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7550  void
7551  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7552  unpackAndCombineImplNonStatic(
7553  const Kokkos::DualView<const local_ordinal_type*,
7554  buffer_device_type>& importLIDs,
7555  Kokkos::DualView<char*, buffer_device_type> imports,
7556  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7557  const size_t constantNumPackets,
7558  const CombineMode combineMode)
7559  {
7560  using Kokkos::View;
7561  using Kokkos::subview;
7562  using Kokkos::MemoryUnmanaged;
7563  using Details::Behavior;
7566  using Details::PackTraits;
7567  using Details::ScalarViewTraits;
7568  using std::endl;
7569  using LO = LocalOrdinal;
7570  using GO = GlobalOrdinal;
7571  using ST = impl_scalar_type;
7572  using size_type = typename Teuchos::ArrayView<LO>::size_type;
7573  using HES =
7574  typename View<int*, device_type>::HostMirror::execution_space;
7575  using pair_type = std::pair<typename View<int*, HES>::size_type,
7576  typename View<int*, HES>::size_type>;
7577  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7578  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7579  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7580 
7581  const bool debug = Behavior::debug("CrsMatrix");
7582  const bool verbose = Behavior::verbose("CrsMatrix");
7583  std::unique_ptr<std::string> prefix;
7584  if (verbose) {
7585  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7586  std::ostringstream os;
7587  os << *prefix << endl; // we've already printed DualViews' statuses
7588  std::cerr << os.str ();
7589  }
7590  const char* const prefix_raw =
7591  verbose ? prefix.get()->c_str() : nullptr;
7592 
7593  const size_type numImportLIDs = importLIDs.extent (0);
7594  if (combineMode == ZERO || numImportLIDs == 0) {
7595  return; // nothing to do; no need to combine entries
7596  }
7597 
7598  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7599  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7600  "Import/Export"
7601  );
7602 
7603  // We're unpacking on host. This is read-only host access.
7604  if (imports.need_sync_host()) {
7605  imports.sync_host ();
7606  }
7607  auto imports_h = imports.view_host();
7608 
7609  // Read-only host access.
7610  if (numPacketsPerLID.need_sync_host()) {
7611  numPacketsPerLID.sync_host ();
7612  }
7613  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7614 
7615  TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7616  auto importLIDs_h = importLIDs.view_host();
7617 
7618  size_t numBytesPerValue;
7619  {
7620  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7621  // with run-time size? We already assume that all entries in both the
7622  // source and target matrices have the same size. If the calling process
7623  // owns at least one entry in either matrix, we can use that entry to set
7624  // the size. However, it is possible that the calling process owns no
7625  // entries. In that case, we're in trouble. One way to fix this would be
7626  // for each row's data to contain the run-time size. This is only
7627  // necessary if the size is not a compile-time constant.
7628  Scalar val;
7629  numBytesPerValue = PackTraits<ST>::packValueCount (val);
7630  }
7631 
7632  // Determine the maximum number of entries in any one row
7633  size_t offset = 0;
7634  size_t maxRowNumEnt = 0;
7635  for (size_type i = 0; i < numImportLIDs; ++i) {
7636  const size_t numBytes = numPacketsPerLID_h[i];
7637  if (numBytes == 0) {
7638  continue; // empty buffer for that row means that the row is empty
7639  }
7640  // We need to unpack a nonzero number of entries for this row.
7641  if (debug) {
7642  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7643  (offset + numBytes > size_t(imports_h.extent (0)),
7644  std::logic_error, ": At local row index importLIDs_h[i="
7645  << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7646  << ") + numBytes (=" << numBytes << ") > "
7647  "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7648  }
7649  LO numEntLO = 0;
7650 
7651  if (debug) {
7652  const size_t theNumBytes =
7653  PackTraits<LO>::packValueCount (numEntLO);
7654  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7655  (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7656  << theNumBytes << " > numBytes = " << numBytes << ".");
7657  }
7658  const char* const inBuf = imports_h.data () + offset;
7659  const size_t actualNumBytes =
7660  PackTraits<LO>::unpackValue (numEntLO, inBuf);
7661 
7662  if (debug) {
7663  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7664  (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7665  << ", actualNumBytes=" << actualNumBytes
7666  << " > numBytes=" << numBytes << ".");
7667  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7668  (numEntLO == 0, std::logic_error, ": At local row index "
7669  "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7670  "the number of entries read from the packed data is "
7671  "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7672  << " != 0.");
7673  }
7674 
7675  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7676  offset += numBytes;
7677  }
7678 
7679  // Temporary space to cache incoming global column indices and
7680  // values. Column indices come in as global indices, in case the
7681  // source object's column Map differs from the target object's
7682  // (this's) column Map.
7683  View<GO*, HES> gblColInds;
7684  View<LO*, HES> lclColInds;
7685  View<ST*, HES> vals;
7686  {
7687  GO gid = 0;
7688  LO lid = 0;
7689  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7690  // with run-time size? We already assume that all entries in both the
7691  // source and target matrices have the same size. If the calling process
7692  // owns at least one entry in either matrix, we can use that entry to set
7693  // the size. However, it is possible that the calling process owns no
7694  // entries. In that case, we're in trouble. One way to fix this would be
7695  // for each row's data to contain the run-time size. This is only
7696  // necessary if the size is not a compile-time constant.
7697  Scalar val;
7698  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7699  gid, maxRowNumEnt, "gids");
7700  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7701  lid, maxRowNumEnt, "lids");
7702  vals = ScalarViewTraits<ST, HES>::allocateArray(
7703  val, maxRowNumEnt, "vals");
7704  }
7705 
7706  offset = 0;
7707  for (size_type i = 0; i < numImportLIDs; ++i) {
7708  const size_t numBytes = numPacketsPerLID_h[i];
7709  if (numBytes == 0) {
7710  continue; // empty buffer for that row means that the row is empty
7711  }
7712  LO numEntLO = 0;
7713  const char* const inBuf = imports_h.data () + offset;
7714  (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7715 
7716  const size_t numEnt = static_cast<size_t>(numEntLO);;
7717  const LO lclRow = importLIDs_h[i];
7718 
7719  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7720  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7721 
7722  const size_t numBytesOut =
7723  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7724  offset, numBytes, numEnt, numBytesPerValue);
7725  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7726  (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7727  << ", numBytes=" << numBytes << " != numBytesOut="
7728  << numBytesOut << ".");
7729 
7730  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7731  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7732  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7733  combineMode, prefix_raw, debug, verbose);
7734  // Don't update offset until current LID has succeeded.
7735  offset += numBytes;
7736  } // for each import LID i
7737 
7738  if (verbose) {
7739  std::ostringstream os;
7740  os << *prefix << "Done" << endl;
7741  std::cerr << os.str();
7742  }
7743  }
7744 
7745  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7746  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7748  getColumnMapMultiVector (const MV& X_domainMap,
7749  const bool force) const
7750  {
7751  using Teuchos::null;
7752  using Teuchos::RCP;
7753  using Teuchos::rcp;
7754 
7755  TEUCHOS_TEST_FOR_EXCEPTION(
7756  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7757  "MapMultiVector: You may only call this method if the matrix has a "
7758  "column Map. If the matrix does not yet have a column Map, you should "
7759  "first call fillComplete (with domain and range Map if necessary).");
7760 
7761  // If the graph is not fill complete, then the Import object (if
7762  // one should exist) hasn't been constructed yet.
7763  TEUCHOS_TEST_FOR_EXCEPTION(
7764  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7765  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7766  "this matrix's graph is fill complete.");
7767 
7768  const size_t numVecs = X_domainMap.getNumVectors ();
7769  RCP<const import_type> importer = this->getGraph ()->getImporter ();
7770  RCP<const map_type> colMap = this->getColMap ();
7771 
7772  RCP<MV> X_colMap; // null by default
7773 
7774  // If the Import object is trivial (null), then we don't need a
7775  // separate column Map multivector. Just return null in that
7776  // case. The caller is responsible for knowing not to use the
7777  // returned null pointer.
7778  //
7779  // If the Import is nontrivial, then we do need a separate
7780  // column Map multivector for the Import operation. Check in
7781  // that case if we have to (re)create the column Map
7782  // multivector.
7783  if (! importer.is_null () || force) {
7784  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7785  X_colMap = rcp (new MV (colMap, numVecs));
7786 
7787  // Cache the newly created multivector for later reuse.
7788  importMV_ = X_colMap;
7789  }
7790  else { // Yay, we can reuse the cached multivector!
7791  X_colMap = importMV_;
7792  // mfh 09 Jan 2013: We don't have to fill with zeros first,
7793  // because the Import uses INSERT combine mode, which overwrites
7794  // existing entries.
7795  //
7796  //X_colMap->putScalar (ZERO);
7797  }
7798  }
7799  return X_colMap;
7800  }
7801 
7802  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7803  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7806  const bool force) const
7807  {
7808  using Teuchos::null;
7809  using Teuchos::RCP;
7810  using Teuchos::rcp;
7811 
7812  // If the graph is not fill complete, then the Export object (if
7813  // one should exist) hasn't been constructed yet.
7814  TEUCHOS_TEST_FOR_EXCEPTION(
7815  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7816  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7817  "matrix's graph is fill complete.");
7818 
7819  const size_t numVecs = Y_rangeMap.getNumVectors ();
7820  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7821  // Every version of the constructor takes either a row Map, or a
7822  // graph (all of whose constructors take a row Map). Thus, the
7823  // matrix always has a row Map.
7824  RCP<const map_type> rowMap = this->getRowMap ();
7825 
7826  RCP<MV> Y_rowMap; // null by default
7827 
7828  // If the Export object is trivial (null), then we don't need a
7829  // separate row Map multivector. Just return null in that case.
7830  // The caller is responsible for knowing not to use the returned
7831  // null pointer.
7832  //
7833  // If the Export is nontrivial, then we do need a separate row
7834  // Map multivector for the Export operation. Check in that case
7835  // if we have to (re)create the row Map multivector.
7836  if (! exporter.is_null () || force) {
7837  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7838  Y_rowMap = rcp (new MV (rowMap, numVecs));
7839  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7840  }
7841  else { // Yay, we can reuse the cached multivector!
7842  Y_rowMap = exportMV_;
7843  }
7844  }
7845  return Y_rowMap;
7846  }
7847 
7848  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7849  void
7851  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7852  {
7853  TEUCHOS_TEST_FOR_EXCEPTION(
7854  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7855  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7856  "was created with a constant graph (that is, when it was created using "
7857  "the version of its constructor that takes an RCP<const CrsGraph>). "
7858  "This is because the matrix is not allowed to modify the graph in that "
7859  "case, but removing empty processes requires modifying the graph.");
7860  myGraph_->removeEmptyProcessesInPlace (newMap);
7861  // Even though CrsMatrix's row Map (as returned by getRowMap())
7862  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7863  // so we also have to change the DistObject's Map.
7864  this->map_ = this->getRowMap ();
7865  // In the nonconst graph case, staticGraph_ is just a const
7866  // pointer to myGraph_. This assignment is probably redundant,
7867  // but it doesn't hurt.
7868  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7869  }
7870 
7871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7872  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7874  add (const Scalar& alpha,
7876  const Scalar& beta,
7877  const Teuchos::RCP<const map_type>& domainMap,
7878  const Teuchos::RCP<const map_type>& rangeMap,
7879  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7880  {
7881  using Teuchos::Array;
7882  using Teuchos::ArrayView;
7883  using Teuchos::ParameterList;
7884  using Teuchos::RCP;
7885  using Teuchos::rcp;
7886  using Teuchos::rcp_implicit_cast;
7887  using Teuchos::sublist;
7888  using std::endl;
7889  using LO = local_ordinal_type;
7890  using GO = global_ordinal_type;
7891  using crs_matrix_type =
7893  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7894 
7895  const bool debug = Details::Behavior::debug("CrsMatrix");
7896  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7897  std::unique_ptr<std::string> prefix;
7898  if (verbose) {
7899  prefix = this->createPrefix("CrsMatrix", "add");
7900  std::ostringstream os;
7901  os << *prefix << "Start" << endl;
7902  std::cerr << os.str ();
7903  }
7904 
7905  const crs_matrix_type& B = *this; // a convenient abbreviation
7906  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7907  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7908 
7909  // If the user didn't supply a domain or range Map, then try to
7910  // get one from B first (if it has them), then from A (if it has
7911  // them). If we don't have any domain or range Maps, scold the
7912  // user.
7913  RCP<const map_type> A_domainMap = A.getDomainMap ();
7914  RCP<const map_type> A_rangeMap = A.getRangeMap ();
7915  RCP<const map_type> B_domainMap = B.getDomainMap ();
7916  RCP<const map_type> B_rangeMap = B.getRangeMap ();
7917 
7918  RCP<const map_type> theDomainMap = domainMap;
7919  RCP<const map_type> theRangeMap = rangeMap;
7920 
7921  if (domainMap.is_null ()) {
7922  if (B_domainMap.is_null ()) {
7923  TEUCHOS_TEST_FOR_EXCEPTION(
7924  A_domainMap.is_null (), std::invalid_argument,
7925  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7926  "then you must supply a nonnull domain Map to this method.");
7927  theDomainMap = A_domainMap;
7928  } else {
7929  theDomainMap = B_domainMap;
7930  }
7931  }
7932  if (rangeMap.is_null ()) {
7933  if (B_rangeMap.is_null ()) {
7934  TEUCHOS_TEST_FOR_EXCEPTION(
7935  A_rangeMap.is_null (), std::invalid_argument,
7936  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7937  "then you must supply a nonnull range Map to this method.");
7938  theRangeMap = A_rangeMap;
7939  } else {
7940  theRangeMap = B_rangeMap;
7941  }
7942  }
7943 
7944  if (debug) {
7945  // In debug mode, check that A and B have matching domain and
7946  // range Maps, if they have domain and range Maps at all. (If
7947  // they aren't fill complete, then they may not yet have them.)
7948  if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7949  if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7950  TEUCHOS_TEST_FOR_EXCEPTION
7951  (! B_domainMap->isSameAs(*A_domainMap),
7952  std::invalid_argument,
7953  errPfx << "The input RowMatrix A must have a domain Map "
7954  "which is the same as (isSameAs) this RowMatrix's "
7955  "domain Map.");
7956  TEUCHOS_TEST_FOR_EXCEPTION
7957  (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7958  errPfx << "The input RowMatrix A must have a range Map "
7959  "which is the same as (isSameAs) this RowMatrix's range "
7960  "Map.");
7961  TEUCHOS_TEST_FOR_EXCEPTION
7962  (! domainMap.is_null() &&
7963  ! domainMap->isSameAs(*B_domainMap),
7964  std::invalid_argument,
7965  errPfx << "The input domain Map must be the same as "
7966  "(isSameAs) this RowMatrix's domain Map.");
7967  TEUCHOS_TEST_FOR_EXCEPTION
7968  (! rangeMap.is_null() &&
7969  ! rangeMap->isSameAs(*B_rangeMap),
7970  std::invalid_argument,
7971  errPfx << "The input range Map must be the same as "
7972  "(isSameAs) this RowMatrix's range Map.");
7973  }
7974  }
7975  else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7976  TEUCHOS_TEST_FOR_EXCEPTION
7977  (! domainMap.is_null() &&
7978  ! domainMap->isSameAs(*B_domainMap),
7979  std::invalid_argument,
7980  errPfx << "The input domain Map must be the same as "
7981  "(isSameAs) this RowMatrix's domain Map.");
7982  TEUCHOS_TEST_FOR_EXCEPTION
7983  (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7984  std::invalid_argument,
7985  errPfx << "The input range Map must be the same as "
7986  "(isSameAs) this RowMatrix's range Map.");
7987  }
7988  else {
7989  TEUCHOS_TEST_FOR_EXCEPTION
7990  (domainMap.is_null() || rangeMap.is_null(),
7991  std::invalid_argument, errPfx << "If neither A nor B "
7992  "have a domain and range Map, then you must supply a "
7993  "nonnull domain and range Map to this method.");
7994  }
7995  }
7996 
7997  // What parameters do we pass to C's constructor? Do we call
7998  // fillComplete on C after filling it? And if so, what parameters
7999  // do we pass to C's fillComplete call?
8000  bool callFillComplete = true;
8001  RCP<ParameterList> constructorSublist;
8002  RCP<ParameterList> fillCompleteSublist;
8003  if (! params.is_null()) {
8004  callFillComplete =
8005  params->get("Call fillComplete", callFillComplete);
8006  constructorSublist = sublist(params, "Constructor parameters");
8007  fillCompleteSublist = sublist(params, "fillComplete parameters");
8008  }
8009 
8010  RCP<const map_type> A_rowMap = A.getRowMap ();
8011  RCP<const map_type> B_rowMap = B.getRowMap ();
8012  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
8013  RCP<crs_matrix_type> C; // The result matrix.
8014 
8015  // If A and B's row Maps are the same, we can compute an upper
8016  // bound on the number of entries in each row of C, before
8017  // actually computing the sum. A reasonable upper bound is the
8018  // sum of the two entry counts in each row. If we choose this as
8019  // the actual per-row upper bound, we can use static profile.
8020  if (A_rowMap->isSameAs (*B_rowMap)) {
8021  const LO localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
8022  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
8023 
8024  // Get the number of entries in each row of A.
8025  if (alpha != ZERO) {
8026  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
8027  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
8028  C_maxNumEntriesPerRow[localRow] += A_numEntries;
8029  }
8030  }
8031  // Get the number of entries in each row of B.
8032  if (beta != ZERO) {
8033  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
8034  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
8035  C_maxNumEntriesPerRow[localRow] += B_numEntries;
8036  }
8037  }
8038  // Construct the result matrix C.
8039  if (constructorSublist.is_null ()) {
8040  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
8041  StaticProfile));
8042  } else {
8043  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
8044  StaticProfile, constructorSublist));
8045  }
8046  // Since A and B have the same row Maps, we could add them
8047  // together all at once and merge values before we call
8048  // insertGlobalValues. However, we don't really need to, since
8049  // we've already allocated enough space in each row of C for C
8050  // to do the merge itself.
8051  }
8052  else { // the row Maps of A and B are not the same
8053  // Construct the result matrix C.
8054  // true: !A_rowMap->isSameAs (*B_rowMap)
8055  TEUCHOS_TEST_FOR_EXCEPTION
8056  (true, std::invalid_argument, errPfx << "The row maps must "
8057  "be the same for statically allocated matrices, to ensure "
8058  "that there is sufficient space to do the addition.");
8059  }
8060 
8061  TEUCHOS_TEST_FOR_EXCEPTION
8062  (C.is_null (), std::logic_error,
8063  errPfx << "C should not be null at this point. "
8064  "Please report this bug to the Tpetra developers.");
8065 
8066  if (verbose) {
8067  std::ostringstream os;
8068  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
8069  std::cerr << os.str ();
8070  }
8071  using gids_type = nonconst_global_inds_host_view_type;
8072  using vals_type = nonconst_values_host_view_type;
8073  gids_type ind;
8074  vals_type val;
8075 
8076  if (alpha != ZERO) {
8077  const LO A_localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
8078  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
8079  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
8080  const GO globalRow = A_rowMap->getGlobalElement (localRow);
8081  if (A_numEntries > static_cast<size_t> (ind.size ())) {
8082  Kokkos::resize(ind,A_numEntries);
8083  Kokkos::resize(val,A_numEntries);
8084  }
8085  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
8086  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
8087  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
8088 
8089  if (alpha != ONE) {
8090  for (size_t k = 0; k < A_numEntries; ++k) {
8091  valView[k] *= alpha;
8092  }
8093  }
8094  C->insertGlobalValues (globalRow, A_numEntries,
8095  reinterpret_cast<Scalar *>(valView.data()),
8096  indView.data());
8097  }
8098  }
8099 
8100  if (beta != ZERO) {
8101  const LO B_localNumRows = static_cast<LO> (B_rowMap->getNodeNumElements ());
8102  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
8103  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
8104  const GO globalRow = B_rowMap->getGlobalElement (localRow);
8105  if (B_numEntries > static_cast<size_t> (ind.size ())) {
8106  Kokkos::resize(ind,B_numEntries);
8107  Kokkos::resize(val,B_numEntries);
8108  }
8109  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
8110  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
8111  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
8112 
8113  if (beta != ONE) {
8114  for (size_t k = 0; k < B_numEntries; ++k) {
8115  valView[k] *= beta;
8116  }
8117  }
8118  C->insertGlobalValues (globalRow, B_numEntries,
8119  reinterpret_cast<Scalar *>(valView.data()),
8120  indView.data());
8121  }
8122  }
8123 
8124  if (callFillComplete) {
8125  if (verbose) {
8126  std::ostringstream os;
8127  os << *prefix << "Call fillComplete on C" << endl;
8128  std::cerr << os.str ();
8129  }
8130  if (fillCompleteSublist.is_null ()) {
8131  C->fillComplete (theDomainMap, theRangeMap);
8132  } else {
8133  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
8134  }
8135  }
8136  else if (verbose) {
8137  std::ostringstream os;
8138  os << *prefix << "Do NOT call fillComplete on C" << endl;
8139  std::cerr << os.str ();
8140  }
8141 
8142  if (verbose) {
8143  std::ostringstream os;
8144  os << *prefix << "Done" << endl;
8145  std::cerr << os.str ();
8146  }
8147  return rcp_implicit_cast<row_matrix_type> (C);
8148  }
8149 
8150 
8151 
8152  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
8153  void
8156  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
8157  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
8158  const Teuchos::RCP<const map_type>& domainMap,
8159  const Teuchos::RCP<const map_type>& rangeMap,
8160  const Teuchos::RCP<Teuchos::ParameterList>& params) const
8161  {
8162  using Details::Behavior;
8167  using Teuchos::ArrayRCP;
8168  using Teuchos::ArrayView;
8169  using Teuchos::Comm;
8170  using Teuchos::ParameterList;
8171  using Teuchos::RCP;
8172  using std::endl;
8173  typedef LocalOrdinal LO;
8174  typedef GlobalOrdinal GO;
8175  typedef node_type NT;
8176  typedef CrsMatrix<Scalar, LO, GO, NT> this_type;
8177  typedef Vector<int, LO, GO, NT> IntVectorType;
8178  using Teuchos::as;
8179 
8180  const bool debug = Behavior::debug("CrsMatrix");
8181  const bool verbose = Behavior::verbose("CrsMatrix");
8182  int MyPID = getComm ()->getRank ();
8183 
8184  std::unique_ptr<std::string> verbosePrefix;
8185  if (verbose) {
8186  verbosePrefix =
8187  this->createPrefix("CrsMatrix", "transferAndFillComplete");
8188  std::ostringstream os;
8189  os << "Start" << endl;
8190  std::cerr << os.str();
8191  }
8192 
8193  //
8194  // Get the caller's parameters
8195  //
8196  bool isMM = false; // optimize for matrix-matrix ops.
8197  bool reverseMode = false; // Are we in reverse mode?
8198  bool restrictComm = false; // Do we need to restrict the communicator?
8199 
8200  int mm_optimization_core_count =
8201  Behavior::TAFC_OptimizationCoreCount();
8202  RCP<ParameterList> matrixparams; // parameters for the destination matrix
8203  bool overrideAllreduce = false;
8204  if (! params.is_null ()) {
8205  matrixparams = sublist (params, "CrsMatrix");
8206  reverseMode = params->get ("Reverse Mode", reverseMode);
8207  restrictComm = params->get ("Restrict Communicator", restrictComm);
8208  auto & slist = params->sublist("matrixmatrix: kernel params",false);
8209  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
8210  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
8211 
8212  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
8213  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
8214  if(reverseMode) isMM = false;
8215  }
8216 
8217  // Only used in the sparse matrix-matrix multiply (isMM) case.
8218  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
8219  int mismatch = 0;
8220  int reduced_mismatch = 0;
8221  if (isMM && !overrideAllreduce) {
8222 
8223  // Test for pathological matrix transfer
8224  const bool source_vals = ! getGraph ()->getImporter ().is_null();
8225  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
8226  rowTransfer.getRemoteLIDs ().size() == 0);
8227  mismatch = (source_vals != target_vals) ? 1 : 0;
8228  iallreduceRequest =
8229  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
8230  Teuchos::REDUCE_MAX, * (getComm ()));
8231  }
8232 
8233 #ifdef HAVE_TPETRA_MMM_TIMINGS
8234  using Teuchos::TimeMonitor;
8235  std::string label;
8236  if(!params.is_null())
8237  label = params->get("Timer Label",label);
8238  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
8239  std::string tlstr;
8240  {
8241  std::ostringstream os;
8242  if(isMM) os<<":MMOpt";
8243  else os<<":MMLegacy";
8244  tlstr = os.str();
8245  }
8246 
8247  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
8248 #endif
8249 
8250  // Make sure that the input argument rowTransfer is either an
8251  // Import or an Export. Import and Export are the only two
8252  // subclasses of Transfer that we defined, but users might
8253  // (unwisely, for now at least) decide to implement their own
8254  // subclasses. Exclude this possibility.
8255  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
8256  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
8257  TEUCHOS_TEST_FOR_EXCEPTION(
8258  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
8259  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
8260  "argument must be either an Import or an Export, and its template "
8261  "parameters must match the corresponding template parameters of the "
8262  "CrsMatrix.");
8263 
8264  // Make sure that the input argument domainTransfer is either an
8265  // Import or an Export. Import and Export are the only two
8266  // subclasses of Transfer that we defined, but users might
8267  // (unwisely, for now at least) decide to implement their own
8268  // subclasses. Exclude this possibility.
8269  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
8270  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
8271 
8272  if(! domainTransfer.is_null()) {
8273  TEUCHOS_TEST_FOR_EXCEPTION(
8274  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
8275  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
8276  "argument must be either an Import or an Export, and its template "
8277  "parameters must match the corresponding template parameters of the "
8278  "CrsMatrix.");
8279 
8280  TEUCHOS_TEST_FOR_EXCEPTION(
8281  ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
8282  (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
8283  ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
8284  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
8285  "arguments must be of the same type (either Import or Export).");
8286 
8287  TEUCHOS_TEST_FOR_EXCEPTION(
8288  ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
8289  (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
8290  ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
8291  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
8292  "arguments must be of the same type (either Import or Export).");
8293  } // domainTransfer != null
8294 
8295 
8296  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
8297  // if the source Map is not distributed but the target Map is?
8298  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
8299 
8300  // Get the new domain and range Maps. We need some of them for
8301  // error checking, now that we have the reverseMode parameter.
8302  RCP<const map_type> MyRowMap = reverseMode ?
8303  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
8304  RCP<const map_type> MyColMap; // create this below
8305  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
8306  domainMap : getDomainMap ();
8307  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
8308  rangeMap : getRangeMap ();
8309  RCP<const map_type> BaseRowMap = MyRowMap;
8310  RCP<const map_type> BaseDomainMap = MyDomainMap;
8311 
8312  // If the user gave us a nonnull destMat, then check whether it's
8313  // "pristine." That means that it has no entries.
8314  //
8315  // FIXME (mfh 15 May 2014) If this is not true on all processes,
8316  // then this exception test may hang. It would be better to
8317  // forward an error flag to the next communication phase.
8318  if (! destMat.is_null ()) {
8319  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
8320  // whether a graph or matrix has no entries on the calling
8321  // process, is that it is neither locally nor globally indexed.
8322  // This may change eventually with the Kokkos refactor version
8323  // of Tpetra, so it would be better just to check the quantity
8324  // of interest directly. Note that with the Kokkos refactor
8325  // version of Tpetra, asking for the total number of entries in
8326  // a graph or matrix that is not fill complete might require
8327  // computation (kernel launch), since it is not thread scalable
8328  // to update a count every time an entry is inserted.
8329  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
8330  ! destMat->getGraph ()->isGloballyIndexed ();
8331  TEUCHOS_TEST_FOR_EXCEPTION(
8332  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
8333  "transferAndFillComplete: The input argument 'destMat' is only allowed "
8334  "to be nonnull, if its graph is empty (neither locally nor globally "
8335  "indexed).");
8336  // FIXME (mfh 15 May 2014) At some point, we want to change
8337  // graphs and matrices so that their DistObject Map
8338  // (this->getMap()) may differ from their row Map. This will
8339  // make redistribution for 2-D distributions more efficient. I
8340  // hesitate to change this check, because I'm not sure how much
8341  // the code here depends on getMap() and getRowMap() being the
8342  // same.
8343  TEUCHOS_TEST_FOR_EXCEPTION(
8344  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
8345  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
8346  "input argument 'destMat' is not the same as the (row) Map specified "
8347  "by the input argument 'rowTransfer'.");
8348  TEUCHOS_TEST_FOR_EXCEPTION(
8349  ! destMat->checkSizes (*this), std::invalid_argument,
8350  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
8351  "destination matrix, but checkSizes() indicates that it is not a legal "
8352  "legal target for redistribution from the source matrix (*this). This "
8353  "may mean that they do not have the same dimensions.");
8354  }
8355 
8356  // If forward mode (the default), then *this's (row) Map must be
8357  // the same as the source Map of the Transfer. If reverse mode,
8358  // then *this's (row) Map must be the same as the target Map of
8359  // the Transfer.
8360  //
8361  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
8362  // and matrices so that their DistObject Map (this->getMap()) may
8363  // differ from their row Map. This will make redistribution for
8364  // 2-D distributions more efficient. I hesitate to change this
8365  // check, because I'm not sure how much the code here depends on
8366  // getMap() and getRowMap() being the same.
8367  TEUCHOS_TEST_FOR_EXCEPTION(
8368  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
8369  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
8370  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
8371  TEUCHOS_TEST_FOR_EXCEPTION(
8372  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
8373  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
8374  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
8375 
8376  // checks for domainTransfer
8377  TEUCHOS_TEST_FOR_EXCEPTION(
8378  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
8379  std::invalid_argument,
8380  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
8381  "argument must be the same as the rebalanced domain map 'domainMap'");
8382 
8383  TEUCHOS_TEST_FOR_EXCEPTION(
8384  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
8385  std::invalid_argument,
8386  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
8387  "argument must be the same as the rebalanced domain map 'domainMap'");
8388 
8389  // The basic algorithm here is:
8390  //
8391  // 1. Call the moral equivalent of "Distor.do" to handle the import.
8392  // 2. Copy all the Imported and Copy/Permuted data into the raw
8393  // CrsMatrix / CrsGraphData pointers, still using GIDs.
8394  // 3. Call an optimized version of MakeColMap that avoids the
8395  // Directory lookups (since the importer knows who owns all the
8396  // GIDs) AND reindexes to LIDs.
8397  // 4. Call expertStaticFillComplete()
8398 
8399  // Get information from the Importer
8400  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
8401  ArrayView<const LO> ExportLIDs = reverseMode ?
8402  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
8403  ArrayView<const LO> RemoteLIDs = reverseMode ?
8404  rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
8405  ArrayView<const LO> PermuteToLIDs = reverseMode ?
8406  rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
8407  ArrayView<const LO> PermuteFromLIDs = reverseMode ?
8408  rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
8409  Distributor& Distor = rowTransfer.getDistributor ();
8410 
8411  // Owning PIDs
8412  Teuchos::Array<int> SourcePids;
8413  Teuchos::Array<int> TargetPids;
8414 
8415  // Temp variables for sub-communicators
8416  RCP<const map_type> ReducedRowMap, ReducedColMap,
8417  ReducedDomainMap, ReducedRangeMap;
8418  RCP<const Comm<int> > ReducedComm;
8419 
8420  // If the user gave us a null destMat, then construct the new
8421  // destination matrix. We will replace its column Map later.
8422  if (destMat.is_null ()) {
8423  destMat = rcp (new this_type (MyRowMap, 0, StaticProfile, matrixparams));
8424  }
8425 
8426  /***************************************************/
8427  /***** 1) First communicator restriction phase ****/
8428  /***************************************************/
8429  if (restrictComm) {
8430  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
8431  ReducedComm = ReducedRowMap.is_null () ?
8432  Teuchos::null :
8433  ReducedRowMap->getComm ();
8434  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
8435 
8436  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
8437  ReducedRowMap :
8438  MyDomainMap->replaceCommWithSubset (ReducedComm);
8439  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
8440  ReducedRowMap :
8441  MyRangeMap->replaceCommWithSubset (ReducedComm);
8442 
8443  // Reset the "my" maps
8444  MyRowMap = ReducedRowMap;
8445  MyDomainMap = ReducedDomainMap;
8446  MyRangeMap = ReducedRangeMap;
8447 
8448  // Update my PID, if we've restricted the communicator
8449  if (! ReducedComm.is_null ()) {
8450  MyPID = ReducedComm->getRank ();
8451  }
8452  else {
8453  MyPID = -2; // For debugging
8454  }
8455  }
8456  else {
8457  ReducedComm = MyRowMap->getComm ();
8458  }
8459 
8460 
8461 
8462  /***************************************************/
8463  /***** 2) From Tpera::DistObject::doTransfer() ****/
8464  /***************************************************/
8465  // Get the owning PIDs
8466  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8467 
8468  // check whether domain maps of source matrix and base domain map is the same
8469  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8470 
8471  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8472  // Same domain map as source matrix
8473  //
8474  // NOTE: This won't work for restrictComm (because the Import
8475  // doesn't know the restricted PIDs), though writing an
8476  // optimized version for that case would be easy (Import an
8477  // IntVector of the new PIDs). Might want to add this later.
8478  Import_Util::getPids (*MyImporter, SourcePids, false);
8479  }
8480  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8481  // Same domain map as source matrix (restricted communicator)
8482  // We need one import from the domain to the column map
8483  IntVectorType SourceDomain_pids(getDomainMap (),true);
8484  IntVectorType SourceCol_pids(getColMap());
8485  // SourceDomain_pids contains the restricted pids
8486  SourceDomain_pids.putScalar(MyPID);
8487 
8488  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8489  SourcePids.resize (getColMap ()->getNodeNumElements ());
8490  SourceCol_pids.get1dCopy (SourcePids ());
8491  }
8492  else if (MyImporter.is_null ()) {
8493  // Matrix has no off-process entries
8494  SourcePids.resize (getColMap ()->getNodeNumElements ());
8495  SourcePids.assign (getColMap ()->getNodeNumElements (), MyPID);
8496  }
8497  else if ( ! MyImporter.is_null () &&
8498  ! domainTransfer.is_null () ) {
8499  // general implementation for rectangular matrices with
8500  // domain map different than SourceMatrix domain map.
8501  // User has to provide a DomainTransfer object. We need
8502  // to communications (import/export)
8503 
8504  // TargetDomain_pids lives on the rebalanced new domain map
8505  IntVectorType TargetDomain_pids (domainMap);
8506  TargetDomain_pids.putScalar (MyPID);
8507 
8508  // SourceDomain_pids lives on the non-rebalanced old domain map
8509  IntVectorType SourceDomain_pids (getDomainMap ());
8510 
8511  // SourceCol_pids lives on the non-rebalanced old column map
8512  IntVectorType SourceCol_pids (getColMap ());
8513 
8514  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8515  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8516  }
8517  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8518  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8519  }
8520  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8521  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8522  }
8523  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8524  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8525  }
8526  else {
8527  TEUCHOS_TEST_FOR_EXCEPTION(
8528  true, std::logic_error, "Tpetra::CrsMatrix::"
8529  "transferAndFillComplete: Should never get here! "
8530  "Please report this bug to a Tpetra developer.");
8531  }
8532  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8533  SourcePids.resize (getColMap ()->getNodeNumElements ());
8534  SourceCol_pids.get1dCopy (SourcePids ());
8535  }
8536  else if ( ! MyImporter.is_null () &&
8537  BaseDomainMap->isSameAs (*BaseRowMap) &&
8538  getDomainMap ()->isSameAs (*getRowMap ())) {
8539  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8540 
8541  IntVectorType TargetRow_pids (domainMap);
8542  IntVectorType SourceRow_pids (getRowMap ());
8543  IntVectorType SourceCol_pids (getColMap ());
8544 
8545  TargetRow_pids.putScalar (MyPID);
8546  if (! reverseMode && xferAsImport != nullptr) {
8547  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8548  }
8549  else if (reverseMode && xferAsExport != nullptr) {
8550  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8551  }
8552  else if (! reverseMode && xferAsExport != nullptr) {
8553  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8554  }
8555  else if (reverseMode && xferAsImport != nullptr) {
8556  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8557  }
8558  else {
8559  TEUCHOS_TEST_FOR_EXCEPTION(
8560  true, std::logic_error, "Tpetra::CrsMatrix::"
8561  "transferAndFillComplete: Should never get here! "
8562  "Please report this bug to a Tpetra developer.");
8563  }
8564 
8565  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8566  SourcePids.resize (getColMap ()->getNodeNumElements ());
8567  SourceCol_pids.get1dCopy (SourcePids ());
8568  }
8569  else {
8570  TEUCHOS_TEST_FOR_EXCEPTION(
8571  true, std::invalid_argument, "Tpetra::CrsMatrix::"
8572  "transferAndFillComplete: This method only allows either domainMap == "
8573  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8574  "getDomainMap () == getRowMap ()).");
8575  }
8576 
8577  // Tpetra-specific stuff
8578  size_t constantNumPackets = destMat->constantNumberOfPackets ();
8579  if (constantNumPackets == 0) {
8580  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8581  RemoteLIDs.size ());
8582  }
8583  else {
8584  // There are a constant number of packets per element. We
8585  // already know (from the number of "remote" (incoming)
8586  // elements) how many incoming elements we expect, so we can
8587  // resize the buffer accordingly.
8588  const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
8589  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8590  }
8591 
8592  // Pack & Prepare w/ owning PIDs
8593  if (debug) {
8594  using Teuchos::outArg;
8595  using Teuchos::REDUCE_MAX;
8596  using Teuchos::reduceAll;
8597  using std::cerr;
8598  using std::endl;
8599  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8600  const int myRank = comm->getRank ();
8601 
8602  std::ostringstream errStrm;
8603  int lclErr = 0;
8604  int gblErr = 0;
8605 
8606  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8607  try {
8608  // packAndPrepare* methods modify numExportPacketsPerLID_.
8609  destMat->numExportPacketsPerLID_.modify_host ();
8610  numExportPacketsPerLID =
8611  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8612  }
8613  catch (std::exception& e) {
8614  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8615  << e.what () << std::endl;
8616  lclErr = 1;
8617  }
8618  catch (...) {
8619  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8620  "an exception not a subclass of std::exception" << std::endl;
8621  lclErr = 1;
8622  }
8623 
8624  if (! comm.is_null ()) {
8625  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8626  }
8627  if (gblErr != 0) {
8628  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8629  TEUCHOS_TEST_FOR_EXCEPTION(
8630  true, std::runtime_error, "getArrayViewFromDualView threw an "
8631  "exception on at least one process.");
8632  }
8633 
8634  if (verbose) {
8635  std::ostringstream os;
8636  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8637  << std::endl;
8638  std::cerr << os.str ();
8639  }
8640  try {
8642  destMat->exports_,
8643  numExportPacketsPerLID,
8644  ExportLIDs,
8645  SourcePids,
8646  constantNumPackets);
8647  }
8648  catch (std::exception& e) {
8649  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8650  << e.what () << std::endl;
8651  lclErr = 1;
8652  }
8653  catch (...) {
8654  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8655  "an exception not a subclass of std::exception" << std::endl;
8656  lclErr = 1;
8657  }
8658 
8659  if (verbose) {
8660  std::ostringstream os;
8661  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8662  << std::endl;
8663  std::cerr << os.str ();
8664  }
8665 
8666  if (! comm.is_null ()) {
8667  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8668  }
8669  if (gblErr != 0) {
8670  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8671  TEUCHOS_TEST_FOR_EXCEPTION(
8672  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8673  "exception on at least one process.");
8674  }
8675  }
8676  else {
8677  // packAndPrepare* methods modify numExportPacketsPerLID_.
8678  destMat->numExportPacketsPerLID_.modify_host ();
8679  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8680  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8681  if (verbose) {
8682  std::ostringstream os;
8683  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8684  << std::endl;
8685  std::cerr << os.str ();
8686  }
8688  destMat->exports_,
8689  numExportPacketsPerLID,
8690  ExportLIDs,
8691  SourcePids,
8692  constantNumPackets);
8693  if (verbose) {
8694  std::ostringstream os;
8695  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8696  << std::endl;
8697  std::cerr << os.str ();
8698  }
8699  }
8700 
8701  // Do the exchange of remote data.
8702  if (! communication_needed) {
8703  if (verbose) {
8704  std::ostringstream os;
8705  os << *verbosePrefix << "Communication not needed" << std::endl;
8706  std::cerr << os.str ();
8707  }
8708  }
8709  else {
8710  if (reverseMode) {
8711  if (constantNumPackets == 0) { // variable number of packets per LID
8712  if (verbose) {
8713  std::ostringstream os;
8714  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8715  << std::endl;
8716  std::cerr << os.str ();
8717  }
8718  // Make sure that host has the latest version, since we're
8719  // using the version on host. If host has the latest
8720  // version, syncing to host does nothing.
8721  destMat->numExportPacketsPerLID_.sync_host ();
8722  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8723  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8724  destMat->numImportPacketsPerLID_.sync_host ();
8725  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8726  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8727 
8728  if (verbose) {
8729  std::ostringstream os;
8730  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8731  << std::endl;
8732  std::cerr << os.str ();
8733  }
8734  Distor.doReversePostsAndWaits (numExportPacketsPerLID, 1,
8735  numImportPacketsPerLID);
8736  if (verbose) {
8737  std::ostringstream os;
8738  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8739  << std::endl;
8740  std::cerr << os.str ();
8741  }
8742 
8743  size_t totalImportPackets = 0;
8744  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8745  totalImportPackets += numImportPacketsPerLID[i];
8746  }
8747 
8748  // Reallocation MUST go before setting the modified flag,
8749  // because it may clear out the flags.
8750  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8751  verbosePrefix.get ());
8752  destMat->imports_.modify_host ();
8753  Teuchos::ArrayView<char> hostImports =
8754  getArrayViewFromDualView (destMat->imports_);
8755  // This is a legacy host pack/unpack path, so use the host
8756  // version of exports_.
8757  destMat->exports_.sync_host ();
8758  Teuchos::ArrayView<const char> hostExports =
8759  getArrayViewFromDualView (destMat->exports_);
8760  if (verbose) {
8761  std::ostringstream os;
8762  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8763  << std::endl;
8764  std::cerr << os.str ();
8765  }
8766  Distor.doReversePostsAndWaits (hostExports,
8767  numExportPacketsPerLID,
8768  hostImports,
8769  numImportPacketsPerLID);
8770  if (verbose) {
8771  std::ostringstream os;
8772  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8773  << std::endl;
8774  std::cerr << os.str ();
8775  }
8776  }
8777  else { // constant number of packets per LID
8778  if (verbose) {
8779  std::ostringstream os;
8780  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8781  << std::endl;
8782  std::cerr << os.str ();
8783  }
8784  destMat->imports_.modify_host ();
8785  Teuchos::ArrayView<char> hostImports =
8786  getArrayViewFromDualView (destMat->imports_);
8787  // This is a legacy host pack/unpack path, so use the host
8788  // version of exports_.
8789  destMat->exports_.sync_host ();
8790  Teuchos::ArrayView<const char> hostExports =
8791  getArrayViewFromDualView (destMat->exports_);
8792  if (verbose) {
8793  std::ostringstream os;
8794  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8795  << std::endl;
8796  std::cerr << os.str ();
8797  }
8798  Distor.doReversePostsAndWaits (hostExports,
8799  constantNumPackets,
8800  hostImports);
8801  if (verbose) {
8802  std::ostringstream os;
8803  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8804  << std::endl;
8805  std::cerr << os.str ();
8806  }
8807  }
8808  }
8809  else { // forward mode (the default)
8810  if (constantNumPackets == 0) { // variable number of packets per LID
8811  if (verbose) {
8812  std::ostringstream os;
8813  os << *verbosePrefix << "Forward mode, variable # packets / LID"
8814  << std::endl;
8815  std::cerr << os.str ();
8816  }
8817  // Make sure that host has the latest version, since we're
8818  // using the version on host. If host has the latest
8819  // version, syncing to host does nothing.
8820  destMat->numExportPacketsPerLID_.sync_host ();
8821  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8822  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8823  destMat->numImportPacketsPerLID_.sync_host ();
8824  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8825  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8826  if (verbose) {
8827  std::ostringstream os;
8828  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8829  << std::endl;
8830  std::cerr << os.str ();
8831  }
8832  Distor.doPostsAndWaits (numExportPacketsPerLID, 1,
8833  numImportPacketsPerLID);
8834  if (verbose) {
8835  std::ostringstream os;
8836  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8837  << std::endl;
8838  std::cerr << os.str ();
8839  }
8840 
8841  size_t totalImportPackets = 0;
8842  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8843  totalImportPackets += numImportPacketsPerLID[i];
8844  }
8845 
8846  // Reallocation MUST go before setting the modified flag,
8847  // because it may clear out the flags.
8848  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8849  verbosePrefix.get ());
8850  destMat->imports_.modify_host ();
8851  Teuchos::ArrayView<char> hostImports =
8852  getArrayViewFromDualView (destMat->imports_);
8853  // This is a legacy host pack/unpack path, so use the host
8854  // version of exports_.
8855  destMat->exports_.sync_host ();
8856  Teuchos::ArrayView<const char> hostExports =
8857  getArrayViewFromDualView (destMat->exports_);
8858  if (verbose) {
8859  std::ostringstream os;
8860  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8861  << std::endl;
8862  std::cerr << os.str ();
8863  }
8864  Distor.doPostsAndWaits (hostExports,
8865  numExportPacketsPerLID,
8866  hostImports,
8867  numImportPacketsPerLID);
8868  if (verbose) {
8869  std::ostringstream os;
8870  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8871  << std::endl;
8872  std::cerr << os.str ();
8873  }
8874  }
8875  else { // constant number of packets per LID
8876  if (verbose) {
8877  std::ostringstream os;
8878  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8879  << std::endl;
8880  std::cerr << os.str ();
8881  }
8882  destMat->imports_.modify_host ();
8883  Teuchos::ArrayView<char> hostImports =
8884  getArrayViewFromDualView (destMat->imports_);
8885  // This is a legacy host pack/unpack path, so use the host
8886  // version of exports_.
8887  destMat->exports_.sync_host ();
8888  Teuchos::ArrayView<const char> hostExports =
8889  getArrayViewFromDualView (destMat->exports_);
8890  if (verbose) {
8891  std::ostringstream os;
8892  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8893  << std::endl;
8894  std::cerr << os.str ();
8895  }
8896  Distor.doPostsAndWaits (hostExports,
8897  constantNumPackets,
8898  hostImports);
8899  if (verbose) {
8900  std::ostringstream os;
8901  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8902  << std::endl;
8903  std::cerr << os.str ();
8904  }
8905  }
8906  }
8907  }
8908 
8909  /*********************************************************************/
8910  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8911  /*********************************************************************/
8912 
8913  // Backwards compatibility measure. We'll use this again below.
8914  destMat->numImportPacketsPerLID_.sync_host ();
8915  Teuchos::ArrayView<const size_t> numImportPacketsPerLID =
8916  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8917  destMat->imports_.sync_host ();
8918  Teuchos::ArrayView<const char> hostImports =
8919  getArrayViewFromDualView (destMat->imports_);
8920 
8921  if (verbose) {
8922  std::ostringstream os;
8923  os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount"
8924  << std::endl;
8925  std::cerr << os.str ();
8926  }
8927  size_t mynnz =
8929  RemoteLIDs,
8930  hostImports,
8931  numImportPacketsPerLID,
8932  constantNumPackets,
8933  INSERT,
8934  NumSameIDs,
8935  PermuteToLIDs,
8936  PermuteFromLIDs);
8937  if (verbose) {
8938  std::ostringstream os;
8939  os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned "
8940  << mynnz << std::endl;
8941  std::cerr << os.str ();
8942  }
8943  size_t N = BaseRowMap->getNodeNumElements ();
8944 
8945  // Allocations
8946  ArrayRCP<size_t> CSR_rowptr(N+1);
8947  ArrayRCP<GO> CSR_colind_GID;
8948  ArrayRCP<LO> CSR_colind_LID;
8949  ArrayRCP<Scalar> CSR_vals;
8950  CSR_colind_GID.resize (mynnz);
8951  CSR_vals.resize (mynnz);
8952 
8953  // If LO and GO are the same, we can reuse memory when
8954  // converting the column indices from global to local indices.
8955  if (typeid (LO) == typeid (GO)) {
8956  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8957  }
8958  else {
8959  CSR_colind_LID.resize (mynnz);
8960  }
8961 
8962  if (verbose) {
8963  std::ostringstream os;
8964  os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays"
8965  << std::endl;
8966  std::cerr << os.str ();
8967  }
8968  // FIXME (mfh 15 May 2014) Why can't we abstract this out as an
8969  // unpackAndCombine method on a "CrsArrays" object? This passing
8970  // in a huge list of arrays is icky. Can't we have a bit of an
8971  // abstraction? Implementing a concrete DistObject subclass only
8972  // takes five methods.
8974  RemoteLIDs,
8975  hostImports,
8976  numImportPacketsPerLID,
8977  constantNumPackets,
8978  INSERT,
8979  NumSameIDs,
8980  PermuteToLIDs,
8981  PermuteFromLIDs,
8982  N,
8983  mynnz,
8984  MyPID,
8985  CSR_rowptr (),
8986  CSR_colind_GID (),
8987  Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
8988  SourcePids (),
8989  TargetPids);
8990 
8991  /**************************************************************/
8992  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8993  /**************************************************************/
8994  // Call an optimized version of makeColMap that avoids the
8995  // Directory lookups (since the Import object knows who owns all
8996  // the GIDs).
8997  Teuchos::Array<int> RemotePids;
8998  if (verbose) {
8999  std::ostringstream os;
9000  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
9001  << std::endl;
9002  std::cerr << os.str ();
9003  }
9004  Import_Util::lowCommunicationMakeColMapAndReindex (CSR_rowptr (),
9005  CSR_colind_LID (),
9006  CSR_colind_GID (),
9007  BaseDomainMap,
9008  TargetPids,
9009  RemotePids,
9010  MyColMap);
9011 
9012  if (verbose) {
9013  std::ostringstream os;
9014  os << *verbosePrefix << "restrictComm="
9015  << (restrictComm ? "true" : "false") << std::endl;
9016  std::cerr << os.str ();
9017  }
9018 
9019  /*******************************************************/
9020  /**** 4) Second communicator restriction phase ****/
9021  /*******************************************************/
9022  if (restrictComm) {
9023  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
9024  ReducedRowMap :
9025  MyColMap->replaceCommWithSubset (ReducedComm);
9026  MyColMap = ReducedColMap; // Reset the "my" maps
9027  }
9028 
9029  // Replace the col map
9030  if (verbose) {
9031  std::ostringstream os;
9032  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
9033  std::cerr << os.str ();
9034  }
9035  destMat->replaceColMap (MyColMap);
9036 
9037  // Short circuit if the processor is no longer in the communicator
9038  //
9039  // NOTE: Epetra replaces modifies all "removed" processes so they
9040  // have a dummy (serial) Map that doesn't touch the original
9041  // communicator. Duplicating that here might be a good idea.
9042  if (ReducedComm.is_null ()) {
9043  if (verbose) {
9044  std::ostringstream os;
9045  os << *verbosePrefix << "I am no longer in the communicator; "
9046  "returning" << std::endl;
9047  std::cerr << os.str ();
9048  }
9049  return;
9050  }
9051 
9052  /***************************************************/
9053  /**** 5) Sort ****/
9054  /***************************************************/
9055  if ((! reverseMode && xferAsImport != nullptr) ||
9056  (reverseMode && xferAsExport != nullptr)) {
9057  if (verbose) {
9058  std::ostringstream os;
9059  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
9060  std::cerr << os.str ();
9061  }
9062  Import_Util::sortCrsEntries (CSR_rowptr (),
9063  CSR_colind_LID (),
9064  CSR_vals ());
9065  }
9066  else if ((! reverseMode && xferAsExport != nullptr) ||
9067  (reverseMode && xferAsImport != nullptr)) {
9068  if (verbose) {
9069  std::ostringstream os;
9070  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
9071  << endl;
9072  std::cerr << os.str();
9073  }
9074  Import_Util::sortAndMergeCrsEntries (CSR_rowptr (),
9075  CSR_colind_LID (),
9076  CSR_vals ());
9077  if (CSR_rowptr[N] != mynnz) {
9078  CSR_colind_LID.resize (CSR_rowptr[N]);
9079  CSR_vals.resize (CSR_rowptr[N]);
9080  }
9081  }
9082  else {
9083  TEUCHOS_TEST_FOR_EXCEPTION(
9084  true, std::logic_error, "Tpetra::CrsMatrix::"
9085  "transferAndFillComplete: Should never get here! "
9086  "Please report this bug to a Tpetra developer.");
9087  }
9088  /***************************************************/
9089  /**** 6) Reset the colmap and the arrays ****/
9090  /***************************************************/
9091 
9092  if (verbose) {
9093  std::ostringstream os;
9094  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
9095  std::cerr << os.str ();
9096  }
9097 
9098  // Call constructor for the new matrix (restricted as needed)
9099  //
9100  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
9101  // refactor version of CrsMatrix, though it reserves the right to
9102  // make a deep copy of the arrays.
9103  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
9104 
9105  /***************************************************/
9106  /**** 7) Build Importer & Call ESFC ****/
9107  /***************************************************/
9108  // Pre-build the importer using the existing PIDs
9109  Teuchos::ParameterList esfc_params;
9110 
9111  RCP<import_type> MyImport;
9112 
9113  // Fulfull the non-blocking allreduce on reduced_mismatch.
9114  if (iallreduceRequest.get () != nullptr) {
9115  if (verbose) {
9116  std::ostringstream os;
9117  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
9118  << endl;
9119  std::cerr << os.str ();
9120  }
9121  iallreduceRequest->wait ();
9122  if (reduced_mismatch != 0) {
9123  isMM = false;
9124  }
9125  }
9126 
9127  if( isMM ) {
9128 #ifdef HAVE_TPETRA_MMM_TIMINGS
9129  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
9130 #endif
9131  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
9132 
9133  if (verbose) {
9134  std::ostringstream os;
9135  os << *verbosePrefix << "Calling getAllValues" << endl;
9136  std::cerr << os.str ();
9137  }
9138 
9139  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
9140  Teuchos::ArrayRCP<int> type3PIDs;
9141  Teuchos::ArrayRCP<const size_t> rowptr;
9142  Teuchos::ArrayRCP<const LO> colind;
9143  Teuchos::ArrayRCP<const Scalar> vals;
9144  {
9145 #ifdef HAVE_TPETRA_MMM_TIMINGS
9146  TimeMonitor tm_getAllValues (*TimeMonitor::getNewTimer(prefix + std::string("isMMgetAllValues")));
9147 #endif
9148  getAllValues(rowptr,colind,vals);
9149  }
9150 
9151  if (verbose) {
9152  std::ostringstream os;
9153  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
9154  std::cerr << os.str ();
9155  }
9156 
9157  {
9158 #ifdef HAVE_TPETRA_MMM_TIMINGS
9159  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
9160 #endif
9161  Import_Util::reverseNeighborDiscovery(*this,
9162  rowptr,
9163  colind,
9164  rowTransfer,
9165  MyImporter,
9166  MyDomainMap,
9167  type3PIDs,
9168  type3LIDs,
9169  ReducedComm);
9170  }
9171 
9172  if (verbose) {
9173  std::ostringstream os;
9174  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
9175  std::cerr << os.str ();
9176  }
9177 
9178  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
9179  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
9180 
9181  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
9182  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
9183 
9184  const int numCols = getGraph()->getColMap()->getNodeNumElements(); // may be dup
9185  // from EpetraExt_MMHelpers.cpp: build_type2_exports
9186  std::vector<bool> IsOwned(numCols,true);
9187  std::vector<int> SentTo(numCols,-1);
9188  if (! MyImporter.is_null ()) {
9189  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
9190  IsOwned[rlid]=false;
9191  }
9192  }
9193 
9194  std::vector<std::pair<int,GO> > usrtg;
9195  usrtg.reserve(TEPID2.size());
9196 
9197  {
9198  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
9199  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
9200  const LO row = TELID2[i];
9201  const int pid = TEPID2[i];
9202  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
9203  const int col = colind[j];
9204  if (IsOwned[col] && SentTo[col] != pid) {
9205  SentTo[col] = pid;
9206  GO gid = colMap.getGlobalElement (col);
9207  usrtg.push_back (std::pair<int,GO> (pid, gid));
9208  }
9209  }
9210  }
9211  }
9212 
9213 // This sort can _not_ be omitted.[
9214  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
9215  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
9216  // 25 Jul 2018: Could just ignore the entries at and after eopg.
9217  usrtg.erase(eopg,usrtg.end());
9218 
9219  const Array_size_type type2_us_size = usrtg.size();
9220  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
9221  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
9222 
9223  int pos=0;
9224  for(auto && p : usrtg) {
9225  EPID2[pos]= p.first;
9226  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
9227  pos++;
9228  }
9229 
9230  Teuchos::ArrayView<int> EPID3 = type3PIDs();
9231  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
9232  GO InfGID = std::numeric_limits<GO>::max();
9233  int InfPID = INT_MAX;
9234 #ifdef TPETRA_MIN3
9235 # undef TPETRA_MIN3
9236 #endif // TPETRA_MIN3
9237 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
9238  int i1=0, i2=0, i3=0;
9239  int Len1 = EPID1.size();
9240  int Len2 = EPID2.size();
9241  int Len3 = EPID3.size();
9242 
9243  int MyLen=Len1+Len2+Len3;
9244  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
9245  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
9246  int iloc = 0; // will be the size of the userExportLID/PIDs
9247 
9248  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
9249  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9250  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9251  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9252 
9253  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9254  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9255  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9256 
9257  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9258  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9259 #ifdef TPETRA_MIN3
9260 # undef TPETRA_MIN3
9261 #endif // TPETRA_MIN3
9262  bool added_entry=false;
9263 
9264  if(PID1 == MIN_PID && GID1 == MIN_GID){
9265  userExportLIDs[iloc]=ELID1[i1];
9266  userExportPIDs[iloc]=EPID1[i1];
9267  i1++;
9268  added_entry=true;
9269  iloc++;
9270  }
9271  if(PID2 == MIN_PID && GID2 == MIN_GID){
9272  if(!added_entry) {
9273  userExportLIDs[iloc]=ELID2[i2];
9274  userExportPIDs[iloc]=EPID2[i2];
9275  added_entry=true;
9276  iloc++;
9277  }
9278  i2++;
9279  }
9280  if(PID3 == MIN_PID && GID3 == MIN_GID){
9281  if(!added_entry) {
9282  userExportLIDs[iloc]=ELID3[i3];
9283  userExportPIDs[iloc]=EPID3[i3];
9284  iloc++;
9285  }
9286  i3++;
9287  }
9288  }
9289 
9290  if (verbose) {
9291  std::ostringstream os;
9292  os << *verbosePrefix << "Create Import" << std::endl;
9293  std::cerr << os.str ();
9294  }
9295 
9296 #ifdef HAVE_TPETRA_MMM_TIMINGS
9297  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9298 #endif
9299  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9300  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9301  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9302  MyImport = rcp ( new import_type (MyDomainMap,
9303  MyColMap,
9304  RemotePids,
9305  userExportLIDs.view(0,iloc).getConst(),
9306  userExportPIDs.view(0,iloc).getConst(),
9307  plist)
9308  );
9309 
9310  if (verbose) {
9311  std::ostringstream os;
9312  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9313  std::cerr << os.str ();
9314  }
9315 
9316  {
9317 #ifdef HAVE_TPETRA_MMM_TIMINGS
9318  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9319  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9320 #endif
9321  if(!params.is_null())
9322  esfc_params.set("compute global constants",params->get("compute global constants",true));
9323  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9324 
9325  }
9326 
9327  } // if(isMM)
9328  else {
9329 #ifdef HAVE_TPETRA_MMM_TIMINGS
9330  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9331 #endif
9332  if (verbose) {
9333  std::ostringstream os;
9334  os << *verbosePrefix << "Create Import" << std::endl;
9335  std::cerr << os.str ();
9336  }
9337 
9338 #ifdef HAVE_TPETRA_MMM_TIMINGS
9339  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9340 #endif
9341  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9342  mypars->set("Timer Label","notMMFrom_tAFC");
9343  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9344  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9345 
9346  if (verbose) {
9347  std::ostringstream os;
9348  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9349  std::cerr << os.str ();
9350  }
9351 
9352 #ifdef HAVE_TPETRA_MMM_TIMINGS
9353  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9354  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9355 #else
9356  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9357 #endif
9358 
9359  if (!params.is_null ()) {
9360  esfc_params.set ("compute global constants",
9361  params->get ("compute global constants", true));
9362  }
9363  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9364  MyImport, Teuchos::null,
9365  rcp (new Teuchos::ParameterList (esfc_params)));
9366  }
9367 
9368  if (verbose) {
9369  std::ostringstream os;
9370  os << *verbosePrefix << "Done" << endl;
9371  std::cerr << os.str ();
9372  }
9373  }
9374 
9375 
9376  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9377  void
9380  const import_type& importer,
9381  const Teuchos::RCP<const map_type>& domainMap,
9382  const Teuchos::RCP<const map_type>& rangeMap,
9383  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9384  {
9385  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9386  }
9387 
9388  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9389  void
9392  const import_type& rowImporter,
9393  const import_type& domainImporter,
9394  const Teuchos::RCP<const map_type>& domainMap,
9395  const Teuchos::RCP<const map_type>& rangeMap,
9396  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9397  {
9398  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9399  }
9400 
9401  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9402  void
9405  const export_type& exporter,
9406  const Teuchos::RCP<const map_type>& domainMap,
9407  const Teuchos::RCP<const map_type>& rangeMap,
9408  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9409  {
9410  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9411  }
9412 
9413  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9414  void
9417  const export_type& rowExporter,
9418  const export_type& domainExporter,
9419  const Teuchos::RCP<const map_type>& domainMap,
9420  const Teuchos::RCP<const map_type>& rangeMap,
9421  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9422  {
9423  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9424  }
9425 
9426 
9427 } // namespace Tpetra
9428 
9429 //
9430 // Explicit instantiation macro
9431 //
9432 // Must be expanded from within the Tpetra namespace!
9433 //
9434 
9435 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9436  \
9437  template class CrsMatrix< SCALAR , LO , GO , NODE >; \
9438  template Teuchos::RCP< CrsMatrix< SCALAR , LO , GO , NODE > > \
9439  CrsMatrix< SCALAR , LO , GO , NODE >::convert< SCALAR > () const;
9440 
9441 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9442  \
9443  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9444  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9445 
9446 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9447  template<> \
9448  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9449  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9450  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9451  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9452  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9453  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9454  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9455  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9456  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9457  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9458  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9459  const Teuchos::RCP<Teuchos::ParameterList>& params);
9460 
9461 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9462  template<> \
9463  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9464  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9465  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9466  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9467  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9468  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9469  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9470  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9471  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9472  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9473  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9474  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9475  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9476  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9477  const Teuchos::RCP<Teuchos::ParameterList>& params);
9478 
9479 
9480 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9481  template<> \
9482  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9483  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9484  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9485  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9486  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9487  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9488  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9489  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9490  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9491  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9492  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9493  const Teuchos::RCP<Teuchos::ParameterList>& params);
9494 
9495 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9496  template<> \
9497  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9498  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9499  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9500  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9501  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9502  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9503  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9504  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9505  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9506  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9507  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9508  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9509  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9510  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9511  const Teuchos::RCP<Teuchos::ParameterList>& params);
9512 
9513 
9514 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9515  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9516  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9517  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9518  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9519  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9520 
9521 #endif // TPETRA_CRSMATRIX_DEF_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular,...
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular,...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Functions for manipulating CRS arrays.
Declaration of a function that prints strings from each process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::iallreduce.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
KOKKOS_FUNCTION size_t packRow(const LocalMapType &col_map, const Kokkos::View< Packet *, BufferDeviceType > &exports, const InputLidsType &lids_in, const InputPidsType &pids_in, const size_t offset, const size_t num_ent, const bool pack_pids)
Packs a single row of the CrsGraph.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
KOKKOS_FUNCTION int unpackRow(const Kokkos::View< GO *, Device, Kokkos::MemoryUnmanaged > &gids_out, const Kokkos::View< int *, Device, Kokkos::MemoryUnmanaged > &pids_out, const Kokkos::View< const Packet *, BufferDevice > &imports, const size_t offset, const size_t num_ent)
Unpack a single row of a CrsGraph.
Utility functions for packing and unpacking sparse matrix entries.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of colum indices for all rows KDDKDD UVM Removal: Device view takes place of k_lclInds...
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
bool noRedundancies_
Whether the graph's indices are non-redundant (merged) in each row, on this process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
bool indicesAreSorted_
Whether the graph's indices are sorted in each row, on this process.
size_t getNodeNumRows() const override
Returns the number of graph rows owned on the calling node.
local_inds_dualv_type::t_host getLocalIndsViewHostNonConst(const RowInfo &rowinfo)
Get a ReadWrite locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(m...
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
Details::EStorageStatus storageStatus_
Status of the matrix's storage, when not in a fill-complete state.
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using global indices.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix's communicator.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
size_t getNodeNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object's data for an Import or Export.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
void computeGlobalConstants()
Compute matrix properties that require collectives.
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
bool isFillActive() const
Whether the matrix is not fill complete.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
bool fillComplete_
Whether the matrix is fill complete.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
std::string description() const override
A one-line description of this object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void checkInternalState() const
Check that this object's state is sane; throw if it's not.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
local_matrix_device_type::values_type getLocalValuesView() const
Get the Kokkos local values.
Scalar scalar_type
The type of each entry in the matrix.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices,...
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
void clearGlobalConstants()
Clear matrix properties that require collectives.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
mag_type frobNorm_
Cached Frobenius norm of the (global) matrix.
bool isFillComplete() const override
Whether the matrix is fill complete.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
ProfileType getProfileType() const
Returns true if the matrix was allocated with static data structures.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries' values, using local row and column indices.
size_t getNodeNumEntries() const override
The local number of entries in this matrix.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
bool isStorageOptimized() const
Returns true if storage has been optimized.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row,...
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using local column indices.
Description of Tpetra's behavior.
static bool debug()
Whether Tpetra is in debug mode.
static bool verbose()
Whether Tpetra is in verbose mode.
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
bool isLocallyComplete() const
Is this Export or Import locally complete?
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
bool isDistributed() const
Whether this is a globally distributed object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
A parallel distribution of indices over processes.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
Accessors for the Teuchos::Comm and Kokkos Node objects.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
local_map_type getLocalMap() const
Get the local Map for Kokkos kernels.
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on host. This requires that ther...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y....
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X....
A read-only, row-oriented interface to a sparse matrix.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row's entries.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
Abstract base class for objects that can be the source of an Import or Export operation.
A distributed dense vector.
Implementation details of Tpetra.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries....
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix's diagonal entries into a Tpetra::V...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length....
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps,...
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for "new" DistObject inter...
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator,...
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2)
Sort the first array, and apply the resulting permutation to the second array.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
size_t global_size_t
Global size_t object.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified,...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ADD
Sum new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ ADD_ASSIGN
Accumulate new values into existing values (may not be supported in all classes)
@ INSERT
Insert new values that don't currently exist.
@ ZERO
Replace old values with zero.
Functor for the the ABSMAX CombineMode of Import and Export operations.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.