40#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
46#include "Teuchos_OrdinalTraits.hpp"
54#include "Kokkos_Core.hpp"
86namespace UnpackAndCombineCrsMatrixImpl {
97template<
class ST,
class LO,
class GO>
102 const char imports[],
106 const size_t bytes_per_value)
112 bool unpack_pids =
pids_out.size() > 0;
122 const size_t pids_len = unpack_pids ?
131 const char*
const pids_in = unpack_pids ? imports +
pids_beg :
nullptr;
142 Kokkos::pair<int, size_t>
p;
181template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
186 typedef typename local_matrix_type::value_type ST;
190 typedef typename DT::execution_space XS;
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
214 size_t bytes_per_value;
216 error_type error_code;
245 void operator()(member_type team_member)
const
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
251 const LO
batch = team_member.league_rank();
263 const size_t buf_size = imports.size();
283#ifndef KOKKOS_ENABLE_SYCL
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
297#ifndef KOKKOS_ENABLE_SYCL
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
337#ifndef KOKKOS_ENABLE_SYCL
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
348 Kokkos::parallel_for(
368 if (combine_mode ==
ADD) {
373 (
void)local_matrix.sumIntoValues(
381 }
else if (combine_mode ==
REPLACE) {
386 (
void)local_matrix.replaceValues(
397#ifndef KOKKOS_ENABLE_SYCL
399 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
400 "At row %d, an unknown error occurred during unpack\n", (
int)
lid_no
403 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
408 team_member.team_barrier();
414 auto error_code_h = Kokkos::create_mirror_view_and_copy(
415 Kokkos::HostSpace(), error_code
422struct MaxNumEntTag {};
423struct TotNumEntTag {};
433template<
class LO,
class DT,
class BDT>
436 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
437 typedef Kokkos::View<const size_t*, DT> offsets_type;
438 typedef Kokkos::View<const char*, BDT> input_buffer_type;
444 num_packets_per_lid_type num_packets_per_lid;
445 offsets_type offsets;
446 input_buffer_type imports;
460 const size_t num_bytes = num_packets_per_lid(
i);
463 const char*
const in_buf = imports.data () + offsets(
i);
472 join (
const MaxNumEntTag,
476 if (dst < src) dst = src;
482 const size_t num_bytes = num_packets_per_lid(
i);
485 const char*
const in_buf = imports.data () + offsets(
i);
499template<
class LO,
class DT,
class BDT>
502 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
503 const Kokkos::View<const size_t*, DT>& offsets,
504 const Kokkos::View<const char*, BDT>& imports)
506 typedef typename DT::execution_space XS;
507 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
513 static_cast<LO
> (num_packets_per_lid.extent (0));
514 size_t max_num_ent = 0;
515 Kokkos::parallel_reduce (
"Max num entries in CRS",
528template<
class LO,
class DT,
class BDT>
531 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
532 const Kokkos::View<const size_t*, DT>& offsets,
533 const Kokkos::View<const char*, BDT>& imports)
535 typedef typename DT::execution_space XS;
536 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag>
range_policy;
541 static_cast<LO
> (num_packets_per_lid.extent (0));
542 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
551unpackRowCount(
const char imports[],
566 return static_cast<size_t>(num_ent_LO);
573template<
class View1,
class View2>
581 using LO =
typename View2::value_type;
587 batch_info(
batch, 0) =
static_cast<LO
>(
i);
592 return batch == batch_info.extent(0);
602template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
607 const Kokkos::View<const char*, BufferDeviceType>& imports,
608 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
612 using ST =
typename LocalMatrix::value_type;
615 using XS =
typename DT::execution_space;
617 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
618 "unpackAndCombineIntoCrsMatrix: ";
620 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
629 std::invalid_argument,
630 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
631 "static graph (i.e., was constructed with the CrsMatrix constructor "
632 "that takes a const CrsGraph pointer).");
635 std::invalid_argument,
636 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
637 "(i.e., was constructed with the CrsMatrix constructor that takes a "
638 "const CrsGraph pointer).");
642 std::invalid_argument,
643 prefix <<
"Invalid combine mode; should never get "
644 "here! Please report this bug to the Tpetra developers.");
650 std::invalid_argument,
652 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
666 Kokkos::View<LO*[2], DT> batch_info(
"",
num_batches);
669 Kokkos::parallel_reduce(
670 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0,
num_import_lids),
674 imports.data(), offsets(
i), num_packets_per_lid(
i)
700 const bool atomic = XS::concurrency() != 1;
716 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
718#if defined(KOKKOS_ENABLE_CUDA)
719 constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
721 constexpr bool is_cuda =
false;
723 if (!is_cuda ||
team_size == Teuchos::OrdinalTraits<size_t>::invalid())
732 auto error_code =
f.error();
736 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
740template<
class LocalMatrix,
class BufferDeviceType>
745 const Kokkos::View<const char*, BufferDeviceType>& imports,
746 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
749 using Kokkos::parallel_reduce;
750 typedef typename LocalMatrix::ordinal_type LO;
751 typedef typename LocalMatrix::device_type device_type;
752 typedef typename device_type::execution_space XS;
753 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
754 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> >
range_policy;
766 update +=
static_cast<size_t>(local_matrix.graph.row_map[
lid+1]
767 -local_matrix.graph.row_map[
lid]);
773 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
776 parallel_reduce(range_policy(0, num_items),
777 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
778 const LO lid = permute_from_lids(i);
779 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
780 - local_matrix.graph.row_map[lid]);
787 const size_type np = num_packets_per_lid.extent(0);
788 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
791 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
799template<
class LO,
class DT,
class BDT>
801setupRowPointersForRemotes(
804 const Kokkos::View<const char*, BDT>& imports,
805 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
808 using Kokkos::parallel_reduce;
809 typedef typename DT::execution_space XS;
811 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
814 const size_type
N = num_packets_per_lid.extent(0);
821 const size_t num_bytes = num_packets_per_lid(
i);
822 const size_t offset = offsets(
i);
835makeCrsRowPtrFromLengths(
839 using Kokkos::parallel_scan;
840 typedef typename DT::execution_space XS;
841 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
842 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> >
range_policy;
845 KOKKOS_LAMBDA(
const size_t&
i,
size_t& update,
const bool&
final) {
856template<
class LocalMatrix,
class LocalMap>
860 const typename PackTraits<int>::output_array_type& tgt_pids,
862 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
864 const typename PackTraits<int>::input_array_type& src_pids,
865 const LocalMatrix& local_matrix,
866 const LocalMap& local_col_map,
867 const size_t num_same_ids,
870 using Kokkos::parallel_for;
873 typedef typename DT::execution_space XS;
874 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
876 parallel_for(range_policy(0, num_same_ids),
877 KOKKOS_LAMBDA(
const size_t i) {
878 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
880 const LO src_lid =
static_cast<LO
>(i);
881 size_t src_row = local_matrix.graph.row_map(src_lid);
883 const LO tgt_lid =
static_cast<LO
>(i);
884 const size_t tgt_row = tgt_rowptr(tgt_lid);
886 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
887 - local_matrix.graph.row_map(src_lid);
888 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
890 for (
size_t j=local_matrix.graph.row_map(src_lid);
891 j<local_matrix.graph.row_map(src_lid+1); ++j) {
892 LO src_col = local_matrix.graph.entries(j);
893 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
894 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
895 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
901template<
class LocalMatrix,
class LocalMap>
903copyDataFromPermuteIDs(
905 const typename PackTraits<int>::output_array_type& tgt_pids,
907 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
909 const typename PackTraits<int>::input_array_type& src_pids,
912 const LocalMatrix& local_matrix,
913 const LocalMap& local_col_map,
916 using Kokkos::parallel_for;
919 typedef typename DT::execution_space XS;
920 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
921 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
923 const size_type num_permute_to_lids = permute_to_lids.extent(0);
925 parallel_for(range_policy(0, num_permute_to_lids),
926 KOKKOS_LAMBDA(
const size_t i) {
927 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
929 const LO src_lid = permute_from_lids(i);
930 const size_t src_row = local_matrix.graph.row_map(src_lid);
932 const LO tgt_lid = permute_to_lids(i);
933 const size_t tgt_row = tgt_rowptr(tgt_lid);
935 size_t nsr = local_matrix.graph.row_map(src_lid+1)
936 - local_matrix.graph.row_map(src_lid);
937 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
939 for (
size_t j=local_matrix.graph.row_map(src_lid);
940 j<local_matrix.graph.row_map(src_lid+1); ++j) {
941 LO src_col = local_matrix.graph.entries(j);
942 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
943 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
944 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
950template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
952unpackAndCombineIntoCrsArrays2(
954 const typename PackTraits<int>::output_array_type& tgt_pids,
956 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
959 const Kokkos::View<const char*, BufferDeviceType>& imports,
960 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
964 const size_t bytes_per_value)
967 using Kokkos::subview;
968 using Kokkos::MemoryUnmanaged;
969 using Kokkos::parallel_reduce;
970 using Kokkos::atomic_fetch_add;
975 typedef typename LocalMatrix::value_type ST;
976 typedef typename DT::execution_space XS;
977 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
978 typedef typename Kokkos::pair<size_type, size_type> slice;
979 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
981 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
982 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
983 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
985 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
988 const size_type num_import_lids = import_lids.size();
991 parallel_reduce (
"Unpack and combine into CRS",
992 range_policy (0, num_import_lids),
993 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
994 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
995 const size_t num_bytes = num_packets_per_lid(i);
996 const size_t offset = offsets(i);
997 if (num_bytes == 0) {
1001 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1002 if (num_ent == InvalidNum) {
1006 const LO lcl_row = import_lids(i);
1007 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1008 const size_t end_row = start_row + num_ent;
1010 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1011 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1012 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1014 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1015 imports.data(), offset, num_bytes,
1016 num_ent, bytes_per_value);
1019 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1020 const int pid = pids_out(j);
1021 pids_out(j) = (pid != my_pid) ? pid : -1;
1028template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1031 const LocalMatrix & local_matrix,
1032 const LocalMap & local_col_map,
1034 const Kokkos::View<const char*, BufferDeviceType>& imports,
1035 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1041 const typename PackTraits<int>::input_array_type& src_pids,
1042 const typename PackTraits<int>::output_array_type& tgt_pids,
1043 const size_t num_same_ids,
1044 const size_t tgt_num_rows,
1045 const size_t tgt_num_nonzeros,
1046 const int my_tgt_pid,
1047 const size_t bytes_per_value)
1050 using Kokkos::subview;
1051 using Kokkos::parallel_for;
1052 using Kokkos::MemoryUnmanaged;
1056 typedef typename DT::execution_space XS;
1057 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1058 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1059 typedef BufferDeviceType BDT;
1061 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1063 const size_t N = tgt_num_rows;
1067 const int my_pid = my_tgt_pid;
1070 parallel_for(range_policy(0, N+1),
1071 KOKKOS_LAMBDA(
const size_t i) {
1077 parallel_for(range_policy(0, num_same_ids),
1078 KOKKOS_LAMBDA(
const size_t i) {
1079 const LO tgt_lid =
static_cast<LO
>(i);
1080 const LO src_lid =
static_cast<LO
>(i);
1081 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1082 - local_matrix.graph.row_map(src_lid);
1087 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1088 parallel_for(range_policy(0, num_permute_to_lids),
1089 KOKKOS_LAMBDA(
const size_t i) {
1090 const LO tgt_lid = permute_to_lids(i);
1091 const LO src_lid = permute_from_lids(i);
1092 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1093 - local_matrix.graph.row_map(src_lid);
1098 const size_type num_import_lids = import_lids.extent(0);
1099 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1102#ifdef HAVE_TPETRA_DEBUG
1104 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1105 const bool condition =
1106 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1107 TEUCHOS_TEST_FOR_EXCEPTION
1108 (condition, std::logic_error, prefix
1109 <<
"The final offset in bytes " << nth_offset_h
1110 <<
" != imports.size() = " << imports.extent(0)
1111 <<
". Please report this bug to the Tpetra developers.");
1117 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1118 import_lids, imports, num_packets_per_lid, offsets);
1119 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1120 <<
" Error transferring data to target row pointers. "
1121 "Please report this bug to the Tpetra developers.");
1125 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1128 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1131 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1132 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1134 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1135 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1136 local_matrix, local_col_map, my_pid);
1138 if (imports.extent(0) <= 0) {
1142 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1143 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1144 local_matrix, local_col_map, my_pid, bytes_per_value);
1145 TEUCHOS_TEST_FOR_EXCEPTION(
1146 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1147 "should never happen. Please report this bug to the Tpetra developers.");
1193template<
typename ST,
typename LO,
typename GO,
typename Node>
1197 const Teuchos::ArrayView<const char>& imports,
1199 const Teuchos::ArrayView<const LO>&
importLIDs,
1204 typedef typename Node::device_type device_type;
1206 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1207 "Node::device_type and LocalMatrix::device_type must be the same.");
1225 imports.size(),
true,
"imports");
1227 auto local_matrix =
sourceMatrix.getLocalMatrixDevice();
1228 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1239 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1245template<
typename ST,
typename LO,
typename GO,
typename NT>
1247unpackCrsMatrixAndCombineNew(
1249 Kokkos::DualView<
char*,
1251 Kokkos::DualView<
size_t*,
1253 const Kokkos::DualView<
const LO*,
1261 using device_type =
typename crs_matrix_type::device_type;
1262 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1263 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1266 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1267 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1268 "must be the same.");
1273 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1275 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1276 auto import_lids_d = importLIDs.view_device ();
1278 if (imports.need_sync_device()) {
1279 imports.sync_device ();
1281 auto imports_d = imports.view_device ();
1283 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1284 auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1285 typedef decltype (local_col_map) local_map_type;
1287 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288 local_matrix_device_type,
1291 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1292 import_lids_d, combineMode);
1350template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1354 const Teuchos::ArrayView<const LocalOrdinal> &
importLIDs,
1355 const Teuchos::ArrayView<const char> &imports,
1360 const Teuchos::ArrayView<const LocalOrdinal>&
permuteToLIDs,
1363 using Kokkos::MemoryUnmanaged;
1365 typedef typename Node::device_type DT;
1367 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1378 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1384 auto local_matrix =
sourceMatrix.getLocalMatrixDevice ();
1389 "permute_from_lids");
1392 imports.getRawPtr (),
1393 imports.size (),
true,
1399 "num_packets_per_lid");
1401 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1420template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1424 const Teuchos::ArrayView<const LocalOrdinal>&
importLIDs,
1425 const Teuchos::ArrayView<const char>& imports,
1429 const size_t numSameIDs,
1430 const Teuchos::ArrayView<const LocalOrdinal>&
permuteToLIDs,
1435 const Teuchos::ArrayView<size_t>&
CRS_rowptr,
1436 const Teuchos::ArrayView<GlobalOrdinal>&
CRS_colind,
1438 const Teuchos::ArrayView<const int>&
SourcePids,
1444 using Kokkos::deep_copy;
1446 using Teuchos::ArrayView;
1447 using Teuchos::outArg;
1448 using Teuchos::REDUCE_MAX;
1449 using Teuchos::reduceAll;
1453 typedef typename Node::device_type DT;
1456 typedef typename matrix_type::impl_scalar_type ST;
1459 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1463 std::invalid_argument,
prefix <<
"CRS_rowptr.size() = " <<
1484 auto local_matrix =
sourceMatrix.getLocalMatrixDevice();
1485 auto local_col_map =
sourceMatrix.getColMap()->getLocalMap();
1495 imports.size(),
true,
"imports");
1517#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518 static_assert (! std::is_same<
1519 typename std::remove_const<
1520 typename std::decay<
1524 std::complex<double> >::value,
1525 "CRS_vals::value_type is std::complex<double>; this should never happen"
1526 ", since std::complex does not work in Kokkos::View objects.");
1531 CRS_vals.size(),
true,
"crs_vals");
1533#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534 static_assert (! std::is_same<
1535 typename decltype (
crs_vals_d)::non_const_value_type,
1536 std::complex<double> >::value,
1537 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538 "never happen, since std::complex does not work in Kokkos::View objects.");
1549 size_t bytes_per_value = 0;
1564 if (local_matrix.values.extent(0) > 0) {
1565 const ST&
val = local_matrix.values(0);
1571 Teuchos::reduceAll<int, size_t>(*(
sourceMatrix.getComm()),
1572 Teuchos::REDUCE_MAX,
1574 outArg(bytes_per_value));
1577#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578 static_assert (! std::is_same<
1579 typename decltype (
crs_vals_d)::non_const_value_type,
1580 std::complex<double> >::value,
1581 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582 "never happen, since std::complex does not work in Kokkos::View objects.");
1585 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1614#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1616 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617 const CrsMatrix<ST, LO, GO, NT>&, \
1618 const Teuchos::ArrayView<const char>&, \
1619 const Teuchos::ArrayView<const size_t>&, \
1620 const Teuchos::ArrayView<const LO>&, \
1624 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1627 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1630 const CombineMode); \
1632 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT> &, \
1634 const Teuchos::ArrayView<const LO>&, \
1635 const Teuchos::ArrayView<const char>&, \
1636 const Teuchos::ArrayView<const size_t>&, \
1638 const CombineMode, \
1640 const Teuchos::ArrayView<const LO>&, \
1641 const Teuchos::ArrayView<const LO>&, \
1645 const Teuchos::ArrayView<size_t>&, \
1646 const Teuchos::ArrayView<GO>&, \
1647 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1648 const Teuchos::ArrayView<const int>&, \
1649 Teuchos::Array<int>&); \
1651 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1652 const CrsMatrix<ST, LO, GO, NT> &, \
1653 const Teuchos::ArrayView<const LO> &, \
1654 const Teuchos::ArrayView<const char> &, \
1655 const Teuchos::ArrayView<const size_t>&, \
1659 const Teuchos::ArrayView<const LO>&, \
1660 const Teuchos::ArrayView<const LO>&);
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Struct that holds views of the contents of a CrsMatrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Unpacks and combines a single row of the CrsMatrix.
int error() const
Host function for getting the error.