67 using ExecutionSpace =
typename DeviceType::execution_space;
68 using TeamPolicy = Kokkos::TeamPolicy<ExecutionSpace>;
69 using TeamMember =
typename TeamPolicy::member_type;
71 using IntegralViewType = Kokkos::View<typename RankExpander<Scalar, integralViewRank>::value_type, DeviceType>;
72 IntegralViewType integralView_;
79 int leftComponentSpan_;
80 int rightComponentSpan_;
81 int numTensorComponents_;
82 int leftFieldOrdinalOffset_;
83 int rightFieldOrdinalOffset_;
84 bool forceNonSpecialized_;
86 size_t fad_size_output_ = 0;
88 Kokkos::Array<int, 7> offsetsForComponentOrdinal_;
92 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldBounds_;
93 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldBounds_;
94 Kokkos::Array<int,Parameters::MaxTensorComponents> pointBounds_;
96 Kokkos::Array<int,Parameters::MaxTensorComponents> leftFieldRelativeEnumerationSpans_;
97 Kokkos::Array<int,Parameters::MaxTensorComponents> rightFieldRelativeEnumerationSpans_;
128 INTREPID2_TEST_FOR_EXCEPTION(numTensorComponents_ != rightComponent_.numTensorComponents(), std::invalid_argument,
"Left and right components must have matching number of tensorial components");
136 for (
int r=0;
r<numTensorComponents_;
r++)
138 leftFieldBounds_[
r] = leftComponent_.getTensorComponent(
r).extent_int(
FIELD_DIM);
139 maxFieldsLeft_ = std::max(maxFieldsLeft_, leftFieldBounds_[
r]);
140 rightFieldBounds_[
r] = rightComponent_.getTensorComponent(
r).extent_int(
FIELD_DIM);
141 maxFieldsRight_ = std::max(maxFieldsRight_, rightFieldBounds_[
r]);
142 pointBounds_[
r] = leftComponent_.getTensorComponent(
r).extent_int(
POINT_DIM);
143 maxPointCount_ = std::max(maxPointCount_, pointBounds_[
r]);
166 const int R = numTensorComponents_ - 1;
173 for (
int r=
R-1;
r>0;
r--)
186 template<
size_t maxComponents,
size_t numComponents = maxComponents>
188 int incrementArgument( Kokkos::Array<int,maxComponents> &
arguments,
189 const Kokkos::Array<int,maxComponents> &
bounds)
const
191 if (numComponents == 0)
197 int r =
static_cast<int>(numComponents - 1);
212 const Kokkos::Array<int,Parameters::MaxTensorComponents> &
bounds,
213 const int &numComponents)
const
215 if (numComponents == 0)
return -1;
216 int r =
static_cast<int>(numComponents - 1);
227 template<
size_t maxComponents,
size_t numComponents = maxComponents>
229 int nextIncrementResult(
const Kokkos::Array<int,maxComponents> &
arguments,
230 const Kokkos::Array<int,maxComponents> &
bounds)
const
232 if (numComponents == 0)
238 int r =
static_cast<int>(numComponents - 1);
239 while (arguments[r] + 1 >= bounds[r])
249 KOKKOS_INLINE_FUNCTION
251 const Kokkos::Array<int,Parameters::MaxTensorComponents> &
bounds,
252 const int &numComponents)
const
254 if (numComponents == 0)
return -1;
255 int r = numComponents - 1;
264 template<
size_t maxComponents,
size_t numComponents = maxComponents>
266 int relativeEnumerationIndex(
const Kokkos::Array<int,maxComponents> &
arguments,
267 const Kokkos::Array<int,maxComponents> &
bounds,
271 if (numComponents == 0)
277 int enumerationIndex = 0;
278 for (
size_t r=numComponents-1; r>
static_cast<size_t>(startIndex); r--)
280 enumerationIndex += arguments[r];
281 enumerationIndex *= bounds[r-1];
283 enumerationIndex += arguments[startIndex];
284 return enumerationIndex;
298 KOKKOS_INLINE_FUNCTION
301 constexpr int numTensorComponents = 3;
303 Kokkos::Array<int,numTensorComponents>
pointBounds;
306 for (
unsigned r=0;
r<numTensorComponents;
r++)
317 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>
scratchView;
318 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>
pointWeights;
320 if (fad_size_output_ > 0) {
322 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (
teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
332 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (
teamMember.team_shmem(), composedTransform_.extent_int(1));
344 constexpr int R = numTensorComponents - 1;
366 const int maxFields = (maxFieldsLeft_ > maxFieldsRight_) ? maxFieldsLeft_ : maxFieldsRight_;
372 const int leftRank = leftTensorComponent_x.rank();
373 leftFields_x(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_x(fieldOrdinal,pointOrdinal) : leftTensorComponent_x(fieldOrdinal,pointOrdinal,a_component);
377 const int leftRank = leftTensorComponent_y.rank();
378 leftFields_y(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_y(fieldOrdinal,pointOrdinal) : leftTensorComponent_y(fieldOrdinal,pointOrdinal,a_component);
382 const int leftRank = leftTensorComponent_z.rank();
383 leftFields_z(fieldOrdinal,pointOrdinal) = (leftRank == 2) ? leftTensorComponent_z(fieldOrdinal,pointOrdinal) : leftTensorComponent_z(fieldOrdinal,pointOrdinal,a_component);
387 const int rightRank = rightTensorComponent_x.rank();
388 rightFields_x(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_x(fieldOrdinal,pointOrdinal) : rightTensorComponent_x(fieldOrdinal,pointOrdinal,b_component);
392 const int rightRank = rightTensorComponent_y.rank();
393 rightFields_y(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_y(fieldOrdinal,pointOrdinal) : rightTensorComponent_y(fieldOrdinal,pointOrdinal,b_component);
397 const int rightRank = rightTensorComponent_z.rank();
398 rightFields_z(fieldOrdinal,pointOrdinal) = (rightRank == 2) ? rightTensorComponent_z(fieldOrdinal,pointOrdinal) : rightTensorComponent_z(fieldOrdinal,pointOrdinal,b_component);
402 if (composedTransform_.underlyingMatchesLogical())
406 pointWeights(pointOrdinal) = composedTransformView(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
411 Kokkos::parallel_for(Kokkos::TeamThreadRange(
teamMember,0,composedTransform_.extent_int(1)), [&] (
const int&
pointOrdinal) {
412 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
441 for (
int i=0;
i<numTensorComponents;
i++)
480 for (
int iy=0;
iy<leftFieldBounds_[1];
iy++)
487 for (
int jy=0;
jy<rightFieldBounds_[1];
jy++)
507 for (
int ix=0;
ix<leftFieldBounds_[0];
ix++)
512 for (
int iy=0;
iy<leftFieldBounds_[1];
iy++)
518 for (
int jx=0;
jx<rightFieldBounds_[0];
jx++)
523 for (
int jy=0;
jy<rightFieldBounds_[1];
jy++)
580 template<
size_t numTensorComponents>
582 void run(
const TeamMember &
teamMember )
const
584 Kokkos::Array<int,numTensorComponents>
pointBounds;
587 for (
unsigned r=0;
r<numTensorComponents;
r++)
594 const int cellDataOrdinal = teamMember.league_rank();
595 const int numThreads = teamMember.team_size();
596 const int scratchViewSize = offsetsForComponentOrdinal_[0];
597 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> scratchView;
598 Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> pointWeights;
599 Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged> leftFields, rightFields;
600 if (fad_size_output_ > 0) {
601 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize * numThreads, fad_size_output_);
602 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1), fad_size_output_);
603 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_, fad_size_output_);
604 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_, fad_size_output_);
607 scratchView = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), scratchViewSize*numThreads);
608 pointWeights = Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged> (teamMember.team_shmem(), composedTransform_.extent_int(1));
609 leftFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsLeft_, maxPointCount_);
610 rightFields = Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>(teamMember.team_shmem(), numTensorComponents, maxFieldsRight_, maxPointCount_);
616 constexpr int R = numTensorComponents - 1;
618 for (
int a_component=0; a_component < leftComponentSpan_; a_component++)
620 const int a = a_offset_ + a_component;
621 for (
int b_component=0; b_component < rightComponentSpan_; b_component++)
623 const int b = b_offset_ + b_component;
625 const Data<Scalar,DeviceType> & leftFinalComponent = leftComponent_.getTensorComponent(R);
626 const Data<Scalar,DeviceType> & rightFinalComponent = rightComponent_.getTensorComponent(R);
628 const int numLeftFieldsFinal = leftFinalComponent.extent_int(0);
629 const int numRightFieldsFinal = rightFinalComponent.extent_int(0);
631 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numTensorComponents), [&] (
const int& r) {
632 const Data<Scalar,DeviceType> & leftTensorComponent = leftComponent_.getTensorComponent(r);
633 const Data<Scalar,DeviceType> & rightTensorComponent = rightComponent_.getTensorComponent(r);
634 const int leftFieldCount = leftTensorComponent.extent_int(0);
635 const int pointCount = leftTensorComponent.extent_int(1);
636 const int leftRank = leftTensorComponent.rank();
637 const int rightFieldCount = rightTensorComponent.extent_int(0);
638 const int rightRank = rightTensorComponent.rank();
639 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsLeft_; fieldOrdinal++)
642 const int fieldAddress = (fieldOrdinal < leftFieldCount) ? fieldOrdinal : leftFieldCount - 1;
643 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
645 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
646 leftFields(r,fieldAddress,pointAddress) = (leftRank == 2) ? leftTensorComponent(fieldAddress,pointAddress) : leftTensorComponent(fieldAddress,pointAddress,a_component);
649 for (
int fieldOrdinal=0; fieldOrdinal<maxFieldsRight_; fieldOrdinal++)
652 const int fieldAddress = (fieldOrdinal < rightFieldCount) ? fieldOrdinal : rightFieldCount - 1;
653 for (
int pointOrdinal=0; pointOrdinal<maxPointCount_; pointOrdinal++)
655 const int pointAddress = (pointOrdinal < pointCount) ? pointOrdinal : pointCount - 1;
656 rightFields(r,fieldAddress,pointAddress) = (rightRank == 2) ? rightTensorComponent(fieldAddress,pointAddress) : rightTensorComponent(fieldAddress,pointAddress,b_component);
661 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,composedTransform_.extent_int(1)), [&] (
const int& pointOrdinal) {
662 pointWeights(pointOrdinal) = composedTransform_(cellDataOrdinal,pointOrdinal,a,b) * cellMeasures_(cellDataOrdinal,pointOrdinal);
667 teamMember.team_barrier();
669 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,0,numLeftFieldsFinal * numRightFieldsFinal), [&] (
const int& leftRightFieldEnumeration) {
670 const int scratchOffsetForThread = teamMember.team_rank() * scratchViewSize;
671 const int i_R = leftRightFieldEnumeration % numLeftFieldsFinal;
672 const int j_R = leftRightFieldEnumeration / numLeftFieldsFinal;
676 Kokkos::Array<int,numTensorComponents> leftFieldArguments;
677 Kokkos::Array<int,numTensorComponents> rightFieldArguments;
678 rightFieldArguments[R] = j_R;
679 leftFieldArguments[R] = i_R;
682 for (
int i=scratchOffsetForThread; i<scratchOffsetForThread+scratchViewSize; i++)
684 scratchView(i) = 0.0;
686 Kokkos::Array<int,numTensorComponents> pointArguments;
687 for (
unsigned i=0; i<numTensorComponents; i++)
689 pointArguments[i] = 0;
696 const int pointBounds_R = pointBounds[R];
697 int & pointArgument_R = pointArguments[R];
698 for (pointArgument_R=0; pointArgument_R < pointBounds_R; pointArgument_R++)
703 G_R = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[R]);
707 const int leftFieldIndex = i_R + leftFieldOrdinalOffset_;
708 const int rightFieldIndex = j_R + rightFieldOrdinalOffset_;
710 if (integralViewRank == 3)
713 G_R = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
718 G_R = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
722 const int & pointIndex_R = pointArguments[R];
724 const Scalar & leftValue = leftFields(R,i_R,pointIndex_R);
725 const Scalar & rightValue = rightFields(R,j_R,pointIndex_R);
727 int pointEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(pointArguments, pointBounds, 0);
729 *G_R += leftValue * pointWeights(pointEnumerationIndex) * rightValue;
734 const int r_next = nextIncrementResult<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
735 const int r_min = (r_next >= 0) ? r_next : 0;
737 for (
int s=R-1; s>=r_min; s--)
739 const int & pointIndex_s = pointArguments[s];
742 for (
int s1=s; s1<R; s1++)
744 leftFieldArguments[s1] = 0;
748 const int & i_s = leftFieldArguments[s];
749 const int & j_s = rightFieldArguments[s];
752 const int & rightEnumerationSpan_s = rightFieldRelativeEnumerationSpans_[s];
753 const int & rightEnumerationSpan_sp = rightFieldRelativeEnumerationSpans_[s+1];
757 const int leftEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s);
760 const int leftEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(leftFieldArguments, leftFieldBounds, s+1);
762 const Scalar & leftValue = leftFields(s,i_s,pointIndex_s);
764 for (
int s1=s; s1<R; s1++)
766 rightFieldArguments[s1] = 0;
771 const int rightEnumerationIndex_s = relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s);
774 const int rightEnumerationIndex_sp = (s+1 == R) ? 0 : relativeEnumerationIndex<numTensorComponents,R>(rightFieldArguments, rightFieldBounds, s+1);
776 const Scalar & rightValue = rightFields(s,j_s,pointIndex_s);
778 const int G_s_index = leftEnumerationIndex_s * rightEnumerationSpan_s + rightEnumerationIndex_s;
784 const int G_sp_index = leftEnumerationIndex_sp * rightEnumerationSpan_sp + rightEnumerationIndex_sp;
786 const Scalar & G_sp = scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s+1] + G_sp_index);
791 G_s = &scratchView(scratchOffsetForThread + offsetsForComponentOrdinal_[s] + G_s_index);
796 int leftEnumerationIndex = relativeEnumerationIndex<numTensorComponents>( leftFieldArguments, leftFieldBounds, 0);
797 int rightEnumerationIndex = relativeEnumerationIndex<numTensorComponents>(rightFieldArguments, rightFieldBounds, 0);
798 const int leftFieldIndex = leftEnumerationIndex + leftFieldOrdinalOffset_;
799 const int rightFieldIndex = rightEnumerationIndex + rightFieldOrdinalOffset_;
801 if (integralViewRank == 3)
804 G_s = &integralView_.access(cellDataOrdinal,leftFieldIndex,rightFieldIndex);
809 G_s = &integralView_.access(leftFieldIndex,rightFieldIndex,0);
813 *G_s += leftValue * G_sp * rightValue;
818 sRight = incrementArgument<numTensorComponents,R>(rightFieldArguments, rightFieldBounds);
822 sLeft = incrementArgument<numTensorComponents,R>(leftFieldArguments, leftFieldBounds);
829 const int endIndex = scratchOffsetForThread + offsetsForComponentOrdinal_[r_min];
830 for (
int i=scratchOffsetForThread; i<endIndex; i++)
832 scratchView(i) = 0.0;
839 r = incrementArgument<numTensorComponents,numTensorComponents-1>(pointArguments, pointBounds);
847 KOKKOS_INLINE_FUNCTION
848 void operator()(
const TeamMember & teamMember )
const
850 switch (numTensorComponents_)
852 case 1: run<1>(teamMember);
break;
853 case 2: run<2>(teamMember);
break;
855 if (forceNonSpecialized_)
860 case 4: run<4>(teamMember);
break;
861 case 5: run<5>(teamMember);
break;
862 case 6: run<6>(teamMember);
break;
863 case 7: run<7>(teamMember);
break;
864#ifdef INTREPID2_HAVE_DEBUG
866 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(
true,std::invalid_argument,
"Unsupported component count");
877 const int R = numTensorComponents_ - 1;
902 Kokkos::Array<int,Parameters::MaxTensorComponents>
pointArguments;
903 for (
int i=0;
i<numTensorComponents_;
i++)
940 r = incrementArgument(
pointArguments, pointBounds_, numTensorComponents_);
953 const int R = numTensorComponents_ - 1;
964 if (fad_size_output_ > 0)
966 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(offsetsForComponentOrdinal_[0] *
team_size, fad_size_output_);
967 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1), fad_size_output_);
968 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_, fad_size_output_);
969 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_, fad_size_output_);
973 shmem_size += Kokkos::View<Scalar*, DeviceType>::shmem_size(offsetsForComponentOrdinal_[0] *
team_size);
974 shmem_size += Kokkos::View<Scalar*, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(composedTransform_.extent_int(1));
975 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsLeft_, maxPointCount_);
976 shmem_size += Kokkos::View<Scalar***, DeviceType, Kokkos::MemoryUnmanaged>::shmem_size(numTensorComponents_, maxFieldsRight_, maxPointCount_);
1840 double* approximateFlops)
1842 using ExecutionSpace =
typename DeviceType::execution_space;
1844 if (approximateFlops != NULL)
1846 *approximateFlops = 0;
1858 const int numFieldsLeft = vectorDataLeft.
numFields();
1859 const int numFieldsRight = vectorDataRight.
numFields();
1860 const int spaceDim = vectorDataLeft.
spaceDim();
1862 INTREPID2_TEST_FOR_EXCEPTION(vectorDataLeft.
spaceDim() != vectorDataRight.
spaceDim(), std::invalid_argument,
"vectorDataLeft and vectorDataRight must agree on the space dimension");
1868 int numTensorComponentsLeft = -1;
1869 const auto &refVectorLeft = vectorDataLeft.
vectorData();
1870 int numFamiliesLeft = refVectorLeft.numFamilies();
1871 int numVectorComponentsLeft = refVectorLeft.numComponents();
1872 Kokkos::Array<int,7> maxFieldsForComponentLeft {0,0,0,0,0,0,0};
1873 Kokkos::Array<int,7> maxFieldsForComponentRight {0,0,0,0,0,0,0};
1874 for (
int familyOrdinal=0; familyOrdinal<numFamiliesLeft; familyOrdinal++)
1876 for (
int vectorComponent=0; vectorComponent<numVectorComponentsLeft; vectorComponent++)
1881 if (numTensorComponentsLeft == -1)
1885 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != tensorData.
numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataLeft must have the same number of tensor components as every other");
1886 for (
int r=0; r<numTensorComponentsLeft; r++)
1893 int numTensorComponentsRight = -1;
1894 const auto &refVectorRight = vectorDataRight.
vectorData();
1895 int numFamiliesRight = refVectorRight.numFamilies();
1896 int numVectorComponentsRight = refVectorRight.numComponents();
1897 for (
int familyOrdinal=0; familyOrdinal<numFamiliesRight; familyOrdinal++)
1899 for (
int vectorComponent=0; vectorComponent<numVectorComponentsRight; vectorComponent++)
1901 const auto &tensorData = refVectorRight.getComponent(familyOrdinal,vectorComponent);
1902 if (tensorData.numTensorComponents() > 0)
1904 if (numTensorComponentsRight == -1)
1906 numTensorComponentsRight = tensorData.numTensorComponents();
1908 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsRight != tensorData.numTensorComponents(), std::invalid_argument,
"Each valid entry in vectorDataRight must have the same number of tensor components as every other");
1909 for (
int r=0; r<numTensorComponentsRight; r++)
1911 maxFieldsForComponentRight[r] = std::max(tensorData.getTensorComponent(r).extent_int(0), maxFieldsForComponentRight[r]);
1916 INTREPID2_TEST_FOR_EXCEPTION(numVectorComponentsLeft != numVectorComponentsRight, std::invalid_argument,
"Left and right vector entries must have the same number of tensorial components");
1919 if ((numPointTensorComponents == numTensorComponentsLeft) && vectorDataLeft.
axisAligned() && vectorDataRight.
axisAligned())
1928 Kokkos::Array<int,Parameters::MaxTensorComponents> pointDimensions;
1929 for (
int r=0; r<numPointTensorComponents; r++)
1935 Kokkos::Array< Kokkos::Array<ScalarView<Scalar,DeviceType>, Parameters::MaxTensorComponents>, Parameters::MaxTensorComponents> componentIntegrals;
1937 for (
int r=0; r<numPointTensorComponents; r++)
1939 for (
int d=0; d<spaceDim; d++)
1962 int leftComponentCount = leftComponent.
extent_int(0);
1963 int rightComponentCount = rightComponent.
extent_int(0);
1965 const bool allocateFadStorage = !std::is_pod<Scalar>::value;
1966 if (allocateFadStorage)
1969 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount, fad_size_output);
1973 componentIntegrals[r][d] = ScalarView<Scalar,DeviceType>(
"componentIntegrals for tensor component " + std::to_string(r) +
", in dimension " + std::to_string(d), leftComponentCount, rightComponentCount);
1976 Kokkos::deep_copy(componentIntegrals[r][d], 0.0);
1978 const int & numPoints = pointDimensions[r];
1979 const int leftComponentRank = leftComponent.
rank();
1980 const int rightComponentRank = rightComponent.
rank();
1982 const int leftComponentDimSpan = leftComponent.
extent_int(2);
1983 const int rightComponentDimSpan = rightComponent.
extent_int(2);
1984 INTREPID2_TEST_FOR_EXCEPTION(( leftComponentDimSpan != rightComponentDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
1990 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<2>>({0,0},{leftComponentCount,rightComponentCount});
1992 for (
int a=0; a<leftComponentDimSpan; a++)
1994 Kokkos::parallel_for(
"compute componentIntegrals", policy,
1995 KOKKOS_LAMBDA (
const int &i,
const int &j) {
1996 Scalar refSpaceIntegral = 0.0;
1997 for (
int ptOrdinal=0; ptOrdinal<numPoints; ptOrdinal++)
1999 const Scalar & leftValue = ( leftComponentRank == 2) ? leftComponent(i,ptOrdinal) : leftComponent(i,ptOrdinal,a);
2000 const Scalar & rightValue = (rightComponentRank == 2) ? rightComponent(j,ptOrdinal) : rightComponent(j,ptOrdinal,a);
2001 refSpaceIntegral += leftValue * rightValue * quadratureWeights(ptOrdinal);
2003 componentIntegrals[r][d](i,j) = refSpaceIntegral;
2007 if (approximateFlops != NULL)
2009 *approximateFlops += leftComponentCount*rightComponentCount*numPoints*(3);
2013 ExecutionSpace().fence();
2016 Kokkos::View<Scalar**, DeviceType> integralView2;
2017 Kokkos::View<Scalar***, DeviceType> integralView3;
2018 if (integralViewRank == 3)
2027 const int leftFamilyCount = vectorDataLeft.
vectorData().numFamilies();
2028 const int rightFamilyCount = vectorDataRight.
vectorData().numFamilies();
2030 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2032 bool haveLaunchedContributionToLeftFamily =
false;
2035 int leftFieldOffset = vectorDataLeft.
vectorData().familyFieldOrdinalOffset(leftFamilyOrdinal);
2037 const int leftVectorComponentCount = vectorDataLeft.
vectorData().numComponents();
2038 for (
int leftVectorComponentOrdinal = 0; leftVectorComponentOrdinal < leftVectorComponentCount; leftVectorComponentOrdinal++)
2040 const auto & leftComponent = vectorDataLeft.
vectorData().getComponent(leftFamilyOrdinal, leftVectorComponentOrdinal);
2041 if (!leftComponent.isValid())
2046 const int leftDimSpan = leftComponent.extent_int(2);
2048 const int leftComponentFieldCount = leftComponent.extent_int(0);
2050 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2052 bool haveLaunchedContributionToRightFamily =
false;
2055 int rightFieldOffset = vectorDataRight.
vectorData().familyFieldOrdinalOffset(rightFamilyOrdinal);
2057 const int rightVectorComponentCount = vectorDataRight.
vectorData().numComponents();
2058 for (
int rightVectorComponentOrdinal = 0; rightVectorComponentOrdinal < rightVectorComponentCount; rightVectorComponentOrdinal++)
2060 const auto & rightComponent = vectorDataRight.
vectorData().getComponent(rightFamilyOrdinal, rightVectorComponentOrdinal);
2061 if (!rightComponent.isValid())
2066 const int rightDimSpan = rightComponent.extent_int(2);
2068 const int rightComponentFieldCount = rightComponent.extent_int(0);
2071 if ((a_offset + leftDimSpan <= b_offset) || (b_offset + rightDimSpan <= a_offset))
2073 b_offset += rightDimSpan;
2079 INTREPID2_TEST_FOR_EXCEPTION(( a_offset != b_offset), std::logic_error,
"left and right dimension offsets should match.");
2080 INTREPID2_TEST_FOR_EXCEPTION(( leftDimSpan != rightDimSpan), std::invalid_argument,
"left and right components must span the same number of dimensions.");
2082 const int d_start = a_offset;
2083 const int d_end = d_start + leftDimSpan;
2085 if (haveLaunchedContributionToLeftFamily && haveLaunchedContributionToRightFamily)
2088 ExecutionSpace().fence();
2090 haveLaunchedContributionToLeftFamily =
true;
2091 haveLaunchedContributionToRightFamily =
true;
2093 Kokkos::Array<int,3> upperBounds {cellDataExtent,leftComponentFieldCount,rightComponentFieldCount};
2094 Kokkos::Array<int,3> lowerBounds {0,0,0};
2095 auto policy = Kokkos::MDRangePolicy<ExecutionSpace,Kokkos::Rank<3>>(lowerBounds, upperBounds);
2097 Kokkos::parallel_for(
"compute field integrals", policy,
2098 KOKKOS_LAMBDA (
const int &cellDataOrdinal,
const int &leftFieldOrdinal,
const int &rightFieldOrdinal) {
2099 const Scalar & cellMeasureWeight = cellMeasures.
getTensorComponent(0)(cellDataOrdinal);
2107 Scalar integralSum = 0.0;
2108 for (
int d=d_start; d<d_end; d++)
2110 const Scalar & transformLeft_d = vectorDataLeft.
transformWeight(cellDataOrdinal,0,d,d);
2111 const Scalar & transformRight_d = vectorDataRight.
transformWeight(cellDataOrdinal,0,d,d);
2113 const Scalar & leftRightTransform_d = transformLeft_d * transformRight_d;
2116 Scalar integral_d = 1.0;
2118 for (
int r=0; r<numPointTensorComponents; r++)
2120 integral_d *= componentIntegrals[r][d](leftTensorIterator.
argument(r),rightTensorIterator.
argument(r));
2123 integralSum += leftRightTransform_d * integral_d;
2126 const int i = leftFieldOrdinal + leftFieldOffset;
2127 const int j = rightFieldOrdinal + rightFieldOffset;
2129 if (integralViewRank == 3)
2131 integralView3(cellDataOrdinal,i,j) += cellMeasureWeight * integralSum;
2135 integralView2(i,j) += cellMeasureWeight * integralSum;
2140 b_offset += rightDimSpan;
2144 a_offset += leftDimSpan;
2148 if (approximateFlops != NULL)
2151 *approximateFlops += (2 + spaceDim * (3 + numPointTensorComponents)) * cellDataExtent * numFieldsLeft * numFieldsRight;
2159 const bool transposeLeft =
true;
2160 const bool transposeRight =
false;
2164 composedTransform.
storeMatMat(transposeLeft, leftTransform, transposeRight, rightTransform);
2170 if (approximateFlops != NULL)
2175 const int leftFamilyCount = vectorDataLeft. vectorData().numFamilies();
2176 const int rightFamilyCount = vectorDataRight.
vectorData().numFamilies();
2177 const int leftComponentCount = vectorDataLeft. vectorData().numComponents();
2178 const int rightComponentCount = vectorDataRight.
vectorData().numComponents();
2180 int leftFieldOrdinalOffset = 0;
2181 for (
int leftFamilyOrdinal=0; leftFamilyOrdinal<leftFamilyCount; leftFamilyOrdinal++)
2186 bool haveLaunchedContributionToCurrentFamilyLeft =
false;
2187 for (
int leftComponentOrdinal=0; leftComponentOrdinal<leftComponentCount; leftComponentOrdinal++)
2193 a_offset += vectorDataLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2197 int rightFieldOrdinalOffset = 0;
2198 for (
int rightFamilyOrdinal=0; rightFamilyOrdinal<rightFamilyCount; rightFamilyOrdinal++)
2202 bool haveLaunchedContributionToCurrentFamilyRight =
false;
2204 for (
int rightComponentOrdinal=0; rightComponentOrdinal<rightComponentCount; rightComponentOrdinal++)
2207 if (!rightComponent.
isValid())
2210 b_offset += vectorDataRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2214 INTREPID2_TEST_FOR_EXCEPTION_DEVICE_SAFE(leftComponent.
numTensorComponents() != rightComponent.
numTensorComponents(), std::invalid_argument,
"left TensorData and right TensorData have different number of tensor components. This is not supported.");
2216 const int vectorSize = getVectorSizeForHierarchicalParallelism<Scalar>();
2217 Kokkos::TeamPolicy<ExecutionSpace> policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,Kokkos::AUTO(),vectorSize);
2226 bool forceNonSpecialized =
false;
2227 bool usePointCacheForRank3Tensor =
true;
2231 if (haveLaunchedContributionToCurrentFamilyLeft && haveLaunchedContributionToCurrentFamilyRight)
2233 ExecutionSpace().fence();
2235 haveLaunchedContributionToCurrentFamilyLeft =
true;
2236 haveLaunchedContributionToCurrentFamilyRight =
true;
2238 if (integralViewRank == 2)
2242 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2244 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2245 const int teamSize = functor.teamSize(recommendedTeamSize);
2247 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2249 Kokkos::parallel_for( policy , functor,
"F_IntegratePointValueCache rank 2");
2251 if (approximateFlops != NULL)
2253 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2258 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 2>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2260 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2261 const int teamSize = functor.teamSize(recommendedTeamSize);
2263 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2265 Kokkos::parallel_for( policy , functor,
"F_Integrate rank 2");
2267 if (approximateFlops != NULL)
2269 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2273 else if (integralViewRank == 3)
2277 auto functor =
Impl::F_IntegratePointValueCache<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset);
2279 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2280 const int teamSize = functor.teamSize(recommendedTeamSize);
2282 policy = Kokkos::TeamPolicy<ExecutionSpace>(cellDataExtent,teamSize,vectorSize);
2284 Kokkos::parallel_for( policy , functor,
"F_IntegratePointValueCache rank 3");
2286 if (approximateFlops != NULL)
2288 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2293 auto functor =
Impl::F_Integrate<Scalar, DeviceType, 3>(integrals, leftComponent, composedTransform, rightComponent, cellMeasures, a_offset, b_offset, leftFieldOrdinalOffset, rightFieldOrdinalOffset, forceNonSpecialized);
2295 const int recommendedTeamSize = policy.team_size_recommended(functor,Kokkos::ParallelForTag());
2296 const int teamSize = functor.teamSize(recommendedTeamSize);
2298 policy = Kokkos::TeamPolicy<DeviceType>(cellDataExtent,teamSize,vectorSize);
2300 Kokkos::parallel_for( policy , functor,
"F_Integrate rank 3");
2302 if (approximateFlops != NULL)
2304 *approximateFlops += functor.approximateFlopCountPerCell() * integrals.
getDataExtent(0);
2308 b_offset += vectorDataRight.
vectorData().numDimsForComponent(rightComponentOrdinal);
2310 rightFieldOrdinalOffset += vectorDataRight.
vectorData().numFieldsInFamily(rightFamilyOrdinal);
2312 a_offset += vectorDataLeft.
vectorData().numDimsForComponent(leftComponentOrdinal);
2314 leftFieldOrdinalOffset += vectorDataLeft.
vectorData().numFieldsInFamily(leftFamilyOrdinal);
2321 ExecutionSpace().fence();