Tpetra parallel linear algebra  Version of the Day
Kokkos_MV_GEMM.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos: Node API and Parallel Node Kernels
6 // Copyright (2008) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_MV_GEMM_HPP
45 #define KOKKOS_MV_GEMM_HPP
46 
47 // Note this code lives only temporarily in TpetraCore. As soon as
48 // GEMM kernels exist in the TpetraKernels subpackage, and thus a
49 // dependency on Teuchos can be eliminated, the code will move to
50 // TpetraKernels.
51 
52 #include <Teuchos_BLAS.hpp>
53 #include <Kokkos_Blas2_MV.hpp>
54 #include "Tpetra_Details_gemm.hpp"
55 
56 namespace Teuchos {
57 
58  // mfh 11 Nov 2014: The DeviceGEMM specializations below need to be
59  // able to use Teuchos::BLAS::{GEMM, GEMV}. We provide just enough
60  // of a specialization for Kokkos::complex<{float, double}> to make
61  // DeviceGEMM work. They just defer to BLAS<int,
62  // std::complex<{float, double}> > via reinterpret_cast. Please
63  // feel free to expand these specializations if you need to.
64 
65  template<>
66  class BLAS<int, ::Kokkos::complex<float> > {
67  public:
68  typedef float mag_type;
69  typedef ::Kokkos::complex<float> val_type;
70  typedef std::complex<float> impl_type;
71 
72  BLAS () {}
73  BLAS (const BLAS<int, val_type>&) {}
74  virtual ~BLAS () {}
75 
76  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
77  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
78  // RealType ASUM (const int n, const val_type* x, const int incx) const;
79  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
80  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
81  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
82  //RealType NRM2(const int n, const val_type* x, const int incx) const;
83  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
84  //int IAMAX(const int n, const val_type* x, const int incx) const;
85 
86  void
87  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
88  const val_type* A, const int lda, const val_type* x, const int incx,
89  const val_type beta, val_type* y, const int incy) const
90  {
91  BLAS<int, impl_type> blas;
92  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
93  reinterpret_cast<const impl_type*> (A), lda,
94  reinterpret_cast<const impl_type*> (x), incx,
95  static_cast<impl_type> (beta),
96  reinterpret_cast<impl_type*> (y), incy);
97  }
98 
99  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
100  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
101 
102  void
103  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
104  const val_type alpha, const val_type* A, const int lda,
105  const val_type* B, const int ldb, const val_type beta, val_type* C,
106  const int ldc) const
107  {
108  BLAS<int, impl_type> blas;
109  blas.GEMM (transa, transb, m, n, k,
110  static_cast<impl_type> (alpha),
111  reinterpret_cast<const impl_type*> (A), lda,
112  reinterpret_cast<const impl_type*> (B), ldb,
113  static_cast<impl_type> (beta),
114  reinterpret_cast<impl_type*> (C), ldc);
115  }
116 
117  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
118  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
119  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
120  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
121  };
122 
123  template<>
124  class BLAS<int, ::Kokkos::complex<double> > {
125  public:
126  typedef double mag_type;
127  typedef ::Kokkos::complex<double> val_type;
128  typedef std::complex<double> impl_type;
129 
130  BLAS () {}
131  BLAS (const BLAS<int, val_type>&) {}
132  virtual ~BLAS () {}
133 
134  // void ROTG (val_type* da, val_type* db, mag_type* c, val_type* s) const;
135  // void ROT (const int n, val_type* dx, const int incx, val_type* dy, const int incy, RealType* c, val_type* s) const;
136  // RealType ASUM (const int n, const val_type* x, const int incx) const;
137  //void AXPY (const int n, const val_type alpha, const val_type* x, const int incx, val_type* y, const int incy) const;
138  //void COPY (const int n, const val_type* x, const int incx, val_type* y, const int incy) const;
139  //val_type DOT(const int n, const val_type* x, const int incx, const val_type* y, const int incy) const;
140  //RealType NRM2(const int n, const val_type* x, const int incx) const;
141  //void SCAL(const int n, const val_type alpha, val_type* x, const int incx) const;
142  //int IAMAX(const int n, const val_type* x, const int incx) const;
143 
144  void
145  GEMV (ETransp trans, const int m, const int n, const val_type alpha,
146  const val_type* A, const int lda, const val_type* x, const int incx,
147  const val_type beta, val_type* y, const int incy) const
148  {
149  BLAS<int, impl_type> blas;
150  blas.GEMV (trans, m, n, static_cast<impl_type> (alpha),
151  reinterpret_cast<const impl_type*> (A), lda,
152  reinterpret_cast<const impl_type*> (x), incx,
153  static_cast<impl_type> (beta),
154  reinterpret_cast<impl_type*> (y), incy);
155  }
156 
157  //void TRMV(EUplo uplo, ETransp trans, EDiag diag, const int n, const val_type* A, const int lda, val_type* x, const int incx) const;
158  //void GER(const int m, const int n, const val_type alpha, const val_type* x, const int incx, const val_type* y, const int incy, val_type* A, const int lda) const;
159 
160  void
161  GEMM (ETransp transa, ETransp transb, const int m, const int n, const int k,
162  const val_type alpha, const val_type* A, const int lda,
163  const val_type* B, const int ldb, const val_type beta, val_type* C,
164  const int ldc) const
165  {
166  BLAS<int, impl_type> blas;
167  blas.GEMM (transa, transb, m, n, k,
168  static_cast<impl_type> (alpha),
169  reinterpret_cast<const impl_type*> (A), lda,
170  reinterpret_cast<const impl_type*> (B), ldb,
171  static_cast<impl_type> (beta),
172  reinterpret_cast<impl_type*> (C), ldc);
173  }
174 
175  //void SYMM(ESide side, EUplo uplo, const int m, const int n, const val_type alpha, const val_type* A, const int lda, const val_type *B, const int ldb, const val_type beta, val_type *C, const int ldc) const;
176  //void SYRK(EUplo uplo, ETransp trans, const int n, const int k, const val_type alpha, const val_type* A, const int lda, const val_type beta, val_type* C, const int ldc) const;
177  //void TRMM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
178  //void TRSM(ESide side, EUplo uplo, ETransp transa, EDiag diag, const int m, const int n, const val_type alpha, const val_type* A, const int lda, val_type* B, const int ldb) const;
179  };
180 
181 } // namespace Teuchos
182 
183 
184 namespace Kokkos {
185  namespace Impl {
186 
187  template<class ViewType>
188  size_t getStride2DView (ViewType A) {
189  size_t stride[8];
190  A.stride (stride);
191  return A.dimension_1 () > 1 ? stride[1] : A.dimension_0 ();
192  }
193  }
194 
201  template <typename Scalar, typename DeviceType>
202  struct DeviceGEMM {
203  public:
204  static void
205  GEMM (const Teuchos::ETransp transA,
206  const Teuchos::ETransp transB,
207  const Scalar& alpha,
208  const View<const Scalar**, LayoutLeft, DeviceType>& A,
209  const View<const Scalar**, LayoutLeft, DeviceType>& B,
210  const Scalar& beta,
211  const View<Scalar**, LayoutLeft, DeviceType>& C)
212  {
213  const int n = static_cast<int> (C.dimension_1 ());
214 
215  // For some BLAS implementations (e.g., MKL), GEMM when B has
216  // one column may be signficantly less efficient than GEMV.
217  if (n == 1 && transB == Teuchos::NO_TRANS) {
218  const int lda = static_cast<int> (Impl::getStride2DView (A));
219  Teuchos::BLAS<int,Scalar> blas;
220  blas.GEMV (transA, A.dimension_0 (), A.dimension_1 (),
221  alpha, A.ptr_on_device (), lda,
222  B.ptr_on_device (), static_cast<int> (1),
223  beta, C.ptr_on_device (), static_cast<int> (1));
224  }
225  else {
226  const char ctransA = (transA == Teuchos::CONJ_TRANS ? 'C' :
227  (transA == Teuchos::TRANS ? 'T' : 'N'));
228  const char ctransB = (transB == Teuchos::CONJ_TRANS ? 'C' :
229  (transB == Teuchos::TRANS ? 'T' : 'N'));
230  ::Tpetra::Details::Blas::gemm (ctransA, ctransB, alpha, A, B, beta, C);
231  }
232  }
233  };
234 
235  // FIXME (mfh 10 May 2016) Temporary work-around for #243.
236  // Don't call MKL for this case.
237 #ifdef HAVE_KOKKOSKERNELS_MKL
238  template <typename DeviceType>
239  struct DeviceGEMM<double, DeviceType> {
240  public:
241  static void
242  GEMM (const Teuchos::ETransp transA,
243  const Teuchos::ETransp transB,
244  const double& alpha,
245  const View<const double**, LayoutLeft, DeviceType>& A,
246  const View<const double**, LayoutLeft, DeviceType>& B,
247  const double& beta,
248  const View<double**, LayoutLeft, DeviceType>& C)
249  {
250  const int n = static_cast<int> (C.dimension_1 ());
251 
252  // For some BLAS implementations (e.g., MKL), GEMM when B has
253  // one column may be signficantly less efficient than GEMV.
254  if (n == 1 && transB == Teuchos::NO_TRANS) {
255  char trans = 'N';
256  if (transA == Teuchos::TRANS) {
257  trans = 'T';
258  }
259  else if (transA == Teuchos::CONJ_TRANS) {
260  trans = 'C';
261  }
262  auto B_0 = Kokkos::subview (B, Kokkos::ALL (), 0);
263  auto C_0 = Kokkos::subview (C, Kokkos::ALL (), 0);
264  KokkosBlas::gemv (&trans, alpha, A, B_0, beta, C_0);
265  }
266  else {
267  const char ctransA = (transA == Teuchos::CONJ_TRANS ? 'C' :
268  (transA == Teuchos::TRANS ? 'T' : 'N'));
269  const char ctransB = (transB == Teuchos::CONJ_TRANS ? 'C' :
270  (transB == Teuchos::TRANS ? 'T' : 'N'));
271  ::Tpetra::Details::Blas::gemm (ctransA, ctransB,
272  alpha, A, B, beta, C);
273  }
274  }
275  };
276 #endif // HAVE_KOKKOSKERNELS_MKL
277 
278 #ifdef KOKKOS_HAVE_CUDA
279  template <typename Scalar>
280  struct DeviceGEMM<Scalar, Cuda> {
281  public:
282  static void
283  GEMM (const Teuchos::ETransp transA,
284  const Teuchos::ETransp transB,
285  const Scalar& alpha,
286  const View<const Scalar**, LayoutLeft, Cuda>& A,
287  const View<const Scalar**,LayoutLeft, Cuda>& B,
288  const Scalar& beta,
289  const View<Scalar**,LayoutLeft,Cuda>& C)
290  {
291  TEUCHOS_TEST_FOR_EXCEPTION
292  (true, std::logic_error, "DeviceGEMM: Kokkos::Cuda has no support "
293  "for GEMM operations over Scalar=" << Teuchos::typeName(alpha) << ".");
294  }
295  };
296 
297  template <>
298  struct DeviceGEMM<float,Cuda> {
299  public:
300  static void
301  GEMM (const Teuchos::ETransp transA,
302  const Teuchos::ETransp transB,
303  const float alpha,
304  const View<const float**,LayoutLeft,Cuda>& A,
305  const View<const float**,LayoutLeft,Cuda>& B,
306  const float beta,
307  const View<float**,LayoutLeft,Cuda>& C)
308  {
309  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : 'T');
310  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
311 
312  ::Tpetra::Details::Blas::gemm (ctransA, ctransB,
313  alpha, A, B, beta, C);
314  }
315  };
316 
317  template <>
318  struct DeviceGEMM<double,Cuda> {
319  public:
320  static void
321  GEMM (const Teuchos::ETransp transA,
322  const Teuchos::ETransp transB,
323  const double alpha,
324  const View<const double**, LayoutLeft, Cuda>& A,
325  const View<const double**, LayoutLeft, Cuda>& B,
326  const double beta,
327  const View<double**, LayoutLeft, Cuda>& C)
328  {
329  const char ctransA = (transA == Teuchos::NO_TRANS ? 'N' : 'T');
330  const char ctransB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
331 
332  ::Tpetra::Details::Blas::gemm (ctransA, ctransB,
333  alpha, A, B, beta, C);
334  }
335  };
336 #endif // KOKKOS_HAVE_CUDA
337 
338 } // namespace Kokkos
339 #endif // KOKKOS_MV_GEMM_HPP
340 
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void GEMM(const char transA[], const char transB[], const CoefficientType &alpha, const ViewType1 &A, const ViewType2 &B, const CoefficientType &beta, const ViewType3 &C)
Small dense matrix-matrix multiply: C := alpha*A*B + beta*C
Class that provides GEMM for a particular Kokkos Device.
Declaration and definition of Tpetra::Details::Blas::gemm, an implementation detail of Tpetra::MultiV...