30#if defined(SIRIUS_GPU)
32#include "core/acc/acc_lapack.hpp"
34#if defined(SIRIUS_MAGMA)
37#if defined(SIRIUS_GPU) and defined(SIRIUS_CUDA)
57#define linalg_msg_wrong_type "[" + std::string(__func__) + "] wrong type of linear algebra library: " + to_string(la_)
59const std::string linalg_msg_no_scalapack =
"not compiled with ScaLAPACK";
77 inline void axpy(
int n, T
const* alpha, T
const* x,
int incx, T* y,
int incy);
86 inline void gemm(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, T
const* alpha, T
const* A, ftn_int lda,
92 inline void gemm(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, T
const* alpha,
94 ftn_int ib, ftn_int jb, T
const* beta,
dmatrix<T>& C, ftn_int ic, ftn_int jc);
103 inline void hemm(
char side,
char uplo, ftn_int m, ftn_int n, T
const* alpha, T
const* A, ftn_len lda,
104 T
const* B, ftn_len ldb, T
const* beta, T* C, ftn_len ldc);
106 template <
typename T>
107 inline void trmm(
char side,
char uplo,
char transa, ftn_int m, ftn_int n, T
const* aplha, T
const* A, ftn_int lda,
115 inline void ger(ftn_int m, ftn_int n, T
const* alpha, T
const* x, ftn_int incx, T
const* y, ftn_int incy, T* A, ftn_int lda,
123 template <
typename T>
124 inline int potrf(ftn_int n, T* A, ftn_int lda, ftn_int
const* desca =
nullptr)
const;
127 template <
typename T>
128 inline int getrf(ftn_int m, ftn_int n, T* A, ftn_int lda, ftn_int* ipiv)
const;
131 template <
typename T>
132 inline int getrf(ftn_int m, ftn_int n,
dmatrix<T>& A, ftn_int ia, ftn_int ja, ftn_int* ipiv)
const;
134 template <
typename T>
135 inline int getrs(
char trans, ftn_int n, ftn_int nrhs, T
const* A, ftn_int lda, ftn_int* ipiv, T* B,
139 template <
typename T>
140 inline int sytrf(ftn_int n, T* A, ftn_int lda, ftn_int* ipiv)
const;
143 template <
typename T>
144 inline int sytrs(ftn_int n, ftn_int nrhs, T* A, ftn_int lda, ftn_int* ipiv, T* b, ftn_int ldb)
const;
151 template <
typename T>
152 inline int trtri(ftn_int n, T* A, ftn_int lda, ftn_int
const* desca =
nullptr)
const;
154 template <
typename T>
155 inline int getri(ftn_int n, T* A, ftn_int lda, ftn_int* ipiv)
const;
158 template <
typename T>
159 inline int sytri(ftn_int n, T* A, ftn_int lda, ftn_int* ipiv)
const;
162 template <
typename T>
165 std::vector<int> ipiv(n);
166 int info = this->
getrf(n, n, A.at(sddk::memory_t::host), A.ld(), &ipiv[0]);
168 std::printf(
"getrf returned %i\n", info);
172 info = this->getri(n, A.at(sddk::memory_t::host), A.ld(), &ipiv[0]);
174 std::printf(
"getri returned %i\n", info);
179 template <
typename T>
182 std::vector<int> ipiv(n);
183 int info = this->
sytrf(n, A.at(sddk::memory_t::host), A.ld(), &ipiv[0]);
185 std::printf(
"sytrf returned %i\n", info);
189 info = this->
sytri(n, A.at(sddk::memory_t::host), A.ld(), &ipiv[0]);
191 std::printf(
"sytri returned %i\n", info);
196 template <
typename T>
197 inline bool sysolve(ftn_int n, sddk::matrix<T> &A, sddk::mdarray<T, 1> &b)
const
199 std::vector<int> ipiv(n);
200 int info = this->
sytrf(n, A.at(sddk::memory_t::host), A.ld(), ipiv.data());
201 if (info)
return false;
203 info = this->
sytrs(n, 1, A.at(sddk::memory_t::host), A.ld(), ipiv.data(), b.at(sddk::memory_t::host), b.ld());
213 template <
typename T>
214 inline int gtsv(ftn_int n, ftn_int nrhs, T* dl, T* d, T* du, T* b, ftn_int ldb)
const;
217 template <
typename T>
218 inline int gesv(ftn_int n, ftn_int nrhs, T* A, ftn_int lda, T* B, ftn_int ldb)
const;
234 template <
typename T>
236 ftn_int ic, ftn_int jc)
const;
239 template <
typename T>
241 ftn_int ic, ftn_int jc)
const;
244 template <
typename T>
245 inline std::tuple<ftn_double, ftn_double, ftn_double> lartg(T f, T g)
const;
247 template <
typename T>
248 inline void geqrf(ftn_int m, ftn_int n,
dmatrix<T>& A, ftn_int ia, ftn_int ja);
257#if defined(SIRIUS_SCALAPACK)
260 ftn_double_complex z;
262 FORTRAN(pzgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), &z, &z, &lwork,
264 lwork =
static_cast<int>(z.real() + 1);
265 std::vector<ftn_double_complex> work(lwork);
266 std::vector<ftn_double_complex> tau(std::max(m, n));
267 FORTRAN(pzgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), tau.data(),
268 work.data(), &lwork, &info);
270 throw std::runtime_error(linalg_msg_no_scalapack);
275 if (A.comm().size() != 1) {
276 throw std::runtime_error(
"[geqrf] can't use lapack for distributed matrix; use scalapck instead");
279 ftn_double_complex z;
281 ftn_int lda = A.ld();
282 FORTRAN(zgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, &z, &z, &lwork, &info);
283 lwork =
static_cast<int>(z.real() + 1);
284 std::vector<ftn_double_complex> work(lwork);
285 std::vector<ftn_double_complex> tau(std::max(m, n));
286 FORTRAN(zgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, tau.data(), work.data(), &lwork, &info);
290 throw std::runtime_error(linalg_msg_wrong_type);
298wrap::geqrf<ftn_double>(ftn_int m, ftn_int n, dmatrix<ftn_double>& A, ftn_int ia, ftn_int ja)
302#if defined(SIRIUS_SCALAPACK)
307 FORTRAN(pdgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), &z, &z, &lwork,
309 lwork =
static_cast<int>(z + 1);
310 std::vector<ftn_double> work(lwork);
311 std::vector<ftn_double> tau(std::max(m, n));
312 FORTRAN(pdgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), tau.data(),
313 work.data(), &lwork, &info);
315 throw std::runtime_error(linalg_msg_no_scalapack);
320 if (A.comm().size() != 1) {
321 throw std::runtime_error(
"[geqrf] can't use lapack for distributed matrix; use scalapck instead");
326 ftn_int lda = A.ld();
327 FORTRAN(dgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, &z, &z, &lwork, &info);
328 lwork =
static_cast<int>(z + 1);
329 std::vector<ftn_double> work(lwork);
330 std::vector<ftn_double> tau(std::max(m, n));
331 FORTRAN(dgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, tau.data(), work.data(), &lwork, &info);
335 throw std::runtime_error(linalg_msg_wrong_type);
343wrap::geqrf<ftn_complex>(ftn_int m, ftn_int n, dmatrix<ftn_complex>& A, ftn_int ia, ftn_int ja)
347#if defined(SIRIUS_SCALAPACK)
352 FORTRAN(pcgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), &z, &z, &lwork,
354 lwork =
static_cast<int>(z.real() + 1);
355 std::vector<ftn_complex> work(lwork);
356 std::vector<ftn_complex> tau(std::max(m, n));
357 FORTRAN(pcgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), tau.data(),
358 work.data(), &lwork, &info);
360 throw std::runtime_error(linalg_msg_no_scalapack);
365 if (A.comm().size() != 1) {
366 throw std::runtime_error(
"[geqrf] can't use lapack for distributed matrix; use scalapck instead");
371 ftn_int lda = A.ld();
372 FORTRAN(cgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, &z, &z, &lwork, &info);
373 lwork =
static_cast<int>(z.real() + 1);
374 std::vector<ftn_complex> work(lwork);
375 std::vector<ftn_complex> tau(std::max(m, n));
376 FORTRAN(cgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, tau.data(), work.data(), &lwork, &info);
380 throw std::runtime_error(linalg_msg_wrong_type);
388wrap::geqrf<ftn_single>(ftn_int m, ftn_int n, dmatrix<ftn_single>& A, ftn_int ia, ftn_int ja)
392#if defined(SIRIUS_SCALAPACK)
397 FORTRAN(psgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), &z, &z, &lwork,
399 lwork =
static_cast<int>(z + 1);
400 std::vector<ftn_single> work(lwork);
401 std::vector<ftn_single> tau(std::max(m, n));
402 FORTRAN(psgeqrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), tau.data(),
403 work.data(), &lwork, &info);
405 throw std::runtime_error(linalg_msg_no_scalapack);
410 if (A.comm().size() != 1) {
411 throw std::runtime_error(
"[geqrf] can't use lapack for distributed matrix; use scalapck instead");
416 ftn_int lda = A.ld();
417 FORTRAN(sgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, &z, &z, &lwork, &info);
418 lwork =
static_cast<int>(z + 1);
419 std::vector<ftn_single> work(lwork);
420 std::vector<ftn_single> tau(std::max(m, n));
421 FORTRAN(sgeqrf)(&m, &n, A.at(sddk::memory_t::host, ia, ja), &lda, tau.data(), work.data(), &lwork, &info);
425 throw std::runtime_error(linalg_msg_wrong_type);
433wrap::axpy(
int n, ftn_double_complex
const* alpha, ftn_double_complex
const* x,
int incx, ftn_double_complex* y,
442 FORTRAN(zaxpy)(&n, alpha, x, &incx, y, &incy);
445#if defined(SIRIUS_GPU)
447 acc::blas::zaxpy(n,
reinterpret_cast<const acc_complex_double_t*
>(alpha),
448 reinterpret_cast<const acc_complex_double_t*
>(x), incx,
449 reinterpret_cast<acc_complex_double_t*
>(y), incy);
454 throw std::runtime_error(linalg_msg_wrong_type);
462wrap::gemm<ftn_single>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_single
const* alpha,
463 ftn_single
const* A, ftn_int lda, ftn_single
const* B, ftn_int ldb, ftn_single
const* beta,
464 ftn_single* C, ftn_int ldc, acc::stream_id sid)
const
475 (&transa, &transb, &m, &n, &k,
const_cast<float*
>(alpha),
const_cast<float*
>(A), &lda,
476 const_cast<float*
>(B), &ldb,
const_cast<float*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
480#if defined(SIRIUS_GPU)
481 acc::blas::sgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, sid());
483 throw std::runtime_error(
"not compiled with GPU blas support!");
488#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
489 acc::blas::xt::sgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
491 throw std::runtime_error(
"not compiled with cublasxt");
496 splablas::sgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
500 throw std::runtime_error(linalg_msg_wrong_type);
507inline void wrap::gemm<ftn_double>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_double
const* alpha,
508 ftn_double
const* A, ftn_int lda, ftn_double
const* B, ftn_int ldb,
509 ftn_double
const* beta, ftn_double* C, ftn_int ldc, acc::stream_id sid)
const
519 FORTRAN(dgemm)(&transa, &transb, &m, &n, &k,
const_cast<double*
>(alpha),
const_cast<double*
>(A), &lda,
520 const_cast<double*
>(B), &ldb,
const_cast<double*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
524#if defined(SIRIUS_GPU)
525 acc::blas::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, sid());
527 throw std::runtime_error(
"not compiled with GPU blas support!");
532#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
533 acc::blas::xt::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
535 throw std::runtime_error(
"not compiled with cublasxt");
541 splablas::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
545 throw std::runtime_error(linalg_msg_wrong_type);
552inline void wrap::gemm<ftn_complex>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_complex
const* alpha,
553 ftn_complex
const* A, ftn_int lda, ftn_complex
const* B, ftn_int ldb, ftn_complex
const *beta,
554 ftn_complex* C, ftn_int ldc, acc::stream_id sid)
const
565 (&transa, &transb, &m, &n, &k,
const_cast<ftn_complex*
>(alpha),
const_cast<ftn_complex*
>(A), &lda,
566 const_cast<ftn_complex*
>(B), &ldb,
const_cast<ftn_complex*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
570#if defined(SIRIUS_GPU)
571 acc::blas::cgemm(transa, transb, m, n, k,
reinterpret_cast<acc_complex_float_t const*
>(alpha),
572 reinterpret_cast<acc_complex_float_t const*
>(A), lda,
573 reinterpret_cast<acc_complex_float_t const*
>(B), ldb,
574 reinterpret_cast<acc_complex_float_t const*
>(beta),
575 reinterpret_cast<acc_complex_float_t*
>(C), ldc, sid());
577 throw std::runtime_error(
"not compiled with GPU blas support!");
582#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
583 acc::blas::xt::cgemm(transa, transb, m, n, k,
reinterpret_cast<acc_complex_float_t const*
>(alpha),
584 reinterpret_cast<acc_complex_float_t const*
>(A), lda,
585 reinterpret_cast<acc_complex_float_t const*
>(B), ldb,
586 reinterpret_cast<acc_complex_float_t const*
>(beta),
587 reinterpret_cast<acc_complex_float_t*
>(C), ldc);
589 throw std::runtime_error(
"not compiled with cublasxt");
594 splablas::cgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
598 throw std::runtime_error(linalg_msg_wrong_type);
605inline void wrap::gemm<ftn_double_complex>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k,
606 ftn_double_complex
const* alpha, ftn_double_complex
const* A, ftn_int lda,
607 ftn_double_complex
const* B, ftn_int ldb, ftn_double_complex
const *beta,
608 ftn_double_complex* C, ftn_int ldc, acc::stream_id sid)
const
618 FORTRAN(zgemm)(&transa, &transb, &m, &n, &k,
const_cast<ftn_double_complex*
>(alpha),
619 const_cast<ftn_double_complex*
>(A), &lda,
const_cast<ftn_double_complex*
>(B), &ldb,
620 const_cast<ftn_double_complex*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
624#if defined(SIRIUS_GPU)
625 acc::blas::zgemm(transa, transb, m, n, k,
reinterpret_cast<acc_complex_double_t const*
>(alpha),
626 reinterpret_cast<acc_complex_double_t const*
>(A), lda,
reinterpret_cast<acc_complex_double_t const*
>(B),
627 ldb,
reinterpret_cast<acc_complex_double_t const*
>(beta),
628 reinterpret_cast<acc_complex_double_t*
>(C), ldc, sid());
630 throw std::runtime_error(
"not compiled with GPU blas support!");
636#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
637 acc::blas::xt::zgemm(transa, transb, m, n, k,
reinterpret_cast<acc_complex_double_t const*
>(alpha),
638 reinterpret_cast<acc_complex_double_t const*
>(A), lda,
639 reinterpret_cast<acc_complex_double_t const*
>(B), ldb,
640 reinterpret_cast<acc_complex_double_t const*
>(beta),
641 reinterpret_cast<acc_complex_double_t*
>(C), ldc);
643 throw std::runtime_error(
"not compiled with cublasxt");
649 splablas::zgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
653 throw std::runtime_error(linalg_msg_wrong_type);
661wrap::gemm<ftn_single>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_single
const* alpha,
662 dmatrix<ftn_single>
const& A, ftn_int ia, ftn_int ja, dmatrix<ftn_single>
const& B,
663 ftn_int ib, ftn_int jb, ftn_single
const* beta, dmatrix<ftn_single>& C, ftn_int ic, ftn_int jc)
667#if defined(SIRIUS_SCALAPACK)
675 FORTRAN(psgemm)(&transa, &transb, &m, &n, &k, alpha, A.at(sddk::memory_t::host), &ia, &ja, A.descriptor(),
676 B.at(sddk::memory_t::host), &ib, &jb, B.descriptor(), beta, C.at(sddk::memory_t::host), &ic, &jc, C.descriptor(),
677 (ftn_len)1, (ftn_len)1);
679 throw std::runtime_error(linalg_msg_no_scalapack);
684 throw std::runtime_error(linalg_msg_wrong_type);
692wrap::gemm<ftn_double>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_double
const* alpha,
693 dmatrix<ftn_double>
const& A, ftn_int ia, ftn_int ja, dmatrix<ftn_double>
const& B,
694 ftn_int ib, ftn_int jb, ftn_double
const* beta, dmatrix<ftn_double>& C, ftn_int ic, ftn_int jc)
698#if defined(SIRIUS_SCALAPACK)
706 FORTRAN(pdgemm)(&transa, &transb, &m, &n, &k, alpha, A.at(sddk::memory_t::host), &ia, &ja, A.descriptor(),
707 B.at(sddk::memory_t::host), &ib, &jb, B.descriptor(), beta, C.at(sddk::memory_t::host), &ic, &jc, C.descriptor(),
708 (ftn_len)1, (ftn_len)1);
710 throw std::runtime_error(linalg_msg_no_scalapack);
715 throw std::runtime_error(linalg_msg_wrong_type);
723wrap::gemm<ftn_complex>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k, ftn_complex
const* alpha,
724 dmatrix<ftn_complex>
const& A, ftn_int ia, ftn_int ja, dmatrix<ftn_complex>
const& B,
725 ftn_int ib, ftn_int jb, ftn_complex
const* beta, dmatrix<ftn_complex>& C, ftn_int ic, ftn_int jc)
729#if defined(SIRIUS_SCALAPACK)
737 FORTRAN(pcgemm)(&transa, &transb, &m, &n, &k, alpha, A.at(sddk::memory_t::host), &ia, &ja, A.descriptor(),
738 B.at(sddk::memory_t::host), &ib, &jb, B.descriptor(), beta, C.at(sddk::memory_t::host), &ic, &jc, C.descriptor(),
739 (ftn_len)1, (ftn_len)1);
741 throw std::runtime_error(linalg_msg_no_scalapack);
746 throw std::runtime_error(linalg_msg_wrong_type);
754wrap::gemm<ftn_double_complex>(
char transa,
char transb, ftn_int m, ftn_int n, ftn_int k,
755 ftn_double_complex
const* alpha, dmatrix<ftn_double_complex>
const& A,
756 ftn_int ia, ftn_int ja, dmatrix<ftn_double_complex>
const& B,
757 ftn_int ib, ftn_int jb, ftn_double_complex
const* beta,
758 dmatrix<ftn_double_complex>& C, ftn_int ic, ftn_int jc)
762#if defined(SIRIUS_SCALAPACK)
770 FORTRAN(pzgemm)(&transa, &transb, &m, &n, &k, alpha, A.at(sddk::memory_t::host), &ia, &ja, A.descriptor(),
771 B.at(sddk::memory_t::host), &ib, &jb, B.descriptor(), beta, C.at(sddk::memory_t::host), &ic, &jc, C.descriptor(),
772 (ftn_len)1, (ftn_len)1);
774 throw std::runtime_error(linalg_msg_no_scalapack);
779 throw std::runtime_error(linalg_msg_wrong_type);
787wrap::hemm<ftn_complex>(
char side,
char uplo, ftn_int m, ftn_int n, ftn_complex
const* alpha, ftn_complex
const* A,
788 ftn_len lda, ftn_complex
const* B, ftn_len ldb, ftn_complex
const* beta, ftn_complex* C, ftn_len ldc)
798 (&side, &uplo, &m, &n,
const_cast<ftn_complex*
>(alpha),
const_cast<ftn_complex*
>(A), &lda,
799 const_cast<ftn_complex*
>(B), &ldb,
const_cast<ftn_complex*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
803 throw std::runtime_error(linalg_msg_wrong_type);
811wrap::hemm<ftn_double_complex>(
char side,
char uplo, ftn_int m, ftn_int n, ftn_double_complex
const* alpha,
812 ftn_double_complex
const* A, ftn_len lda, ftn_double_complex
const* B, ftn_len ldb,
813 ftn_double_complex
const* beta, ftn_double_complex* C, ftn_len ldc)
822 FORTRAN(zhemm)(&side, &uplo, &m, &n,
const_cast<ftn_double_complex*
>(alpha),
823 const_cast<ftn_double_complex*
>(A), &lda,
const_cast<ftn_double_complex*
>(B), &ldb,
824 const_cast<ftn_double_complex*
>(beta), C, &ldc, (ftn_len)1, (ftn_len)1);
828 throw std::runtime_error(linalg_msg_wrong_type);
835inline void wrap::ger<ftn_single>(ftn_int m, ftn_int n, ftn_single
const* alpha, ftn_single
const* x, ftn_int incx,
836 ftn_single
const* y, ftn_int incy, ftn_single* A, ftn_int lda, acc::stream_id sid)
const
840 FORTRAN(sger)(&m, &n,
const_cast<ftn_single*
>(alpha),
const_cast<ftn_single*
>(x), &incx,
841 const_cast<ftn_single*
>(y), &incy, A, &lda);
845#if defined(SIRIUS_GPU)
846 acc::blas::sger(m, n, alpha, x, incx, y, incy, A, lda, sid());
848 throw std::runtime_error(
"not compiled with GPU blas support!");
853 throw std::runtime_error(
"(s,c)ger is not implemented in cublasxt");
857 throw std::runtime_error(linalg_msg_wrong_type);
864inline void wrap::ger<ftn_double>(ftn_int m, ftn_int n, ftn_double
const* alpha, ftn_double
const* x, ftn_int incx,
865 ftn_double
const* y, ftn_int incy, ftn_double* A, ftn_int lda, acc::stream_id sid)
const
869 FORTRAN(dger)(&m, &n,
const_cast<ftn_double*
>(alpha),
const_cast<ftn_double*
>(x), &incx,
870 const_cast<ftn_double*
>(y), &incy, A, &lda);
874#if defined(SIRIUS_GPU)
875 acc::blas::dger(m, n, alpha, x, incx, y, incy, A, lda, sid());
877 throw std::runtime_error(
"not compiled with GPU blas support!");
882 throw std::runtime_error(
"(d,z)ger is not implemented in cublasxt");
886 throw std::runtime_error(linalg_msg_wrong_type);
893inline void wrap::trmm<ftn_double>(
char side,
char uplo,
char transa, ftn_int m, ftn_int n, ftn_double
const* alpha,
894 ftn_double
const* A, ftn_int lda, ftn_double* B, ftn_int ldb, acc::stream_id sid)
const
898 FORTRAN(dtrmm)(&side, &uplo, &transa,
"N", &m, &n,
const_cast<ftn_double*
>(alpha),
899 const_cast<ftn_double*
>(A), &lda, B, &ldb, (ftn_len)1, (ftn_len)1, (ftn_len)1, (ftn_len)1);
903#if defined(SIRIUS_GPU)
904 acc::blas::dtrmm(side, uplo, transa,
'N', m, n, alpha, A, lda, B, ldb, sid());
906 throw std::runtime_error(
"not compiled with GPU blas support!");
911#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
912 acc::blas::xt::dtrmm(side, uplo, transa,
'N', m, n, alpha, A, lda, B, ldb);
914 throw std::runtime_error(
"not compiled with cublasxt");
919 throw std::runtime_error(linalg_msg_wrong_type);
926inline void wrap::trmm<ftn_single>(
char side,
char uplo,
char transa, ftn_int m, ftn_int n, ftn_single
const* alpha,
927 ftn_single
const* A, ftn_int lda, ftn_single* B, ftn_int ldb, acc::stream_id sid)
const
931 FORTRAN(strmm)(&side, &uplo, &transa,
"N", &m, &n,
const_cast<ftn_single*
>(alpha),
932 const_cast<ftn_single*
>(A), &lda, B, &ldb, (ftn_len)1, (ftn_len)1, (ftn_len)1, (ftn_len)1);
936#if defined(SIRIUS_GPU)
937 acc::blas::strmm(side, uplo, transa,
'N', m, n, alpha, A, lda, B, ldb, sid());
939 throw std::runtime_error(
"not compiled with GPU blas support!");
944#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
945 acc::blas::xt::strmm(side, uplo, transa,
'N', m, n, alpha, A, lda, B, ldb);
947 throw std::runtime_error(
"not compiled with cublasxt");
952 throw std::runtime_error(linalg_msg_wrong_type);
959inline void wrap::trmm<ftn_double_complex>(
char side,
char uplo,
char transa, ftn_int m, ftn_int n,
960 ftn_double_complex
const* alpha, ftn_double_complex
const* A,
961 ftn_int lda, ftn_double_complex* B, ftn_int ldb, acc::stream_id sid)
const
965 FORTRAN(ztrmm)(&side, &uplo, &transa,
"N", &m, &n,
const_cast<ftn_double_complex*
>(alpha),
966 const_cast<ftn_double_complex*
>(A), &lda, B, &ldb, (ftn_len)1, (ftn_len)1,
967 (ftn_len)1, (ftn_len)1);
971#if defined(SIRIUS_GPU)
972 acc::blas::ztrmm(side, uplo, transa,
'N', m, n,
reinterpret_cast<acc_complex_double_t const*
>(alpha),
973 reinterpret_cast<acc_complex_double_t const*
>(A), lda,
974 reinterpret_cast<acc_complex_double_t*
>(B), ldb, sid());
976 throw std::runtime_error(
"not compiled with GPU blas support!");
981#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
982 acc::blas::xt::ztrmm(side, uplo, transa,
'N', m, n,
reinterpret_cast<acc_complex_double_t const*
>(alpha),
983 reinterpret_cast<acc_complex_double_t const*
>(A), lda,
reinterpret_cast<acc_complex_double_t*
>(B), ldb);
985 throw std::runtime_error(
"not compiled with cublasxt");
990 throw std::runtime_error(linalg_msg_wrong_type);
997inline void wrap::trmm<ftn_complex>(
char side,
char uplo,
char transa, ftn_int m, ftn_int n,
998 ftn_complex
const* alpha, ftn_complex
const* A,
999 ftn_int lda, ftn_complex* B, ftn_int ldb, acc::stream_id sid)
const
1004 (&side, &uplo, &transa,
"N", &m, &n,
const_cast<ftn_complex*
>(alpha),
const_cast<ftn_complex*
>(A), &lda, B,
1005 &ldb, (ftn_len)1, (ftn_len)1, (ftn_len)1, (ftn_len)1);
1009#if defined(SIRIUS_GPU)
1010 acc::blas::ctrmm(side, uplo, transa,
'N', m, n,
reinterpret_cast<acc_complex_float_t const*
>(alpha),
1011 reinterpret_cast<acc_complex_float_t const*
>(A), lda,
1012 reinterpret_cast<acc_complex_float_t*
>(B), ldb, sid());
1014 throw std::runtime_error(
"not compiled with GPU blas support!");
1019#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1020 acc::blas::xt::ctrmm(side, uplo, transa,
'N', m, n,
reinterpret_cast<acc_complex_float_t const*
>(alpha),
1021 reinterpret_cast<acc_complex_float_t const*
>(A), lda,
1022 reinterpret_cast<acc_complex_float_t*
>(B), ldb);
1024 throw std::runtime_error(
"not compiled with cublasxt");
1029 throw std::runtime_error(linalg_msg_wrong_type);
1036inline int wrap::potrf<ftn_double>(ftn_int n, ftn_double* A, ftn_int lda, ftn_int
const* desca)
const
1041 FORTRAN(dpotrf)(
"U", &n, A, &lda, &info, (ftn_len)1);
1046#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1047 return magma::dpotrf(
'U', n, A, lda);
1049 throw std::runtime_error(
"not compiled with magma");
1054#if defined(SIRIUS_SCALAPACK)
1055 assert(desca !=
nullptr);
1059 FORTRAN(pdpotrf)(
"U", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1);
1062 throw std::runtime_error(linalg_msg_no_scalapack);
1067#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1068 acc::cusolver::potrf<ftn_double>(n, A, lda);
1070 throw std::runtime_error(
"not compiled with CUDA");
1075 throw std::runtime_error(linalg_msg_wrong_type);
1083inline int wrap::potrf<ftn_single>(ftn_int n, ftn_single* A, ftn_int lda, ftn_int
const* desca)
const
1088 FORTRAN(spotrf)(
"U", &n, A, &lda, &info, (ftn_len)1);
1093#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1094 return magma::spotrf(
'U', n, A, lda);
1096 throw std::runtime_error(
"not compiled with magma");
1101#if defined(SIRIUS_SCALAPACK)
1102 assert(desca !=
nullptr);
1106 FORTRAN(pspotrf)(
"U", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1);
1109 throw std::runtime_error(linalg_msg_no_scalapack);
1114#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1115 acc::cusolver::potrf<ftn_single>(n, A, lda);
1117 throw std::runtime_error(
"not compiled with CUDA");
1122 throw std::runtime_error(linalg_msg_wrong_type);
1130inline int wrap::potrf<ftn_double_complex>(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int
const* desca)
const
1135 FORTRAN(zpotrf)(
"U", &n, A, &lda, &info, (ftn_len)1);
1140#if defined(SIRIUS_SCALAPACK)
1141 assert(desca !=
nullptr);
1145 FORTRAN(pzpotrf)(
"U", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1);
1148 throw std::runtime_error(linalg_msg_no_scalapack);
1153#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1154 return magma::zpotrf(
'U', n,
reinterpret_cast<magmaDoubleComplex*
>(A), lda);
1156 throw std::runtime_error(
"not compiled with magma");
1161#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1162 acc::cusolver::potrf<ftn_double_complex>(n, A, lda);
1164 throw std::runtime_error(
"not compiled with CUDA");
1169 throw std::runtime_error(linalg_msg_wrong_type);
1177inline int wrap::potrf<ftn_complex>(ftn_int n, ftn_complex* A, ftn_int lda, ftn_int
const* desca)
const
1182 FORTRAN(cpotrf)(
"U", &n, A, &lda, &info, (ftn_len)1);
1187#if defined(SIRIUS_SCALAPACK)
1188 assert(desca !=
nullptr);
1192 FORTRAN(pcpotrf)(
"U", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1);
1195 throw std::runtime_error(linalg_msg_no_scalapack);
1200#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1201 return magma::cpotrf(
'U', n,
reinterpret_cast<magmaFloatComplex*
>(A), lda);
1203 throw std::runtime_error(
"not compiled with magma");
1208#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1209 acc::cusolver::potrf<ftn_complex>(n, A, lda);
1211 throw std::runtime_error(
"not compiled with CUDA");
1216 throw std::runtime_error(linalg_msg_wrong_type);
1224inline int wrap::trtri<ftn_double>(ftn_int n, ftn_double* A, ftn_int lda, ftn_int
const* desca)
const
1229 FORTRAN(dtrtri)(
"U",
"N", &n, A, &lda, &info, (ftn_len)1, (ftn_len)1);
1234#if defined(SIRIUS_SCALAPACK)
1235 assert(desca !=
nullptr);
1239 FORTRAN(pdtrtri)(
"U",
"N", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1, (ftn_len)1);
1242 throw std::runtime_error(linalg_msg_no_scalapack);
1247#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1248 return magma::dtrtri(
'U', n, A, lda);
1250 throw std::runtime_error(
"not compiled with magma");
1255#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1256 acc::cusolver::trtri<ftn_double>(n, A, lda);
1258 throw std::runtime_error(
"not compiled with CUDA");
1263 throw std::runtime_error(linalg_msg_wrong_type);
1271inline int wrap::trtri<ftn_single>(ftn_int n, ftn_single* A, ftn_int lda, ftn_int
const* desca)
const
1276 FORTRAN(strtri)(
"U",
"N", &n, A, &lda, &info, (ftn_len)1, (ftn_len)1);
1281#if defined(SIRIUS_SCALAPACK)
1282 assert(desca !=
nullptr);
1286 FORTRAN(pstrtri)(
"U",
"N", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1, (ftn_len)1);
1289 throw std::runtime_error(linalg_msg_no_scalapack);
1294#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1295 return magma::strtri(
'U', n, A, lda);
1297 throw std::runtime_error(
"not compiled with magma");
1302#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1303 acc::cusolver::trtri<ftn_single>(n, A, lda);
1305 throw std::runtime_error(
"not compiled with CUDA");
1310 throw std::runtime_error(linalg_msg_wrong_type);
1318inline int wrap::trtri<ftn_double_complex>(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int
const* desca)
const
1323 FORTRAN(ztrtri)(
"U",
"N", &n, A, &lda, &info, (ftn_len)1, (ftn_len)1);
1328#if defined(SIRIUS_SCALAPACK)
1329 assert(desca !=
nullptr);
1333 FORTRAN(pztrtri)(
"U",
"N", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1, (ftn_len)1);
1336 throw std::runtime_error(linalg_msg_no_scalapack);
1341#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1342 return magma::ztrtri(
'U', n,
reinterpret_cast<magmaDoubleComplex*
>(A), lda);
1344 throw std::runtime_error(
"not compiled with magma");
1349#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1350 acc::cusolver::trtri<ftn_double_complex>(n, A, lda);
1352 throw std::runtime_error(
"not compiled with CUDA");
1357 throw std::runtime_error(linalg_msg_wrong_type);
1365inline int wrap::trtri<ftn_complex>(ftn_int n, ftn_complex* A, ftn_int lda, ftn_int
const* desca)
const
1370 FORTRAN(ctrtri)(
"U",
"N", &n, A, &lda, &info, (ftn_len)1, (ftn_len)1);
1375#if defined(SIRIUS_SCALAPACK)
1376 assert(desca !=
nullptr);
1380 FORTRAN(pctrtri)(
"U",
"N", &n, A, &ia, &ja,
const_cast<ftn_int*
>(desca), &info, (ftn_len)1, (ftn_len)1);
1383 throw std::runtime_error(linalg_msg_no_scalapack);
1388#if defined(SIRIUS_GPU) && defined(SIRIUS_MAGMA)
1389 return magma::ctrtri(
'U', n,
reinterpret_cast<magmaFloatComplex*
>(A), lda);
1391 throw std::runtime_error(
"not compiled with magma");
1396#if defined(SIRIUS_GPU) && defined(SIRIUS_CUDA)
1397 acc::cusolver::trtri<ftn_complex>(n, A, lda);
1399 throw std::runtime_error(
"not compiled with CUDA");
1404 throw std::runtime_error(linalg_msg_wrong_type);
1412inline int wrap::gtsv<ftn_double>(ftn_int n, ftn_int nrhs, ftn_double* dl, ftn_double* d, ftn_double* du,
1413 ftn_double* b, ftn_int ldb)
const
1418 FORTRAN(dgtsv)(&n, &nrhs, dl, d,
du, b, &ldb, &info);
1423 throw std::runtime_error(linalg_msg_wrong_type);
1431inline int wrap::gtsv<ftn_double_complex>(ftn_int n, ftn_int nrhs, ftn_double_complex* dl, ftn_double_complex* d,
1432 ftn_double_complex* du, ftn_double_complex* b, ftn_int ldb)
const
1437 FORTRAN(zgtsv)(&n, &nrhs, dl, d,
du, b, &ldb, &info);
1442 throw std::runtime_error(linalg_msg_wrong_type);
1450inline int wrap::gesv<ftn_double>(ftn_int n, ftn_int nrhs, ftn_double* A, ftn_int lda, ftn_double* B, ftn_int ldb)
const
1455 std::vector<ftn_int> ipiv(n);
1456 FORTRAN(dgesv)(&n, &nrhs, A, &lda, &ipiv[0], B, &ldb, &info);
1461 throw std::runtime_error(linalg_msg_wrong_type);
1469inline int wrap::gesv<ftn_double_complex>(ftn_int n, ftn_int nrhs, ftn_double_complex* A, ftn_int lda,
1470 ftn_double_complex* B, ftn_int ldb)
const
1475 std::vector<ftn_int> ipiv(n);
1476 FORTRAN(zgesv)(&n, &nrhs, A, &lda, &ipiv[0], B, &ldb, &info);
1481 throw std::runtime_error(linalg_msg_wrong_type);
1490inline int wrap::getrf<ftn_double>(ftn_int m, ftn_int n, ftn_double* A, ftn_int lda, ftn_int* ipiv)
const
1495 FORTRAN(dgetrf)(&m, &n, A, &lda, ipiv, &info);
1500 throw std::runtime_error(linalg_msg_wrong_type);
1509inline int wrap::getrf<ftn_double_complex>(ftn_int m, ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int* ipiv)
const
1514 FORTRAN(zgetrf)(&m, &n, A, &lda, ipiv, &info);
1519 throw std::runtime_error(linalg_msg_wrong_type);
1527inline int wrap::getrf<ftn_double_complex>(ftn_int m, ftn_int n, dmatrix<ftn_double_complex>& A,
1528 ftn_int ia, ftn_int ja, ftn_int* ipiv)
const
1532#if defined (SIRIUS_SCALAPACK)
1536 FORTRAN(pzgetrf)(&m, &n, A.at(sddk::memory_t::host), &ia, &ja,
const_cast<int*
>(A.descriptor()), ipiv, &info);
1539 throw std::runtime_error(linalg_msg_no_scalapack);
1544 throw std::runtime_error(linalg_msg_wrong_type);
1553wrap::getrs<ftn_double_complex>(
char trans, ftn_int n, ftn_int nrhs,
const ftn_double_complex* A, ftn_int lda,
1554 ftn_int* ipiv, ftn_double_complex* B, ftn_int ldb)
const
1559 FORTRAN(
zgetrs)(&trans, &n, &nrhs,
const_cast<ftn_double_complex*
>(A), &lda, ipiv, B, &ldb, &info);
1563#if defined(SIRIUS_GPU)
1565 return acc::lapack::getrs(trans, n, nrhs,
reinterpret_cast<const acc_complex_double_t*
>(A), lda, ipiv,
1566 reinterpret_cast<acc_complex_double_t*
>(B), ldb);
1571 throw std::runtime_error(linalg_msg_wrong_type);
1580wrap::tranc<ftn_complex>(ftn_int m, ftn_int n, dmatrix<ftn_complex>& A, ftn_int ia, ftn_int ja, dmatrix<ftn_complex>& C,
1581 ftn_int ic, ftn_int jc)
const
1585#if defined(SIRIUS_SCALAPACK)
1589 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1590 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1592 FORTRAN(pctranc)(&m, &n,
const_cast<ftn_complex*
>(&constant<ftn_complex>::one()),
1593 A_ptr, &ia, &ja, A.descriptor(),
1594 const_cast<ftn_complex*
>(&constant<ftn_complex>::zero()),
1595 C_ptr, &ic, &jc, C.descriptor());
1597 throw std::runtime_error(linalg_msg_no_scalapack);
1602 throw std::runtime_error(linalg_msg_wrong_type);
1609inline void wrap::tranu<ftn_double_complex>(ftn_int m, ftn_int n, dmatrix<ftn_double_complex>& A,
1610 ftn_int ia, ftn_int ja, dmatrix<ftn_double_complex>& C, ftn_int ic, ftn_int jc)
const
1614#if defined(SIRIUS_SCALAPACK)
1618 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1619 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1621 FORTRAN(pztranu)(&m, &n,
const_cast<ftn_double_complex*
>(&constant<ftn_double_complex>::one()),
1622 A_ptr, &ia, &ja, A.descriptor(),
1623 const_cast<ftn_double_complex*
>(&constant<ftn_double_complex>::zero()),
1624 C_ptr, &ic, &jc, C.descriptor());
1626 throw std::runtime_error(linalg_msg_no_scalapack);
1631 throw std::runtime_error(linalg_msg_wrong_type);
1638inline void wrap::tranc<ftn_double_complex>(ftn_int m, ftn_int n, dmatrix<ftn_double_complex>& A,
1639 ftn_int ia, ftn_int ja, dmatrix<ftn_double_complex>& C, ftn_int ic, ftn_int jc)
const
1643#if defined(SIRIUS_SCALAPACK)
1647 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1648 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1650 FORTRAN(pztranc)(&m, &n,
const_cast<ftn_double_complex*
>(&constant<ftn_double_complex>::one()),
1651 A_ptr, &ia, &ja, A.descriptor(),
1652 const_cast<ftn_double_complex*
>(&constant<ftn_double_complex>::zero()),
1653 C_ptr, &ic, &jc, C.descriptor());
1655 throw std::runtime_error(linalg_msg_no_scalapack);
1660 throw std::runtime_error(linalg_msg_wrong_type);
1667inline void wrap::tranc<ftn_single>(ftn_int m, ftn_int n, dmatrix<ftn_single>& A, ftn_int ia, ftn_int ja,
1668 dmatrix<ftn_single>& C, ftn_int ic, ftn_int jc)
const
1672#if defined(SIRIUS_SCALAPACK)
1676 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1677 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1679 FORTRAN(pstran)(&m, &n,
const_cast<ftn_single*
>(&constant<ftn_single>::one()), A_ptr,
1680 &ia, &ja, A.descriptor(),
const_cast<ftn_single*
>(&constant<ftn_single>::zero()),
1681 C_ptr, &ic, &jc, C.descriptor());
1683 throw std::runtime_error(linalg_msg_no_scalapack);
1688 throw std::runtime_error(linalg_msg_wrong_type);
1695inline void wrap::tranu<ftn_double>(ftn_int m, ftn_int n, dmatrix<ftn_double>& A, ftn_int ia, ftn_int ja,
1696 dmatrix<ftn_double>& C, ftn_int ic, ftn_int jc)
const
1700#if defined(SIRIUS_SCALAPACK)
1704 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1705 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1707 FORTRAN(pdtran)(&m, &n,
const_cast<ftn_double*
>(&constant<ftn_double>::one()), A_ptr,
1708 &ia, &ja, A.descriptor(),
const_cast<ftn_double*
>(&constant<ftn_double>::zero()),
1709 C_ptr, &ic, &jc, C.descriptor());
1711 throw std::runtime_error(linalg_msg_no_scalapack);
1716 throw std::runtime_error(linalg_msg_wrong_type);
1723inline void wrap::tranc<ftn_double>(ftn_int m, ftn_int n, dmatrix<ftn_double>& A, ftn_int ia, ftn_int ja,
1724 dmatrix<ftn_double>& C, ftn_int ic, ftn_int jc)
const
1728#if defined(SIRIUS_SCALAPACK)
1732 auto A_ptr = (A.num_rows_local() * A.num_cols_local() > 0) ? A.at(sddk::memory_t::host) :
nullptr;
1733 auto C_ptr = (C.num_rows_local() * C.num_cols_local() > 0) ? C.at(sddk::memory_t::host) :
nullptr;
1735 FORTRAN(pdtran)(&m, &n,
const_cast<ftn_double*
>(&constant<ftn_double>::one()), A_ptr,
1736 &ia, &ja, A.descriptor(),
const_cast<ftn_double*
>(&constant<ftn_double>::zero()),
1737 C_ptr, &ic, &jc, C.descriptor());
1739 throw std::runtime_error(linalg_msg_no_scalapack);
1744 throw std::runtime_error(linalg_msg_wrong_type);
1752inline int wrap::getri<ftn_double>(ftn_int n, ftn_double* A, ftn_int lda, ftn_int* ipiv)
const
1756 ftn_int nb = linalg_base::ilaenv(1,
"dgetri",
"U", n, -1, -1, -1);
1757 ftn_int lwork = n * nb;
1758 std::vector<ftn_double> work(lwork);
1761 FORTRAN(dgetri)(&n, A, &lda, ipiv, &work[0], &lwork, &info);
1766 throw std::runtime_error(linalg_msg_wrong_type);
1775inline int wrap::getri<ftn_double_complex>(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int* ipiv)
const
1779 ftn_int nb = linalg_base::ilaenv(1,
"zgetri",
"U", n, -1, -1, -1);
1780 ftn_int lwork = n * nb;
1781 std::vector<ftn_double_complex> work(lwork);
1784 FORTRAN(zgetri)(&n, A, &lda, ipiv, &work[0], &lwork, &info);
1789 throw std::runtime_error(linalg_msg_wrong_type);
1797inline int wrap::sytrf<ftn_double_complex>(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int* ipiv)
const
1801 ftn_int nb = linalg_base::ilaenv(1,
"zhetrf",
"U", n, -1, -1, -1);
1802 ftn_int lwork = n * nb;
1803 std::vector<ftn_double_complex> work(lwork);
1806 FORTRAN(zhetrf)(
"U", &n, A, &lda, ipiv, &work[0], &lwork, &info, (ftn_len)1);
1811 throw std::runtime_error(linalg_msg_wrong_type);
1819inline int wrap::sytrf<ftn_double>(ftn_int n, ftn_double* A, ftn_int lda, ftn_int* ipiv)
const
1823 ftn_int nb = linalg_base::ilaenv(1,
"dsytrf",
"U", n, -1, -1, -1);
1824 ftn_int lwork = n * nb;
1825 std::vector<ftn_double> work(lwork);
1828 FORTRAN(dsytrf)(
"U", &n, A, &lda, ipiv, &work[0], &lwork, &info, (ftn_len)1);
1833 throw std::runtime_error(linalg_msg_wrong_type);
1841inline int wrap::sytri<ftn_double>(ftn_int n, ftn_double* A, ftn_int lda, ftn_int* ipiv)
const
1845 std::vector<ftn_double> work(n);
1847 FORTRAN(dsytri)(
"U", &n, A, &lda, ipiv, &work[0], &info, (ftn_len)1);
1852 throw std::runtime_error(linalg_msg_wrong_type);
1860inline int wrap::sytrs<ftn_double>(ftn_int n, ftn_int nrhs, ftn_double* A, ftn_int lda, ftn_int* ipiv, ftn_double* b, ftn_int ldb)
const
1865 FORTRAN(dsytrs)(
"U", &n, &nrhs, A, &lda, ipiv, b, &ldb, &info, (ftn_len)1);
1870 throw std::runtime_error(linalg_msg_wrong_type);
1878inline int wrap::sytri<ftn_double_complex>(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int* ipiv)
const
1882 std::vector<ftn_double_complex> work(n);
1884 FORTRAN(zhetri)(
"U", &n, A, &lda, ipiv, &work[0], &info, (ftn_len)1);
1888 throw std::runtime_error(linalg_msg_wrong_type);
1896inline std::tuple<ftn_double, ftn_double, ftn_double> wrap::lartg(ftn_double f, ftn_double g)
const
1900 ftn_double cs, sn, r;
1901 FORTRAN(dlartg)(&f, &g, &cs, &sn, &r);
1902 return std::make_tuple(cs, sn, r);
1905 throw std::runtime_error(linalg_msg_wrong_type);
1911template <
typename T>
1912inline void check_hermitian(std::string
const& name, sddk::matrix<T>
const& mtrx,
int n = -1)
1914 assert(mtrx.size(0) == mtrx.size(1));
1916 double maxdiff = 0.0;
1921 n =
static_cast<int>(mtrx.size(0));
1924 for (
int i = 0; i < n; i++) {
1925 for (
int j = 0; j < n; j++) {
1926 double diff = std::abs(mtrx(i, j) -
std::conj(mtrx(j, i)));
1927 if (diff > maxdiff) {
1935 if (maxdiff > 1e-10) {
1936 std::stringstream s;
1937 s << name <<
" is not a symmetric or hermitian matrix" << std::endl
1938 <<
" maximum error: i, j : " << i0 <<
" " << j0 <<
" diff : " << maxdiff;
1944template <
typename T>
1945inline real_type<T> check_hermitian(dmatrix<T>& mtrx__,
int n__)
1947 real_type<T> max_diff{0};
1948 if (mtrx__.comm().size() != 1) {
1949 dmatrix<T> tmp(n__, n__, mtrx__.blacs_grid(), mtrx__.bs_row(), mtrx__.bs_col());
1951 for (
int i = 0; i < tmp.num_cols_local(); i++) {
1952 for (
int j = 0; j < tmp.num_rows_local(); j++) {
1953 max_diff = std::max(max_diff, std::abs(mtrx__(j, i) - tmp(j, i)));
1956 mtrx__.blacs_grid().comm().template allreduce<real_type<T>, mpi::op_t::max>(&max_diff, 1);
1958 for (
int i = 0; i < n__; i++) {
1959 for (
int j = 0; j < n__; j++) {
1960 max_diff = std::max(max_diff, std::abs(mtrx__(j, i) -
std::conj(mtrx__(i, j))));
1967template <
typename T>
1968inline double check_identity(dmatrix<T>& mtrx__,
int n__)
1970 real_type<T> max_diff{0};
1971 for (
int i = 0; i < mtrx__.num_cols_local(); i++) {
1972 int icol = mtrx__.icol(i);
1974 for (
int j = 0; j < mtrx__.num_rows_local(); j++) {
1975 int jrow = mtrx__.irow(j);
1978 max_diff = std::max(max_diff, std::abs(mtrx__(j, i) -
static_cast<real_type<T>
>(1.0)));
1980 max_diff = std::max(max_diff, std::abs(mtrx__(j, i)));
1986 mtrx__.comm().template allreduce<real_type<T>, mpi::op_t::max>(&max_diff, 1);
1990template <
typename T>
1991inline double check_diagonal(dmatrix<T>& mtrx__,
int n__, sddk::mdarray<double, 1>
const& diag__)
1994 for (
int i = 0; i < mtrx__.num_cols_local(); i++) {
1995 int icol = mtrx__.icol(i);
1997 for (
int j = 0; j < mtrx__.num_rows_local(); j++) {
1998 int jrow = mtrx__.irow(j);
2001 max_diff = std::max(max_diff, std::abs(mtrx__(j, i) - diag__[icol]));
2003 max_diff = std::max(max_diff, std::abs(mtrx__(j, i)));
2009 mtrx__.comm().template allreduce<double, mpi::op_t::max>(&max_diff, 1);
2017template <
typename T>
2021 if (!(kind__ == 0 || kind__ == 1)) {
2022 RTE_THROW(
"wrong 'kind' parameter");
2024 char c1 = kind__ == 0 ?
'N' :
'C';
2025 char c2 = kind__ == 0 ?
'C' :
'N';
2026 if (A__.comm().size() != 1) {
2041 U__.at(sddk::memory_t::host), U__.
ld(), A__.at(sddk::memory_t::host), A__.
ld(), &
constant<T>::zero(),
2042 tmp.at(sddk::memory_t::host), tmp.
ld());
2046 tmp.at(sddk::memory_t::host), tmp.
ld(), U__.at(sddk::memory_t::host), U__.
ld(), &
constant<T>::zero(),
2047 A__.at(sddk::memory_t::host), A__.
ld());
Interface to accelerators API.
Blas functions for execution on GPUs.
Interface to some BLAS/LAPACK functions.
Helper class to wrap stream id (integer number).
int bs_row() const
Row blocking factor.
int bs_col() const
Column blocking factor.
int gesv(ftn_int n, ftn_int nrhs, T *A, ftn_int lda, T *B, ftn_int ldb) const
Compute the solution to system of linear equations A * X = B for general matrix.
int sytri(ftn_int n, T *A, ftn_int lda, ftn_int *ipiv) const
Inversion of factorized symmetric triangular matrix.
void geinv(ftn_int n, sddk::matrix< T > &A) const
Invert a general matrix.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, dmatrix< T > const &A, ftn_int ia, ftn_int ja, dmatrix< T > const &B, ftn_int ib, ftn_int jb, T const *beta, dmatrix< T > &C, ftn_int ic, ftn_int jc)
Distributed general matrix-matrix multiplication.
void tranc(ftn_int m, ftn_int n, dmatrix< T > &A, ftn_int ia, ftn_int ja, dmatrix< T > &C, ftn_int ic, ftn_int jc) const
Conjugate transpose matrix.
int sytrf(ftn_int n, T *A, ftn_int lda, ftn_int *ipiv) const
U*D*U^H factorization of hermitian or symmetric matrix.
void tranu(ftn_int m, ftn_int n, dmatrix< T > &A, ftn_int ia, ftn_int ja, dmatrix< T > &C, ftn_int ic, ftn_int jc) const
Transpose matrix without conjugation.
int trtri(ftn_int n, T *A, ftn_int lda, ftn_int const *desca=nullptr) const
Inversion of a triangular matrix.
int gtsv(ftn_int n, ftn_int nrhs, T *dl, T *d, T *du, T *b, ftn_int ldb) const
Compute the solution to system of linear equations A * X = B for general tri-diagonal matrix.
int sytrs(ftn_int n, ftn_int nrhs, T *A, ftn_int lda, ftn_int *ipiv, T *b, ftn_int ldb) const
solve Ax=b in place of b where A is factorized with sytrf.
void hemm(char side, char uplo, ftn_int m, ftn_int n, T const *alpha, T const *A, ftn_len lda, T const *B, ftn_len ldb, T const *beta, T *C, ftn_len ldc)
Hermitian matrix times a general matrix or vice versa.
int getrf(ftn_int m, ftn_int n, T *A, ftn_int lda, ftn_int *ipiv) const
LU factorization of general matrix.
void gemm(char transa, char transb, ftn_int m, ftn_int n, ftn_int k, T const *alpha, T const *A, ftn_int lda, T const *B, ftn_int ldb, T const *beta, T *C, ftn_int ldc, acc::stream_id sid=acc::stream_id(-1)) const
General matrix-matrix multiplication.
void axpy(int n, T const *alpha, T const *x, int incx, T *y, int incy)
vector addition
int potrf(ftn_int n, T *A, ftn_int lda, ftn_int const *desca=nullptr) const
Cholesky factorization.
int getrf(ftn_int m, ftn_int n, dmatrix< T > &A, ftn_int ia, ftn_int ja, ftn_int *ipiv) const
LU factorization of general matrix.
uint32_t ld() const
Return leading dimension size.
Interface to CUDA eigen-solver library.
Contains definition and implementation of distributed matrix class.
bool is_set_device_id()
check if device id has been set properly
Interface to SPLA library.
Interface to some of the MAGMA functions.
Memory management functions and classes.
void zgetrs(rocblas_handle handle, char trans, int n, int nrhs, acc_complex_double_t *A, int lda, const int *devIpiv, acc_complex_double_t *B, int ldb)
Linear Solvers.
int num_devices()
Get the number of devices.
int get_device_id()
Get current device ID.
lib_t
Type of linear algebra backend library.
@ cublasxt
cuBlasXt (cuBlas with CPU pointers and large matrices support)
@ scalapack
CPU ScaLAPACK.
@ spla
SPLA library. Can take CPU and device pointers.
@ magma
MAGMA with CPU pointers.
@ gpublas
GPU BLAS (cuBlas or ROCblas)
void unitary_similarity_transform(int kind__, dmatrix< T > &A__, dmatrix< T > const &U__, int n__)
int get_device_id(int num_devices__)
Get GPU device id associated with the current rank.
Namespace of the SIRIUS library.
auto conj(double x__)
Return complex conjugate of a number. For a real value this is the number itself.