Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 119 additions & 1 deletion source/source_base/kernels/dsp/dsp_connector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,66 @@ void zgemm_mth_(const char* transa,
free_ht(bet);
} // zgemm that needn't malloc_ht or free_ht

void zgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<double>* alpha,
const std::complex<double>* a,
const int* lda,
const std::complex<double>* b,
const int* ldb,
const std::complex<double>* beta,
std::complex<double>* c,
const int* ldc,
int cluster_id)
{
const bool transa_not = (transa[0] == 'N' || transa[0] == 'n');
const bool transb_not = (transb[0] == 'N' || transb[0] == 'n');
// const size_t a_elems = static_cast<size_t>(*lda) * (transa_not ? static_cast<size_t>(*k) : static_cast<size_t>(*m));
// const size_t b_elems = static_cast<size_t>(*ldb) * (transb_not ? static_cast<size_t>(*n) : static_cast<size_t>(*k));
const size_t c_elems = static_cast<size_t>(*ldc) * static_cast<size_t>(*n);

// std::complex<double>* A_dsp = static_cast<std::complex<double>*>(malloc_ht(a_elems * sizeof(std::complex<double>), cluster_id));
// std::complex<double>* B_dsp = static_cast<std::complex<double>*>(malloc_ht(b_elems * sizeof(std::complex<double>), cluster_id));
std::complex<double>* C_dsp = static_cast<std::complex<double>*>(malloc_ht(c_elems * sizeof(std::complex<double>), cluster_id));
std::complex<double>* alp = static_cast<std::complex<double>*>(malloc_ht(sizeof(std::complex<double>), cluster_id));
std::complex<double>* bet = static_cast<std::complex<double>*>(malloc_ht(sizeof(std::complex<double>), cluster_id));

// memcpy(A_dsp, a, a_elems * sizeof(std::complex<double>));
// memcpy(B_dsp, b, b_elems * sizeof(std::complex<double>));
memcpy(C_dsp, c, c_elems * sizeof(std::complex<double>));
*alp = *alpha;
*bet = *beta;

mt_hthread_zgemm(CBLAS_ORDER::CblasColMajor,
convertBLASTranspose(transa),
convertBLASTranspose(transb),
*m,
*n,
*k,
alp,
a,
// A_dsp,
*lda,
b,
// B_dsp,
*ldb,
bet,
// c,
C_dsp,
*ldc,
cluster_id);
memcpy(c, C_dsp, c_elems * sizeof(std::complex<double>));

// free_ht(A_dsp);
// free_ht(B_dsp);
free_ht(C_dsp);
free_ht(alp);
free_ht(bet);
}

void cgemm_mth_(const char* transa,
const char* transb,
const int* m,
Expand Down Expand Up @@ -443,6 +503,64 @@ void cgemm_mth_(const char* transa,
free_ht(bet);
} // cgemm that needn't malloc_ht or free_ht

void cgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id)
{
const bool transa_not = (transa[0] == 'N' || transa[0] == 'n');
const bool transb_not = (transb[0] == 'N' || transb[0] == 'n');
const size_t a_elems = static_cast<size_t>(*lda) * (transa_not ? static_cast<size_t>(*k) : static_cast<size_t>(*m));
const size_t b_elems = static_cast<size_t>(*ldb) * (transb_not ? static_cast<size_t>(*n) : static_cast<size_t>(*k));
const size_t c_elems = static_cast<size_t>(*ldc) * static_cast<size_t>(*n);

std::complex<float>* A_dsp = static_cast<std::complex<float>*>(malloc_ht(a_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* B_dsp = static_cast<std::complex<float>*>(malloc_ht(b_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* C_dsp = static_cast<std::complex<float>*>(malloc_ht(c_elems * sizeof(std::complex<float>), cluster_id));
std::complex<float>* alp = static_cast<std::complex<float>*>(malloc_ht(sizeof(std::complex<float>), cluster_id));
std::complex<float>* bet = static_cast<std::complex<float>*>(malloc_ht(sizeof(std::complex<float>), cluster_id));

memcpy(A_dsp, a, a_elems * sizeof(std::complex<float>));
memcpy(B_dsp, b, b_elems * sizeof(std::complex<float>));
memcpy(C_dsp, c, c_elems * sizeof(std::complex<float>));
*alp = *alpha;
*bet = *beta;

mt_hthread_cgemm(CBLAS_ORDER::CblasColMajor,
convertBLASTranspose(transa),
convertBLASTranspose(transb),
*m,
*n,
*k,
(const void*)alp,
(const void*)A_dsp,
*lda,
(const void*)B_dsp,
*ldb,
(const void*)bet,
(void*)C_dsp,
*ldc,
cluster_id);

memcpy(c, C_dsp, c_elems * sizeof(std::complex<float>));

free_ht(A_dsp);
free_ht(B_dsp);
free_ht(C_dsp);
free_ht(alp);
free_ht(bet);
}

void sgemv_mth_(const char* transa,
const int* m,
const int* n,
Expand Down Expand Up @@ -570,4 +688,4 @@ void cgemv_mth_(const char* transa,
free_ht(alp);
free_ht(bet);
}
} // namespace mtfunc
} // namespace mtfunc
60 changes: 46 additions & 14 deletions source/source_base/kernels/dsp/dsp_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,21 @@ void zgemm_mt_(const char* transa,
int cluster_id);

void cgemm_mt_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);



void sgemv_mt_(const char* transa,
const int* m,
Expand Down Expand Up @@ -173,6 +175,21 @@ void zgemm_mth_(const char* transa,
const int* ldc,
int cluster_id);

void zgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<double>* alpha,
const std::complex<double>* a,
const int* lda,
const std::complex<double>* b,
const int* ldb,
const std::complex<double>* beta,
std::complex<double>* c,
const int* ldc,
int cluster_id);

void cgemm_mth_(const char* transa,
const char* transb,
const int* m,
Expand All @@ -188,6 +205,21 @@ void cgemm_mth_(const char* transa,
const int* ldc,
int cluster_id);

void cgemm_pack_mth_(const char* transa,
const char* transb,
const int* m,
const int* n,
const int* k,
const std::complex<float>* alpha,
const std::complex<float>* a,
const int* lda,
const std::complex<float>* b,
const int* ldb,
const std::complex<float>* beta,
std::complex<float>* c,
const int* ldc,
int cluster_id);

void sgemv_mth_(const char* transa,
const int* m,
const int* n,
Expand Down Expand Up @@ -282,4 +314,4 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv
} // namespace mtfunc

#endif
#endif
#endif
2 changes: 1 addition & 1 deletion source/source_base/math_bspline.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Bspline
int norder; // the order of bezier base; norder >= 0
double Dx; // Dx: the interval of control node
double xi; // xi: the starting point
double *bezier; // bezier[n] = Bk[n]
double * bezier = nullptr; // bezier[n] = Bk[n]

public:
Bspline();
Expand Down
6 changes: 3 additions & 3 deletions source/source_base/math_chebyshev.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class Chebyshev
std::complex<REAL>* coefc_cpu = nullptr; //[CPU] expansion coefficient of each order

FFTW<REAL> fftw; // use for fftw
REAL* polytrace; //[CPU] w_n = \sum_i v^+ * T_n(A) * v, only
REAL* polytrace = nullptr; //[CPU] w_n = \sum_i v^+ * T_n(A) * v, only

bool getcoef_real; // coef_real has been calculated
bool getcoef_complex; // coef_complex has been calculated
Expand Down Expand Up @@ -248,7 +248,7 @@ class FFTW<double>
FFTW(const int norder2_in);
~FFTW();
void execute_fftw();
double* dcoef; //[norder2]
double* dcoef = nullptr; //[norder2]
fftw_complex* ccoef = nullptr;
fftw_plan coef_plan;
};
Expand All @@ -261,7 +261,7 @@ class FFTW<float>
FFTW(const int norder2_in);
~FFTW();
void execute_fftw();
float* dcoef; //[norder2]
float* dcoef = nullptr; //[norder2]
fftwf_complex* ccoef = nullptr;
fftwf_plan coef_plan;
};
Expand Down
8 changes: 4 additions & 4 deletions source/source_base/mcd.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ typedef struct ChunkS
#else
long long id; // 64bit allocation ID
#endif
char *function; //creating function
char *file; //file function is in
void *ptr; //pointer to allocation
char * function = nullptr; //creating function
char * file = nullptr; //file function is in
void * ptr = nullptr; //pointer to allocation
struct ChunkS *next, //next chunk (null if nonw)
*prev; //previous chunk (null if nonw)
}Chunk;
Expand Down Expand Up @@ -706,7 +706,7 @@ int MCD_sscanf(const char *str,const char *fmt,char*fun,char*file,int line,...)
void scan_args(const char *fmt,va_list argptr,char*fun,char*file,int line)
{
char **ptr;
void *dummy; // clear up the unused warning
void * dummy = nullptr; // clear up the unused warning

for(;*fmt;fmt++) {
if(*fmt!='%')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ class BFCAllocator : public Allocator {
}

private:
BFCAllocator* allocator_; // The parent allocator
BFCAllocator* allocator_ = nullptr; // The parent allocator
};

using free_chunk_set_t = std::set<ChunkHandle, ChunkComparator>;
Expand Down
19 changes: 19 additions & 0 deletions source/source_base/module_device/memory_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,19 @@ struct resize_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
}
};

template <typename FPTYPE>
struct set_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
{
void operator()(FPTYPE* arr, const int var, const size_t size)
{
ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) {
int beg = 0, len = 0;
ModuleBase::BLOCK_TASK_DIST_1D(num_thread, thread_id, size, (size_t)4096 / sizeof(FPTYPE), beg, len);
memset(arr + beg, var, sizeof(FPTYPE) * len);
});
}
};

template <typename FPTYPE>
struct delete_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
{
Expand All @@ -487,6 +500,12 @@ template struct resize_memory_op_mt<double, base_device::DEVICE_CPU>;
template struct resize_memory_op_mt<std::complex<float>, base_device::DEVICE_CPU>;
template struct resize_memory_op_mt<std::complex<double>, base_device::DEVICE_CPU>;

template struct set_memory_op_mt<int, base_device::DEVICE_CPU>;
template struct set_memory_op_mt<float, base_device::DEVICE_CPU>;
template struct set_memory_op_mt<double, base_device::DEVICE_CPU>;
template struct set_memory_op_mt<std::complex<float>, base_device::DEVICE_CPU>;
template struct set_memory_op_mt<std::complex<double>, base_device::DEVICE_CPU>;

template struct delete_memory_op_mt<int, base_device::DEVICE_CPU>;
template struct delete_memory_op_mt<float, base_device::DEVICE_CPU>;
template struct delete_memory_op_mt<double, base_device::DEVICE_CPU>;
Expand Down
14 changes: 14 additions & 0 deletions source/source_base/module_device/memory_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,20 @@ struct resize_memory_op_mt
void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
};

template <typename FPTYPE, typename Device>
struct set_memory_op_mt
{
/// @brief memset for DSP memory allocated by mt allocator.
///
/// Input Parameters
/// \param var : the specified constant byte value
/// \param size : array size
///
/// Output Parameters
/// \param arr : output array initialized by the input value
void operator()(FPTYPE* arr, const int var, const size_t size);
};

template <typename FPTYPE, typename Device>
struct delete_memory_op_mt
{
Expand Down
16 changes: 12 additions & 4 deletions source/source_base/module_external/blas_connector_matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ void BlasConnector::gemm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::cgemm_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::cgemm_pack_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// cgemm_mth_ for raw dsp mth;
// cgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
else if (device_type == base_device::AbacusDevice_t::GpuDevice)
Expand Down Expand Up @@ -158,7 +160,9 @@ void BlasConnector::gemm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::zgemm_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::zgemm_pack_mth_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// zgemm_mth_ for raw dsp mth;
// zgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
else if (device_type == base_device::AbacusDevice_t::GpuDevice)
Expand Down Expand Up @@ -277,7 +281,9 @@ void BlasConnector::gemm_cm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::cgemm_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::cgemm_pack_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// cgemm_mth_ for raw dsp mth;
// cgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
#ifdef __CUDA
Expand Down Expand Up @@ -328,7 +334,9 @@ void BlasConnector::gemm_cm(const char transa,
#ifdef __DSP
else if (device_type == base_device::AbacusDevice_t::DspDevice)
{
mtfunc::zgemm_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
mtfunc::zgemm_pack_mth_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc, GlobalV::MY_RANK % PARAM.inp.dsp_count);
// zgemm_mth_ for raw dsp mth;
// zgemm_pack_mth_ for dsp mth with memcpy to DSP buffer
}
#endif
#ifdef __CUDA
Expand Down
2 changes: 1 addition & 1 deletion source/source_basis/module_ao/ORB_nonlocal.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Numerical_Nonlocal
const int& nproj_in,
const Numerical_Nonlocal_Lm* ps_orbital_in);

Numerical_Nonlocal_Lm* Proj; ///< length: nproj(only store radial function )
Numerical_Nonlocal_Lm* Proj = nullptr; ///< length: nproj(only store radial function )

const double& get_rcut_max() const { return rcut_max; }
const int& get_nproj() const { return nproj; }
Expand Down
Loading
Loading