Misc Improvements to GPU Package

- Optimizations for molecular systems
-   Improved kernel performance and greater CPU overlap
- Reduced GPU to CPU communications for discrete devices
- Switch classic Intel makefiles to use LLVM-based compilers
- Prefetch optimizations supported for OpenCL
- Optimized data repack for quaternions
This commit is contained in:
W. Michael Brown
2023-03-05 21:03:12 -08:00
parent 142876a59b
commit 37f22c8627
151 changed files with 1085 additions and 617 deletions

View File

@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
template <class numtyp, class acctyp>
int AnswerT::bytes_per_atom() const {
int bytes=11*sizeof(acctyp);
int bytes=10*sizeof(acctyp);
if (_rot)
bytes+=4*sizeof(acctyp);
bytes+=3*sizeof(acctyp);
if (_charge)
bytes+=sizeof(acctyp);
return bytes;
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {
bool success=true;
_ans_fields=4;
_ans_fields=3;
if (_rot)
_ans_fields+=4;
_ans_fields+=3;
// --------------------------- Device allocations
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
@ -134,11 +134,11 @@ void AnswerT::clear() {
template <class numtyp, class acctyp>
double AnswerT::host_memory_usage() const {
int atom_bytes=4;
int atom_bytes=3;
if (_charge)
atom_bytes+=1;
if (_rot)
atom_bytes+=4;
atom_bytes+=3;
int ans_bytes=atom_bytes+_ev_fields;
return ans_bytes*(_max_local)*sizeof(acctyp)+
sizeof(Answer<numtyp,acctyp>);
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
if (csize>0)
engv.update_host(_ev_stride*csize,true);
if (_rot)
force.update_host(_inum*4*2,true);
force.update_host(_inum*3*2,true);
else
force.update_host(_inum*4,true);
force.update_host(_inum*3,true);
time_answer.stop();
#ifndef GERYON_OCL_FLUSH
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
template <class numtyp, class acctyp>
void AnswerT::get_answers(double **f, double **tor) {
if (_ilist==nullptr) {
typedef struct { double x,y,z; } vec3d;
typedef struct { acctyp x,y,z,w; } vec4d_t;
auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
auto fp=reinterpret_cast<double*>(&(f[0][0]));
#if (LAL_USE_OMP == 1)
#pragma omp parallel
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
#if (LAL_USE_OMP == 1)
const int nthreads = omp_get_num_threads();
const int tid = omp_get_thread_num();
const int idelta = _inum / nthreads + 1;
const int idelta = _inum*3 / nthreads + 1;
const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum);
const int ito = std::min(ifrom + idelta, _inum*3);
#else
const int ifrom = 0;
const int ito = _inum;
const int ito = _inum*3;
#endif
for (int i=ifrom; i<ito; i++) {
fp[i].x+=forcep[i].x;
fp[i].y+=forcep[i].y;
fp[i].z+=forcep[i].z;
}
for (int i=ifrom; i<ito; i++)
fp[i]+=force[i];
if (_rot) {
auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
for (int i=ifrom; i<ito; i++) {
torp[i].x+=torquep[i].x;
torp[i].y+=torquep[i].y;
torp[i].z+=torquep[i].z;
}
auto torp=reinterpret_cast<double*>(&(tor[0][0]));
auto torquep=&(force[_inum*3]);
for (int i=ifrom; i<ito; i++)
torp[i]+=torquep[i];
}
}
} else {
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
const int idelta = _inum / nthreads + 1;
const int ifrom = tid * idelta;
const int ito = std::min(ifrom + idelta, _inum);
int fl=ifrom*4;
int fl=ifrom*3;
#else
const int ifrom = 0;
const int ito = _inum;
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
f[ii][0]+=force[fl];
f[ii][1]+=force[fl+1];
f[ii][2]+=force[fl+2];
fl+=4;
fl+=3;
}
if (_rot) {
fl=_inum*4 + ifrom*4;
fl=_inum*3 + ifrom*3;
for (int i=ifrom; i<ito; i++) {
int ii=_ilist[i];
tor[ii][0]+=force[fl];
tor[ii][1]+=force[fl+1];
tor[ii][2]+=force[fl+2];
fl+=4;
fl+=3;
}
}
}