Misc Improvements to GPU Package
- Optimizations for molecular systems - Improved kernel performance and greater CPU overlap - Reduced GPU to CPU communications for discrete devices - Switch classic Intel makefiles to use LLVM-based compilers - Prefetch optimizations supported for OpenCL - Optimized data repack for quaternions
This commit is contained in:
@ -28,9 +28,9 @@ AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false),
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int AnswerT::bytes_per_atom() const {
|
||||
int bytes=11*sizeof(acctyp);
|
||||
int bytes=10*sizeof(acctyp);
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(acctyp);
|
||||
bytes+=3*sizeof(acctyp);
|
||||
if (_charge)
|
||||
bytes+=sizeof(acctyp);
|
||||
return bytes;
|
||||
@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) {
|
||||
|
||||
bool success=true;
|
||||
|
||||
_ans_fields=4;
|
||||
_ans_fields=3;
|
||||
if (_rot)
|
||||
_ans_fields+=4;
|
||||
_ans_fields+=3;
|
||||
|
||||
// --------------------------- Device allocations
|
||||
success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY,
|
||||
@ -134,11 +134,11 @@ void AnswerT::clear() {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double AnswerT::host_memory_usage() const {
|
||||
int atom_bytes=4;
|
||||
int atom_bytes=3;
|
||||
if (_charge)
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
atom_bytes+=3;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(Answer<numtyp,acctyp>);
|
||||
@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
|
||||
if (csize>0)
|
||||
engv.update_host(_ev_stride*csize,true);
|
||||
if (_rot)
|
||||
force.update_host(_inum*4*2,true);
|
||||
force.update_host(_inum*3*2,true);
|
||||
else
|
||||
force.update_host(_inum*4,true);
|
||||
force.update_host(_inum*3,true);
|
||||
time_answer.stop();
|
||||
|
||||
#ifndef GERYON_OCL_FLUSH
|
||||
@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
|
||||
template <class numtyp, class acctyp>
|
||||
void AnswerT::get_answers(double **f, double **tor) {
|
||||
if (_ilist==nullptr) {
|
||||
typedef struct { double x,y,z; } vec3d;
|
||||
typedef struct { acctyp x,y,z,w; } vec4d_t;
|
||||
auto fp=reinterpret_cast<vec3d*>(&(f[0][0]));
|
||||
auto forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
|
||||
auto fp=reinterpret_cast<double*>(&(f[0][0]));
|
||||
|
||||
#if (LAL_USE_OMP == 1)
|
||||
#pragma omp parallel
|
||||
@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
#if (LAL_USE_OMP == 1)
|
||||
const int nthreads = omp_get_num_threads();
|
||||
const int tid = omp_get_thread_num();
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int idelta = _inum*3 / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
const int ito = std::min(ifrom + idelta, _inum*3);
|
||||
#else
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
const int ito = _inum*3;
|
||||
#endif
|
||||
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
fp[i].x+=forcep[i].x;
|
||||
fp[i].y+=forcep[i].y;
|
||||
fp[i].z+=forcep[i].z;
|
||||
}
|
||||
for (int i=ifrom; i<ito; i++)
|
||||
fp[i]+=force[i];
|
||||
if (_rot) {
|
||||
auto torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
|
||||
auto torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
torp[i].x+=torquep[i].x;
|
||||
torp[i].y+=torquep[i].y;
|
||||
torp[i].z+=torquep[i].z;
|
||||
}
|
||||
auto torp=reinterpret_cast<double*>(&(tor[0][0]));
|
||||
auto torquep=&(force[_inum*3]);
|
||||
for (int i=ifrom; i<ito; i++)
|
||||
torp[i]+=torquep[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -344,7 +335,7 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
const int idelta = _inum / nthreads + 1;
|
||||
const int ifrom = tid * idelta;
|
||||
const int ito = std::min(ifrom + idelta, _inum);
|
||||
int fl=ifrom*4;
|
||||
int fl=ifrom*3;
|
||||
#else
|
||||
const int ifrom = 0;
|
||||
const int ito = _inum;
|
||||
@ -356,16 +347,16 @@ void AnswerT::get_answers(double **f, double **tor) {
|
||||
f[ii][0]+=force[fl];
|
||||
f[ii][1]+=force[fl+1];
|
||||
f[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
fl+=3;
|
||||
}
|
||||
if (_rot) {
|
||||
fl=_inum*4 + ifrom*4;
|
||||
fl=_inum*3 + ifrom*3;
|
||||
for (int i=ifrom; i<ito; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=force[fl];
|
||||
tor[ii][1]+=force[fl+1];
|
||||
tor[ii][2]+=force[fl+2];
|
||||
fl+=4;
|
||||
fl+=3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user