git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
@ -29,9 +29,8 @@ __win_sort _win_sort;
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
|
||||
_vflag(false),_inum(0),_ilist(NULL),
|
||||
_newton(false) {
|
||||
PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
|
||||
_max_gpu_bytes(0) {
|
||||
#ifndef USE_OPENCL
|
||||
sort_config.op = CUDPP_ADD;
|
||||
sort_config.datatype = CUDPP_UINT;
|
||||
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
|
||||
int id_space=0;
|
||||
if (_gpu_nbor)
|
||||
id_space=2;
|
||||
int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
|
||||
int bytes=4*sizeof(numtyp)+id_space;
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
|
||||
bytes+=4*sizeof(numtyp);
|
||||
if (_charge)
|
||||
bytes+=sizeof(numtyp);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||
bool PairGPUAtomT::alloc(const int nall) {
|
||||
_max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
|
||||
if (_newton)
|
||||
_max_local=_max_atoms;
|
||||
else
|
||||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
|
||||
bool success=true;
|
||||
|
||||
int ans_elements=4;
|
||||
if (_rot)
|
||||
ans_elements+=4;
|
||||
|
||||
// Ignore host/device transfers?
|
||||
bool cpuview=false;
|
||||
if (dev->device_type()==UCL_CPU)
|
||||
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||
success=success && (host_x.alloc(_max_atoms*4,*dev,
|
||||
UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
|
||||
#endif
|
||||
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
||||
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
||||
// Buffer for casting only if different precisions
|
||||
if (_charge)
|
||||
success=success && (host_q.alloc(_max_atoms,*dev,
|
||||
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||
|
||||
|
||||
// --------------------------- Device allocations
|
||||
_gpu_bytes=0;
|
||||
int gpu_bytes=0;
|
||||
if (cpuview) {
|
||||
#ifdef GPU_CAST
|
||||
assert(0==1);
|
||||
#else
|
||||
dev_x.view(host_x);
|
||||
#endif
|
||||
dev_engv.view(host_engv);
|
||||
dev_ans.view(host_ans);
|
||||
if (_rot)
|
||||
dev_quat.view(host_quat);
|
||||
if (_charge)
|
||||
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
|
||||
dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
|
||||
_gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
|
||||
#else
|
||||
success=success && (UCL_SUCCESS==
|
||||
dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
|
||||
#endif
|
||||
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
if (_charge) {
|
||||
success=success && (dev_q.alloc(_max_atoms,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_q.row_bytes();
|
||||
gpu_bytes+=dev_q.row_bytes();
|
||||
}
|
||||
if (_rot) {
|
||||
success=success && (dev_quat.alloc(_max_atoms*4,*dev,
|
||||
UCL_READ_ONLY)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_quat.row_bytes();
|
||||
gpu_bytes+=dev_quat.row_bytes();
|
||||
}
|
||||
}
|
||||
if (_gpu_nbor) {
|
||||
success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
|
||||
if (_bonds) {
|
||||
success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
|
||||
_gpu_bytes+=dev_tag.row_bytes();
|
||||
gpu_bytes+=dev_tag.row_bytes();
|
||||
}
|
||||
}
|
||||
|
||||
_gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
|
||||
gpu_bytes+=dev_x.row_bytes();
|
||||
if (gpu_bytes>_max_gpu_bytes)
|
||||
_max_gpu_bytes=gpu_bytes;
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
||||
const bool rot, UCL_Device &devi, const bool gpu_nbor,
|
||||
bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
|
||||
const bool gpu_nbor, const bool bonds) {
|
||||
bool realloc=false;
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (rot && _rot==false) {
|
||||
_rot=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (gpu_nbor && _gpu_nbor==false) {
|
||||
_gpu_nbor=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (bonds && _bonds==false) {
|
||||
_bonds=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (realloc) {
|
||||
_other=_charge || _rot;
|
||||
int max_atoms=_max_atoms;
|
||||
clear_resize();
|
||||
return alloc(max_atoms);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
|
||||
UCL_Device &devi, const bool gpu_nbor,
|
||||
const bool bonds) {
|
||||
clear();
|
||||
|
||||
bool success=true;
|
||||
_x_avail=false;
|
||||
_q_avail=false;
|
||||
_quat_avail=false;
|
||||
_resized=false;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
_bonds=bonds;
|
||||
_charge=charge;
|
||||
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
|
||||
_other=_charge || _rot;
|
||||
dev=&devi;
|
||||
|
||||
_e_fields=1;
|
||||
if (_charge)
|
||||
_e_fields++;
|
||||
_ev_fields=6+_e_fields;
|
||||
|
||||
// Initialize atom and nbor data
|
||||
int ef_inum=inum;
|
||||
if (ef_inum==0)
|
||||
ef_inum=1000;
|
||||
int ef_nall=nall;
|
||||
if (ef_nall<=ef_inum)
|
||||
ef_nall=ef_inum*2;
|
||||
if (ef_nall==0)
|
||||
ef_nall=2000;
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_pos.init(*dev);
|
||||
time_other.init(*dev);
|
||||
time_answer.init(*dev);
|
||||
time_q.init(*dev);
|
||||
time_quat.init(*dev);
|
||||
time_pos.zero();
|
||||
time_other.zero();
|
||||
time_answer.zero();
|
||||
time_q.zero();
|
||||
time_quat.zero();
|
||||
_time_cast=0.0;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
compile_kernels(*dev);
|
||||
#endif
|
||||
|
||||
return success && alloc(ef_inum,ef_nall);
|
||||
return success && alloc(ef_nall);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
|
||||
dev_quat.clear();
|
||||
host_quat.clear();
|
||||
}
|
||||
dev_ans.clear();
|
||||
dev_engv.clear();
|
||||
#ifndef GPU_CAST
|
||||
host_x.clear();
|
||||
#else
|
||||
host_x_cast.clear();
|
||||
host_type_cast.clear();
|
||||
#endif
|
||||
host_ans.clear();
|
||||
host_engv.clear();
|
||||
dev_cell_id.clear();
|
||||
dev_particle_id.clear();
|
||||
dev_tag.clear();
|
||||
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::clear() {
|
||||
_gpu_bytes=0;
|
||||
_max_gpu_bytes=0;
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
time_pos.clear();
|
||||
time_other.clear();
|
||||
time_answer.clear();
|
||||
time_q.clear();
|
||||
time_quat.clear();
|
||||
clear_resize();
|
||||
_inum=0;
|
||||
_eflag=false;
|
||||
_vflag=false;
|
||||
|
||||
#ifdef GPU_CAST
|
||||
if (_compiled) {
|
||||
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return _max_atoms*atom_bytes*sizeof(numtyp)+
|
||||
ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(PairGPUAtom<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom) {
|
||||
time_answer.start();
|
||||
_eflag=eflag;
|
||||
_vflag=vflag;
|
||||
_ef_atom=ef_atom;
|
||||
_vf_atom=vf_atom;
|
||||
|
||||
int csize=_ev_fields;
|
||||
if (!eflag)
|
||||
csize-=_e_fields;
|
||||
if (!vflag)
|
||||
csize-=6;
|
||||
|
||||
if (csize>0)
|
||||
ucl_copy(host_engv,dev_engv,_inum*csize,true);
|
||||
if (_rot)
|
||||
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
|
||||
else
|
||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
||||
time_answer.stop();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom,
|
||||
int *ilist) {
|
||||
_ilist=ilist;
|
||||
copy_answers(eflag,vflag,ef_atom,vf_atom);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
if (_eflag==false && _vflag==false)
|
||||
return 0.0;
|
||||
|
||||
double evdwl=0.0;
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial, double &ecoul) {
|
||||
if (_eflag==false && _vflag==false) {
|
||||
ecoul=0.0;
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (_charge==false)
|
||||
return energy_virial(eatom,vatom,virial);
|
||||
|
||||
double evdwl=0.0;
|
||||
double _ecoul=0.0;
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]*=0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
ecoul+=_ecoul*0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::get_answers(double **f, double **tor) {
|
||||
acctyp *ap=host_ans.begin();
|
||||
if (_gpu_nbor) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=*ap;
|
||||
ap++;
|
||||
f[i][1]+=*ap;
|
||||
ap++;
|
||||
f[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
tor[i][0]+=*ap;
|
||||
ap++;
|
||||
tor[i][1]+=*ap;
|
||||
ap++;
|
||||
tor[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort arrays for neighbor list calculation
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAtomT::sort_neighbor(const int num_atoms) {
|
||||
|
||||
Reference in New Issue
Block a user