/*************************************************************************** answer.cpp ------------------- W. Michael Brown (ORNL) Class for data management of forces, torques, energies, and virials __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) __________________________________________________________________________ begin : email : brownw@ornl.gov ***************************************************************************/ #include "lal_answer.h" #if (LAL_USE_OMP == 1) #include #endif namespace LAMMPS_AL { #define AnswerT Answer template AnswerT::Answer() : _allocated(false),_eflag(false),_vflag(false), _inum(0),_ilist(nullptr),_newton(false) { } template int AnswerT::bytes_per_atom() const { int bytes=11*sizeof(acctyp); if (_rot) bytes+=4*sizeof(acctyp); if (_charge) bytes+=sizeof(acctyp); return bytes; } template bool AnswerT::alloc(const int inum) { _max_local=static_cast(static_cast(inum)*1.10); bool success=true; _ans_fields=4; if (_rot) _ans_fields+=4; // --------------------------- Device allocations success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); success=success && (force.alloc(_ans_fields*_max_local,*dev,UCL_READ_ONLY, UCL_READ_WRITE)==UCL_SUCCESS); _gpu_bytes=engv.device.row_bytes()+force.device.row_bytes(); _allocated=true; return success; } template bool AnswerT::init(const int inum, const bool charge, const bool rot, UCL_Device &devi) { clear(); bool success=true; _charge=charge; _rot=rot; _other=_charge || _rot; dev=&devi; _e_fields=1; if (_charge) _e_fields++; _ev_fields=6+_e_fields; // Initialize atom and nbor data int ef_inum=inum; if (ef_inum==0) ef_inum=1000; // Initialize timers for the selected device time_answer.init(*dev); time_answer.zero(); _time_cast=0.0; _time_cpu_idle=0.0; success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE, UCL_WRITE_ONLY)==UCL_SUCCESS); if (success) error_flag.zero(); return success && alloc(ef_inum); } template bool AnswerT::add_fields(const bool charge, const bool rot) { bool realloc=false; if (charge && _charge==false) { _charge=true; _e_fields++; _ev_fields++; realloc=true; } if (rot && _rot==false) { _rot=true; realloc=true; } if (realloc) { _other=_charge || _rot; int inum=_max_local; force.clear(); engv.clear(); _allocated=false; return alloc(inum); } return true; } template void AnswerT::clear() { _gpu_bytes=0; error_flag.clear(); if (!_allocated) return; _allocated=false; force.clear(); engv.clear(); time_answer.clear(); _inum=0; _ilist=nullptr; _eflag=false; _vflag=false; } template double AnswerT::host_memory_usage() const { int atom_bytes=4; if (_charge) atom_bytes+=1; if (_rot) atom_bytes+=4; int ans_bytes=atom_bytes+_ev_fields; return ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(Answer); } template void AnswerT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, const int red_blocks) { time_answer.start(); _eflag=eflag; _vflag=vflag; _ef_atom=ef_atom; _vf_atom=vf_atom; #ifdef LAL_NO_BLOCK_REDUCE _ev_stride=_inum; #else if (ef_atom || vf_atom) _ev_stride=_inum; else _ev_stride=red_blocks; #endif int csize=_ev_fields; if (!eflag) csize-=_e_fields; if (!vflag) csize-=6; if (csize>0) engv.update_host(_ev_stride*csize,true); if (_rot) force.update_host(_inum*4*2,true); else force.update_host(_inum*4,true); time_answer.stop(); #ifndef GERYON_OCL_FLUSH force.flush(); #endif } template void AnswerT::copy_answers(const bool eflag, const bool vflag, const bool ef_atom, const bool vf_atom, int *ilist, const int red_blocks) { _ilist=ilist; copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks); } template double AnswerT::energy_virial(double *eatom, double **vatom, double *virial) { if (_eflag==false && _vflag==false) return 0.0; double evdwl=0.0; int vstart=0; if (_eflag) { #if (LAL_USE_OMP_SIMD == 1) #pragma omp simd reduction(+:evdwl) #endif for (int i=0; i<_ev_stride; i++) evdwl+=engv[i]; if (_ef_atom) { if (_ilist==nullptr) { for (int i=0; i<_ev_stride; i++) eatom[i]+=engv[i]; } else { for (int i=0; i<_ev_stride; i++) eatom[_ilist[i]]+=engv[i]; } } vstart=_ev_stride; } if (_vflag) { int iend=vstart+_ev_stride; for (int j=0; j<6; j++) { for (int i=vstart; i double AnswerT::energy_virial(double *eatom, double **vatom, double *virial, double &ecoul) { if (_eflag==false && _vflag==false) return 0.0; if (_charge==false) return energy_virial(eatom,vatom,virial); double evdwl=0.0; int vstart=0, iend=_ev_stride; if (_eflag) { iend=_ev_stride*2; #if (LAL_USE_OMP_SIMD == 1) #pragma omp simd reduction(+:evdwl) #endif for (int i=0; i<_ev_stride; i++) evdwl+=engv[i]; double ecv=0.0; #if (LAL_USE_OMP_SIMD == 1) #pragma omp simd reduction(+:ecv) #endif for (int i=_ev_stride; i void AnswerT::get_answers(double **f, double **tor) { if (_ilist==nullptr) { typedef struct { double x,y,z; } vec3d; typedef struct { acctyp x,y,z,w; } vec4d_t; vec3d *fp=reinterpret_cast(&(f[0][0])); vec4d_t *forcep=reinterpret_cast(&(force[0])); #if (LAL_USE_OMP == 1) #pragma omp parallel #endif { #if (LAL_USE_OMP == 1) const int nthreads = omp_get_num_threads(); const int tid = omp_get_thread_num(); const int idelta = _inum / nthreads + 1; const int ifrom = tid * idelta; const int ito = std::min(ifrom + idelta, _inum); #else const int ifrom = 0; const int ito = _inum; #endif for (int i=ifrom; i(&(tor[0][0])); vec4d_t *torquep=reinterpret_cast(&(force[_inum*4])); for (int i=ifrom; i void AnswerT::cq(const int cq_index) { engv.cq(dev->cq(cq_index)); force.cq(dev->cq(cq_index)); time_answer.clear(); time_answer.init(*dev,dev->cq(cq_index)); time_answer.zero(); } template class Answer; }