diff --git a/doc/src/package.rst b/doc/src/package.rst index 0ced387539..76bf20a97f 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -319,7 +319,7 @@ CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH, THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR, BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD, BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES, -PPPM_MAX_SPLINE. +PPPM_MAX_SPLINE, NBOR_PREFETCH. CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX (NVIDIA) or OpenCL extensions (Intel) should be used for horizontal diff --git a/lib/gpu/Makefile.oneapi b/lib/gpu/Makefile.oneapi index 9d11a0c4b0..32800676aa 100644 --- a/lib/gpu/Makefile.oneapi +++ b/lib/gpu/Makefile.oneapi @@ -12,13 +12,12 @@ EXTRAMAKE = Makefile.lammps.opencl LMP_INC = -DLAMMPS_SMALLBIG OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/ -CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -fp-model fast=2 -no-prec-div \ - -qoverride-limits -OCL_CPP = mpiicpc -std=c++11 -diag-disable=10441 -DMPICH_IGNORE_CXX_SEEK \ +CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math +OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \ $(LMP_INC) $(OCL_INC) $(CPP_OPT) OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL OCL_PREC = -D_SINGLE_DOUBLE -OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT +OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -DGERYON_NO_OCL_MARKERS BIN_DIR = ./ OBJ_DIR = ./ diff --git a/lib/gpu/Makefile.oneapi_prof b/lib/gpu/Makefile.oneapi_prof new file mode 100644 index 0000000000..1e21597373 --- /dev/null +++ b/lib/gpu/Makefile.oneapi_prof @@ -0,0 +1,28 @@ +# /* ---------------------------------------------------------------------- +# Linux Makefile for Intel oneAPI - Mixed precision (with timing enabled) +# ------------------------------------------------------------------------- */ + +# which file will be copied to Makefile.lammps + +EXTRAMAKE = Makefile.lammps.opencl + +# this setting should match LAMMPS Makefile +# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL + +LMP_INC = -DLAMMPS_SMALLBIG + +OCL_INC = -I$(ONEAPI_ROOT)/compiler/latest/linux/include/sycl/ +CPP_OPT = -xHost -O2 -qopenmp -qopenmp-simd -ffast-math -freciprocal-math +OCL_CPP = mpiicpc -cxx=icpx -std=c++11 -DMPICH_IGNORE_CXX_SEEK \ + $(LMP_INC) $(OCL_INC) $(CPP_OPT) +OCL_LINK = -L$(ONEAPI_ROOT)/compiler/latest/linux/lib -lOpenCL +OCL_PREC = -D_SINGLE_DOUBLE +OCL_TUNE = -DMPI_GERYON -DCUDA_PROXY -DGERYON_NUMA_FISSION -DUCL_NO_EXIT + +BIN_DIR = ./ +OBJ_DIR = ./ +LIB_DIR = ./ +AR = ar +BSH = /bin/sh + +include Opencl.makefile diff --git a/lib/gpu/README b/lib/gpu/README index 51b21960ae..b720aa65cb 100644 --- a/lib/gpu/README +++ b/lib/gpu/README @@ -266,6 +266,7 @@ LAL_SERIALIZE_INIT Force serialization of initialization and compilation for multiple MPI tasks sharing the same accelerator. Some accelerator API implementations have had issues with temporary file conflicts in the past. +LAL_DISABLE_PREFETCH Disable prefetch in kernels GERYON_FORCE_SHARED_MAIN_MEM_ON Should only be used for builds where the accelerator is guaranteed to share physical main memory with the host (e.g. integrated diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 1b2e5b8c77..e63a1f56b2 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -429,7 +429,7 @@ void UCL_Device::clear() { CU_SAFE_CALL_NS(cuCtxSetCurrent(_old_context)); CU_SAFE_CALL_NS(cuDevicePrimaryCtxRelease(_cu_device)); #else - cuCtxDestroy(_context)); + cuCtxDestroy(_context); #endif } _device=-1; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index f572d3ebd0..82a42cff6c 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -113,7 +113,7 @@ _texture( q_tex,int2); dufld[5]=red_acc[5][tid]; \ } \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii int AnswerT::bytes_per_atom() const { - int bytes=11*sizeof(acctyp); + int bytes=10*sizeof(acctyp); if (_rot) - bytes+=4*sizeof(acctyp); + bytes+=3*sizeof(acctyp); if (_charge) bytes+=sizeof(acctyp); return bytes; @@ -42,9 +42,9 @@ bool AnswerT::alloc(const int inum) { bool success=true; - _ans_fields=4; + _ans_fields=3; if (_rot) - _ans_fields+=4; + _ans_fields+=3; // --------------------------- Device allocations success=success && (engv.alloc(_ev_fields*_max_local,*dev,UCL_READ_ONLY, @@ -134,11 +134,11 @@ void AnswerT::clear() { template double AnswerT::host_memory_usage() const { - int atom_bytes=4; + int atom_bytes=3; if (_charge) atom_bytes+=1; if (_rot) - atom_bytes+=4; + atom_bytes+=3; int ans_bytes=atom_bytes+_ev_fields; return ans_bytes*(_max_local)*sizeof(acctyp)+ sizeof(Answer); @@ -169,9 +169,9 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag, if (csize>0) engv.update_host(_ev_stride*csize,true); if (_rot) - force.update_host(_inum*4*2,true); + force.update_host(_inum*3*2,true); else - force.update_host(_inum*4,true); + force.update_host(_inum*3,true); time_answer.stop(); #ifndef GERYON_OCL_FLUSH @@ -298,10 +298,7 @@ double AnswerT::energy_virial(double *eatom, double **vatom, template void AnswerT::get_answers(double **f, double **tor) { if (_ilist==nullptr) { - typedef struct { double x,y,z; } vec3d; - typedef struct { acctyp x,y,z,w; } vec4d_t; - auto fp=reinterpret_cast(&(f[0][0])); - auto forcep=reinterpret_cast(&(force[0])); + auto fp=reinterpret_cast(&(f[0][0])); #if (LAL_USE_OMP == 1) #pragma omp parallel @@ -310,27 +307,21 @@ void AnswerT::get_answers(double **f, double **tor) { #if (LAL_USE_OMP == 1) const int nthreads = omp_get_num_threads(); const int tid = omp_get_thread_num(); - const int idelta = _inum / nthreads + 1; + const int idelta = _inum*3 / nthreads + 1; const int ifrom = tid * idelta; - const int ito = std::min(ifrom + idelta, _inum); + const int ito = std::min(ifrom + idelta, _inum*3); #else const int ifrom = 0; - const int ito = _inum; + const int ito = _inum*3; #endif - for (int i=ifrom; i(&(tor[0][0])); - auto torquep=reinterpret_cast(&(force[_inum*4])); - for (int i=ifrom; i(&(tor[0][0])); + auto torquep=&(force[_inum*3]); + for (int i=ifrom; i void AtomT::compile_kernels(UCL_Device &dev) { std::string flags = ""; atom_program=new UCL_Program(dev); - atom_program->load_string(atom,flags,nullptr,screen); + atom_program->load_string(atom,flags.c_str(),nullptr,stderr); k_cast_x.set_function(*atom_program,"kernel_cast_x"); _compiled=true; } diff --git a/lib/gpu/lal_atom.cu b/lib/gpu/lal_atom.cu index 287d72803c..1418459301 100644 --- a/lib/gpu/lal_atom.cu +++ b/lib/gpu/lal_atom.cu @@ -18,7 +18,7 @@ #endif __kernel void kernel_cast_x(__global numtyp4 *restrict x_type, - const __global numtyp *restrict x, + const __global double *restrict x, const __global int *restrict type, const int nall) { int ii=GLOBAL_ID_X; diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h index 771c2a3571..081a1ae048 100644 --- a/lib/gpu/lal_atom.h +++ b/lib/gpu/lal_atom.h @@ -52,6 +52,12 @@ using namespace ucl_cudadr; namespace LAMMPS_AL { +struct EllipsoidBonus { + double shape[3]; + double quat[4]; + int ilocal; +}; + template class Atom { public: @@ -306,8 +312,8 @@ class Atom { if (_x_avail==false) { double t=MPI_Wtime(); #ifdef GPU_CAST - memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); - memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int)); + memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int)); #else vec3d *host_p=reinterpret_cast(&(host_ptr[0][0])); vec4d_t *xp=reinterpret_cast(&(x[0])); @@ -351,6 +357,24 @@ class Atom { add_x_data(host_ptr,host_type); } + // Cast mu data to write buffer (stored in quat) + template + inline void cast_mu_data(cpytyp *host_ptr) { + if (_quat_avail==false) { + double t=MPI_Wtime(); + if (sizeof(numtyp)==sizeof(double)) + memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); + else + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; + _time_cast+=MPI_Wtime()-t; + } + } + // Cast charges to write buffer template inline void cast_q_data(cpytyp *host_ptr) { @@ -384,22 +408,24 @@ class Atom { } // Cast quaternions to write buffer - template - inline void cast_quat_data(cpytyp *host_ptr) { + inline void cast_quat_data(const int *ellipsoid, + const EllipsoidBonus *bonus) { if (_quat_avail==false) { double t=MPI_Wtime(); - if (_host_view) { - quat.host.view((numtyp*)host_ptr,_nall*4,*dev); - quat.device.view(quat.host); - } else if (sizeof(numtyp)==sizeof(double)) - memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp)); - else - #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) - #pragma omp parallel for simd schedule(static) - #elif (LAL_USE_OMP_SIMD == 1) - #pragma omp simd - #endif - for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i]; + #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1) + #pragma omp parallel for simd schedule(static) + #elif (LAL_USE_OMP_SIMD == 1) + #pragma omp simd + #endif + for (int i=0; i<_nall; i++) { + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i*4] = bonus[qi].quat[0]; + quat[i*4+1] = bonus[qi].quat[1]; + quat[i*4+2] = bonus[qi].quat[2]; + quat[i*4+3] = bonus[qi].quat[3]; + } + } _time_cast+=MPI_Wtime()-t; } } @@ -419,10 +445,6 @@ class Atom { inline void cast_v_data(double **host_ptr, const tagint *host_tag) { if (_v_avail==false) { double t=MPI_Wtime(); - #ifdef GPU_CAST - memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double)); - memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int)); - #else vec3d *host_p=reinterpret_cast(&(host_ptr[0][0])); vec4d_t *vp=reinterpret_cast(&(v[0])); #if (LAL_USE_OMP == 1) @@ -434,7 +456,6 @@ class Atom { vp[i].z=host_p[i].z; vp[i].w=host_tag[i]; } - #endif _time_cast+=MPI_Wtime()-t; } } @@ -444,16 +465,7 @@ class Atom { inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) { time_vel.start(); if (_v_avail==false) { - #ifdef GPU_CAST - v_cast.update_device(_nall*3,true); - tag_cast.update_device(_nall,true); - int block_size=64; - int GX=static_cast(ceil(static_cast(_nall)/block_size)); - k_cast_x.set_size(GX,block_size); - k_cast_x.run(&v, &v_cast, &tag_cast, &_nall); - #else v.update_device(_nall*4,true); - #endif _v_avail=true; } time_vel.stop(); @@ -519,7 +531,7 @@ class Atom { UCL_Vector extra; #ifdef GPU_CAST - UCL_Vector x_cast; + UCL_Vector x_cast; UCL_Vector type_cast; #endif diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 09d7386461..0821a33b06 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -143,10 +143,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); _max_tep_size=static_cast(static_cast(ef_nall)*1.10); - _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _tep.alloc(_max_tep_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _max_fieldp_size = _max_tep_size; - _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _fieldp.alloc(_max_fieldp_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _max_thetai_size = 0; @@ -387,7 +387,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full, if (inum_full>_max_tep_size) { _max_tep_size=static_cast(static_cast(inum_full)*1.10); - _tep.resize(_max_tep_size*4); + _tep.resize(_max_tep_size*3); } *tep_ptr=_tep.host.begin(); @@ -403,7 +403,7 @@ void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full, // copy tep from device to host - _tep.update_host(_max_tep_size*4,false); + _tep.update_host(_max_tep_size*3,false); } // --------------------------------------------------------------------------- @@ -429,7 +429,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double // copy field and fieldp from device to host (_fieldp store both arrays, one after another) - _fieldp.update_host(_max_fieldp_size*8,false); + _fieldp.update_host(_max_fieldp_size*6,false); } // --------------------------------------------------------------------------- @@ -456,7 +456,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // NOTE: move this step to update_fieldp() to delay device-host transfer // after umutual1 and self are done on the GPU // *fieldp_ptr=_fieldp.host.begin(); - // _fieldp.update_host(_max_fieldp_size*8,false); + // _fieldp.update_host(_max_fieldp_size*6,false); } // --------------------------------------------------------------------------- @@ -732,7 +732,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, device->add_ans_object(ans); // copy tep from device to host - _tep.update_host(_max_tep_size*4,false); + _tep.update_host(_max_tep_size*3,false); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 6ef1c40ca7..7f09e100f1 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -233,7 +233,7 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full, atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); - atom->cast_quat_data(host_mu[0]); + atom->cast_mu_data(host_mu[0]); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); atom->add_q_data(); @@ -297,12 +297,12 @@ int** BaseDipoleT::compute(const int ago, const int inum_full, if (!success) return nullptr; atom->cast_q_data(host_q); - atom->cast_quat_data(host_mu[0]); + atom->cast_mu_data(host_mu[0]); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); atom->cast_q_data(host_q); - atom->cast_quat_data(host_mu[0]); + atom->cast_mu_data(host_mu[0]); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 0bc20615a1..bc383de18f 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -375,7 +375,8 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat) { + bool &success, const int *ellipsoid, + const EllipsoidBonus *bonus) { acc_timers(); int eflag, vflag; if (eflag_in) eflag=2; @@ -409,7 +410,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full, list=ilist; atom->cast_x_data(host_x,host_type); - atom->cast_quat_data(host_quat[0]); + atom->cast_quat_data(ellipsoid,bonus); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); atom->add_quat_data(); @@ -433,7 +434,8 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_quat) { + const int *ellipsoid, + const EllipsoidBonus *bonus) { acc_timers(); int eflag, vflag; if (eflag_in) eflag=2; @@ -460,11 +462,11 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, sublo, subhi, tag, nspecial, special, success); if (!success) return nullptr; - atom->cast_quat_data(host_quat[0]); + atom->cast_quat_data(ellipsoid,bonus); hd_balancer.start_timer(); } else { atom->cast_x_data(host_x,host_type); - atom->cast_quat_data(host_quat[0]); + atom->cast_quat_data(ellipsoid,bonus); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); } diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h index 9885e931ee..618f97da54 100644 --- a/lib/gpu/lal_base_ellipsoid.h +++ b/lib/gpu/lal_base_ellipsoid.h @@ -170,7 +170,8 @@ class BaseEllipsoid { double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double **quat); + const double cpu_time, bool &success, + const int *ellipsoid, const EllipsoidBonus *bonus); /// Pair loop with device neighboring int**compute(const int ago, const int inum_full, const int nall, @@ -179,7 +180,7 @@ class BaseEllipsoid { tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **numj, const double cpu_time, bool &success, - double **host_quat); + const int *ellipsoid, const EllipsoidBonus *bonus); // -------------------------- DEVICE DATA ------------------------- diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu index 12f1314c52..b0a9a6a4c1 100644 --- a/lib/gpu/lal_beck.cu +++ b/lib/gpu/lal_beck.cu @@ -31,7 +31,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom) { @@ -47,7 +47,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, sp_lj[2]=sp_lj_in[2]; sp_lj[3]=sp_lj_in[3]; - acctyp4 f; + acctyp3 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp energy, virial[6]; if (EVFLAG) { @@ -66,6 +66,7 @@ __kernel void k_beck(const __global numtyp4 *restrict x_, numtyp factor_lj; for ( ; nbor gpu_lib_data(19,*gpu,UCL_NOT_PINNED); + UCL_Vector gpu_lib_data(20,*gpu,UCL_NOT_PINNED); k_info.set_size(1,1); k_info.run(&gpu_lib_data); gpu_lib_data.update_host(false); diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu index 61341964b2..073c7de3d9 100644 --- a/lib/gpu/lal_device.cu +++ b/lib/gpu/lal_device.cu @@ -52,4 +52,5 @@ __kernel void kernel_info(__global int *info) { info[16]=MAX_SHARED_TYPES; info[17]=MAX_BIO_SHARED_TYPES; info[18]=PPPM_MAX_SPLINE; + info[19]=NBOR_PREFETCH; } diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu index cbe68ff692..18326edd3a 100644 --- a/lib/gpu/lal_dipole_lj.cu +++ b/lib/gpu/lal_dipole_lj.cu @@ -211,7 +211,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj_in, const __global int *dev_nbor, const __global int *dev_packed, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, @@ -235,7 +235,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, sp_lj[6]=sp_lj_in[6]; sp_lj[7]=sp_lj_in[7]; - acctyp4 f, tor; + acctyp3 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; acctyp energy, e_coul, virial[6]; @@ -257,6 +257,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_, int itype=ix.w; for ( ; nbor(bonus)); } int * gb_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double **host_quat) { + const double cpu_time, bool &success, + const int *ellipsoid, const void *bonus) { return GBMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, - cpu_time, success, host_quat); + cpu_time, success, ellipsoid, + static_cast(bonus)); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu index 4582f0d411..55b4eddb58 100644 --- a/lib/gpu/lal_gayberne_lj.cu +++ b/lib/gpu/lal_gayberne_lj.cu @@ -34,7 +34,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, const __global numtyp *restrict lshape, const __global int *dev_nbor, const int stride, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, __global acctyp *restrict engv, __global int *restrict err_flag, const int eflag, const int vflag, @@ -53,7 +53,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, sp_lj[2]=gum[5]; sp_lj[3]=gum[6]; - acctyp4 f; + acctyp3 f; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; acctyp energy, virial[6]; if (EVFLAG) { @@ -75,6 +75,7 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_, numtyp factor_lj; for ( ; nbor1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii=0 && iH2>=0) { - compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_); + compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_); } else { m[iO] = ix; m[iO].w = qO; @@ -313,9 +313,9 @@ __kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_, /* ---------------------------------------------------------------------- Compute initial value of force, energy and virial for each local particle. The values calculated on oxygens use the virtual charge position (m) and - they are stored in a separate array (ansO) for further distribution + they are stored in a separate array (ansO) for further distribution in a separate kernel. For some hydrogens located on the boundary - of the local region, oxygens are non-local and the contribution + of the local region, oxygens are non-local and the contribution of oxygen is calculated separately in this kernel for them . ---------------------------------------------------------------------- */ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, @@ -325,7 +325,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, const __global numtyp *restrict sp_lj, const __global int * dev_nbor, const __global int * dev_packed, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, const int nbor_pitch, const int t_per_atom, @@ -344,7 +344,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, int n_stride; local_allocate_store_charge(); - acctyp4 f, fO; + acctyp3 f; + acctyp4 fO; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0; acctyp energy, e_coul, virial[6], vO[6]; @@ -386,6 +387,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_, } for ( ; nbor 0 + tagint special_preload[SPECIAL_DATA_PRELOAD_SIZE]; + for (int i = 0, j = 0; (i < n3) && (j < SPECIAL_DATA_PRELOAD_SIZE); i+=UNROLL_FACTOR_SPECIAL, j++) { + special_preload[j] = special[ii + i*nt]; + } +#endif - int offset=ii; - for (int i=0; i=n1) - which++; - if (i>=n2) - which++; - nbor=nbor ^ (which << SBBITS); - *list=nbor; + for ( ; list 0 + if ((c == 0) && (j < SPECIAL_DATA_PRELOAD_SIZE)) { + special_data[c] = special_preload[j]; + } + else +#endif + special_data[c] = special[ii + (i+c)*nt]; + } } - offset+=nt; + + for (int k=0; k= n1) { + which[k]++; + } + } + for (int k=0; k= n2) { + which[k]++; + } + which[k] <<= SBBITS; + } + for (int c = 0; c < UNROLL_FACTOR_SPECIAL; c++) { + if (i + c < n3) { + for (int l=0; l=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE // +// NBOR_PREFETCH +// Definition: Control use of prefetch for neighbor indices +// 0 = No prefetch +// 1 = Prefetch using standard API +// 2 = Prefetch using Intel intrinsics +// Restrictions: NBOR_PREFETCH forced to 0 when LAL_DISABLE_PREFETCH +// is defined in library build //*************************************************************************/ // ------------------------------------------------------------------------- @@ -101,6 +108,7 @@ #if defined(NV_KERNEL) || defined(USE_HIP) #include "lal_pre_cuda_hip.h" +#define ucl_prefetch(p) #define ucl_pow pow #endif @@ -169,7 +177,7 @@ #define ucl_abs fabs #define ucl_erfc erfc -#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE) +#if (FAST_MATH > 0) && !defined(_DOUBLE_DOUBLE) #define ucl_exp native_exp #define ucl_pow pow @@ -285,6 +293,55 @@ #define simd_size() SIMD_SIZE #endif +// ------------------------------------------------------------------------- +// OPENCL KERNEL MACROS - PREFETCH +// ------------------------------------------------------------------------- + +#if (NBOR_PREFETCH == 0) +#define ucl_prefetch(p) +#endif + +#if (NBOR_PREFETCH == 1) +inline void ucl_prefetch(const __global int *p) { + prefetch(p, 1); +} +#endif + +#if (NBOR_PREFETCH == 2) +// Load message caching control +enum LSC_LDCC { + LSC_LDCC_DEFAULT, + LSC_LDCC_L1UC_L3UC, //1 Override to L1 uncached and L3 uncached + LSC_LDCC_L1UC_L3C, //1 Override to L1 uncached and L3 cached + LSC_LDCC_L1C_L3UC, //1 Override to L1 cached and L3 uncached + LSC_LDCC_L1C_L3C, //1 Override to L1 cached and L3 cached + LSC_LDCC_L1S_L3UC, //1 Override to L1 streaming load and L3 uncached + LSC_LDCC_L1S_L3C, //1 Override to L1 streaming load and L3 cached + LSC_LDCC_L1IAR_L3C, //1 Override to L1 invalidate-after-read, and L3 cached +}; + +void __builtin_IB_lsc_prefetch_global_uint(const __global uint *base, + int elemOff, + enum LSC_LDCC cacheOpt); //D32V1 + +inline void ucl_prefetch(const __global int *p) { + __builtin_IB_lsc_prefetch_global_uint((const __global uint *)p, 0, + LSC_LDCC_L1C_L3UC); +} +#endif + +struct _lgpu_float3 { + float x; float y; float z; +}; +struct _lgpu_double3 { + double x; double y; double z; +}; +#ifdef _SINGLE_SINGLE +#define acctyp3 struct _lgpu_float3 +#else +#define acctyp3 struct _lgpu_double3 +#endif + // ------------------------------------------------------------------------- // END OPENCL DEFINITIONS // ------------------------------------------------------------------------- @@ -301,6 +358,9 @@ #define numtyp4 double4 #define acctyp double #define acctyp2 double2 +#ifndef acctyp3 +#define acctyp3 double3 +#endif #define acctyp4 double4 #endif @@ -310,6 +370,9 @@ #define numtyp4 float4 #define acctyp double #define acctyp2 double2 +#ifndef acctyp3 +#define acctyp3 double3 +#endif #define acctyp4 double4 #endif @@ -319,6 +382,9 @@ #define numtyp4 float4 #define acctyp float #define acctyp2 float2 +#ifndef acctyp3 +#define acctyp3 float3 +#endif #define acctyp4 float4 #endif diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu index c69a338749..318bdfdd69 100644 --- a/lib/gpu/lal_re_squared.cu +++ b/lib/gpu/lal_re_squared.cu @@ -32,6 +32,9 @@ ucl_inline numtyp det_prime(const numtyp m[9], const numtyp m2[9]) return ans; } +#ifdef INTEL_OCL +__attribute__((intel_reqd_sub_group_size(16))) +#endif __kernel void k_resquared(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict q, const __global numtyp4 *restrict shape, @@ -41,7 +44,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, const int ntypes, const __global int *dev_nbor, const int stride, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, const int astride, __global acctyp *restrict engv, __global int *restrict err_flag, @@ -62,7 +65,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0; const numtyp cr60=ucl_cbrt((numtyp)60.0); - acctyp4 f, tor; + acctyp3 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; acctyp energy, virial[6]; @@ -122,6 +125,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_, numtyp factor_lj; for ( ; nbor(bonus)); } int * re_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, double **host_quat) { + const double cpu_time, bool &success, + const int *ellipsoid, const void *bonus) { return REMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, - cpu_time, success, host_quat); + cpu_time, success, ellipsoid, + static_cast(bonus)); } // --------------------------------------------------------------------------- @@ -135,4 +139,3 @@ int * re_gpu_compute(const int ago, const int inum_full, const int nall, double re_gpu_bytes() { return REMF.host_memory_usage(); } - diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu index ca1b08facd..b3347fcb18 100644 --- a/lib/gpu/lal_re_squared_lj.cu +++ b/lib/gpu/lal_re_squared_lj.cu @@ -86,7 +86,7 @@ ap1+=astride; \ } \ } \ - acctyp4 old=ans[ii]; \ + acctyp3 old=ans[ii]; \ old.x+=f.x; \ old.y+=f.y; \ old.z+=f.z; \ @@ -131,7 +131,7 @@ ap1+=astride; \ } \ } \ - acctyp4 old=ans[ii]; \ + acctyp3 old=ans[ii]; \ old.x+=f.x; \ old.y+=f.y; \ old.z+=f.z; \ @@ -154,7 +154,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, const int ntypes, const __global int *dev_nbor, const int stride, - __global acctyp4 *restrict ans, + __global acctyp3 *restrict ans, const int astride, __global acctyp *restrict engv, __global int *restrict err_flag, @@ -180,7 +180,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, const numtyp solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0); - acctyp4 f, tor; + acctyp3 f, tor; f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0; acctyp energy, virial[6]; @@ -216,6 +216,7 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_, numtyp factor_lj; for ( ; nbor1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && ii1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ if (offset==0 && iimolecular != Atom::ATOMIC)) { + PairHybrid *ph = reinterpret_cast(force->pair_match("^hybrid",0)); + if (ph) { + for (int isub=0; isub < ph->nstyles; ++isub) { + if (force->pair_match("gpu",0,isub)) overlap_topo = 1; + } + } else { + if (force->pair_match("gpu",0)) overlap_topo = 1; + } + } + if (overlap_topo) neighbor->set_overlap_topo(1); + if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) if (neighbor->exclude_setting() != 0) error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds"); diff --git a/src/GPU/fix_nve_asphere_gpu.cpp b/src/GPU/fix_nve_asphere_gpu.cpp index 06d1d7a7ca..481f44bb63 100644 --- a/src/GPU/fix_nve_asphere_gpu.cpp +++ b/src/GPU/fix_nve_asphere_gpu.cpp @@ -243,12 +243,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/) // update angular momentum by 1/2 step if (igroup == 0) { #if (LAL_USE_OMP_SIMD == 1) - // Workaround for compiler bug - #ifdef __INTEL_COMPILER - #pragma simd - #else - #pragma omp simd - #endif + #pragma omp simd #endif for (int i = ifrom; i < ito; i++) { double *quat = bonus[ellipsoid[i]].quat; @@ -257,12 +252,7 @@ void FixNVEAsphereGPU::initial_integrate(int /*vflag*/) } } else { #if (LAL_USE_OMP_SIMD == 1) - // Workaround for compiler bug - #ifdef __INTEL_COMPILER - #pragma simd - #else - #pragma omp simd - #endif + #pragma omp simd #endif for (int i = ifrom; i < ito; i++) { if (mask[i] & groupbit) { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index fd423486fd..1221db66b1 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -155,6 +155,15 @@ PairAmoebaGPU::~PairAmoebaGPU() amoeba_gpu_clear(); } +/* ---------------------------------------------------------------------- */ + +void PairAmoebaGPU::compute(int eflag, int vflag) +{ + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + PairAmoeba::compute(eflag, vflag); +} + /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index be53f7ef50..75f0d26336 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -28,6 +28,7 @@ class PairAmoebaGPU : public PairAmoeba { public: PairAmoebaGPU(LAMMPS *lmp); ~PairAmoebaGPU() override; + void compute(int, int) override; void init_style() override; double memory_usage() override; diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp index 3c21a99105..8d057fd317 100644 --- a/src/GPU/pair_beck_gpu.cpp +++ b/src/GPU/pair_beck_gpu.cpp @@ -109,6 +109,8 @@ void PairBeckGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp index 788a46e2cb..798caeb97a 100644 --- a/src/GPU/pair_born_coul_long_cs_gpu.cpp +++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp @@ -129,6 +129,8 @@ void PairBornCoulLongCSGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp index 629f716fd6..ca12f03070 100644 --- a/src/GPU/pair_born_coul_long_gpu.cpp +++ b/src/GPU/pair_born_coul_long_gpu.cpp @@ -123,6 +123,8 @@ void PairBornCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp index 214a9575be..9858015622 100644 --- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp +++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp @@ -117,6 +117,8 @@ void PairBornCoulWolfCSGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp index 02a671adc9..ce9956d232 100644 --- a/src/GPU/pair_born_coul_wolf_gpu.cpp +++ b/src/GPU/pair_born_coul_wolf_gpu.cpp @@ -114,6 +114,8 @@ void PairBornCoulWolfGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp index 905278cdb7..9499cd7307 100644 --- a/src/GPU/pair_born_gpu.cpp +++ b/src/GPU/pair_born_gpu.cpp @@ -109,6 +109,8 @@ void PairBornGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp index 125ffbfbbd..b6e1e8fbed 100644 --- a/src/GPU/pair_buck_coul_cut_gpu.cpp +++ b/src/GPU/pair_buck_coul_cut_gpu.cpp @@ -111,6 +111,8 @@ void PairBuckCoulCutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp index ca90b3e869..adae92d1ac 100644 --- a/src/GPU/pair_buck_coul_long_gpu.cpp +++ b/src/GPU/pair_buck_coul_long_gpu.cpp @@ -120,6 +120,8 @@ void PairBuckCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp index d6dcdf30bc..4e11a2ec2d 100644 --- a/src/GPU/pair_buck_gpu.cpp +++ b/src/GPU/pair_buck_gpu.cpp @@ -107,6 +107,8 @@ void PairBuckGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp index c0e85907bb..510c4ef12f 100644 --- a/src/GPU/pair_colloid_gpu.cpp +++ b/src/GPU/pair_colloid_gpu.cpp @@ -109,6 +109,8 @@ void PairColloidGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp index d48ee1cb7b..240ed2f91e 100644 --- a/src/GPU/pair_coul_cut_gpu.cpp +++ b/src/GPU/pair_coul_cut_gpu.cpp @@ -108,6 +108,8 @@ void PairCoulCutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp index ed9781c016..7d1fe8d546 100644 --- a/src/GPU/pair_coul_debye_gpu.cpp +++ b/src/GPU/pair_coul_debye_gpu.cpp @@ -109,6 +109,8 @@ void PairCoulDebyeGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp index a4837ed8cb..bf207caf60 100644 --- a/src/GPU/pair_coul_dsf_gpu.cpp +++ b/src/GPU/pair_coul_dsf_gpu.cpp @@ -118,6 +118,8 @@ void PairCoulDSFGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp index 921a294721..79c4c4ab7c 100644 --- a/src/GPU/pair_coul_long_cs_gpu.cpp +++ b/src/GPU/pair_coul_long_cs_gpu.cpp @@ -123,6 +123,8 @@ void PairCoulLongCSGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp index 0b773882b2..7ecb052f69 100644 --- a/src/GPU/pair_coul_long_gpu.cpp +++ b/src/GPU/pair_coul_long_gpu.cpp @@ -117,6 +117,8 @@ void PairCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp index 716978deac..e4657cf2eb 100644 --- a/src/GPU/pair_dpd_gpu.cpp +++ b/src/GPU/pair_dpd_gpu.cpp @@ -256,6 +256,8 @@ void PairDPDGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp index 029bf7245e..4a7b05fd2c 100644 --- a/src/GPU/pair_dpd_tstat_gpu.cpp +++ b/src/GPU/pair_dpd_tstat_gpu.cpp @@ -272,6 +272,8 @@ void PairDPDTstatGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp index d1d73e415c..4b7693e989 100644 --- a/src/GPU/pair_eam_alloy_gpu.cpp +++ b/src/GPU/pair_eam_alloy_gpu.cpp @@ -138,6 +138,8 @@ void PairEAMAlloyGPU::compute(int eflag, int vflag) eam_alloy_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom); else eam_alloy_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp index c1a4c74d52..9da4e20a6f 100644 --- a/src/GPU/pair_eam_fs_gpu.cpp +++ b/src/GPU/pair_eam_fs_gpu.cpp @@ -138,6 +138,8 @@ void PairEAMFSGPU::compute(int eflag, int vflag) eam_fs_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom); else eam_fs_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp index 17af6cfb22..4cb7c7f749 100644 --- a/src/GPU/pair_eam_gpu.cpp +++ b/src/GPU/pair_eam_gpu.cpp @@ -136,6 +136,8 @@ void PairEAMGPU::compute(int eflag, int vflag) eam_gpu_compute_force(nullptr, eflag, vflag, eflag_atom, vflag_atom); else eam_gpu_compute_force(ilist, eflag, vflag, eflag_atom, vflag_atom); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp index 17b9e9a650..e6e4ccae1b 100644 --- a/src/GPU/pair_gauss_gpu.cpp +++ b/src/GPU/pair_gauss_gpu.cpp @@ -106,6 +106,8 @@ void PairGaussGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp index 5f12b1eaf4..c0b0c2ecb0 100644 --- a/src/GPU/pair_gayberne_gpu.cpp +++ b/src/GPU/pair_gayberne_gpu.cpp @@ -35,33 +35,39 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, const double mu, - double **shape, double **well, double **cutsq, double **sigma, double **epsilon, - double *host_lshape, int **form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, const int maxspecial, +int gb_gpu_init(const int ntypes, const double gamma, const double upsilon, + const double mu, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, double *host_lshape, + int **form, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, + double *special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void gb_gpu_clear(); -int **gb_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_quat); -int *gb_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat); +int **gb_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const int *ellipsoid, + const void *bonus); +int *gb_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, const int *ellipsoid, + const void *bonus); double gb_gpu_bytes(); enum { SPHERE_SPHERE, SPHERE_ELLIPSE, ELLIPSE_SPHERE, ELLIPSE_ELLIPSE }; /* ---------------------------------------------------------------------- */ -PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), gpu_mode(GPU_FORCE) +PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp), + gpu_mode(GPU_FORCE) { - quat_nmax = 0; reinitflag = 0; - quat = nullptr; suffix_flag |= Suffix::GPU; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -74,7 +80,6 @@ PairGayBerneGPU::~PairGayBerneGPU() { gb_gpu_clear(); cpu_time = 0.0; - memory->destroy(quat); } /* ---------------------------------------------------------------------- */ @@ -89,21 +94,8 @@ void PairGayBerneGPU::compute(int eflag, int vflag) bool success = true; int *ilist, *numneigh, **firstneigh; - if (nall > quat_nmax) { - quat_nmax = static_cast(1.1 * nall); - memory->grow(quat, quat_nmax, 4, "pair:quat"); - } AtomVecEllipsoid::Bonus *bonus = avec->bonus; int *ellipsoid = atom->ellipsoid; - for (int i = 0; i < nall; i++) { - int qi = ellipsoid[i]; - if (qi > -1) { - quat[i][0] = bonus[qi].quat[0]; - quat[i][1] = bonus[qi].quat[1]; - quat[i][2] = bonus[qi].quat[2]; - quat[i][3] = bonus[qi].quat[3]; - } - } if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -119,19 +111,24 @@ void PairGayBerneGPU::compute(int eflag, int vflag) } inum = atom->nlocal; firstneigh = - gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, - atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, success, quat); + gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, + subhi, atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, success, + ellipsoid, bonus); } else { inum = list->inum; numneigh = list->numneigh; firstneigh = list->firstneigh; - ilist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, list->ilist, numneigh, - firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, quat); + ilist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + list->ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, + success, ellipsoid, bonus); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); @@ -185,14 +182,13 @@ void PairGayBerneGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - gb_gpu_init(atom->ntypes + 1, gamma, upsilon, mu, shape2, well, cutsq, sigma, epsilon, lshape, - form, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, - atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); + gb_gpu_init(atom->ntypes + 1, gamma, upsilon, mu, shape2, well, cutsq, + sigma, epsilon, lshape, form, lj1, lj2, lj3, lj4, offset, + force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success, error, world); if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); - quat_nmax = static_cast(1.1 * (atom->nlocal + atom->nghost)); - memory->grow(quat, quat_nmax, 4, "pair:quat"); } /* ---------------------------------------------------------------------- */ @@ -200,12 +196,13 @@ void PairGayBerneGPU::init_style() double PairGayBerneGPU::memory_usage() { double bytes = Pair::memory_usage(); - return bytes + memory->usage(quat, quat_nmax) + gb_gpu_bytes(); + return bytes + gb_gpu_bytes(); } /* ---------------------------------------------------------------------- */ -void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, +void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, + int /* vflag */, int *ilist, int *numneigh, int **firstneigh) { int i, j, ii, jj, jnum, itype, jtype; diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h index 89d21b9046..1ce760352c 100644 --- a/src/GPU/pair_gayberne_gpu.h +++ b/src/GPU/pair_gayberne_gpu.h @@ -38,8 +38,6 @@ class PairGayBerneGPU : public PairGayBerne { private: int gpu_mode; double cpu_time; - int quat_nmax; - double **quat; }; } // namespace LAMMPS_NS diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 9d286d5db7..8287a7c09d 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -172,6 +172,15 @@ PairHippoGPU::~PairHippoGPU() hippo_gpu_clear(); } +/* ---------------------------------------------------------------------- */ + +void PairAmoebaGPU::compute(int eflag, int vflag) +{ + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + PairAmoeba::compute(eflag, vflag); +} + /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index d160446d77..50f362bafc 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -28,6 +28,7 @@ class PairHippoGPU : public PairAmoeba { public: PairHippoGPU(LAMMPS *lmp); ~PairHippoGPU() override; + void compute(int, int) override; void init_style() override; double memory_usage() override; diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp index 5b1dd47340..f2371b14ef 100644 --- a/src/GPU/pair_lj96_cut_gpu.cpp +++ b/src/GPU/pair_lj96_cut_gpu.cpp @@ -106,6 +106,8 @@ void PairLJ96CutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp index d894d6acf1..dbaef3b929 100644 --- a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp +++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp @@ -101,6 +101,8 @@ void PairLJCharmmCoulCharmmGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp index 5153ea0b37..87d4896bde 100644 --- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp @@ -122,6 +122,8 @@ void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp index 2de9586596..90a4682e8f 100644 --- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp @@ -120,6 +120,8 @@ void PairLJClass2CoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp index 7d7edb773d..9668c1d63a 100644 --- a/src/GPU/pair_lj_class2_gpu.cpp +++ b/src/GPU/pair_lj_class2_gpu.cpp @@ -106,6 +106,8 @@ void PairLJClass2GPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp index 4a1316a00a..bec2465b84 100644 --- a/src/GPU/pair_lj_cubic_gpu.cpp +++ b/src/GPU/pair_lj_cubic_gpu.cpp @@ -111,6 +111,8 @@ void PairLJCubicGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp index 7bae62ff02..45f98d3ce8 100644 --- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp @@ -109,6 +109,8 @@ void PairLJCutCoulCutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp index 9c598a7572..86732defb5 100644 --- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp @@ -112,6 +112,8 @@ void PairLJCutCoulDebyeGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp index 90c8b556dc..08d90b8b57 100644 --- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp @@ -119,6 +119,8 @@ void PairLJCutCoulDSFGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp index 5094bdc7c9..c70fe555d0 100644 --- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp @@ -122,6 +122,8 @@ void PairLJCutCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp index c1aaa6323a..aa1fa45ec2 100644 --- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp +++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp @@ -112,6 +112,8 @@ void PairLJCutCoulMSMGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp index cac0582138..b71e526bf2 100644 --- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp @@ -113,6 +113,8 @@ void PairLJCutDipoleCutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp index 9489a43389..df1a2d78ba 100644 --- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp @@ -125,6 +125,8 @@ void PairLJCutDipoleLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp index 422990e1cb..46dd67dc94 100644 --- a/src/GPU/pair_lj_cut_gpu.cpp +++ b/src/GPU/pair_lj_cut_gpu.cpp @@ -109,6 +109,8 @@ void PairLJCutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp index 3830e5dd06..d7eaf4b006 100644 --- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp +++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp @@ -131,6 +131,8 @@ void PairLJCutTIP4PLongGPU::compute(int eflag, int vflag) success, atom->q, atom->nlocal, domain->boxlo, domain->prd); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp index c9ffd0ac23..35cb18122a 100644 --- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp @@ -123,6 +123,8 @@ void PairLJExpandCoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp index 8d7dcf2c21..1e1eac603b 100644 --- a/src/GPU/pair_lj_expand_gpu.cpp +++ b/src/GPU/pair_lj_expand_gpu.cpp @@ -107,6 +107,8 @@ void PairLJExpandGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp index 424bce480c..8bb901f961 100644 --- a/src/GPU/pair_lj_gromacs_gpu.cpp +++ b/src/GPU/pair_lj_gromacs_gpu.cpp @@ -108,6 +108,8 @@ void PairLJGromacsGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp index 9bd5dc4749..4d8fbb5139 100644 --- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp +++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp @@ -112,6 +112,8 @@ void PairLJSFDipoleSFGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp index 8ae295282e..5451f4a4f4 100644 --- a/src/GPU/pair_lj_smooth_gpu.cpp +++ b/src/GPU/pair_lj_smooth_gpu.cpp @@ -113,6 +113,8 @@ void PairLJSmoothGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_lj_spica_coul_long_gpu.cpp b/src/GPU/pair_lj_spica_coul_long_gpu.cpp index b315b8cc57..4317c04220 100644 --- a/src/GPU/pair_lj_spica_coul_long_gpu.cpp +++ b/src/GPU/pair_lj_spica_coul_long_gpu.cpp @@ -125,6 +125,8 @@ void PairLJSPICACoulLongGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); if (evflag) { diff --git a/src/GPU/pair_lj_spica_gpu.cpp b/src/GPU/pair_lj_spica_gpu.cpp index 71756a8c26..d531e27284 100644 --- a/src/GPU/pair_lj_spica_gpu.cpp +++ b/src/GPU/pair_lj_spica_gpu.cpp @@ -110,6 +110,8 @@ void PairLJSPICAGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); if (evflag) { diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp index 075546588a..0dabf9f3e2 100644 --- a/src/GPU/pair_mie_cut_gpu.cpp +++ b/src/GPU/pair_mie_cut_gpu.cpp @@ -107,6 +107,8 @@ void PairMIECutGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp index b0ac2cce14..570027c1d8 100644 --- a/src/GPU/pair_morse_gpu.cpp +++ b/src/GPU/pair_morse_gpu.cpp @@ -105,6 +105,8 @@ void PairMorseGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp index c0e700c5e6..8c1d1cec17 100644 --- a/src/GPU/pair_resquared_gpu.cpp +++ b/src/GPU/pair_resquared_gpu.cpp @@ -35,21 +35,28 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, double **sigma, - double **epsilon, int **form, double **host_lj1, double **host_lj2, - double **host_lj3, double **host_lj4, double **offset, double *special_lj, - const int nlocal, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); +int re_gpu_init(const int ntypes, double **shape, double **well, double **cutsq, + double **sigma, double **epsilon, int **form, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **offset, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); void re_gpu_clear(); -int **re_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_quat); -int *re_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, const double cpu_time, - bool &success, double **host_quat); +int **re_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const int *ellipsoid, + const void *bonus); +int *re_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, const int *ellipsoid, + const void *bonus); double re_gpu_bytes(); enum { SPHERE_SPHERE, SPHERE_ELLIPSE, ELLIPSE_SPHERE, ELLIPSE_ELLIPSE }; @@ -61,8 +68,6 @@ PairRESquaredGPU::PairRESquaredGPU(LAMMPS *lmp) : PairRESquared(lmp), gpu_mode(G reinitflag = 0; avec = dynamic_cast(atom->style_match("ellipsoid")); if (!avec) error->all(FLERR, "Pair resquared/gpu requires atom style ellipsoid"); - quat_nmax = 0; - quat = nullptr; suffix_flag |= Suffix::GPU; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -75,7 +80,6 @@ PairRESquaredGPU::~PairRESquaredGPU() { re_gpu_clear(); cpu_time = 0.0; - memory->destroy(quat); } /* ---------------------------------------------------------------------- */ @@ -90,21 +94,8 @@ void PairRESquaredGPU::compute(int eflag, int vflag) bool success = true; int *ilist, *numneigh, **firstneigh; - if (nall > quat_nmax) { - quat_nmax = static_cast(1.1 * nall); - memory->grow(quat, quat_nmax, 4, "pair:quat"); - } AtomVecEllipsoid::Bonus *bonus = avec->bonus; int *ellipsoid = atom->ellipsoid; - for (int i = 0; i < nall; i++) { - int qi = ellipsoid[i]; - if (qi > -1) { - quat[i][0] = bonus[qi].quat[0]; - quat[i][1] = bonus[qi].quat[1]; - quat[i][2] = bonus[qi].quat[2]; - quat[i][3] = bonus[qi].quat[3]; - } - } if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -120,19 +111,24 @@ void PairRESquaredGPU::compute(int eflag, int vflag) } inum = atom->nlocal; firstneigh = - re_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, - atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, success, quat); + re_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, + subhi, atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, host_start, + &ilist, &numneigh, cpu_time, success, ellipsoid, + bonus); } else { inum = list->inum; numneigh = list->numneigh; firstneigh = list->firstneigh; - ilist = re_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, list->ilist, numneigh, - firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, - success, quat); + ilist = re_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + list->ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, + success, ellipsoid, bonus); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); @@ -184,14 +180,13 @@ void PairRESquaredGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - re_gpu_init(atom->ntypes + 1, shape1, well, cutsq, sigma, epsilon, form, lj1, lj2, lj3, lj4, - offset, force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, mnf, - maxspecial, cell_size, gpu_mode, screen); + re_gpu_init(atom->ntypes + 1, shape1, well, cutsq, sigma, epsilon, form, + lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, + gpu_mode, screen); GPU_EXTRA::check_flag(success, error, world); if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); - quat_nmax = static_cast(1.1 * (atom->nlocal + atom->nghost)); - memory->grow(quat, quat_nmax, 4, "pair:quat"); } /* ---------------------------------------------------------------------- */ @@ -199,7 +194,7 @@ void PairRESquaredGPU::init_style() double PairRESquaredGPU::memory_usage() { double bytes = Pair::memory_usage(); - return bytes + memory->usage(quat, quat_nmax) + re_gpu_bytes(); + return bytes + re_gpu_bytes(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_resquared_gpu.h b/src/GPU/pair_resquared_gpu.h index 6d79952c39..825655a61d 100644 --- a/src/GPU/pair_resquared_gpu.h +++ b/src/GPU/pair_resquared_gpu.h @@ -38,8 +38,6 @@ class PairRESquaredGPU : public PairRESquared { private: int gpu_mode; double cpu_time; - int quat_nmax; - double **quat; }; } // namespace LAMMPS_NS diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp index 973e82c13a..9d406d1eaa 100644 --- a/src/GPU/pair_soft_gpu.cpp +++ b/src/GPU/pair_soft_gpu.cpp @@ -108,6 +108,8 @@ void PairSoftGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp index 67c52e0602..7645218a85 100644 --- a/src/GPU/pair_sw_gpu.cpp +++ b/src/GPU/pair_sw_gpu.cpp @@ -114,6 +114,8 @@ void PairSWGPU::compute(int eflag, int vflag) success); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp index 6615710b8a..ec927a7845 100644 --- a/src/GPU/pair_table_gpu.cpp +++ b/src/GPU/pair_table_gpu.cpp @@ -107,6 +107,8 @@ void PairTableGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp index 9f0c8fa883..8610a3880c 100644 --- a/src/GPU/pair_tersoff_gpu.cpp +++ b/src/GPU/pair_tersoff_gpu.cpp @@ -118,6 +118,8 @@ void PairTersoffGPU::compute(int eflag, int vflag) cpu_time, success); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp index 15bfc9a85e..1bb09c1403 100644 --- a/src/GPU/pair_tersoff_mod_gpu.cpp +++ b/src/GPU/pair_tersoff_mod_gpu.cpp @@ -117,6 +117,8 @@ void PairTersoffMODGPU::compute(int eflag, int vflag) host_start, cpu_time, success); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp index 68b0d9dfa7..8d5e05ce4c 100644 --- a/src/GPU/pair_tersoff_zbl_gpu.cpp +++ b/src/GPU/pair_tersoff_zbl_gpu.cpp @@ -121,6 +121,8 @@ void PairTersoffZBLGPU::compute(int eflag, int vflag) host_start, cpu_time, success); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp index 099bfe1e63..d1c099f9fb 100644 --- a/src/GPU/pair_ufm_gpu.cpp +++ b/src/GPU/pair_ufm_gpu.cpp @@ -111,6 +111,8 @@ void PairUFMGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp index 0fb0491ad3..38ad2b3c57 100644 --- a/src/GPU/pair_vashishta_gpu.cpp +++ b/src/GPU/pair_vashishta_gpu.cpp @@ -116,6 +116,8 @@ void PairVashishtaGPU::compute(int eflag, int vflag) cpu_time, success); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); } /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp index 8701a9ee80..c1e785380d 100644 --- a/src/GPU/pair_yukawa_colloid_gpu.cpp +++ b/src/GPU/pair_yukawa_colloid_gpu.cpp @@ -108,6 +108,8 @@ void PairYukawaColloidGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp index e2caef9515..b27361e32d 100644 --- a/src/GPU/pair_yukawa_gpu.cpp +++ b/src/GPU/pair_yukawa_gpu.cpp @@ -106,6 +106,8 @@ void PairYukawaGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp index cbb2c198f7..a1fb3e4663 100644 --- a/src/GPU/pair_zbl_gpu.cpp +++ b/src/GPU/pair_zbl_gpu.cpp @@ -108,6 +108,8 @@ void PairZBLGPU::compute(int eflag, int vflag) } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); if (host_start < inum) { cpu_time = platform::walltime(); cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); diff --git a/src/MAKE/OPTIONS/Makefile.oneapi b/src/MAKE/OPTIONS/Makefile.oneapi index d34f0900c6..7f450d5340 100644 --- a/src/MAKE/OPTIONS/Makefile.oneapi +++ b/src/MAKE/OPTIONS/Makefile.oneapi @@ -6,16 +6,16 @@ SHELL = /bin/sh # compiler/linker settings # specify flags and libraries needed for your compiler -CC = mpiicpc -std=c++11 -OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -qopenmp -qopenmp-simd -qno-offload -ansi-alias -restrict \ +CC = mpiicpc -cxx=icpx -std=c++11 +OPTFLAGS = -xHost -O2 -ffast-math -freciprocal-math +CCFLAGS = -qopenmp-simd -qopenmp -ansi-alias \ -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \ -I$(MKLROOT)/include SHFLAGS = -fPIC DEPFLAGS = -M -LINK = mpiicpc -std=c++11 -LINKFLAGS = -qopenmp -qopenmp-simd $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/ +LINK = mpiicpc -cxx=icpx -std=c++11 +LINKFLAGS = -qopenmp-simd -qopenmp $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/ LIB = -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core SIZE = size diff --git a/src/neighbor.cpp b/src/neighbor.cpp index 05371c8259..90e0a81fd0 100644 --- a/src/neighbor.cpp +++ b/src/neighbor.cpp @@ -291,6 +291,10 @@ Neighbor::~Neighbor() void Neighbor::init() { + #ifdef LMP_GPU + overlap_topo = 0; + #endif + int i,j,n; ncalls = ndanger = 0; @@ -2434,7 +2438,13 @@ void Neighbor::build(int topoflag) // build topology lists for bonds/angles/etc + #ifdef LMP_GPU + if (overlap_topo == 0) { + if ((atom->molecular != Atom::ATOMIC) && topoflag) build_topology(); + } + #else if ((atom->molecular != Atom::ATOMIC) && topoflag) build_topology(); + #endif } /* ---------------------------------------------------------------------- @@ -2817,6 +2827,17 @@ int Neighbor::exclude_setting() return exclude; } +/* ---------------------------------------------------------------------- + If nonzero, call build_topology from GPU styles instead to overlap comp +------------------------------------------------------------------------- */ + +#ifdef LMP_GPU +void Neighbor::set_overlap_topo(int s) +{ + overlap_topo = s; +} +#endif + /* ---------------------------------------------------------------------- check if any of the old requested neighbor lists are full ------------------------------------------------------------------------- */ diff --git a/src/neighbor.h b/src/neighbor.h index 241f44be06..dd638880c7 100644 --- a/src/neighbor.h +++ b/src/neighbor.h @@ -155,6 +155,12 @@ class Neighbor : protected Pointers { void exclusion_group_group_delete(int, int); // rm a group-group exclusion int exclude_setting(); // return exclude value to accelerator pkg + // Option to call build_topology from gpu styles instead for overlapped comp + #ifdef LMP_GPU + void set_overlap_topo(int); + int overlap_topo; // 0 for default/old non-overlap mode + #endif + // find a neighbor list based on requestor NeighList *find_list(void *, const int id = 0) const; // find a neighbor request based on requestor