diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h index 3135594dc3..66ca6ab527 100644 --- a/lib/gpu/geryon/ocl_mat.h +++ b/lib/gpu/geryon/ocl_mat.h @@ -54,6 +54,6 @@ namespace ucl_opencl { #include "ucl_print.h" #undef UCL_PRINT_ALLOW -} // namespace ucl_cudart +} // namespace ucl_opencl #endif diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index e103699d40..0ddd24d21e 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -56,7 +56,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name, const int onetype) { + const char *k_name, const int onetype, + const int extra_fields) { screen=_screen; int gpu_nbor=0; @@ -75,7 +76,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, bool charge = false; bool rot = false; bool vel = true; - int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 9eb56993af..6e9451d72a 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -53,7 +53,7 @@ class BaseDPD { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_name, - const int onetype=0); + const int onetype=0, const int extra_fields=0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -167,7 +167,6 @@ class BaseDPD { /// Atom Data Atom *atom; - // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -197,6 +196,8 @@ class BaseDPD { numtyp _dtinvsqrt; int _seed, _timestep; + int _extra_fields; + protected: bool _compiled; int _block_size, _threads_per_atom, _onetype; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 70ba373a65..e9ef2294b2 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -364,6 +364,12 @@ int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, } else _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size); + #ifndef LAL_USE_OLD_NEIGHBOR + _use_old_nbor_build = 0; + #else + _use_old_nbor_build = 1; + #endif + return flag; } @@ -510,9 +516,13 @@ int DeviceT::init(Answer &ans, const bool charge, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; + + // NOTE: enforce the hybrid mode (binning on the CPU) + // when not using sorting on the device #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif + // or when the device supports subgroups #ifndef LAL_USE_OLD_NEIGHBOR if (gpu_nbor==1) gpu_nbor=2; #endif @@ -886,19 +896,31 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } if (times[5] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); - fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); - fprintf(screen,"Vector width: %d.\n", simd_size()); - fprintf(screen,"Prefetch mode: "); - if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); - else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); - else fprintf(screen,"None.\n"); - fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"Prefetch mode: "); + if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); + else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); + else fprintf(screen,"None.\n"); + fprintf(screen,"Vector width: %d.\n", simd_size()); + fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); + fprintf(screen,"Pair block: %d.\n",_block_pair); + fprintf(screen,"Neigh block: %d.\n",_block_nbor_build); + if (nbor.gpu_nbor()==2) { + fprintf(screen,"Neigh mode: Hybrid (binning on host)"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," with subgroup support\n"); + } else if (nbor.gpu_nbor()==1) { + fprintf(screen,"Neigh mode: Device"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," - with subgroup support\n"); + } else if (nbor.gpu_nbor()==0) + fprintf(screen,"Neigh mode: Host\n"); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index ba693e551a..d6b52484f1 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -347,6 +347,7 @@ class Device { int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id; int _max_shared_types, _max_bio_shared_types, _pppm_max_spline; int _nbor_prefetch; + int _use_old_nbor_build; UCL_Program *dev_program; UCL_Kernel k_zero, k_info; diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h index bc4fce40a5..ace5a26339 100644 --- a/lib/gpu/lal_lj_coul_long.h +++ b/lib/gpu/lal_lj_coul_long.h @@ -78,7 +78,7 @@ class LJCoulLong : public BaseCharge { numtyp _cut_coulsq, _qqrd2e, _g_ewald; - private: +protected: bool _allocated; int loop(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_lj_coul_long_soft.cpp b/lib/gpu/lal_lj_coul_long_soft.cpp new file mode 100644 index 0000000000..2b90333bae --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cpp @@ -0,0 +1,174 @@ +/*************************************************************************** + lj_coul_long_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/long/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_long_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_long_soft=0; +#else +#include "lj_coul_long_soft_cubin.h" +#endif + +#include "lal_lj_coul_long_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulLongSoftT LJCoulLongSoft + +extern Device device; + +template +LJCoulLongSoftT::LJCoulLongSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulLongSoftT::~LJCoulLongSoft() { + clear(); +} + +template +int LJCoulLongSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulLongSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_long_soft,"k_lj_coul_long_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulLongSoftT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double **host_cut_ljsq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); +} + +template +void LJCoulLongSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulLongSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulLongSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulLongSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulLongSoft; +} diff --git a/lib/gpu/lal_lj_coul_long_soft.cu b/lib/gpu/lal_lj_coul_long_soft.cu new file mode 100644 index 0000000000..242b52fb79 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cu @@ -0,0 +1,288 @@ +// ************************************************************************** +// lj_coul_long_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/cut/coul/long/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndtrung@uchicago.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulLongSoft : public BaseCharge { + public: + LJCoulLongSoft(); + ~LJCoulLongSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double **host_cut_ljsq); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + +protected: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_long_soft_ext.cpp b/lib/gpu/lal_lj_coul_long_soft_ext.cpp new file mode 100644 index 0000000000..efccbd6ab8 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft_ext.cpp @@ -0,0 +1,151 @@ +/*************************************************************************** + lj_coul_long_soft_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/long/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_long_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulLongSoft LJCLSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + LJCLSMF.clear(); + gpu_mode=LJCLSMF.device->gpu_mode(); + double gpu_split=LJCLSMF.device->particle_split(); + int first_gpu=LJCLSMF.device->first_device(); + int last_gpu=LJCLSMF.device->last_device(); + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + LJCLSMF.device->init_message(screen,"lj/cut/coul/long/soft",first_gpu,last_gpu); + + bool message=false; + if (LJCLSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCLSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + LJCLSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCLSMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double **host_cut_ljsq) { + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + if (world_me==0) + LJCLSMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, host_cut_ljsq); + LJCLSMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + +void ljcls_gpu_clear() { + LJCLSMF.clear(); +} + +int** ljcls_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcls_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double ljcls_gpu_bytes() { + return LJCLSMF.host_memory_usage(); +} + diff --git a/lib/gpu/lal_lj_coul_soft.cpp b/lib/gpu/lal_lj_coul_soft.cpp new file mode 100644 index 0000000000..96bcf41aa3 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cpp @@ -0,0 +1,157 @@ +/*************************************************************************** + lj_coul_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/cut/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_soft=0; +#else +#include "lj_coul_soft_cubin.h" +#endif + +#include "lal_lj_coul_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulSoftT LJCoulSoft + +extern Device device; + +template +LJCoulSoftT::LJCoulSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulSoftT::~LJCoulSoft() { + clear(); +} + +template +int LJCoulSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_soft,"lj_coul_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulSoft; +} diff --git a/lib/gpu/lal_lj_coul_soft.cu b/lib/gpu/lal_lj_coul_soft.cu new file mode 100644 index 0000000000..beea2ded43 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cu @@ -0,0 +1,275 @@ +// ************************************************************************** +// lj_coul_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/coul/cut/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndtrung@uchicago.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulSoft : public BaseCharge { + public: + LJCoulSoft(); + ~LJCoulSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_soft_ext.cpp b/lib/gpu/lal_lj_coul_soft_ext.cpp new file mode 100644 index 0000000000..8c10db14f6 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + lj_coul_soft_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/cut/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulSoft LJCSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + LJCSMF.clear(); + gpu_mode=LJCSMF.device->gpu_mode(); + double gpu_split=LJCSMF.device->particle_split(); + int first_gpu=LJCSMF.device->first_device(); + int last_gpu=LJCSMF.device->last_device(); + int world_me=LJCSMF.device->world_me(); + int gpu_rank=LJCSMF.device->gpu_rank(); + int procs_per_gpu=LJCSMF.device->procs_per_gpu(); + + LJCSMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu); + + bool message=false; + if (LJCSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, epsilon, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + LJCSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCSMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljcs_gpu_clear() { + LJCSMF.clear(); +} + +int** ljcs_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcs_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljcs_gpu_bytes() { + return LJCSMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp new file mode 100644 index 0000000000..7b0d115d96 --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -0,0 +1,249 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_cut_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); +void ljcs_gpu_clear(); +int **ljcs_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcs_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd); +double ljcs_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::PairLJCutCoulCutSoftGPU(LAMMPS *lmp) : + PairLJCutCoulCutSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::~PairLJCutCoulCutSoftGPU() +{ + ljcs_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcs_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcs_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::init_style() +{ + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/cut/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcs_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulCutSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcs_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype, itable; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + + if (rsq < cut_coulsq[itype][jtype]) { + denc = sqrt(lj4[itype][jtype] + rsq); + forcecoul = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + } else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else forcelj = 0.0; + + fpair = factor_coul*forcecoul + factor_lj*forcelj; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + + if (eflag) { + if (rsq < cut_coulsq[itype][jtype]) + ecoul = factor_coul * qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + else + ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h new file mode 100644 index 0000000000..fcd5439f7a --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/soft/soft/gpu,PairLJCutCoulCutSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H + +#include "pair_lj_cut_coul_cut_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulCutSoftGPU : public PairLJCutCoulCutSoft { + public: + PairLJCutCoulCutSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulCutSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp new file mode 100644 index 0000000000..45d26f5155 --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp @@ -0,0 +1,297 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_long_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "kspace.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, const double qqrd2e, + const double g_ewald); +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, + double **host_lj_cutsq); +void ljcls_gpu_clear(); +int **ljcls_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcls_gpu_compute(const int ago, const int inum, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); +double ljcls_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::PairLJCutCoulLongSoftGPU(LAMMPS *lmp) : + PairLJCutCoulLongSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::~PairLJCutCoulLongSoftGPU() +{ + ljcls_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcls_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcls_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::init_style() +{ + cut_respa = nullptr; + + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/long/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // insure use of KSpace long-range solver, set g_ewald + + if (force->kspace == nullptr) error->all(FLERR, "Pair style requires a KSpace style"); + g_ewald = force->kspace->g_ewald; + + // setup force tables + + if (ncoultablebits) init_tables(cut_coul, cut_respa); + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcls_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::reinit() +{ + Pair::reinit(); + + ljcls_gpu_reinit(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, cut_ljsq); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulLongSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcls_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype, itable; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double r, r2inv, r6inv, forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + double grij, expm2, prefactor, t, erfc; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0 / rsq; + + if (rsq < cut_coulsq) { + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij * grij); + t = 1.0 / (1.0 + EWALD_P * grij); + erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + + denc = sqrt(lj4[itype][jtype] + rsq); + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + } else + forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else + forcelj = 0.0; + + fpair = (forcecoul + factor_lj * forcelj) * r2inv; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + ecoul = prefactor*erfc; + } else + ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.h b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h new file mode 100644 index 0000000000..d4ff0e375f --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/long/soft/gpu,PairLJCutCoulLongSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H + +#include "pair_lj_cut_coul_long_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulLongSoftGPU : public PairLJCutCoulLongSoft { + public: + PairLJCutCoulLongSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulLongSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + void reinit() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif