diff --git a/lib/gpu/lal_lj_smooth.cpp b/lib/gpu/lal_lj_smooth.cpp new file mode 100644 index 0000000000..5c75539660 --- /dev/null +++ b/lib/gpu/lal_lj_smooth.cpp @@ -0,0 +1,184 @@ +/*************************************************************************** + lj_smooth.cpp + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/smooth pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_smooth_cl.h" +#elif defined(USE_CUDART) +const char *lj=0; +#else +#include "lj_smooth_cubin.h" +#endif + +#include "lal_lj_smooth.h" +#include +namespace LAMMPS_AL { +#define LJSMOOTHT LJSMOOTH + +extern Device device; + +template +LJSMOOTHT::LJSMOOTH() : BaseAtomic(), _allocated(false) { +} + +template +LJSMOOTHT::~LJSMOOTH() { + clear(); +} + +template +int LJSMOOTHT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJSMOOTHT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_smooth,"k_lj_smooth"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, cut_inner_sq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, cut_inner); + + ljsw.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, + host_ljsw3,host_ljsw4); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, cut_inner_sq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, cut_inner); + this->atom->type_pack4(ntypes,_lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, + host_ljsw3,host_ljsw4); +} + +template +void LJSMOOTHT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + ljsw.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJSMOOTHT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJSMOOTH); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +void LJSMOOTHT::loop(const bool _eflag, const bool _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; + + if (_vflag) + vflag=1; + else + vflag=0; + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX,BX); + this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); +} + +template class LJSMOOTH; +} diff --git a/lib/gpu/lal_lj_smooth.h b/lib/gpu/lal_lj_smooth.h new file mode 100644 index 0000000000..98dbad87b7 --- /dev/null +++ b/lib/gpu/lal_lj_smooth.h @@ -0,0 +1,92 @@ +/*************************************************************************** + lj_smooth.h + ------------------- + W. Michael Brown (ORNL) + + Class for acceleration of the lj/smooth pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#ifndef LAL_LJ_SMOOTH_H +#define LAL_LJ_SMOOTH_H + +#include "lal_base_atomic.h" + +namespace LAMMPS_AL { + +template +class LJSMOOTH : public BaseAtomic { + public: + LJSMOOTH(); + ~LJSMOOTH(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cut_inner_sq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = cut_inner + UCL_D_Vec lj3; + /// ljsw.x = ljsw1, ljsw.y = ljsw2, ljsw.z = ljsw3, ljsw.w = ljsw4 + UCL_D_Vec ljsw; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + void loop(const bool _eflag, const bool _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_smooth_ext.cpp b/lib/gpu/lal_lj_smooth_ext.cpp new file mode 100644 index 0000000000..92624bf7fa --- /dev/null +++ b/lib/gpu/lal_lj_smooth_ext.cpp @@ -0,0 +1,146 @@ +/*************************************************************************** + lj_smooth_ext.cpp + ------------------- + W. Michael Brown (ORNL) + + Functions for LAMMPS access to lj/smooth acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : brownw@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_smooth.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJSMOOTH LJSMTMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { + LJSMTMF.clear(); + gpu_mode=LJSMTMF.device->gpu_mode(); + double gpu_split=LJSMTMF.device->particle_split(); + int first_gpu=LJSMTMF.device->first_device(); + int last_gpu=LJSMTMF.device->last_device(); + int world_me=LJSMTMF.device->world_me(); + int gpu_rank=LJSMTMF.device->gpu_rank(); + int procs_per_gpu=LJSMTMF.device->procs_per_gpu(); + + LJSMTMF.device->init_message(screen,"lj/smooth",first_gpu,last_gpu); + + bool message=false; + if (LJSMTMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, 300, + maxspecial, cell_size, gpu_split, screen, + host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + + LJSMTMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJSMTMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { + int world_me=LJSMTMF.device->world_me(); + int gpu_rank=LJSMTMF.device->gpu_rank(); + int procs_per_gpu=LJSMTMF.device->procs_per_gpu(); + + if (world_me==0) + LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + LJSMTMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + +void ljsmt_gpu_clear() { + LJSMTMF.clear(); +} + +int ** ljsmt_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJSMTMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void ljsmt_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJSMTMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double ljsmt_gpu_bytes() { + return LJSMTMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_lj_smooth_gpu.cpp b/src/GPU/pair_lj_smooth_gpu.cpp index 4ea5cce92e..882055e84d 100644 --- a/src/GPU/pair_lj_smooth_gpu.cpp +++ b/src/GPU/pair_lj_smooth_gpu.cpp @@ -40,34 +40,40 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition -int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1, +int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, double **offset, double *special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, - const double cell_size, int &gpu_mode, FILE *screen); + const double cell_size, int &gpu_mode, FILE *screen, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_innersq); -void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, +void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, double **host_lj3, double **host_lj4, - double **offset); + double **offset, + double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_innersq); -void ljl_gpu_clear(); -int ** ljl_gpu_compute_n(const int ago, const int inum, +void ljsmt_gpu_clear(); +int ** ljsmt_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success); -void ljl_gpu_compute(const int ago, const int inum, const int nall, +void ljsmt_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); -double ljl_gpu_bytes(); +double ljsmt_gpu_bytes(); /* ---------------------------------------------------------------------- */ -PairLJSmoothGPU::PairLJSmoothGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_FORCE) +PairLJSmoothGPU::PairLJSmoothGPU(LAMMPS *lmp) : PairLJSmooth(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; cpu_time = 0.0; @@ -81,7 +87,7 @@ PairLJSmoothGPU::PairLJSmoothGPU(LAMMPS *lmp) : PairLJCut(lmp), gpu_mode(GPU_FOR PairLJSmoothGPU::~PairLJSmoothGPU() { - ljl_gpu_clear(); + ljsmt_gpu_clear(); } /* ---------------------------------------------------------------------- */ @@ -108,7 +114,7 @@ void PairLJSmoothGPU::compute(int eflag, int vflag) domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; - firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall, + firstneigh = ljsmt_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, @@ -119,7 +125,7 @@ void PairLJSmoothGPU::compute(int eflag, int vflag) ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; - ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ljsmt_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success); } @@ -140,10 +146,10 @@ void PairLJSmoothGPU::compute(int eflag, int vflag) void PairLJSmoothGPU::init_style() { - cut_respa = nullptr; + //cut_respa = nullptr; if (force->newton_pair) - error->all(FLERR,"Cannot use newton pair with lj/cut/gpu pair style"); + error->all(FLERR,"Cannot use newton pair with lj/smooth/gpu pair style"); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; @@ -165,10 +171,11 @@ void PairLJSmoothGPU::init_style() int maxspecial=0; if (atom->molecular) maxspecial=atom->maxspecial; - int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + int success = ljsmt_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, force->special_lj, atom->nlocal, atom->nlocal+atom->nghost, 300, maxspecial, - cell_size, gpu_mode, screen); + cell_size, gpu_mode, screen, ljsw1, ljsw2, + ljsw3, ljsw4, cut_inner, cut_inner_sq); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) { @@ -184,7 +191,7 @@ void PairLJSmoothGPU::reinit() { Pair::reinit(); - ljl_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset); + ljsmt_gpu_reinit(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, offset, ljsw1, ljsw2, ljsw3, ljsw4, cut_inner, cut_inner_sq); } /* ---------------------------------------------------------------------- */ @@ -192,7 +199,7 @@ void PairLJSmoothGPU::reinit() double PairLJSmoothGPU::memory_usage() { double bytes = Pair::memory_usage(); - return bytes + ljl_gpu_bytes(); + return bytes + ljsmt_gpu_bytes(); } /* ---------------------------------------------------------------------- */ @@ -202,6 +209,7 @@ void PairLJSmoothGPU::cpu_compute(int start, int inum, int eflag, int /* vflag * int i,j,ii,jj,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; double rsq,r2inv,r6inv,forcelj,factor_lj; + double r,t,tsq,fskin; int *jlist; double **x = atom->x; @@ -233,8 +241,18 @@ void PairLJSmoothGPU::cpu_compute(int start, int inum, int eflag, int /* vflag * if (rsq < cutsq[itype][jtype]) { r2inv = 1.0/rsq; - r6inv = r2inv*r2inv*r2inv; - forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + if (rsq < cut_inner_sq[itype][jtype]) { + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv-lj2[itype][jtype]); + } else { + r = sqrt(rsq); + t = r - cut_inner[itype][jtype]; + tsq = t*t; + fskin = ljsw1[itype][jtype] + ljsw2[itype][jtype]*t + + ljsw3[itype][jtype]*tsq + ljsw4[itype][jtype]*tsq*t; + forcelj = fskin*r; + } + fpair = factor_lj*forcelj*r2inv; f[i][0] += delx*fpair; @@ -242,8 +260,13 @@ void PairLJSmoothGPU::cpu_compute(int start, int inum, int eflag, int /* vflag * f[i][2] += delz*fpair; if (eflag) { - evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - - offset[itype][jtype]; + if (rsq < cut_inner_sq[itype][jtype]) + evdwl = r6inv * (lj3[itype][jtype]*r6inv - + lj4[itype][jtype]) - offset[itype][jtype]; + else + evdwl = ljsw0[itype][jtype] - ljsw1[itype][jtype]*t - + ljsw2[itype][jtype]*tsq/2.0 - ljsw3[itype][jtype]*tsq*t/3.0 - + ljsw4[itype][jtype]*tsq*tsq/4.0 - offset[itype][jtype]; evdwl *= factor_lj; } diff --git a/src/GPU/pair_lj_smooth_gpu.h b/src/GPU/pair_lj_smooth_gpu.h index fc245918d0..414ce4c2d2 100644 --- a/src/GPU/pair_lj_smooth_gpu.h +++ b/src/GPU/pair_lj_smooth_gpu.h @@ -20,11 +20,11 @@ PairStyle(lj/smooth/gpu,PairLJSmoothGPU) #ifndef LMP_PAIR_LJ_SMOOTH_GPU_H #define LMP_PAIR_LJ_SMOOTH_GPU_H -#include "pair_lj_cut.h" +#include "pair_lj_smooth.h" namespace LAMMPS_NS { -class PairLJSmoothGPU : public PairLJCut { +class PairLJSmoothGPU : public PairLJSmooth { public: PairLJSmoothGPU(LAMMPS *lmp); ~PairLJSmoothGPU(); @@ -52,7 +52,7 @@ E: Insufficient memory on accelerator There is insufficient memory on one of the devices specified for the gpu package -E: Cannot use newton pair with lj/cut/gpu pair style +E: Cannot use newton pair with lj/smooth/gpu pair style Self-explanatory.