diff --git a/lib/gpu/lal_base_sphere.cpp b/lib/gpu/lal_base_sphere.cpp new file mode 100644 index 0000000000..a6cd6cdf0c --- /dev/null +++ b/lib/gpu/lal_base_sphere.cpp @@ -0,0 +1,331 @@ +/*************************************************************************** + base_sphere.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + radius, angular velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#include "lal_base_sphere.h" +namespace LAMMPS_AL { +#define BaseSphereT BaseSphere + +extern Device global_device; + +template +BaseSphereT::BaseSphere() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif +} + +template +BaseSphereT::~BaseSphere() { + delete ans; + delete nbor; + k_pair_fast.clear(); + k_pair.clear(); + if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif +} + +template +int BaseSphereT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseSphereT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + + bool charge = false; + bool rot = true; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,pair_program,k_name); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + + hd_balancer.init(device,gpu_nbor,gpu_split); + + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return 0; +} + +template +void BaseSphereT::estimate_gpu_overhead(const int add_kernels) { + device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead); +} + +template +void BaseSphereT::clear_atomic() { + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); +} + +template +int * BaseSphereT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +template +void BaseSphereT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +template +void BaseSphereT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, + const int nlocal, + double *boxlo, double *prd) { + acc_timers(); + int eflags = eflag ? 1 : 0; + int vflags = vflag ? 1 : 0; + if (eatom) eflags=2; + if (vatom) vflags=2; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflags) eflags=2; + if (vflags) vflags=2; + #endif + + set_kernel(eflags, vflags); + if (inum_full==0) { + host_start=0; + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + + const int red_blocks=loop(eflags, vflags); + ans->copy_answers(eflag,vflag,eatom,vatom,ilist,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +template +int **BaseSphereT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double *boxlo, double *prd) { + acc_timers(); + int eflags = eflag ? 1 : 0; + int vflags = vflag ? 1 : 0; + if (eatom) eflags=2; + if (vatom) vflags=2; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflags) eflags=2; + if (vflags) vflags=2; + #endif + + set_kernel(eflags, vflags); + if (inum_full==0) { + host_start=0; + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return nullptr; + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + + + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + const int red_blocks=loop(eflags, vflags); + ans->copy_answers(eflag,vflag,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseSphereT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseSphere); +} + +template +void BaseSphereT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *k) { + if (_compiled) + return; + + std::string s_fast=std::string(k)+"_fast"; + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,k); + pos_tex.get_texture(*pair_program,"pos_tex"); + + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif +} + +template class BaseSphere; +} \ No newline at end of file diff --git a/lib/gpu/lal_base_sphere.h b/lib/gpu/lal_base_sphere.h new file mode 100644 index 0000000000..182546f28b --- /dev/null +++ b/lib/gpu/lal_base_sphere.h @@ -0,0 +1,176 @@ +/*************************************************************************** + base_sphere.h + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for pair styles needing per-particle data for position, + radius, angular velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : nguyentd@ornl.gov + ***************************************************************************/ + +#ifndef LAL_BASE_SPHERE_H +#define LAL_BASE_SPHERE_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseSphere { + public: + BaseSphere(); + virtual ~BaseSphere(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(const int add_kernels=0); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + const int nlocal, double *boxlo, double *prd); + + /// Pair loop with device neighboring + int **compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double *boxlo, double *prd); + + // -------------------------- DEVICE DATA ------------------------- + + Device *device; + UCL_Device *ucl_device; + UCL_Timer time_pair; + Balance hd_balancer; + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + Atom *atom; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; + inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + + protected: + bool _compiled; + int _block_size, _threads_per_atom; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + + virtual int loop(const int eflag, const int vflag) = 0; +}; + +} +#endif \ No newline at end of file diff --git a/lib/gpu/lal_gran_hooke.cpp b/lib/gpu/lal_gran_hooke.cpp new file mode 100644 index 0000000000..e5d56de5d3 --- /dev/null +++ b/lib/gpu/lal_gran_hooke.cpp @@ -0,0 +1,391 @@ +/*************************************************************************** + * gran_hooke.cpp + * ------------------- + * Trung Dac Nguyen (ORNL) + * + * Class for acceleration of the gran/hooke pair style. + * + * __________________________________________________________________________ + * This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + * __________________________________________________________________________ + * + * begin : + * email : nguyentd@ornl.gov + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "gran_hooke_cl.h" +#elif defined(USE_CUDART) +const char *gran_hooke=0; +#else +#include "gran_hooke_cubin.h" +#endif + +#include "lal_gran_hooke.h" +#include +using namespace LAMMPS_AL; +#define GranHookeT GranHooke + +extern Device device; + +template +GranHookeT::GranHooke() : BaseSphere(), + _allocated(false) { +} + +template +GranHookeT::~GranHooke() { + clear(); +} + +template +int GranHookeT::init(const int ntypes, + double host_k_n, double host_k_t, + double host_gamman, double host_gammat, + double host_xmu, double host_dt, const int host_dampflag, + double *host_special_lj, int *host_mask, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen) { + int onetype=0; + int success; + success = this->init_atomic(nlocal, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, + gran_hooke, "k_gran_hooke"); + if (success != 0) + return success; + + _shared_view = this->ucl_device->shared_memory() && sizeof(numtyp)==sizeof(double); + + + // allocate rad + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_rad_size=static_cast(static_cast(ef_nall)*1.10); + + if (!_shared_view) + { + c_rad.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + c_rmass.alloc(_max_rad_size,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + c_omega.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + c_vel.alloc(_max_rad_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY); + } + + rad_tex.get_texture(*(this->pair_program),"rad_tex"); + rad_tex.bind_float(c_rad,1); + + rmass_tex.get_texture(*(this->pair_program),"rmass_tex"); + rmass_tex.bind_float(c_rmass,1); + + omega_tex.get_texture(*(this->pair_program),"omega_tex"); + omega_tex.bind_float(c_omega,4); + + vel_tex.get_texture(*(this->pair_program),"vel_tex"); + vel_tex.bind_float(c_vel,4); + + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {//TODO + lj_types=max_shared_types; + shared_types=true; + } + //shared_types=false;//TEST + _k_n=host_k_n; + _k_t=host_k_t; + _gamman=host_gamman; + _gammat=host_gammat; + _xmu=host_xmu; + _dampflag=host_dampflag; + _dt=host_dt; + + UCL_H_Vec h_mask; + _mask.alloc(_max_rad_size,*(this->ucl_device),UCL_READ_ONLY); + h_mask.view(host_mask,_max_rad_size,*(this->ucl_device)); + ucl_copy(_mask,h_mask,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=sp_lj.row_bytes() + _mask.row_bytes(); + + return 0; +} + +template +void GranHookeT::clear() { + if (!_allocated) + return; + + _allocated = false; + sp_lj.clear(); + _mask.clear(); + c_rad.clear(); + c_omega.clear(); + c_rmass.clear(); + c_vel.clear(); + this->clear_atomic(); +} + +template +double GranHookeT::host_memory_usage() const { + return this->host_memory_usage_atomic() + + sizeof(GranHooke); +} + +template +int GranHookeT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors) + + sizeof(numtyp4)*2 + sizeof(numtyp2) + sizeof(int) + + sizeof(numtyp)*3 + sizeof(numtyp)*3*max_nbors; +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int GranHookeT::loop(const int _eflag, const int _vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX = this->block_size(); + //int GX = static_cast(ceil(static_cast(this->ans->inum()) / BX)); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_fast.set_size(GX, BX); + this->k_pair_fast.run(&this->atom->x, &c_rad, &c_vel, + &c_omega, &sp_lj, &c_rmass, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_mask, &this->ans->force, + &this->ans->engv, &_eflag, &_vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &_k_n, &_k_t, &_gamman, &_gammat, &_xmu, + &_dampflag, &freeze_group_bit, + &_limit_damping, &_dt); + + } else { + this->k_pair.set_size(GX, BX); + this->k_pair.run(&this->atom->x, &c_rad, &c_vel, + &c_omega, &sp_lj, &c_rmass, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &_mask, &this->ans->force, + &this->ans->engv, &_eflag, &_vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom, + &_k_n, &_k_t, &_gamman, &_gammat, &_xmu, + &_dampflag, &freeze_group_bit, + &_limit_damping, &_dt); + } + this->time_pair.stop(); + return GX; +} + +template +void GranHookeT::compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + const int nlocal, double *boxlo, double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + _limit_damping = limit_damping; + // ------------------- Resize data array -------------------------- + if (nall>_max_rad_size) { + _max_rad_size=static_cast(static_cast(nall)*1.10); + if (!_shared_view) { + c_rad.resize(_max_rad_size); + rad_tex.bind_float(c_rad,1); + + c_omega.resize(_max_rad_size); + omega_tex.bind_float(c_omega,4); + + c_vel.resize(_max_rad_size); + vel_tex.bind_float(c_vel,4); + + c_rmass.resize(_max_rad_size); + rmass_tex.bind_float(c_rmass,1); + } + } + + // ---------------------------------------------------------------- + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return; + } + + // load balance, returning the atom count on the device (inum) + int ago=this->hd_balancer.ago_first(f_ago); + //this->hd_balancer.balance(cpu_time); + //int inum=this->hd_balancer.get_gpu_count(ago,inum_full); + int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); + this->ans->inum(inum); + host_start=inum; + + + // ----------------------------------------------------------------- + + if (ago==0) { + this->reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + this->atom->cast_x_data(host_x,host_type); + this->cast_vel_data(host_v); + this->cast_rad_data(host_rad); + this->cast_omega_data(host_omega); + this->cast_rmass_data(host_rmass); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + this->add_vel_data(); + this->add_rad_data(); + this->add_rmass_data(); + this->add_omega_data(); + + const int red_blocks=this->loop(eflag,vflag); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); + this->device->add_ans_object(this->ans); + this->hd_balancer.stop_timer(); + +} + +template +int** GranHookeT::compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + double *boxlo, double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + _limit_damping= limit_damping; + + // ------------------- Resize data array -------------------------- + if (nall>_max_rad_size) { + _max_rad_size=static_cast(static_cast(nall)*1.10); + if (!_shared_view) { + c_rad.resize(_max_rad_size); + rad_tex.bind_float(c_rad,1); + + c_omega.resize(_max_rad_size); + omega_tex.bind_float(c_omega,4); + + c_vel.resize(_max_rad_size); + vel_tex.bind_float(c_vel,4); + + c_rmass.resize(_max_rad_size); + rmass_tex.bind_float(c_rmass,1); + } + } + + // ----------------------------------------------------------------- + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return nullptr; + } + + // ---------------------------------------------------------------- + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return nullptr; + } + + // load balance, returning the atom count on the device (inum) + this->hd_balancer.balance(cpu_time); + int inum=this->hd_balancer.get_gpu_count(ago,inum_full); + this->ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return nullptr; + this->cast_vel_data(host_v); + this->cast_rad_data(host_rad); + this->cast_omega_data(host_omega); + this->cast_rmass_data(host_rmass); + this->hd_balancer.start_timer(); + } else { + this->atom->cast_x_data(host_x,host_type); + this->cast_vel_data(host_v); + this->cast_rad_data(host_rad); + this->cast_omega_data(host_omega); + this->cast_rmass_data(host_rmass); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + } + this->add_vel_data(); + this->add_rad_data(); + this->add_rmass_data(); + this->add_omega_data(); + *ilist=this->nbor->host_ilist.begin(); + *jnum=this->nbor->host_acc.begin(); + + const int red_blocks=this->loop(eflag,vflag); + this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + this->device->add_ans_object(this->ans); + this->hd_balancer.stop_timer(); + + return this->nbor->host_jlist.begin()-host_start; +} +// Instantiate the class +template class GranHooke; diff --git a/lib/gpu/lal_gran_hooke.cu b/lib/gpu/lal_gran_hooke.cu new file mode 100644 index 0000000000..82286ad2fc --- /dev/null +++ b/lib/gpu/lal_gran_hooke.cu @@ -0,0 +1,491 @@ +// ************************************************************************** +// gran_hooke.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the gran/hooke pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : nguyentd@ornl.gov +// *************************************************************************** + + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +// 确保类型定义可见 +#ifndef numtyp +#define numtyp float +#endif +#ifndef numtyp4 +#define numtyp4 float4 +#endif +#ifndef acctyp +#define acctyp float +#endif +#ifndef acctyp3 +#define acctyp3 float3 +#endif +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( rad_tex,float); +_texture( vel_tex,float4); +_texture( omega_tex,float4); +_texture( ramss_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( rad_tex,int2); +_texture_2d( vel_tex,int4); +_texture_2d( omega_tex,int4); +_texture( ramss_tex,int2); +#endif + +#if (__CUDACC_VER_MAJOR__ >= 11) +#define rad_tex rad_ +#define vel_tex v_ +#define omega_tex omega_ +#endif + +#else +#define pos_tex x_ +#define rad_tex rad_ +#define vel_tex v_ +#define omega_tex omega_ +#define ramss_tex mass_rigid_ +#define vel_rad_tex v_rad_ +#define omega_ramss_tex omega_ramss_ +#endif + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) + +#define store_answers_t(f, tor, energy, virial, ii, inum, tid, \ + t_per_atom, offset, eflag, vflag, ans, engv) \ + if (t_per_atom>1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + f.x += shfl_down(f.x, s, t_per_atom); \ + f.y += shfl_down(f.y, s, t_per_atom); \ + f.z += shfl_down(f.z, s, t_per_atom); \ + tor.x += shfl_down(tor.x, s, t_per_atom); \ + tor.y += shfl_down(tor.y, s, t_per_atom); \ + tor.z += shfl_down(tor.z, s, t_per_atom); \ + if (EVFLAG) energy += shfl_down(energy, s, t_per_atom); \ + } \ + if (EVFLAG && vflag) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + for (int r=0; r<6; r++) \ + virial[r] += shfl_down(virial[r], s, t_per_atom); \ + } \ + } \ + } \ + if (offset==0 && ii +class GranHooke : public BaseSphere { + public: + GranHooke(); + ~GranHooke(); + + typedef struct { double x,y,z; } vec3d; + typedef struct { numtyp x,y,z,w; } vec4d_t; + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, + double host_k_n, double host_k_t, + double host_gamman, double host_gammat, + double host_xmu, double host_dt, const int host_dampflag, + double *host_special_lj, int *host_mask, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, FILE *screen); + + inline void cast_rad_data(double* rad) { + int nall = this->atom->nall(); + if (_shared_view) { + c_rad.host.view((numtyp*)rad,nall,*(this->ucl_device)); + c_rad.device.view(c_rad.host); + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(c_rad.host.begin(),rad,nall*sizeof(numtyp)); + else + for (int i=0; iatom->nall(),true); + } + + inline void cast_rmass_data(double* rmass) { + int nall = this->atom->nall(); + if (_shared_view) { + c_rmass.host.view((numtyp*)rmass,nall,*(this->ucl_device)); + c_rmass.device.view(c_rmass.host); + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(c_rmass.host.begin(),rmass,nall*sizeof(numtyp)); + else + for (int i=0; iatom->nall(),true); + } + + /*inline void cast_omega_data(double** omega) { + int nall = this->atom->nall(); + if (_shared_view) { + c_omega.host.view((numtyp*)omega,nall,*(this->ucl_device)); + c_omega.device.view(c_omega.host); + } else { + if (sizeof(numtyp)==sizeof(double)) + memcpy(c_omega.host.begin(),omega,nall*sizeof(numtyp)); + else + for (int i=0; iatom->nall(),true); + }*/ + + inline void cast_omega_data(double** omega) { + int nall = this->atom->nall(); + #ifdef GPU_CAST + memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int)); + #else + vec3d *host_p=reinterpret_cast(&(omega[0][0])); + vec4d_t *vp=reinterpret_cast(&(c_omega[0])); + #if (LAL_USE_OMP == 1) + #pragma omp parallel for schedule(static) + #endif + for (int i=0; iatom->nall()*4,true); + } + + inline void cast_vel_data(double** vel) { + int nall = this->atom->nall(); + #ifdef GPU_CAST + memcpy(x_cast.host.begin(),host_ptr[0],_nall*3*sizeof(double)); + memcpy(type_cast.host.begin(),host_type,_nall*sizeof(int)); + #else + vec3d *host_p=reinterpret_cast(&(vel[0][0])); + vec4d_t *vp=reinterpret_cast(&(c_vel[0])); + #if (LAL_USE_OMP == 1) + #pragma omp parallel for schedule(static) + #endif + for (int i=0; iatom->nall()*4,true); + } + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + const int nlocal, double *boxlo, double *prd); + + int **compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + double *boxlo, double *prd); + + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = k_n, coeff.y = k_t, coeff.z = cutsq, coeff.w = gamman + UCL_D_Vec coeff; + + numtyp _k_n; + numtyp _k_t; + numtyp _gamman; + numtyp _gammat; + numtyp _xmu; + numtyp _dt; + numtyp _dampflag; + int _limit_damping; + /// coeff2.x = gammat, coeff2.y = xmu + UCL_D_Vec coeff2; + /// Special LJ values + UCL_D_Vec sp_lj; + + UCL_D_Vec _mask; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector c_rad; + + UCL_Vector c_omega; + + UCL_Vector c_vel; + + UCL_Vector c_rmass; + +// --------------------------- TEXTURES ----------------------------- + UCL_Texture rad_tex; + + UCL_Texture omega_tex; + + UCL_Texture vel_tex; + + UCL_Texture rmass_tex; + + numtyp freeze_group_bit; + + int _max_rad_size; + + private: + bool _shared_view; + bool _allocated; + int loop(const int _eflag, const int _vflag); +}; + +} + +#endif \ No newline at end of file diff --git a/lib/gpu/lal_gran_hooke_ext.cpp b/lib/gpu/lal_gran_hooke_ext.cpp new file mode 100644 index 0000000000..641916cda4 --- /dev/null +++ b/lib/gpu/lal_gran_hooke_ext.cpp @@ -0,0 +1,134 @@ +/*************************************************************************** + * gran_hooke_ext.cpp + * ------------------- + * Trung Dac Nguyen (ORNL) + * + * Functions for LAMMPS access to gran/hooke acceleration routines. + * + * __________________________________________________________________________ + * This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + * __________________________________________________________________________ + * + * begin : + * email : nguyentd@ornl.gov + ***************************************************************************/ + +#include +#include +#include + +#include "lal_gran_hooke.h" + +using namespace std; +using namespace LAMMPS_AL; + +static GranHooke GHM; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int gran_hooke_gpu_init(const int ntypes, double **cutsq, + const double host_k_n, const double host_k_t, + const double host_gamman, const double host_gammat, + const double host_xmu, const double host_dt, const int host_dampflag, + double *special_lj, int *host_mask, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen) { + GHM.clear(); + gpu_mode=GHM.device->gpu_mode(); + double gpu_split=GHM.device->particle_split(); + int first_gpu=GHM.device->first_device(); + int last_gpu=GHM.device->last_device(); + int world_me=GHM.device->world_me(); + int gpu_rank=GHM.device->gpu_rank(); + int procs_per_gpu=GHM.device->procs_per_gpu(); + + GHM.device->init_message(screen,"gran/hooke",first_gpu,last_gpu); + + bool message=false; + if (GHM.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=GHM.init(ntypes, host_k_n, host_k_t, host_gamman, + host_gammat, host_xmu, host_dt, host_dampflag, special_lj, + host_mask, inum, nall, max_nbors, maxspecial, cell_size, + gpu_split, screen); + + GHM.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + GHM.estimate_gpu_overhead(); + return init_ok; +} + +void gran_hooke_gpu_clear() { + GHM.clear(); +} + +int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + double *boxlo, double *prd) { + return GHM.compute(ago, inum_full, nall, host_x, host_type, sublo, subhi, + tag, nspecial, special, eflag, vflag, eatom, vatom, + host_start, ilist, jnum, cpu_time, success, host_v, host_rad, + host_omega, host_rmass, limit_damping, boxlo, prd); +} + +void gran_hooke_gpu_compute(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + const int nlocal, double *boxlo, double *prd) { + GHM.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, + success, tag, host_v, host_rad, host_omega, host_rmass, limit_damping, nlocal, boxlo, prd); +} + +double gran_hooke_gpu_bytes() { + return GHM.host_memory_usage(); +} \ No newline at end of file diff --git a/src/GPU/pair_gran_hooke_gpu.cpp b/src/GPU/pair_gran_hooke_gpu.cpp new file mode 100644 index 0000000000..f8b36351c4 --- /dev/null +++ b/src/GPU/pair_gran_hooke_gpu.cpp @@ -0,0 +1,477 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (ORNL) +------------------------------------------------------------------------- */ + +#include "pair_gran_hooke_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" +#include "memory.h" +#include "comm.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int gran_hooke_gpu_init(const int ntypes, double **cutsq, + const double host_k_n, const double host_k_t, + const double host_gamman, const double host_gammat, + const double host_xmu, const double host_dt, const int host_dampflag, + double *special_lj, int *host_mask, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); +void gran_hooke_gpu_clear(); +int ** gran_hooke_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + double *boxlo, double *prd); +void gran_hooke_gpu_compute(const int f_ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, + int &host_start, const double cpu_time, + bool &success, tagint *tag, + double **host_v, double *host_rad, + double **host_omega, double *host_rmass, int limit_damping, + const int nlocal, double *boxlo, double *prd); +double gran_hooke_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairGranHookeGPU::PairGranHookeGPU(LAMMPS *lmp) : + PairGranHooke(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairGranHookeGPU::~PairGranHookeGPU() +{ + gran_hooke_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairGranHookeGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + int shearupdate = 1; + if (update->setupflag) shearupdate = 0; + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + + + inum = atom->nlocal; + firstneigh = gran_hooke_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, + sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v,atom->radius, atom->omega, atom->rmass, + limit_damping, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + gran_hooke_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, + firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, + cpu_time, success, atom->tag, atom->v, atom->radius, atom->omega, atom->rmass, + limit_damping, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairGranHookeGPU::init_style() +{ + if (!atom->radius_flag || !atom->rmass_flag || !atom->omega_flag) + error->all(FLERR, "Pair style gran/hooke/history/gpu requires atom attributes radius, rmass, omega"); + if (comm->ghost_velocity == 0) + error->all(FLERR, "Pair gran/h* requires ghost atoms store velocity"); + + if (gpu_mode == GPU_FORCE) + { + if (history) + neighbor->add_request(this, NeighConst::REQ_SIZE | NeighConst::REQ_HISTORY); + else + neighbor->add_request(this, NeighConst::REQ_SIZE); + } + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + //dt = update->dt; + + if (history && (fix_history == nullptr)) { + auto cmd = fmt::format("NEIGH_HISTORY_HH{} all NEIGH_HISTORY {}", instance_me, size_history); + fix_history = dynamic_cast( + modify->replace_fix("NEIGH_HISTORY_HH_DUMMY" + std::to_string(instance_me), cmd, 1)); + fix_history->pair = this; + } + + // check for FixFreeze and set freeze_group_bit + + auto fixlist = modify->get_fix_by_style("^freeze"); + if (fixlist.size() == 0) + freeze_group_bit = 0; + else if (fixlist.size() > 1) + error->all(FLERR, "Only one fix freeze command at a time allowed"); + else + freeze_group_bit = fixlist.front()->groupbit; + + // check for FixRigid so can extract rigid body masses + + fix_rigid = nullptr; + for (const auto &ifix : modify->get_fix_list()) { + if (ifix->rigid_flag) { + if (fix_rigid) + error->all(FLERR, "Only one fix rigid command at a time allowed"); + else + fix_rigid = ifix; + } + } + + // check for FixPour and FixDeposit so can extract particle radii + + auto pours = modify->get_fix_by_style("^pour"); + auto deps = modify->get_fix_by_style("^deposit"); + + // set maxrad_dynamic and maxrad_frozen for each type + // include future FixPour and FixDeposit particles as dynamic + + int itype; + for (int i = 1; i <= atom->ntypes; i++) { + onerad_dynamic[i] = onerad_frozen[i] = 0.0; + for (auto &ipour : pours) { + itype = i; + double maxrad = *((double *) ipour->extract("radius", itype)); + if (maxrad > 0.0) onerad_dynamic[i] = maxrad; + } + for (auto &idep : deps) { + itype = i; + double maxrad = *((double *) idep->extract("radius", itype)); + if (maxrad > 0.0) onerad_dynamic[i] = maxrad; + } + } + + double *radius = atom->radius; + int *mask = atom->mask; + int *type = atom->type; + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) { + if (mask[i] & freeze_group_bit) + onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]], radius[i]); + else + onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]], radius[i]); + } + + MPI_Allreduce(&onerad_dynamic[1], &maxrad_dynamic[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world); + MPI_Allreduce(&onerad_frozen[1], &maxrad_frozen[1], atom->ntypes, MPI_DOUBLE, MPI_MAX, world); + + // set fix which stores history info + + if (history) { + fix_history = dynamic_cast( + modify->get_fix_by_id("NEIGH_HISTORY_HH" + std::to_string(instance_me))); + if (!fix_history) error->all(FLERR, "Could not find pair fix neigh history ID"); + } + + int success = gran_hooke_gpu_init(atom->ntypes + 1, cutsq, kn, kt, gamman, gammat, xmu, update->dt, + dampflag, force->special_lj, atom->mask, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + +} + +void PairGranHookeGPU::cpu_compute(int start, int inum, int eflag, int vflag, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum; + double xtmp, ytmp, ztmp, delx, dely, delz, fx, fy, fz; + double radi, radj, radsum, rsq, r, rinv, rsqinv, factor_lj; + double vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3; + double wr1, wr2, wr3; + double vtr1, vtr2, vtr3, vrel; + double mi, mj, meff, damp, ccel, tor1, tor2, tor3; + double fn,fs,ft,fs1,fs2,fs3; + double shrmag, rsht; + int *jlist; + int *touch, **firsttouch; + double *shear, *allshear, **firstshear; + + ev_init(eflag, vflag); + + int shearupdate = 1; + if (update->setupflag) shearupdate = 0; + + // update rigid body info for owned & ghost atoms if using FixRigid masses + // body[i] = which body atom I is in, -1 if none + // mass_body = mass of each rigid body + + if (fix_rigid && neighbor->ago == 0) { + int tmp; + int *body = (int *) fix_rigid->extract("body", tmp); + auto mass_body = (double *) fix_rigid->extract("masstotal", tmp); + if (atom->nmax > nmax) { + memory->destroy(mass_rigid); + nmax = atom->nmax; + memory->create(mass_rigid, nmax, "pair:mass_rigid"); + } + int nlocal = atom->nlocal; + for (i = 0; i < nlocal; i++) + if (body[i] >= 0) + mass_rigid[i] = mass_body[body[i]]; + else + mass_rigid[i] = 0.0; + comm->forward_comm(this); + } + + double **x = atom->x; + double **v = atom->v; + double **f = atom->f; + double **omega = atom->omega; + double **torque = atom->torque; + double *radius = atom->radius; + double *rmass = atom->rmass; + int *mask = atom->mask; + int nlocal = atom->nlocal; + int newton_pair = force->newton_pair; + double *special_lj = force->special_lj; + + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + firsttouch = fix_history->firstflag; + firstshear = fix_history->firstvalue; + + // loop over neighbors of my atoms + + for (ii = 0; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + radi = radius[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + if (factor_lj == 0) continue; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + radj = radius[j]; + radsum = radi + radj; + + if (rsq < radsum*radsum) { + r = sqrt(rsq); + rinv = 1.0/r; + rsqinv = 1.0/rsq; + + // relative translational velocity + + vr1 = v[i][0] - v[j][0]; + vr2 = v[i][1] - v[j][1]; + vr3 = v[i][2] - v[j][2]; + + // normal component + + vnnr = vr1*delx + vr2*dely + vr3*delz; + vn1 = delx*vnnr * rsqinv; + vn2 = dely*vnnr * rsqinv; + vn3 = delz*vnnr * rsqinv; + + // tangential component + + vt1 = vr1 - vn1; + vt2 = vr2 - vn2; + vt3 = vr3 - vn3; + + // relative rotational velocity + + wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv; + wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv; + wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv; + + // meff = effective mass of pair of particles + // if I or J part of rigid body, use body mass + // if I or J is frozen, meff is other particle + + mi = rmass[i]; + mj = rmass[j]; + if (fix_rigid) { + if (mass_rigid[i] > 0.0) mi = mass_rigid[i]; + if (mass_rigid[j] > 0.0) mj = mass_rigid[j]; + } + + meff = mi*mj / (mi+mj); + if (mask[i] & freeze_group_bit) meff = mj; + if (mask[j] & freeze_group_bit) meff = mi; + + // normal forces = Hookian contact + normal velocity damping + + damp = meff*gamman*vnnr*rsqinv; + ccel = kn*(radsum-r)*rinv - damp; + if (limit_damping && (ccel < 0.0)) ccel = 0.0; + + // relative velocities + + vtr1 = vt1 - (delz*wr2-dely*wr3); + vtr2 = vt2 - (delx*wr3-delz*wr1); + vtr3 = vt3 - (dely*wr1-delx*wr2); + vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3; + vrel = sqrt(vrel); + + // force normalization + + fn = xmu * fabs(ccel*r); + fs = meff*gammat*vrel; + if (vrel != 0.0) ft = MIN(fn,fs) / vrel; + else ft = 0.0; + + // tangential force due to tangential velocity damping + + fs1 = -ft*vtr1; + fs2 = -ft*vtr2; + fs3 = -ft*vtr3; + + // forces & torques + + fx = delx*ccel + fs1; + fy = dely*ccel + fs2; + fz = delz*ccel + fs3; + fx *= factor_lj; + fy *= factor_lj; + fz *= factor_lj; + f[i][0] += fx; + f[i][1] += fy; + f[i][2] += fz; + + tor1 = rinv * (dely*fs3 - delz*fs2); + tor2 = rinv * (delz*fs1 - delx*fs3); + tor3 = rinv * (delx*fs2 - dely*fs1); + tor1 *= factor_lj; + tor2 *= factor_lj; + tor3 *= factor_lj; + torque[i][0] -= radi*tor1; + torque[i][1] -= radi*tor2; + torque[i][2] -= radi*tor3; + + if (newton_pair || j < nlocal) { + f[j][0] -= fx; + f[j][1] -= fy; + f[j][2] -= fz; + torque[j][0] -= radj*tor1; + torque[j][1] -= radj*tor2; + torque[j][2] -= radj*tor3; + } + + if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair, + 0.0,0.0,fx,fy,fz,delx,dely,delz); + } + } + } + + if (vflag_fdotr) virial_fdotr_compute(); +} + +/* ---------------------------------------------------------------------- */ + +double PairGranHookeGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + gran_hooke_gpu_bytes(); +} \ No newline at end of file diff --git a/src/GPU/pair_gran_hooke_gpu.h b/src/GPU/pair_gran_hooke_gpu.h new file mode 100644 index 0000000000..1bf5b33000 --- /dev/null +++ b/src/GPU/pair_gran_hooke_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(gran/hooke/gpu,PairGranHookeGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_GRAN_HOOKE_GPU_H +#define LMP_PAIR_GRAN_HOOKE_GPU_H + +#include "pair_gran_hooke.h" +#include "fix_neigh_history.h" + +namespace LAMMPS_NS { + +class PairGranHookeGPU : public PairGranHooke { + public: + PairGranHookeGPU(LAMMPS *lmp); + ~PairGranHookeGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif \ No newline at end of file