diff --git a/lib/gpu/lal_base_sph.cpp b/lib/gpu/lal_base_sph.cpp new file mode 100644 index 0000000000..c6876a7dcd --- /dev/null +++ b/lib/gpu/lal_base_sph.cpp @@ -0,0 +1,362 @@ +/*************************************************************************** + base_sph.cpp + ------------------- + Trung Dac Nguyen (ORNL) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include "lal_base_sph.h" +namespace LAMMPS_AL { +#define BaseSPHT BaseSPH + +extern Device global_device; + +template +BaseSPHT::BaseSPH() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif +} + +template +BaseSPHT::~BaseSPH() { + delete ans; + delete nbor; + k_pair_fast.clear(); + k_pair.clear(); + if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif +} + +template +int BaseSPHT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseSPHT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name, const int onetype, + const int extra_fields) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + + bool charge = false; + bool rot = false; + bool vel = true; + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,pair_program,k_name,onetype); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template +void BaseSPHT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template +void BaseSPHT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseSPHT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseSPHT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseSPHT::compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_v_data(host_v,tag); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseSPHT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return nullptr; + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_v_data(host_v,tag); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseSPHT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseSPH); +} + +template +void BaseSPHT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname, const int onetype) { + if (_compiled && _onetype==onetype) + return; + + _onetype=onetype; + + std::string s_fast=std::string(kname)+"_fast"; + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); + pos_tex.get_texture(*pair_program,"pos_tex"); + vel_tex.get_texture(*pair_program,"vel_tex"); + + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +template class BaseSPH; +} diff --git a/lib/gpu/lal_base_sph.h b/lib/gpu/lal_base_sph.h new file mode 100644 index 0000000000..950462e998 --- /dev/null +++ b/lib/gpu/lal_base_sph.h @@ -0,0 +1,209 @@ +/*************************************************************************** + base_sph.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_BASE_SPH_H +#define LAL_BASE_DPD_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseSPH { + public: + BaseSPH(); + virtual ~BaseSPH(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name, + const int onetype=0, const int extra_fields=0); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **v, const int nlocal); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **v); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; + inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture vel_tex; + + // ------------------------- COMMON VARS ---------------------------- + + protected: + bool _compiled; + int _block_size, _threads_per_atom, _onetype, _extra_fields; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int onetype); + virtual int loop(const int eflag, const int vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp index 85c144e240..dbe44bf30a 100644 --- a/lib/gpu/lal_sph_lj.cpp +++ b/lib/gpu/lal_sph_lj.cpp @@ -29,7 +29,7 @@ namespace LAMMPS_AL { extern Device device; template -SPHLJT::SPHLJ() : BaseDPD(), _allocated(false) { +SPHLJT::SPHLJ() : BaseSPH(), _allocated(false) { _max_drhoE_size = 0; } @@ -46,8 +46,8 @@ int SPHLJT::bytes_per_atom(const int max_nbors) const { template int SPHLJT::init(const int ntypes, double **host_cutsq, double **host_cut, - double **host_viscosity, const int dimension, - double *host_special_lj, + double **host_viscosity, double* host_mass, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -70,7 +70,7 @@ int SPHLJT::init(const int ntypes, int success; int extra_fields = 4; // round up to accomodate quadruples of numtyp values - // rho, cv, mass + // rho, cv success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,sph_lj,"k_sph_lj",onetype,extra_fields); if (success!=0) @@ -96,6 +96,12 @@ int SPHLJT::init(const int ntypes, this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, host_cut, host_cutsq); + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + UCL_H_Vec dview; sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); dview.view(host_special_lj,4,*(this->ucl_device)); @@ -124,6 +130,7 @@ void SPHLJT::clear() { _allocated=false; coeff.clear(); + mass.clear(); drhoE.clear(); sp_lj.clear(); this->clear_atomic(); @@ -168,7 +175,7 @@ int SPHLJT::loop(const int eflag, const int vflag) { v.x = rho[i]; v.y = esph[i]; v.z = cv[i]; - v.w = mass[i]; + v.w = 0; pextra[idx] = v; } this->atom->add_extra_data(); @@ -184,13 +191,13 @@ int SPHLJT::loop(const int eflag, const int vflag) { this->time_pair.start(); if (shared_types) { this->k_pair_sel->set_size(GX,BX); - this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &sp_lj, + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &mass, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); } else { this->k_pair.set_size(GX,BX); - this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &mass, &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); @@ -205,12 +212,10 @@ int SPHLJT::loop(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template -void SPHLJT::get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double* host_mass) { +void SPHLJT::get_extra_data(double *host_rho, double *host_esph, double *host_cv) { rho = host_rho; esph = host_esph; cv = host_cv; - mass = host_mass; } template class SPHLJ; diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu index ab41477d2f..b105632330 100644 --- a/lib/gpu/lal_sph_lj.cu +++ b/lib/gpu/lal_sph_lj.cu @@ -47,7 +47,7 @@ _texture_2d( vel_tex,int4); } \ } \ if (offset==0 && ii -class SPHLJ : public BaseDPD { +class SPHLJ : public BaseSPH { public: SPHLJ(); ~SPHLJ(); @@ -38,7 +38,8 @@ class SPHLJ : public BaseDPD { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double** host_cut, double **host_viscosity, const int dimension, + double** host_cut, double **host_viscosity, double *host_mass, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -54,7 +55,7 @@ class SPHLJ : public BaseDPD { double host_memory_usage() const; void get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double* host_mass); + double *host_cv); /// copy drho and desph from device to host void update_drhoE(void **drhoE_ptr); @@ -62,7 +63,10 @@ class SPHLJ : public BaseDPD { // --------------------------- TYPE DATA -------------------------- /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq - UCL_D_Vec coeff; + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec mass; /// Special LJ values UCL_D_Vec sp_lj; @@ -80,7 +84,7 @@ class SPHLJ : public BaseDPD { int _dimension; /// pointer to host data - double *rho, *esph, *cv, *mass; + double *rho, *esph, *cv; private: bool _allocated; diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp index 0b8ea0e263..488f7b1200 100644 --- a/lib/gpu/lal_sph_lj_ext.cpp +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -28,7 +28,7 @@ static SPHLJ SPHLJMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, const int dimension, + double **host_viscosity, double* host_mass, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen) { @@ -54,8 +54,8 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, - special_lj, inum, nall, max_nbors, maxspecial, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SPHLJMF.device->world_barrier(); @@ -72,8 +72,8 @@ int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, dimension, - special_lj, inum, nall, max_nbors, maxspecial, + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); SPHLJMF.device->serialize_init(); @@ -93,38 +93,31 @@ void sph_lj_gpu_clear() { } int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, - vatom, host_start, ilist, jnum, cpu_time, success, - host_v, dtinvsqrt, seed, timestep, boxlo, prd); + subhi, host_tag, nspecial, special, eflag, vflag, + eatom, vatom, host_start, ilist, jnum, cpu_time, success, + host_v); } void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, - double **host_v, const int nlocal, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, - tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); + host_tag, host_v, nlocal); } -void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double *host_mass) { - SPHLJMF.get_extra_data(host_rho, host_esph, host_cv, host_mass); +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, double *host_cv) { + SPHLJMF.get_extra_data(host_rho, host_esph, host_cv); } void sph_lj_gpu_update_drhoE(void **drhoE_ptr) { diff --git a/lib/gpu/lal_sph_taitwater.cpp b/lib/gpu/lal_sph_taitwater.cpp index 84a25aab5c..7a584d435e 100644 --- a/lib/gpu/lal_sph_taitwater.cpp +++ b/lib/gpu/lal_sph_taitwater.cpp @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Class for acceleration of the sph_taitwater pair style. + Class for acceleration of the sph/taitwater pair style. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) @@ -29,7 +29,7 @@ namespace LAMMPS_AL { extern Device device; template -SPHTaitwaterT::SPHTaitwater() : BaseDPD(), _allocated(false) { +SPHTaitwaterT::SPHTaitwater() : BaseSPH(), _allocated(false) { _max_drhoE_size = 0; } @@ -46,8 +46,8 @@ int SPHTaitwaterT::bytes_per_atom(const int max_nbors) const { template int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, double **host_cut, double **host_viscosity, - double* host_rho0, double* host_soundspeed, - double* host_B, const int dimension, + double* host_mass, double* host_rho0, + double* host_soundspeed, double* host_B, const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, @@ -70,7 +70,7 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, int success; int extra_fields = 4; // round up to accomodate quadruples of numtyp values - // rho, mass + // rho success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, gpu_split,_screen,sph_taitwater,"k_sph_taitwater", onetype,extra_fields); @@ -99,10 +99,10 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, UCL_H_Vec dview_coeff2(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); for (int i = 0; i < ntypes; i++) { - dview_coeff2[i].x = host_rho0[i]; - dview_coeff2[i].y = host_soundspeed[i]; - dview_coeff2[i].z = host_B[i]; - dview_coeff2[i].w = 0; + dview_coeff2[i].x = host_mass[i]; + dview_coeff2[i].y = host_rho0[i]; + dview_coeff2[i].z = host_soundspeed[i]; + dview_coeff2[i].w = host_B[i]; } coeff2.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); ucl_copy(coeff2,dview_coeff2,false); @@ -122,7 +122,7 @@ int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); _dimension = dimension; - + _allocated=true; this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); return 0; @@ -178,7 +178,7 @@ int SPHTaitwaterT::loop(const int eflag, const int vflag) { int idx = n+i*nstride; numtyp4 v; v.x = rho[i]; - v.y = mass[i]; + v.y = 0; v.z = 0; v.w = 0; pextra[idx] = v; @@ -217,9 +217,8 @@ int SPHTaitwaterT::loop(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template -void SPHTaitwaterT::get_extra_data(double *host_rho, double* host_mass) { +void SPHTaitwaterT::get_extra_data(double *host_rho) { rho = host_rho; - mass = host_mass; } template class SPHTaitwater; diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu index d07defc42b..cc32e4a3c9 100644 --- a/lib/gpu/lal_sph_taitwater.cu +++ b/lib/gpu/lal_sph_taitwater.cu @@ -47,7 +47,7 @@ _texture_2d( vel_tex,int4); } \ } \ if (offset==0 && ii -class SPHTaitwater : public BaseDPD { +class SPHTaitwater : public BaseSPH { public: SPHTaitwater(); ~SPHTaitwater(); @@ -38,9 +38,9 @@ class SPHTaitwater : public BaseDPD { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, double **host_cutsq, - double** host_cut, double **host_viscosity, - double* host_rho0, double* host_soundspeed, - double* host_B, const int dimension, double *host_special_lj, + double** host_cut, double **host_viscosity, double *host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, + const int dimension, double *host_special_lj, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen); @@ -55,7 +55,7 @@ class SPHTaitwater : public BaseDPD { /// Total host memory used by library for pair style double host_memory_usage() const; - void get_extra_data(double *host_rho, double* host_mass); + void get_extra_data(double *host_rho); /// copy drho and desph from device to host void update_drhoE(void **drhoE_ptr); @@ -84,7 +84,7 @@ class SPHTaitwater : public BaseDPD { int _dimension; /// pointer to host data - double *rho, *mass; + double *rho; private: bool _allocated; diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp index e5b1d439b8..9d125a6395 100644 --- a/lib/gpu/lal_sph_taitwater_ext.cpp +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -3,7 +3,7 @@ ------------------- Trung Dac Nguyen (U Chicago) - Functions for LAMMPS access to sph lj acceleration routines. + Functions for LAMMPS access to sph taitwater acceleration routines. __________________________________________________________________________ This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) @@ -28,8 +28,8 @@ static SPHTaitwater SPHTaitwaterMF; // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double* host_rho0, - double* host_soundspeed, double* host_B, + double **host_viscosity, double* host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, @@ -56,7 +56,7 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, int init_ok=0; if (world_me==0) - init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, host_rho0, host_soundspeed, host_B, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -75,7 +75,7 @@ int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, host_rho0, host_soundspeed, host_B, dimension, special_lj, inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen); @@ -98,36 +98,30 @@ void sph_taitwater_gpu_clear() { int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, + double *subhi, tagint *host_tag, int **nspecial, tagint **special, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, - double **host_v, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + double **host_v) { return SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, sublo, - subhi, tag, nspecial, special, eflag, vflag, eatom, + subhi, host_tag, nspecial, special, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, - host_v, dtinvsqrt, seed, timestep, boxlo, prd); + host_v); } void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, - double **host_v, const int nlocal, double *boxlo, double *prd) { - double dtinvsqrt = 1.0; - int seed = 0; - int timestep = 0; + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, - tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); + host_tag, host_v, nlocal); } -void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass) { - SPHTaitwaterMF.get_extra_data(host_rho, host_mass); +void sph_taitwater_gpu_get_extra_data(double *host_rho) { + SPHTaitwaterMF.get_extra_data(host_rho); } void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr) { diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp index 33366f09a6..942a3c33bd 100644 --- a/src/GPU/pair_sph_lj_gpu.cpp +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -35,24 +35,27 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, const int dimension, double *special_lj, + double **host_viscosity, double* host_mass, + const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void sph_lj_gpu_clear(); -int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_v, - double *boxlo, double *prd); -void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, double **host_v, - const int nlocal, double *boxlo, double *prd); +int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal); void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, - double *host_cv, double *host_mass); + double *host_cv); void sph_lj_gpu_update_drhoE(void **drhoE_ptr); double sph_lj_gpu_bytes(); @@ -92,8 +95,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) double *rho = atom->rho; double *esph = atom->esph; double *cv = atom->cv; - double *mass = atom->mass; - sph_lj_gpu_get_extra_data(rho, esph, cv, mass); + sph_lj_gpu_get_extra_data(rho, esph, cv); if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -112,7 +114,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, - cpu_time, success, atom->v, domain->boxlo, domain->prd); + cpu_time, success, atom->v); } else { inum = list->inum; ilist = list->ilist; @@ -121,7 +123,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag) sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); @@ -182,8 +184,9 @@ void PairSPHLJGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, domain->dimension, - force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, + sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, + domain->dimension, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen); GPU_EXTRA::check_flag(success, error, world); diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp index b98a89c3ff..37a1b0feb5 100644 --- a/src/GPU/pair_sph_taitwater_gpu.cpp +++ b/src/GPU/pair_sph_taitwater_gpu.cpp @@ -35,24 +35,26 @@ using namespace LAMMPS_NS; // External functions from cuda library for atom decomposition int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, - double **host_viscosity, double* host_rho0, + double **host_viscosity, double* host_mass, double* host_rho0, double* host_soundspeed, double* host_B, const int dimension, double *special_lj, const int inum, const int nall, const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); void sph_taitwater_gpu_clear(); -int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, double **host_v, - double *boxlo, double *prd); -void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, - int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, tagint *tag, double **host_v, - const int nlocal, double *boxlo, double *prd); -void sph_taitwater_gpu_get_extra_data(double *host_rho, double *host_mass); +int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal); +void sph_taitwater_gpu_get_extra_data(double *host_rho); void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr); double sph_taitwater_gpu_bytes(); @@ -90,8 +92,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) int *ilist, *numneigh, **firstneigh; double *rho = atom->rho; - double *mass = atom->mass; - sph_taitwater_gpu_get_extra_data(rho, mass); + sph_taitwater_gpu_get_extra_data(rho); if (gpu_mode != GPU_FORCE) { double sublo[3], subhi[3]; @@ -109,7 +110,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) firstneigh = sph_taitwater_gpu_compute_n( neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, - cpu_time, success, atom->v, domain->boxlo, domain->prd); + cpu_time, success, atom->v); } else { inum = list->inum; ilist = list->ilist; @@ -117,7 +118,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag) firstneigh = list->firstneigh; sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, - atom->tag, atom->v, atom->nlocal, domain->boxlo, domain->prd); + atom->tag, atom->v, atom->nlocal); } if (!success) error->one(FLERR, "Insufficient memory on accelerator"); @@ -178,7 +179,7 @@ void PairSPHTaitwaterGPU::init_style() if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; int mnf = 5e-2 * neighbor->oneatom; int success = - sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, + sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, rho0, soundspeed, B, domain->dimension, force->special_lj, atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, screen);