diff --git a/lib/gpu/lal_edpd.cpp b/lib/gpu/lal_edpd.cpp new file mode 100644 index 0000000000..2a32d01abb --- /dev/null +++ b/lib/gpu/lal_edpd.cpp @@ -0,0 +1,295 @@ +/*************************************************************************** + edpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the edpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "edpd_cl.h" +#elif defined(USE_CUDART) +const char *edpd=0; +#else +#include "edpd_cubin.h" +#endif + +#include "lal_edpd.h" +#include +namespace LAMMPS_AL { +#define EDPDT EDPD + +extern Device device; + +template +EDPDT::EDPD() : BaseDPD(), _allocated(false) { + _max_q_size = 0; +} + +template +EDPDT::~EDPD() { + clear(); +} + +template +int EDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int EDPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, + double **host_power, double **host_kappa, + double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, + const int power_flag, const int kappa_flag, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // T and cv + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,edpd,"k_edpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma, + host_cut); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_power,host_kappa, + host_powerT,host_cutT); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + if (host_sc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + sc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_sc[i][j][0]; + dview[n].y = host_sc[i][j][1]; + dview[n].z = host_sc[i][j][2]; + dview[n].w = host_sc[i][j][3]; + n++; + } + ucl_copy(sc,dview,false); + } + + if (host_kc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + kc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_kc[i][j][0]; + dview[n].y = host_kc[i][j][1]; + dview[n].z = host_kc[i][j][2]; + dview[n].w = host_kc[i][j][3]; + n++; + } + ucl_copy(kc,dview,false); + } + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + _power_flag = power_flag; + _kappa_flag = kappa_flag; + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_q_size=static_cast(static_cast(ef_nall)*1.10); + Q.alloc(_max_q_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+Q.row_bytes()+ + sc.row_bytes()+kc.row_bytes()+mass.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void EDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + sc.clear(); + kc.clear(); + Q.clear(); + mass.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double EDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(EDPD); +} + +template +void EDPDT::update_flux(void **flux_ptr) { + *flux_ptr=Q.host.begin(); + Q.update_host(_max_q_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int EDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize Q array if necessary + if (nall > _max_q_size) { + _max_q_size=static_cast(static_cast(nall)*1.10); + Q.resize(_max_q_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = edpd_temp[i]; + v.y = edpd_cv[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &_lj_types, &sp_lj, &sp_sqrt, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +template +void EDPDT::update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut) +{ + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack4(ntypes,_lj_types,coeff,host_write,host_a0,host_gamma, + host_sigma,host_cut); +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void EDPDT::get_extra_data(double *host_T, double *host_cv) { + edpd_temp = host_T; + edpd_cv = host_cv; +} + +template class EDPD; +} diff --git a/lib/gpu/lal_edpd.cu b/lib/gpu/lal_edpd.cu new file mode 100644 index 0000000000..9662d15aea --- /dev/null +++ b/lib/gpu/lal_edpd.cu @@ -0,0 +1,619 @@ +// ************************************************************************** +// edpd.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the edpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_heatflux(Qi, ii, inum, tid, t_per_atom, offset, Q) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, Qi); \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add1(t_per_atom,Qi); \ + } \ + if (offset==0 && ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp coeff2x = coeff2[mtype].x; //power[itype][jtype] + numtyp coeff2y = coeff2[mtype].y; //kappa[itype][jtype] + numtyp coeff2z = coeff2[mtype].z; //powerT[itype][jtype] + numtyp coeff2w = coeff2[mtype].w; //cutT[itype][jtype] + numtyp power_d = coeff2x; + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += sc[mtype].x*T_pow.x + sc[mtype].y*T_pow.y + + sc[mtype].z*T_pow.z + sc[mtype].w*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow(wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ * wr *randnum * dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kc[mtype].x*T_pow.x + kc[mtype].y*T_pow.y + + kc[mtype].z*T_pow.z + kc[mtype].w*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti)/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + +__kernel void k_edpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict mass, + const __global numtyp4 *restrict sc_in, + const __global numtyp4 *restrict kc_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict Q, + const int eflag, const int vflag, + const int power_flag, const int kappa_flag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 sc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 kc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp power_d = coeff2x; // power[itype][jtype] + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ* wr *randnum * dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force *= rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + + } + } // for nbor + } // if ii + + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + diff --git a/lib/gpu/lal_edpd.h b/lib/gpu/lal_edpd.h new file mode 100644 index 0000000000..b7935bf2f4 --- /dev/null +++ b/lib/gpu/lal_edpd.h @@ -0,0 +1,106 @@ +/*************************************************************************** + edpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the dpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_DPD_H +#define LAL_DPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class EDPD : public BaseDPD { + public: + EDPD(); + ~EDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, const int power_flag, const int kappa_flag, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + /// Update coeff if needed (tstat only) + void update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut); + + void get_extra_data(double *host_T, double *host_cv); + + /// copy Q (flux) from device to host + void update_flux(void **flux_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = a0, coeff.y = gamma, coeff.z = cut + UCL_D_Vec coeff; + /// coeff2.x = power, coeff2.y = kappa, coeff2.z = powerT, coeff2.w = cutT + UCL_D_Vec coeff2; + + UCL_D_Vec kc, sc; + UCL_D_Vec cutsq; + + /// per-type array + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector Q; + int _max_q_size; + + int _power_flag, _kappa_flag; + + /// pointer to host data + double *edpd_temp, *edpd_cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_edpd_ext.cpp b/lib/gpu/lal_edpd_ext.cpp new file mode 100644 index 0000000000..a9f60c3941 --- /dev/null +++ b/lib/gpu/lal_edpd_ext.cpp @@ -0,0 +1,142 @@ +/*************************************************************************** + edpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to edpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_edpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static EDPD EDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + EDPDMF.clear(); + gpu_mode=EDPDMF.device->gpu_mode(); + double gpu_split=EDPDMF.device->particle_split(); + int first_gpu=EDPDMF.device->first_device(); + int last_gpu=EDPDMF.device->last_device(); + int world_me=EDPDMF.device->world_me(); + int gpu_rank=EDPDMF.device->gpu_rank(); + int procs_per_gpu=EDPDMF.device->procs_per_gpu(); + + EDPDMF.device->init_message(screen,"edpd",first_gpu,last_gpu); + + bool message=false; + if (EDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=EDPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_cut, + host_power, host_kappa, host_powerT, + host_cutT, host_sc, host_kc, host_mass, + special_lj, power_flag, kappa_flag, + inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + EDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + EDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void edpd_gpu_clear() { + EDPDMF.clear(); +} + +int ** edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return EDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + EDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void edpd_gpu_get_extra_data(double *host_T, double *host_cv) { + EDPDMF.get_extra_data(host_T, host_cv); +} + +void edpd_gpu_update_flux(void **flux_ptr) { + EDPDMF.update_flux(flux_ptr); +} + +double edpd_gpu_bytes() { + return EDPDMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_mdpd.cpp b/lib/gpu/lal_mdpd.cpp new file mode 100644 index 0000000000..16cf926df8 --- /dev/null +++ b/lib/gpu/lal_mdpd.cpp @@ -0,0 +1,218 @@ +/*************************************************************************** + mdpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "mdpd_cl.h" +#elif defined(USE_CUDART) +const char *mdpd=0; +#else +#include "mdpd_cubin.h" +#endif + +#include "lal_mdpd.h" +#include +namespace LAMMPS_AL { +#define MDPDT MDPD + +extern Device device; + +template +MDPDT::MDPD() : BaseDPD(), _allocated(false) { +} + +template +MDPDT::~MDPD() { + clear(); +} + +template +int MDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int MDPDT::init(const int ntypes, + double **host_cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,mdpd,"k_mdpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_A_att,host_B_rep, + host_gamma,host_sigma); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_cut,host_cut_r, + host_cutsq); + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void MDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double MDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(MDPD); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int MDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = mdpd_rho[i]; + v.y = 0; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &_lj_types, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void MDPDT::get_extra_data(double *host_rho) { + mdpd_rho = host_rho; +} + +template class MDPD; +} diff --git a/lib/gpu/lal_mdpd.cu b/lib/gpu/lal_mdpd.cu new file mode 100644 index 0000000000..6230cb2496 --- /dev/null +++ b/lib/gpu/lal_mdpd.cu @@ -0,0 +1,475 @@ +// ************************************************************************** +// mdpd.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the mdpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : December 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) < (B) ? (B) : (A)) + +// coeff.x = A_att, coeff.y = B_rep, coeff.z = gamma, coeff.w = sigma +// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + +__kernel void k_mdpd(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global numtyp *restrict sp_sqrt, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_pair(); + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + +__kernel void k_mdpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force*=rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + diff --git a/lib/gpu/lal_mdpd.h b/lib/gpu/lal_mdpd.h new file mode 100644 index 0000000000..ca702dba73 --- /dev/null +++ b/lib/gpu/lal_mdpd.h @@ -0,0 +1,88 @@ +/*************************************************************************** + mdpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_DPD_H +#define LAL_DPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class MDPD : public BaseDPD { + public: + MDPD(); + ~MDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = A_att, coeff.x = B_rep, coeff.z = gamma, coeff.w = sigma + UCL_D_Vec coeff; + /// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + UCL_D_Vec coeff2; + + UCL_D_Vec cutsq; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// pointer to host data + double *mdpd_rho; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_mdpd_ext.cpp b/lib/gpu/lal_mdpd_ext.cpp new file mode 100644 index 0000000000..def6adb1f6 --- /dev/null +++ b/lib/gpu/lal_mdpd_ext.cpp @@ -0,0 +1,133 @@ +/*************************************************************************** + mdpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to mdpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_mdpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static MDPD MDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int mdpd_gpu_init(const int ntypes, double **cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + MDPDMF.clear(); + gpu_mode=MDPDMF.device->gpu_mode(); + double gpu_split=MDPDMF.device->particle_split(); + int first_gpu=MDPDMF.device->first_device(); + int last_gpu=MDPDMF.device->last_device(); + int world_me=MDPDMF.device->world_me(); + int gpu_rank=MDPDMF.device->gpu_rank(); + int procs_per_gpu=MDPDMF.device->procs_per_gpu(); + + MDPDMF.device->init_message(screen,"mdpd",first_gpu,last_gpu); + + bool message=false; + if (MDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=MDPDMF.init(ntypes, cutsq, host_A_att, host_B_rep, host_gamma, host_sigma, + host_cut, host_cut_r, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen); + + MDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + MDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void mdpd_gpu_clear() { + MDPDMF.clear(); +} + +int ** mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return MDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + MDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void mdpd_gpu_get_extra_data(double *host_rho) { + MDPDMF.get_extra_data(host_rho); +} + +double mdpd_gpu_bytes() { + return MDPDMF.host_memory_usage(); +} + + diff --git a/src/GPU/pair_edpd_gpu.cpp b/src/GPU/pair_edpd_gpu.cpp new file mode 100644 index 0000000000..5bee0cadb8 --- /dev/null +++ b/src/GPU/pair_edpd_gpu.cpp @@ -0,0 +1,195 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_edpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma, + double **host_cut, double **host_power, double **host_kappa, + double **host_powerT, double** host_cutT, double*** host_sc, double ***host_kc, + double *host_mass, double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void edpd_gpu_clear(); +int **edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void edpd_gpu_get_extra_data(double *host_T, double *host_cv); +void edpd_gpu_update_flux(void **flux_ptr); +double edpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairEDPDGPU::PairEDPDGPU(LAMMPS *lmp) : PairEDPD(lmp), gpu_mode(GPU_FORCE) +{ + flux_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairEDPDGPU::~PairEDPDGPU() +{ + edpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairEDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *T = atom->edpd_temp; + double *cv = atom->edpd_cv; + edpd_gpu_get_extra_data(T, cv); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = edpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + edpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the heat flux from device + + double *Q = atom->edpd_flux; + edpd_gpu_update_flux(&flux_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto flux_ptr = (float *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + + } else { + auto flux_ptr = (double *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairEDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + edpd_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, cut, power, kappa, + powerT, cutT, sc, kc, atom->mass, force->special_lj, + power_flag, kappa_flag, atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairEDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + edpd_gpu_bytes(); +} diff --git a/src/GPU/pair_edpd_gpu.h b/src/GPU/pair_edpd_gpu.h new file mode 100644 index 0000000000..75495b2ca4 --- /dev/null +++ b/src/GPU/pair_edpd_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(edpd/gpu,PairEDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_EDPD_GPU_H +#define LMP_PAIR_EDPD_GPU_H + +#include "pair_edpd.h" + +namespace LAMMPS_NS { + +class PairEDPDGPU : public PairEDPD { + public: + PairEDPDGPU(LAMMPS *lmp); + ~PairEDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *flux_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_mdpd_gpu.cpp b/src/GPU/pair_mdpd_gpu.cpp new file mode 100644 index 0000000000..bebe1e9736 --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.cpp @@ -0,0 +1,171 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_mdpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int mdpd_gpu_init(const int ntypes, double **cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, double **host_cut, double **host_cut_r, + double *special_lj, const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void mdpd_gpu_clear(); +int **mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void mdpd_gpu_get_extra_data(double *host_rho); +double mdpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairMDPDGPU::PairMDPDGPU(LAMMPS *lmp) : PairMDPD(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairMDPDGPU::~PairMDPDGPU() +{ + mdpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairMDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + mdpd_gpu_get_extra_data(rho); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = mdpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + mdpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairMDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + mdpd_gpu_init(atom->ntypes + 1, cutsq, A_att, B_rep, gamma, sigma, + cut, cut_r, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairMDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + mdpd_gpu_bytes(); +} diff --git a/src/GPU/pair_mdpd_gpu.h b/src/GPU/pair_mdpd_gpu.h new file mode 100644 index 0000000000..5f27c4014e --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(mdpd/gpu,PairMDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_MDPD_GPU_H +#define LMP_PAIR_MDPD_GPU_H + +#include "pair_mdpd.h" + +namespace LAMMPS_NS { + +class PairMDPDGPU : public PairMDPD { + public: + PairMDPDGPU(LAMMPS *lmp); + ~PairMDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif