diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index 54b0d2a6a1..7291a1491b 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -47,6 +47,10 @@ action pair_coul_dsf_gpu.cpp action pair_coul_dsf_gpu.h action pair_coul_long_gpu.cpp pair_coul_long.cpp action pair_coul_long_gpu.h pair_coul_long.cpp +action pair_dpd_gpu.cpp +action pair_dpd_gpu.h +action pair_dpd_tstat_gpu.cpp +action pair_dpd_tstat_gpu.h action pair_lj_cut_dipole_cut_gpu.cpp pair_lj_cut_dipole_cut.cpp action pair_lj_cut_dipole_cut_gpu.h pair_lj_cut_dipole_cut.cpp action pair_lj_sf_dipole_sf_gpu.cpp pair_lj_sf_dipole_sf.cpp diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 66bb72846d..a386067289 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -46,7 +46,7 @@ extern double lmp_gpu_forces(double **f, double **tor, double *eatom, double **vatom, double *virial, double &ecoul); static const char cite_gpu_package[] = - "GPU package (short-range and long-range):\n\n" + "GPU package (short-range, long-range and three-body potentials):\n\n" "@Article{Brown11,\n" " author = {W. M. Brown, P. Wang, S. J. Plimpton, A. N. Tharrington},\n" " title = {Implementing Molecular Dynamics on Hybrid High Performance Computers - Short Range Forces},\n" @@ -62,6 +62,14 @@ static const char cite_gpu_package[] = " year = 2012,\n" " volume = 183,\n" " pages = {449--459}\n" + "}\n\n" + "@Article{Brown13,\n" + " author = {W. M. Brown, Y. Masako},\n" + " title = {Implementing Molecular Dynamics on Hybrid High Performance Computers – Three-Body Potentials},\n" + " journal = {Comp.~Phys.~Comm.},\n" + " year = 2013,\n" + " volume = 184,\n" + " pages = {2785--2793}\n" "}\n\n"; /* ---------------------------------------------------------------------- */ diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp new file mode 100644 index 0000000000..b08144c4b5 --- /dev/null +++ b/src/GPU/pair_dpd_gpu.cpp @@ -0,0 +1,409 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (ORNL) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "pair_dpd_gpu.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "integrate.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "random_mars.h" +#include "universe.h" +#include "update.h" +#include "domain.h" +#include "string.h" +#include "gpu_extra.h" + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, bool tstat_only, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void dpd_gpu_clear(); +int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd); +void dpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd); +double dpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 // Full period 32 bit LCG +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add // Prime period 3666320093 +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define numtyp double +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define numtyp double +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define numtyp double +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +/* ---------------------------------------------------------------------- */ + +PairDPDGPU::PairDPDGPU(LAMMPS *lmp) : PairDPD(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairDPDGPU::~PairDPDGPU() +{ + dpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0/sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + inum = atom->nlocal; + firstneigh = dpd_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->v, dtinvsqrt, seed, + update->ntimestep, + domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + dpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, dtinvsqrt, seed, + update->ntimestep, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + if (host_startnewton_pair) + error->all(FLERR,"Cannot use newton pair with dpd/gpu pair style"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i,j); + mcut *= mcut; + if (mcut > maxcut) + maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + int success = dpd_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma, + cut, force->special_lj, false, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + dpd_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { + int i,j,ii,jj,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double vxtmp,vytmp,vztmp,delvx,delvy,delvz; + double rsq,r,rinv,dot,wd,randnum,factor_dpd; + int *jlist; + tagint itag,jtag; + + double **x = atom->x; + double **v = atom->v; + double **f = atom->f; + int *type = atom->type; + tagint *tag = atom->tag; + double *special_lj = force->special_lj; + double dtinvsqrt = 1.0/sqrt(update->dt); + int timestep = (int)update->ntimestep; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + vxtmp = v[i][0]; + vytmp = v[i][1]; + vztmp = v[i][2]; + itype = type[i]; + itag = tag[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_dpd = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + jtag = tag[j]; + + if (rsq < cutsq[itype][jtype]) { + r = sqrt(rsq); + if (r < EPSILON) continue; // r can be 0.0 in DPD systems + rinv = 1.0/r; + delvx = vxtmp - v[j][0]; + delvy = vytmp - v[j][1]; + delvz = vztmp - v[j][2]; + dot = delx*delvx + dely*delvy + delz*delvz; + wd = 1.0 - r/cut[itype][jtype]; + + unsigned int tag1=itag, tag2=jtag; + if (tag1 > tag2) { + tag1 = jtag; tag2 = itag; + } + + randnum = 0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = a0 * wd + // drag force = -gamma * wd^2 * (delx dot delv) / r + // random force = sigma * wd * rnd * dtinvsqrt; + + fpair = a0[itype][jtype]*wd; + fpair -= gamma[itype][jtype]*wd*wd*dot*rinv; + fpair += sigma[itype][jtype]*wd*randnum*dtinvsqrt; + fpair *= factor_dpd*rinv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + // unshifted eng of conservative term: + // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]); + // eng shifted to 0.0 at cutoff + evdwl = 0.5*a0[itype][jtype]*cut[itype][jtype] * wd*wd; + evdwl *= factor_dpd; + } + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} diff --git a/src/GPU/pair_dpd_gpu.h b/src/GPU/pair_dpd_gpu.h new file mode 100644 index 0000000000..e3e5fc508a --- /dev/null +++ b/src/GPU/pair_dpd_gpu.h @@ -0,0 +1,59 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/gpu,PairDPDGPU) + +#else + +#ifndef LMP_PAIR_DPD_GPU_H +#define LMP_PAIR_DPD_GPU_H + +#include "pair_dpd.h" + +namespace LAMMPS_NS { + +class PairDPDGPU : public PairDPD { + public: + PairDPDGPU(LAMMPS *lmp); + ~PairDPDGPU(); + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int); + void init_style(); + double memory_usage(); + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; + int *gpulist; +}; + +} +#endif +#endif + +/* ERROR/WARNING messages: + +E: Insufficient memory on accelerator + +There is insufficient memory on one of the devices specified for the gpu +package + +E: Cannot use newton pair with lj/cut/gpu pair style + +Self-explanatory. + +*/ diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp new file mode 100644 index 0000000000..b9c8bb905b --- /dev/null +++ b/src/GPU/pair_dpd_tstat_gpu.cpp @@ -0,0 +1,417 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (ORNL) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "pair_dpd_tstat_gpu.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "integrate.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "random_mars.h" +#include "universe.h" +#include "update.h" +#include "domain.h" +#include "string.h" +#include "gpu_extra.h" + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_sigma, double **host_cut, + double *special_lj, bool tstat_only, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void dpd_gpu_clear(); +int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd); +void dpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd); +void dpd_gpu_update_coeff(int ntypes, double **host_a0, double **host_gamma, + double **host_sigma, double **host_cut); +double dpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 // Full period 32 bit LCG +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add // Prime period 3666320093 +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define numtyp double +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define numtyp double +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define numtyp double +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +/* ---------------------------------------------------------------------- */ + +PairDPDTstatGPU::PairDPDTstatGPU(LAMMPS *lmp) : PairDPDTstat(lmp), + gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairDPDTstatGPU::~PairDPDTstatGPU() +{ + dpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDTstatGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + // adjust sigma if target T is changing + + if (t_start != t_stop) { + double delta = update->ntimestep - update->beginstep; + if (delta != 0.0) delta /= update->endstep - update->beginstep; + temperature = t_start + delta * (t_stop-t_start); + double boltz = force->boltz; + for (int i = 1; i <= atom->ntypes; i++) + for (int j = i; j <= atom->ntypes; j++) + sigma[i][j] = sigma[j][i] = sqrt(2.0*boltz*temperature*gamma[i][j]); + + dpd_gpu_update_coeff(atom->ntypes+1, a0, gamma, sigma, cut); + } + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0/sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + inum = atom->nlocal; + firstneigh = dpd_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, atom->v, dtinvsqrt, seed, + update->ntimestep, + domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + dpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, dtinvsqrt, seed, + update->ntimestep, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); + + if (host_startnewton_pair) + error->all(FLERR,"Cannot use newton pair with dpd/tstat/gpu pair style"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i,j); + mcut *= mcut; + if (mcut > maxcut) + maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + int success = dpd_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma, + cut, force->special_lj, true, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode == GPU_FORCE) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairDPDTstatGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + dpd_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairDPDTstatGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) { + int i,j,ii,jj,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double vxtmp,vytmp,vztmp,delvx,delvy,delvz; + double rsq,r,rinv,dot,wd,randnum,factor_dpd; + int *jlist; + tagint itag,jtag; + + double **x = atom->x; + double **v = atom->v; + double **f = atom->f; + int *type = atom->type; + tagint *tag = atom->tag; + double *special_lj = force->special_lj; + double dtinvsqrt = 1.0/sqrt(update->dt); + int timestep = (int)update->ntimestep; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + vxtmp = v[i][0]; + vytmp = v[i][1]; + vztmp = v[i][2]; + itype = type[i]; + itag = tag[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_dpd = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + jtag = tag[j]; + + if (rsq < cutsq[itype][jtype]) { + r = sqrt(rsq); + if (r < EPSILON) continue; // r can be 0.0 in DPD systems + rinv = 1.0/r; + delvx = vxtmp - v[j][0]; + delvy = vytmp - v[j][1]; + delvz = vztmp - v[j][2]; + dot = delx*delvx + dely*delvy + delz*delvz; + wd = 1.0 - r/cut[itype][jtype]; + + unsigned int tag1=itag, tag2=jtag; + if (tag1 > tag2) { + tag1 = jtag; tag2 = itag; + } + + randnum = 0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = a0 * wd + // drag force = -gamma * wd^2 * (delx dot delv) / r + // random force = sigma * wd * rnd * dtinvsqrt; + + fpair = -gamma[itype][jtype]*wd*wd*dot*rinv; + fpair += sigma[itype][jtype]*wd*randnum*dtinvsqrt; + fpair *= factor_dpd*rinv; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (evflag) ev_tally_full(i,0.0,0.0,fpair,delx,dely,delz); + } + } + } +} diff --git a/src/GPU/pair_dpd_tstat_gpu.h b/src/GPU/pair_dpd_tstat_gpu.h new file mode 100644 index 0000000000..ce4f07494e --- /dev/null +++ b/src/GPU/pair_dpd_tstat_gpu.h @@ -0,0 +1,59 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(dpd/tstat/gpu,PairDPDTstatGPU) + +#else + +#ifndef LMP_PAIR_DPD_TSTAT_GPU_H +#define LMP_PAIR_DPD_TSTAT_GPU_H + +#include "pair_dpd_tstat.h" + +namespace LAMMPS_NS { + +class PairDPDTstatGPU : public PairDPDTstat { + public: + PairDPDTstatGPU(LAMMPS *lmp); + ~PairDPDTstatGPU(); + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int); + void init_style(); + double memory_usage(); + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; + int *gpulist; +}; + +} +#endif +#endif + +/* ERROR/WARNING messages: + +E: Insufficient memory on accelerator + +There is insufficient memory on one of the devices specified for the gpu +package + +E: Cannot use newton pair with lj/cut/gpu pair style + +Self-explanatory. + +*/